cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

grukservices.c (29524B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * SN Platform GRU Driver
      4 *
      5 *              KERNEL SERVICES THAT USE THE GRU
      6 *
      7 *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
      8 */
      9
     10#include <linux/kernel.h>
     11#include <linux/errno.h>
     12#include <linux/slab.h>
     13#include <linux/mm.h>
     14#include <linux/spinlock.h>
     15#include <linux/device.h>
     16#include <linux/miscdevice.h>
     17#include <linux/proc_fs.h>
     18#include <linux/interrupt.h>
     19#include <linux/sync_core.h>
     20#include <linux/uaccess.h>
     21#include <linux/delay.h>
     22#include <linux/export.h>
     23#include <asm/io_apic.h>
     24#include "gru.h"
     25#include "grulib.h"
     26#include "grutables.h"
     27#include "grukservices.h"
     28#include "gru_instructions.h"
     29#include <asm/uv/uv_hub.h>
     30
     31/*
     32 * Kernel GRU Usage
     33 *
     34 * The following is an interim algorithm for management of kernel GRU
     35 * resources. This will likely be replaced when we better understand the
     36 * kernel/user requirements.
     37 *
     38 * Blade percpu resources reserved for kernel use. These resources are
     39 * reserved whenever the the kernel context for the blade is loaded. Note
     40 * that the kernel context is not guaranteed to be always available. It is
     41 * loaded on demand & can be stolen by a user if the user demand exceeds the
     42 * kernel demand. The kernel can always reload the kernel context but
     43 * a SLEEP may be required!!!.
     44 *
     45 * Async Overview:
     46 *
     47 * 	Each blade has one "kernel context" that owns GRU kernel resources
     48 * 	located on the blade. Kernel drivers use GRU resources in this context
     49 * 	for sending messages, zeroing memory, etc.
     50 *
     51 * 	The kernel context is dynamically loaded on demand. If it is not in
     52 * 	use by the kernel, the kernel context can be unloaded & given to a user.
     53 * 	The kernel context will be reloaded when needed. This may require that
     54 * 	a context be stolen from a user.
     55 * 		NOTE: frequent unloading/reloading of the kernel context is
     56 * 		expensive. We are depending on batch schedulers, cpusets, sane
     57 * 		drivers or some other mechanism to prevent the need for frequent
     58 *	 	stealing/reloading.
     59 *
     60 * 	The kernel context consists of two parts:
     61 * 		- 1 CB & a few DSRs that are reserved for each cpu on the blade.
     62 * 		  Each cpu has it's own private resources & does not share them
     63 * 		  with other cpus. These resources are used serially, ie,
     64 * 		  locked, used & unlocked  on each call to a function in
     65 * 		  grukservices.
     66 * 		  	(Now that we have dynamic loading of kernel contexts, I
     67 * 		  	 may rethink this & allow sharing between cpus....)
     68 *
     69 *		- Additional resources can be reserved long term & used directly
     70 *		  by UV drivers located in the kernel. Drivers using these GRU
     71 *		  resources can use asynchronous GRU instructions that send
     72 *		  interrupts on completion.
     73 *		  	- these resources must be explicitly locked/unlocked
     74 *		  	- locked resources prevent (obviously) the kernel
     75 *		  	  context from being unloaded.
     76 *			- drivers using these resource directly issue their own
     77 *			  GRU instruction and must wait/check completion.
     78 *
     79 * 		  When these resources are reserved, the caller can optionally
     80 * 		  associate a wait_queue with the resources and use asynchronous
     81 * 		  GRU instructions. When an async GRU instruction completes, the
     82 * 		  driver will do a wakeup on the event.
     83 *
     84 */
     85
     86
     87#define ASYNC_HAN_TO_BID(h)	((h) - 1)
     88#define ASYNC_BID_TO_HAN(b)	((b) + 1)
     89#define ASYNC_HAN_TO_BS(h)	gru_base[ASYNC_HAN_TO_BID(h)]
     90
     91#define GRU_NUM_KERNEL_CBR	1
     92#define GRU_NUM_KERNEL_DSR_BYTES 256
     93#define GRU_NUM_KERNEL_DSR_CL	(GRU_NUM_KERNEL_DSR_BYTES /		\
     94					GRU_CACHE_LINE_BYTES)
     95
     96/* GRU instruction attributes for all instructions */
     97#define IMA			IMA_CB_DELAY
     98
     99/* GRU cacheline size is always 64 bytes - even on arches with 128 byte lines */
    100#define __gru_cacheline_aligned__                               \
    101	__attribute__((__aligned__(GRU_CACHE_LINE_BYTES)))
    102
    103#define MAGIC	0x1234567887654321UL
    104
    105/* Default retry count for GRU errors on kernel instructions */
    106#define EXCEPTION_RETRY_LIMIT	3
    107
    108/* Status of message queue sections */
    109#define MQS_EMPTY		0
    110#define MQS_FULL		1
    111#define MQS_NOOP		2
    112
    113/*----------------- RESOURCE MANAGEMENT -------------------------------------*/
    114/* optimized for x86_64 */
    115struct message_queue {
    116	union gru_mesqhead	head __gru_cacheline_aligned__;	/* CL 0 */
    117	int			qlines;				/* DW 1 */
    118	long 			hstatus[2];
    119	void 			*next __gru_cacheline_aligned__;/* CL 1 */
    120	void 			*limit;
    121	void 			*start;
    122	void 			*start2;
    123	char			data ____cacheline_aligned;	/* CL 2 */
    124};
    125
    126/* First word in every message - used by mesq interface */
    127struct message_header {
    128	char	present;
    129	char	present2;
    130	char 	lines;
    131	char	fill;
    132};
    133
    134#define HSTATUS(mq, h)	((mq) + offsetof(struct message_queue, hstatus[h]))
    135
    136/*
    137 * Reload the blade's kernel context into a GRU chiplet. Called holding
    138 * the bs_kgts_sema for READ. Will steal user contexts if necessary.
    139 */
    140static void gru_load_kernel_context(struct gru_blade_state *bs, int blade_id)
    141{
    142	struct gru_state *gru;
    143	struct gru_thread_state *kgts;
    144	void *vaddr;
    145	int ctxnum, ncpus;
    146
    147	up_read(&bs->bs_kgts_sema);
    148	down_write(&bs->bs_kgts_sema);
    149
    150	if (!bs->bs_kgts) {
    151		do {
    152			bs->bs_kgts = gru_alloc_gts(NULL, 0, 0, 0, 0, 0);
    153			if (!IS_ERR(bs->bs_kgts))
    154				break;
    155			msleep(1);
    156		} while (true);
    157		bs->bs_kgts->ts_user_blade_id = blade_id;
    158	}
    159	kgts = bs->bs_kgts;
    160
    161	if (!kgts->ts_gru) {
    162		STAT(load_kernel_context);
    163		ncpus = uv_blade_nr_possible_cpus(blade_id);
    164		kgts->ts_cbr_au_count = GRU_CB_COUNT_TO_AU(
    165			GRU_NUM_KERNEL_CBR * ncpus + bs->bs_async_cbrs);
    166		kgts->ts_dsr_au_count = GRU_DS_BYTES_TO_AU(
    167			GRU_NUM_KERNEL_DSR_BYTES * ncpus +
    168				bs->bs_async_dsr_bytes);
    169		while (!gru_assign_gru_context(kgts)) {
    170			msleep(1);
    171			gru_steal_context(kgts);
    172		}
    173		gru_load_context(kgts);
    174		gru = bs->bs_kgts->ts_gru;
    175		vaddr = gru->gs_gru_base_vaddr;
    176		ctxnum = kgts->ts_ctxnum;
    177		bs->kernel_cb = get_gseg_base_address_cb(vaddr, ctxnum, 0);
    178		bs->kernel_dsr = get_gseg_base_address_ds(vaddr, ctxnum, 0);
    179	}
    180	downgrade_write(&bs->bs_kgts_sema);
    181}
    182
    183/*
    184 * Free all kernel contexts that are not currently in use.
    185 *   Returns 0 if all freed, else number of inuse context.
    186 */
    187static int gru_free_kernel_contexts(void)
    188{
    189	struct gru_blade_state *bs;
    190	struct gru_thread_state *kgts;
    191	int bid, ret = 0;
    192
    193	for (bid = 0; bid < GRU_MAX_BLADES; bid++) {
    194		bs = gru_base[bid];
    195		if (!bs)
    196			continue;
    197
    198		/* Ignore busy contexts. Don't want to block here.  */
    199		if (down_write_trylock(&bs->bs_kgts_sema)) {
    200			kgts = bs->bs_kgts;
    201			if (kgts && kgts->ts_gru)
    202				gru_unload_context(kgts, 0);
    203			bs->bs_kgts = NULL;
    204			up_write(&bs->bs_kgts_sema);
    205			kfree(kgts);
    206		} else {
    207			ret++;
    208		}
    209	}
    210	return ret;
    211}
    212
    213/*
    214 * Lock & load the kernel context for the specified blade.
    215 */
    216static struct gru_blade_state *gru_lock_kernel_context(int blade_id)
    217{
    218	struct gru_blade_state *bs;
    219	int bid;
    220
    221	STAT(lock_kernel_context);
    222again:
    223	bid = blade_id < 0 ? uv_numa_blade_id() : blade_id;
    224	bs = gru_base[bid];
    225
    226	/* Handle the case where migration occurred while waiting for the sema */
    227	down_read(&bs->bs_kgts_sema);
    228	if (blade_id < 0 && bid != uv_numa_blade_id()) {
    229		up_read(&bs->bs_kgts_sema);
    230		goto again;
    231	}
    232	if (!bs->bs_kgts || !bs->bs_kgts->ts_gru)
    233		gru_load_kernel_context(bs, bid);
    234	return bs;
    235
    236}
    237
    238/*
    239 * Unlock the kernel context for the specified blade. Context is not
    240 * unloaded but may be stolen before next use.
    241 */
    242static void gru_unlock_kernel_context(int blade_id)
    243{
    244	struct gru_blade_state *bs;
    245
    246	bs = gru_base[blade_id];
    247	up_read(&bs->bs_kgts_sema);
    248	STAT(unlock_kernel_context);
    249}
    250
    251/*
    252 * Reserve & get pointers to the DSR/CBRs reserved for the current cpu.
    253 * 	- returns with preemption disabled
    254 */
    255static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
    256{
    257	struct gru_blade_state *bs;
    258	int lcpu;
    259
    260	BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES);
    261	preempt_disable();
    262	bs = gru_lock_kernel_context(-1);
    263	lcpu = uv_blade_processor_id();
    264	*cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE;
    265	*dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES;
    266	return 0;
    267}
    268
    269/*
    270 * Free the current cpus reserved DSR/CBR resources.
    271 */
    272static void gru_free_cpu_resources(void *cb, void *dsr)
    273{
    274	gru_unlock_kernel_context(uv_numa_blade_id());
    275	preempt_enable();
    276}
    277
    278/*
    279 * Reserve GRU resources to be used asynchronously.
    280 *   Note: currently supports only 1 reservation per blade.
    281 *
    282 * 	input:
    283 * 		blade_id  - blade on which resources should be reserved
    284 * 		cbrs	  - number of CBRs
    285 * 		dsr_bytes - number of DSR bytes needed
    286 *	output:
    287 *		handle to identify resource
    288 *		(0 = async resources already reserved)
    289 */
    290unsigned long gru_reserve_async_resources(int blade_id, int cbrs, int dsr_bytes,
    291			struct completion *cmp)
    292{
    293	struct gru_blade_state *bs;
    294	struct gru_thread_state *kgts;
    295	int ret = 0;
    296
    297	bs = gru_base[blade_id];
    298
    299	down_write(&bs->bs_kgts_sema);
    300
    301	/* Verify no resources already reserved */
    302	if (bs->bs_async_dsr_bytes + bs->bs_async_cbrs)
    303		goto done;
    304	bs->bs_async_dsr_bytes = dsr_bytes;
    305	bs->bs_async_cbrs = cbrs;
    306	bs->bs_async_wq = cmp;
    307	kgts = bs->bs_kgts;
    308
    309	/* Resources changed. Unload context if already loaded */
    310	if (kgts && kgts->ts_gru)
    311		gru_unload_context(kgts, 0);
    312	ret = ASYNC_BID_TO_HAN(blade_id);
    313
    314done:
    315	up_write(&bs->bs_kgts_sema);
    316	return ret;
    317}
    318
    319/*
    320 * Release async resources previously reserved.
    321 *
    322 *	input:
    323 *		han - handle to identify resources
    324 */
    325void gru_release_async_resources(unsigned long han)
    326{
    327	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
    328
    329	down_write(&bs->bs_kgts_sema);
    330	bs->bs_async_dsr_bytes = 0;
    331	bs->bs_async_cbrs = 0;
    332	bs->bs_async_wq = NULL;
    333	up_write(&bs->bs_kgts_sema);
    334}
    335
    336/*
    337 * Wait for async GRU instructions to complete.
    338 *
    339 *	input:
    340 *		han - handle to identify resources
    341 */
    342void gru_wait_async_cbr(unsigned long han)
    343{
    344	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
    345
    346	wait_for_completion(bs->bs_async_wq);
    347	mb();
    348}
    349
    350/*
    351 * Lock previous reserved async GRU resources
    352 *
    353 *	input:
    354 *		han - handle to identify resources
    355 *	output:
    356 *		cb  - pointer to first CBR
    357 *		dsr - pointer to first DSR
    358 */
    359void gru_lock_async_resource(unsigned long han,  void **cb, void **dsr)
    360{
    361	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
    362	int blade_id = ASYNC_HAN_TO_BID(han);
    363	int ncpus;
    364
    365	gru_lock_kernel_context(blade_id);
    366	ncpus = uv_blade_nr_possible_cpus(blade_id);
    367	if (cb)
    368		*cb = bs->kernel_cb + ncpus * GRU_HANDLE_STRIDE;
    369	if (dsr)
    370		*dsr = bs->kernel_dsr + ncpus * GRU_NUM_KERNEL_DSR_BYTES;
    371}
    372
    373/*
    374 * Unlock previous reserved async GRU resources
    375 *
    376 *	input:
    377 *		han - handle to identify resources
    378 */
    379void gru_unlock_async_resource(unsigned long han)
    380{
    381	int blade_id = ASYNC_HAN_TO_BID(han);
    382
    383	gru_unlock_kernel_context(blade_id);
    384}
    385
    386/*----------------------------------------------------------------------*/
    387int gru_get_cb_exception_detail(void *cb,
    388		struct control_block_extended_exc_detail *excdet)
    389{
    390	struct gru_control_block_extended *cbe;
    391	struct gru_thread_state *kgts = NULL;
    392	unsigned long off;
    393	int cbrnum, bid;
    394
    395	/*
    396	 * Locate kgts for cb. This algorithm is SLOW but
    397	 * this function is rarely called (ie., almost never).
    398	 * Performance does not matter.
    399	 */
    400	for_each_possible_blade(bid) {
    401		if (!gru_base[bid])
    402			break;
    403		kgts = gru_base[bid]->bs_kgts;
    404		if (!kgts || !kgts->ts_gru)
    405			continue;
    406		off = cb - kgts->ts_gru->gs_gru_base_vaddr;
    407		if (off < GRU_SIZE)
    408			break;
    409		kgts = NULL;
    410	}
    411	BUG_ON(!kgts);
    412	cbrnum = thread_cbr_number(kgts, get_cb_number(cb));
    413	cbe = get_cbe(GRUBASE(cb), cbrnum);
    414	gru_flush_cache(cbe);	/* CBE not coherent */
    415	sync_core();
    416	excdet->opc = cbe->opccpy;
    417	excdet->exopc = cbe->exopccpy;
    418	excdet->ecause = cbe->ecause;
    419	excdet->exceptdet0 = cbe->idef1upd;
    420	excdet->exceptdet1 = cbe->idef3upd;
    421	gru_flush_cache(cbe);
    422	return 0;
    423}
    424
    425static char *gru_get_cb_exception_detail_str(int ret, void *cb,
    426					     char *buf, int size)
    427{
    428	struct gru_control_block_status *gen = (void *)cb;
    429	struct control_block_extended_exc_detail excdet;
    430
    431	if (ret > 0 && gen->istatus == CBS_EXCEPTION) {
    432		gru_get_cb_exception_detail(cb, &excdet);
    433		snprintf(buf, size,
    434			"GRU:%d exception: cb %p, opc %d, exopc %d, ecause 0x%x,"
    435			"excdet0 0x%lx, excdet1 0x%x", smp_processor_id(),
    436			gen, excdet.opc, excdet.exopc, excdet.ecause,
    437			excdet.exceptdet0, excdet.exceptdet1);
    438	} else {
    439		snprintf(buf, size, "No exception");
    440	}
    441	return buf;
    442}
    443
    444static int gru_wait_idle_or_exception(struct gru_control_block_status *gen)
    445{
    446	while (gen->istatus >= CBS_ACTIVE) {
    447		cpu_relax();
    448		barrier();
    449	}
    450	return gen->istatus;
    451}
    452
    453static int gru_retry_exception(void *cb)
    454{
    455	struct gru_control_block_status *gen = (void *)cb;
    456	struct control_block_extended_exc_detail excdet;
    457	int retry = EXCEPTION_RETRY_LIMIT;
    458
    459	while (1)  {
    460		if (gru_wait_idle_or_exception(gen) == CBS_IDLE)
    461			return CBS_IDLE;
    462		if (gru_get_cb_message_queue_substatus(cb))
    463			return CBS_EXCEPTION;
    464		gru_get_cb_exception_detail(cb, &excdet);
    465		if ((excdet.ecause & ~EXCEPTION_RETRY_BITS) ||
    466				(excdet.cbrexecstatus & CBR_EXS_ABORT_OCC))
    467			break;
    468		if (retry-- == 0)
    469			break;
    470		gen->icmd = 1;
    471		gru_flush_cache(gen);
    472	}
    473	return CBS_EXCEPTION;
    474}
    475
    476int gru_check_status_proc(void *cb)
    477{
    478	struct gru_control_block_status *gen = (void *)cb;
    479	int ret;
    480
    481	ret = gen->istatus;
    482	if (ret == CBS_EXCEPTION)
    483		ret = gru_retry_exception(cb);
    484	rmb();
    485	return ret;
    486
    487}
    488
    489int gru_wait_proc(void *cb)
    490{
    491	struct gru_control_block_status *gen = (void *)cb;
    492	int ret;
    493
    494	ret = gru_wait_idle_or_exception(gen);
    495	if (ret == CBS_EXCEPTION)
    496		ret = gru_retry_exception(cb);
    497	rmb();
    498	return ret;
    499}
    500
    501static void gru_abort(int ret, void *cb, char *str)
    502{
    503	char buf[GRU_EXC_STR_SIZE];
    504
    505	panic("GRU FATAL ERROR: %s - %s\n", str,
    506	      gru_get_cb_exception_detail_str(ret, cb, buf, sizeof(buf)));
    507}
    508
    509void gru_wait_abort_proc(void *cb)
    510{
    511	int ret;
    512
    513	ret = gru_wait_proc(cb);
    514	if (ret)
    515		gru_abort(ret, cb, "gru_wait_abort");
    516}
    517
    518
    519/*------------------------------ MESSAGE QUEUES -----------------------------*/
    520
    521/* Internal status . These are NOT returned to the user. */
    522#define MQIE_AGAIN		-1	/* try again */
    523
    524
    525/*
    526 * Save/restore the "present" flag that is in the second line of 2-line
    527 * messages
    528 */
    529static inline int get_present2(void *p)
    530{
    531	struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
    532	return mhdr->present;
    533}
    534
    535static inline void restore_present2(void *p, int val)
    536{
    537	struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
    538	mhdr->present = val;
    539}
    540
    541/*
    542 * Create a message queue.
    543 * 	qlines - message queue size in cache lines. Includes 2-line header.
    544 */
    545int gru_create_message_queue(struct gru_message_queue_desc *mqd,
    546		void *p, unsigned int bytes, int nasid, int vector, int apicid)
    547{
    548	struct message_queue *mq = p;
    549	unsigned int qlines;
    550
    551	qlines = bytes / GRU_CACHE_LINE_BYTES - 2;
    552	memset(mq, 0, bytes);
    553	mq->start = &mq->data;
    554	mq->start2 = &mq->data + (qlines / 2 - 1) * GRU_CACHE_LINE_BYTES;
    555	mq->next = &mq->data;
    556	mq->limit = &mq->data + (qlines - 2) * GRU_CACHE_LINE_BYTES;
    557	mq->qlines = qlines;
    558	mq->hstatus[0] = 0;
    559	mq->hstatus[1] = 1;
    560	mq->head = gru_mesq_head(2, qlines / 2 + 1);
    561	mqd->mq = mq;
    562	mqd->mq_gpa = uv_gpa(mq);
    563	mqd->qlines = qlines;
    564	mqd->interrupt_pnode = nasid >> 1;
    565	mqd->interrupt_vector = vector;
    566	mqd->interrupt_apicid = apicid;
    567	return 0;
    568}
    569EXPORT_SYMBOL_GPL(gru_create_message_queue);
    570
    571/*
    572 * Send a NOOP message to a message queue
    573 * 	Returns:
    574 * 		 0 - if queue is full after the send. This is the normal case
    575 * 		     but various races can change this.
    576 *		-1 - if mesq sent successfully but queue not full
    577 *		>0 - unexpected error. MQE_xxx returned
    578 */
    579static int send_noop_message(void *cb, struct gru_message_queue_desc *mqd,
    580				void *mesg)
    581{
    582	const struct message_header noop_header = {
    583					.present = MQS_NOOP, .lines = 1};
    584	unsigned long m;
    585	int substatus, ret;
    586	struct message_header save_mhdr, *mhdr = mesg;
    587
    588	STAT(mesq_noop);
    589	save_mhdr = *mhdr;
    590	*mhdr = noop_header;
    591	gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), 1, IMA);
    592	ret = gru_wait(cb);
    593
    594	if (ret) {
    595		substatus = gru_get_cb_message_queue_substatus(cb);
    596		switch (substatus) {
    597		case CBSS_NO_ERROR:
    598			STAT(mesq_noop_unexpected_error);
    599			ret = MQE_UNEXPECTED_CB_ERR;
    600			break;
    601		case CBSS_LB_OVERFLOWED:
    602			STAT(mesq_noop_lb_overflow);
    603			ret = MQE_CONGESTION;
    604			break;
    605		case CBSS_QLIMIT_REACHED:
    606			STAT(mesq_noop_qlimit_reached);
    607			ret = 0;
    608			break;
    609		case CBSS_AMO_NACKED:
    610			STAT(mesq_noop_amo_nacked);
    611			ret = MQE_CONGESTION;
    612			break;
    613		case CBSS_PUT_NACKED:
    614			STAT(mesq_noop_put_nacked);
    615			m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
    616			gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1,
    617						IMA);
    618			if (gru_wait(cb) == CBS_IDLE)
    619				ret = MQIE_AGAIN;
    620			else
    621				ret = MQE_UNEXPECTED_CB_ERR;
    622			break;
    623		case CBSS_PAGE_OVERFLOW:
    624			STAT(mesq_noop_page_overflow);
    625			fallthrough;
    626		default:
    627			BUG();
    628		}
    629	}
    630	*mhdr = save_mhdr;
    631	return ret;
    632}
    633
    634/*
    635 * Handle a gru_mesq full.
    636 */
    637static int send_message_queue_full(void *cb, struct gru_message_queue_desc *mqd,
    638				void *mesg, int lines)
    639{
    640	union gru_mesqhead mqh;
    641	unsigned int limit, head;
    642	unsigned long avalue;
    643	int half, qlines;
    644
    645	/* Determine if switching to first/second half of q */
    646	avalue = gru_get_amo_value(cb);
    647	head = gru_get_amo_value_head(cb);
    648	limit = gru_get_amo_value_limit(cb);
    649
    650	qlines = mqd->qlines;
    651	half = (limit != qlines);
    652
    653	if (half)
    654		mqh = gru_mesq_head(qlines / 2 + 1, qlines);
    655	else
    656		mqh = gru_mesq_head(2, qlines / 2 + 1);
    657
    658	/* Try to get lock for switching head pointer */
    659	gru_gamir(cb, EOP_IR_CLR, HSTATUS(mqd->mq_gpa, half), XTYPE_DW, IMA);
    660	if (gru_wait(cb) != CBS_IDLE)
    661		goto cberr;
    662	if (!gru_get_amo_value(cb)) {
    663		STAT(mesq_qf_locked);
    664		return MQE_QUEUE_FULL;
    665	}
    666
    667	/* Got the lock. Send optional NOP if queue not full, */
    668	if (head != limit) {
    669		if (send_noop_message(cb, mqd, mesg)) {
    670			gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half),
    671					XTYPE_DW, IMA);
    672			if (gru_wait(cb) != CBS_IDLE)
    673				goto cberr;
    674			STAT(mesq_qf_noop_not_full);
    675			return MQIE_AGAIN;
    676		}
    677		avalue++;
    678	}
    679
    680	/* Then flip queuehead to other half of queue. */
    681	gru_gamer(cb, EOP_ERR_CSWAP, mqd->mq_gpa, XTYPE_DW, mqh.val, avalue,
    682							IMA);
    683	if (gru_wait(cb) != CBS_IDLE)
    684		goto cberr;
    685
    686	/* If not successfully in swapping queue head, clear the hstatus lock */
    687	if (gru_get_amo_value(cb) != avalue) {
    688		STAT(mesq_qf_switch_head_failed);
    689		gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half), XTYPE_DW,
    690							IMA);
    691		if (gru_wait(cb) != CBS_IDLE)
    692			goto cberr;
    693	}
    694	return MQIE_AGAIN;
    695cberr:
    696	STAT(mesq_qf_unexpected_error);
    697	return MQE_UNEXPECTED_CB_ERR;
    698}
    699
    700/*
    701 * Handle a PUT failure. Note: if message was a 2-line message, one of the
    702 * lines might have successfully have been written. Before sending the
    703 * message, "present" must be cleared in BOTH lines to prevent the receiver
    704 * from prematurely seeing the full message.
    705 */
    706static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd,
    707			void *mesg, int lines)
    708{
    709	unsigned long m;
    710	int ret, loops = 200;	/* experimentally determined */
    711
    712	m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
    713	if (lines == 2) {
    714		gru_vset(cb, m, 0, XTYPE_CL, lines, 1, IMA);
    715		if (gru_wait(cb) != CBS_IDLE)
    716			return MQE_UNEXPECTED_CB_ERR;
    717	}
    718	gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
    719	if (gru_wait(cb) != CBS_IDLE)
    720		return MQE_UNEXPECTED_CB_ERR;
    721
    722	if (!mqd->interrupt_vector)
    723		return MQE_OK;
    724
    725	/*
    726	 * Send a noop message in order to deliver a cross-partition interrupt
    727	 * to the SSI that contains the target message queue. Normally, the
    728	 * interrupt is automatically delivered by hardware following mesq
    729	 * operations, but some error conditions require explicit delivery.
    730	 * The noop message will trigger delivery. Otherwise partition failures
    731	 * could cause unrecovered errors.
    732	 */
    733	do {
    734		ret = send_noop_message(cb, mqd, mesg);
    735	} while ((ret == MQIE_AGAIN || ret == MQE_CONGESTION) && (loops-- > 0));
    736
    737	if (ret == MQIE_AGAIN || ret == MQE_CONGESTION) {
    738		/*
    739		 * Don't indicate to the app to resend the message, as it's
    740		 * already been successfully sent.  We simply send an OK
    741		 * (rather than fail the send with MQE_UNEXPECTED_CB_ERR),
    742		 * assuming that the other side is receiving enough
    743		 * interrupts to get this message processed anyway.
    744		 */
    745		ret = MQE_OK;
    746	}
    747	return ret;
    748}
    749
    750/*
    751 * Handle a gru_mesq failure. Some of these failures are software recoverable
    752 * or retryable.
    753 */
    754static int send_message_failure(void *cb, struct gru_message_queue_desc *mqd,
    755				void *mesg, int lines)
    756{
    757	int substatus, ret = 0;
    758
    759	substatus = gru_get_cb_message_queue_substatus(cb);
    760	switch (substatus) {
    761	case CBSS_NO_ERROR:
    762		STAT(mesq_send_unexpected_error);
    763		ret = MQE_UNEXPECTED_CB_ERR;
    764		break;
    765	case CBSS_LB_OVERFLOWED:
    766		STAT(mesq_send_lb_overflow);
    767		ret = MQE_CONGESTION;
    768		break;
    769	case CBSS_QLIMIT_REACHED:
    770		STAT(mesq_send_qlimit_reached);
    771		ret = send_message_queue_full(cb, mqd, mesg, lines);
    772		break;
    773	case CBSS_AMO_NACKED:
    774		STAT(mesq_send_amo_nacked);
    775		ret = MQE_CONGESTION;
    776		break;
    777	case CBSS_PUT_NACKED:
    778		STAT(mesq_send_put_nacked);
    779		ret = send_message_put_nacked(cb, mqd, mesg, lines);
    780		break;
    781	case CBSS_PAGE_OVERFLOW:
    782		STAT(mesq_page_overflow);
    783		fallthrough;
    784	default:
    785		BUG();
    786	}
    787	return ret;
    788}
    789
    790/*
    791 * Send a message to a message queue
    792 * 	mqd	message queue descriptor
    793 * 	mesg	message. ust be vaddr within a GSEG
    794 * 	bytes	message size (<= 2 CL)
    795 */
    796int gru_send_message_gpa(struct gru_message_queue_desc *mqd, void *mesg,
    797				unsigned int bytes)
    798{
    799	struct message_header *mhdr;
    800	void *cb;
    801	void *dsr;
    802	int istatus, clines, ret;
    803
    804	STAT(mesq_send);
    805	BUG_ON(bytes < sizeof(int) || bytes > 2 * GRU_CACHE_LINE_BYTES);
    806
    807	clines = DIV_ROUND_UP(bytes, GRU_CACHE_LINE_BYTES);
    808	if (gru_get_cpu_resources(bytes, &cb, &dsr))
    809		return MQE_BUG_NO_RESOURCES;
    810	memcpy(dsr, mesg, bytes);
    811	mhdr = dsr;
    812	mhdr->present = MQS_FULL;
    813	mhdr->lines = clines;
    814	if (clines == 2) {
    815		mhdr->present2 = get_present2(mhdr);
    816		restore_present2(mhdr, MQS_FULL);
    817	}
    818
    819	do {
    820		ret = MQE_OK;
    821		gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), clines, IMA);
    822		istatus = gru_wait(cb);
    823		if (istatus != CBS_IDLE)
    824			ret = send_message_failure(cb, mqd, dsr, clines);
    825	} while (ret == MQIE_AGAIN);
    826	gru_free_cpu_resources(cb, dsr);
    827
    828	if (ret)
    829		STAT(mesq_send_failed);
    830	return ret;
    831}
    832EXPORT_SYMBOL_GPL(gru_send_message_gpa);
    833
    834/*
    835 * Advance the receive pointer for the queue to the next message.
    836 */
    837void gru_free_message(struct gru_message_queue_desc *mqd, void *mesg)
    838{
    839	struct message_queue *mq = mqd->mq;
    840	struct message_header *mhdr = mq->next;
    841	void *next, *pnext;
    842	int half = -1;
    843	int lines = mhdr->lines;
    844
    845	if (lines == 2)
    846		restore_present2(mhdr, MQS_EMPTY);
    847	mhdr->present = MQS_EMPTY;
    848
    849	pnext = mq->next;
    850	next = pnext + GRU_CACHE_LINE_BYTES * lines;
    851	if (next == mq->limit) {
    852		next = mq->start;
    853		half = 1;
    854	} else if (pnext < mq->start2 && next >= mq->start2) {
    855		half = 0;
    856	}
    857
    858	if (half >= 0)
    859		mq->hstatus[half] = 1;
    860	mq->next = next;
    861}
    862EXPORT_SYMBOL_GPL(gru_free_message);
    863
    864/*
    865 * Get next message from message queue. Return NULL if no message
    866 * present. User must call next_message() to move to next message.
    867 * 	rmq	message queue
    868 */
    869void *gru_get_next_message(struct gru_message_queue_desc *mqd)
    870{
    871	struct message_queue *mq = mqd->mq;
    872	struct message_header *mhdr = mq->next;
    873	int present = mhdr->present;
    874
    875	/* skip NOOP messages */
    876	while (present == MQS_NOOP) {
    877		gru_free_message(mqd, mhdr);
    878		mhdr = mq->next;
    879		present = mhdr->present;
    880	}
    881
    882	/* Wait for both halves of 2 line messages */
    883	if (present == MQS_FULL && mhdr->lines == 2 &&
    884				get_present2(mhdr) == MQS_EMPTY)
    885		present = MQS_EMPTY;
    886
    887	if (!present) {
    888		STAT(mesq_receive_none);
    889		return NULL;
    890	}
    891
    892	if (mhdr->lines == 2)
    893		restore_present2(mhdr, mhdr->present2);
    894
    895	STAT(mesq_receive);
    896	return mhdr;
    897}
    898EXPORT_SYMBOL_GPL(gru_get_next_message);
    899
    900/* ---------------------- GRU DATA COPY FUNCTIONS ---------------------------*/
    901
    902/*
    903 * Load a DW from a global GPA. The GPA can be a memory or MMR address.
    904 */
    905int gru_read_gpa(unsigned long *value, unsigned long gpa)
    906{
    907	void *cb;
    908	void *dsr;
    909	int ret, iaa;
    910
    911	STAT(read_gpa);
    912	if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
    913		return MQE_BUG_NO_RESOURCES;
    914	iaa = gpa >> 62;
    915	gru_vload_phys(cb, gpa, gru_get_tri(dsr), iaa, IMA);
    916	ret = gru_wait(cb);
    917	if (ret == CBS_IDLE)
    918		*value = *(unsigned long *)dsr;
    919	gru_free_cpu_resources(cb, dsr);
    920	return ret;
    921}
    922EXPORT_SYMBOL_GPL(gru_read_gpa);
    923
    924
    925/*
    926 * Copy a block of data using the GRU resources
    927 */
    928int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
    929				unsigned int bytes)
    930{
    931	void *cb;
    932	void *dsr;
    933	int ret;
    934
    935	STAT(copy_gpa);
    936	if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
    937		return MQE_BUG_NO_RESOURCES;
    938	gru_bcopy(cb, src_gpa, dest_gpa, gru_get_tri(dsr),
    939		  XTYPE_B, bytes, GRU_NUM_KERNEL_DSR_CL, IMA);
    940	ret = gru_wait(cb);
    941	gru_free_cpu_resources(cb, dsr);
    942	return ret;
    943}
    944EXPORT_SYMBOL_GPL(gru_copy_gpa);
    945
    946/* ------------------- KERNEL QUICKTESTS RUN AT STARTUP ----------------*/
    947/* 	Temp - will delete after we gain confidence in the GRU		*/
    948
    949static int quicktest0(unsigned long arg)
    950{
    951	unsigned long word0;
    952	unsigned long word1;
    953	void *cb;
    954	void *dsr;
    955	unsigned long *p;
    956	int ret = -EIO;
    957
    958	if (gru_get_cpu_resources(GRU_CACHE_LINE_BYTES, &cb, &dsr))
    959		return MQE_BUG_NO_RESOURCES;
    960	p = dsr;
    961	word0 = MAGIC;
    962	word1 = 0;
    963
    964	gru_vload(cb, uv_gpa(&word0), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
    965	if (gru_wait(cb) != CBS_IDLE) {
    966		printk(KERN_DEBUG "GRU:%d quicktest0: CBR failure 1\n", smp_processor_id());
    967		goto done;
    968	}
    969
    970	if (*p != MAGIC) {
    971		printk(KERN_DEBUG "GRU:%d quicktest0 bad magic 0x%lx\n", smp_processor_id(), *p);
    972		goto done;
    973	}
    974	gru_vstore(cb, uv_gpa(&word1), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
    975	if (gru_wait(cb) != CBS_IDLE) {
    976		printk(KERN_DEBUG "GRU:%d quicktest0: CBR failure 2\n", smp_processor_id());
    977		goto done;
    978	}
    979
    980	if (word0 != word1 || word1 != MAGIC) {
    981		printk(KERN_DEBUG
    982		       "GRU:%d quicktest0 err: found 0x%lx, expected 0x%lx\n",
    983		     smp_processor_id(), word1, MAGIC);
    984		goto done;
    985	}
    986	ret = 0;
    987
    988done:
    989	gru_free_cpu_resources(cb, dsr);
    990	return ret;
    991}
    992
    993#define ALIGNUP(p, q)	((void *)(((unsigned long)(p) + (q) - 1) & ~(q - 1)))
    994
    995static int quicktest1(unsigned long arg)
    996{
    997	struct gru_message_queue_desc mqd;
    998	void *p, *mq;
    999	int i, ret = -EIO;
   1000	char mes[GRU_CACHE_LINE_BYTES], *m;
   1001
   1002	/* Need  1K cacheline aligned that does not cross page boundary */
   1003	p = kmalloc(4096, 0);
   1004	if (p == NULL)
   1005		return -ENOMEM;
   1006	mq = ALIGNUP(p, 1024);
   1007	memset(mes, 0xee, sizeof(mes));
   1008
   1009	gru_create_message_queue(&mqd, mq, 8 * GRU_CACHE_LINE_BYTES, 0, 0, 0);
   1010	for (i = 0; i < 6; i++) {
   1011		mes[8] = i;
   1012		do {
   1013			ret = gru_send_message_gpa(&mqd, mes, sizeof(mes));
   1014		} while (ret == MQE_CONGESTION);
   1015		if (ret)
   1016			break;
   1017	}
   1018	if (ret != MQE_QUEUE_FULL || i != 4) {
   1019		printk(KERN_DEBUG "GRU:%d quicktest1: unexpected status %d, i %d\n",
   1020		       smp_processor_id(), ret, i);
   1021		goto done;
   1022	}
   1023
   1024	for (i = 0; i < 6; i++) {
   1025		m = gru_get_next_message(&mqd);
   1026		if (!m || m[8] != i)
   1027			break;
   1028		gru_free_message(&mqd, m);
   1029	}
   1030	if (i != 4) {
   1031		printk(KERN_DEBUG "GRU:%d quicktest2: bad message, i %d, m %p, m8 %d\n",
   1032			smp_processor_id(), i, m, m ? m[8] : -1);
   1033		goto done;
   1034	}
   1035	ret = 0;
   1036
   1037done:
   1038	kfree(p);
   1039	return ret;
   1040}
   1041
   1042static int quicktest2(unsigned long arg)
   1043{
   1044	static DECLARE_COMPLETION(cmp);
   1045	unsigned long han;
   1046	int blade_id = 0;
   1047	int numcb = 4;
   1048	int ret = 0;
   1049	unsigned long *buf;
   1050	void *cb0, *cb;
   1051	struct gru_control_block_status *gen;
   1052	int i, k, istatus, bytes;
   1053
   1054	bytes = numcb * 4 * 8;
   1055	buf = kmalloc(bytes, GFP_KERNEL);
   1056	if (!buf)
   1057		return -ENOMEM;
   1058
   1059	ret = -EBUSY;
   1060	han = gru_reserve_async_resources(blade_id, numcb, 0, &cmp);
   1061	if (!han)
   1062		goto done;
   1063
   1064	gru_lock_async_resource(han, &cb0, NULL);
   1065	memset(buf, 0xee, bytes);
   1066	for (i = 0; i < numcb; i++)
   1067		gru_vset(cb0 + i * GRU_HANDLE_STRIDE, uv_gpa(&buf[i * 4]), 0,
   1068				XTYPE_DW, 4, 1, IMA_INTERRUPT);
   1069
   1070	ret = 0;
   1071	k = numcb;
   1072	do {
   1073		gru_wait_async_cbr(han);
   1074		for (i = 0; i < numcb; i++) {
   1075			cb = cb0 + i * GRU_HANDLE_STRIDE;
   1076			istatus = gru_check_status(cb);
   1077			if (istatus != CBS_ACTIVE && istatus != CBS_CALL_OS)
   1078				break;
   1079		}
   1080		if (i == numcb)
   1081			continue;
   1082		if (istatus != CBS_IDLE) {
   1083			printk(KERN_DEBUG "GRU:%d quicktest2: cb %d, exception\n", smp_processor_id(), i);
   1084			ret = -EFAULT;
   1085		} else if (buf[4 * i] || buf[4 * i + 1] || buf[4 * i + 2] ||
   1086				buf[4 * i + 3]) {
   1087			printk(KERN_DEBUG "GRU:%d quicktest2:cb %d,  buf 0x%lx, 0x%lx, 0x%lx, 0x%lx\n",
   1088			       smp_processor_id(), i, buf[4 * i], buf[4 * i + 1], buf[4 * i + 2], buf[4 * i + 3]);
   1089			ret = -EIO;
   1090		}
   1091		k--;
   1092		gen = cb;
   1093		gen->istatus = CBS_CALL_OS; /* don't handle this CBR again */
   1094	} while (k);
   1095	BUG_ON(cmp.done);
   1096
   1097	gru_unlock_async_resource(han);
   1098	gru_release_async_resources(han);
   1099done:
   1100	kfree(buf);
   1101	return ret;
   1102}
   1103
   1104#define BUFSIZE 200
   1105static int quicktest3(unsigned long arg)
   1106{
   1107	char buf1[BUFSIZE], buf2[BUFSIZE];
   1108	int ret = 0;
   1109
   1110	memset(buf2, 0, sizeof(buf2));
   1111	memset(buf1, get_cycles() & 255, sizeof(buf1));
   1112	gru_copy_gpa(uv_gpa(buf2), uv_gpa(buf1), BUFSIZE);
   1113	if (memcmp(buf1, buf2, BUFSIZE)) {
   1114		printk(KERN_DEBUG "GRU:%d quicktest3 error\n", smp_processor_id());
   1115		ret = -EIO;
   1116	}
   1117	return ret;
   1118}
   1119
   1120/*
   1121 * Debugging only. User hook for various kernel tests
   1122 * of driver & gru.
   1123 */
   1124int gru_ktest(unsigned long arg)
   1125{
   1126	int ret = -EINVAL;
   1127
   1128	switch (arg & 0xff) {
   1129	case 0:
   1130		ret = quicktest0(arg);
   1131		break;
   1132	case 1:
   1133		ret = quicktest1(arg);
   1134		break;
   1135	case 2:
   1136		ret = quicktest2(arg);
   1137		break;
   1138	case 3:
   1139		ret = quicktest3(arg);
   1140		break;
   1141	case 99:
   1142		ret = gru_free_kernel_contexts();
   1143		break;
   1144	}
   1145	return ret;
   1146
   1147}
   1148
   1149int gru_kservices_init(void)
   1150{
   1151	return 0;
   1152}
   1153
   1154void gru_kservices_exit(void)
   1155{
   1156	if (gru_free_kernel_contexts())
   1157		BUG();
   1158}
   1159