pio.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
pio.c (57904B)
      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2/*
      3 * Copyright(c) 2015-2018 Intel Corporation.
      4 */
      5
      6#include <linux/delay.h>
      7#include "hfi.h"
      8#include "qp.h"
      9#include "trace.h"
     10
     11#define SC(name) SEND_CTXT_##name
     12/*
     13 * Send Context functions
     14 */
     15static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
     16
     17/*
     18 * Set the CM reset bit and wait for it to clear.  Use the provided
     19 * sendctrl register.  This routine has no locking.
     20 */
     21void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
     22{
     23	write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
     24	while (1) {
     25		udelay(1);
     26		sendctrl = read_csr(dd, SEND_CTRL);
     27		if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
     28			break;
     29	}
     30}
     31
     32/* global control of PIO send */
     33void pio_send_control(struct hfi1_devdata *dd, int op)
     34{
     35	u64 reg, mask;
     36	unsigned long flags;
     37	int write = 1;	/* write sendctrl back */
     38	int flush = 0;	/* re-read sendctrl to make sure it is flushed */
     39	int i;
     40
     41	spin_lock_irqsave(&dd->sendctrl_lock, flags);
     42
     43	reg = read_csr(dd, SEND_CTRL);
     44	switch (op) {
     45	case PSC_GLOBAL_ENABLE:
     46		reg |= SEND_CTRL_SEND_ENABLE_SMASK;
     47		fallthrough;
     48	case PSC_DATA_VL_ENABLE:
     49		mask = 0;
     50		for (i = 0; i < ARRAY_SIZE(dd->vld); i++)
     51			if (!dd->vld[i].mtu)
     52				mask |= BIT_ULL(i);
     53		/* Disallow sending on VLs not enabled */
     54		mask = (mask & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
     55			SEND_CTRL_UNSUPPORTED_VL_SHIFT;
     56		reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
     57		break;
     58	case PSC_GLOBAL_DISABLE:
     59		reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
     60		break;
     61	case PSC_GLOBAL_VLARB_ENABLE:
     62		reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
     63		break;
     64	case PSC_GLOBAL_VLARB_DISABLE:
     65		reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
     66		break;
     67	case PSC_CM_RESET:
     68		__cm_reset(dd, reg);
     69		write = 0; /* CSR already written (and flushed) */
     70		break;
     71	case PSC_DATA_VL_DISABLE:
     72		reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
     73		flush = 1;
     74		break;
     75	default:
     76		dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
     77		break;
     78	}
     79
     80	if (write) {
     81		write_csr(dd, SEND_CTRL, reg);
     82		if (flush)
     83			(void)read_csr(dd, SEND_CTRL); /* flush write */
     84	}
     85
     86	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
     87}
     88
     89/* number of send context memory pools */
     90#define NUM_SC_POOLS 2
     91
     92/* Send Context Size (SCS) wildcards */
     93#define SCS_POOL_0 -1
     94#define SCS_POOL_1 -2
     95
     96/* Send Context Count (SCC) wildcards */
     97#define SCC_PER_VL -1
     98#define SCC_PER_CPU  -2
     99#define SCC_PER_KRCVQ  -3
    100
    101/* Send Context Size (SCS) constants */
    102#define SCS_ACK_CREDITS  32
    103#define SCS_VL15_CREDITS 102	/* 3 pkts of 2048B data + 128B header */
    104
    105#define PIO_THRESHOLD_CEILING 4096
    106
    107#define PIO_WAIT_BATCH_SIZE 5
    108
    109/* default send context sizes */
    110static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
    111	[SC_KERNEL] = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
    112			.count = SCC_PER_VL },	/* one per NUMA */
    113	[SC_ACK]    = { .size  = SCS_ACK_CREDITS,
    114			.count = SCC_PER_KRCVQ },
    115	[SC_USER]   = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
    116			.count = SCC_PER_CPU },	/* one per CPU */
    117	[SC_VL15]   = { .size  = SCS_VL15_CREDITS,
    118			.count = 1 },
    119
    120};
    121
    122/* send context memory pool configuration */
    123struct mem_pool_config {
    124	int centipercent;	/* % of memory, in 100ths of 1% */
    125	int absolute_blocks;	/* absolute block count */
    126};
    127
    128/* default memory pool configuration: 100% in pool 0 */
    129static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
    130	/* centi%, abs blocks */
    131	{  10000,     -1 },		/* pool 0 */
    132	{      0,     -1 },		/* pool 1 */
    133};
    134
    135/* memory pool information, used when calculating final sizes */
    136struct mem_pool_info {
    137	int centipercent;	/*
    138				 * 100th of 1% of memory to use, -1 if blocks
    139				 * already set
    140				 */
    141	int count;		/* count of contexts in the pool */
    142	int blocks;		/* block size of the pool */
    143	int size;		/* context size, in blocks */
    144};
    145
    146/*
    147 * Convert a pool wildcard to a valid pool index.  The wildcards
    148 * start at -1 and increase negatively.  Map them as:
    149 *	-1 => 0
    150 *	-2 => 1
    151 *	etc.
    152 *
    153 * Return -1 on non-wildcard input, otherwise convert to a pool number.
    154 */
    155static int wildcard_to_pool(int wc)
    156{
    157	if (wc >= 0)
    158		return -1;	/* non-wildcard */
    159	return -wc - 1;
    160}
    161
    162static const char *sc_type_names[SC_MAX] = {
    163	"kernel",
    164	"ack",
    165	"user",
    166	"vl15"
    167};
    168
    169static const char *sc_type_name(int index)
    170{
    171	if (index < 0 || index >= SC_MAX)
    172		return "unknown";
    173	return sc_type_names[index];
    174}
    175
    176/*
    177 * Read the send context memory pool configuration and send context
    178 * size configuration.  Replace any wildcards and come up with final
    179 * counts and sizes for the send context types.
    180 */
    181int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
    182{
    183	struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
    184	int total_blocks = (chip_pio_mem_size(dd) / PIO_BLOCK_SIZE) - 1;
    185	int total_contexts = 0;
    186	int fixed_blocks;
    187	int pool_blocks;
    188	int used_blocks;
    189	int cp_total;		/* centipercent total */
    190	int ab_total;		/* absolute block total */
    191	int extra;
    192	int i;
    193
    194	/*
    195	 * When SDMA is enabled, kernel context pio packet size is capped by
    196	 * "piothreshold". Reduce pio buffer allocation for kernel context by
    197	 * setting it to a fixed size. The allocation allows 3-deep buffering
    198	 * of the largest pio packets plus up to 128 bytes header, sufficient
    199	 * to maintain verbs performance.
    200	 *
    201	 * When SDMA is disabled, keep the default pooling allocation.
    202	 */
    203	if (HFI1_CAP_IS_KSET(SDMA)) {
    204		u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
    205					 piothreshold : PIO_THRESHOLD_CEILING;
    206		sc_config_sizes[SC_KERNEL].size =
    207			3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
    208	}
    209
    210	/*
    211	 * Step 0:
    212	 *	- copy the centipercents/absolute sizes from the pool config
    213	 *	- sanity check these values
    214	 *	- add up centipercents, then later check for full value
    215	 *	- add up absolute blocks, then later check for over-commit
    216	 */
    217	cp_total = 0;
    218	ab_total = 0;
    219	for (i = 0; i < NUM_SC_POOLS; i++) {
    220		int cp = sc_mem_pool_config[i].centipercent;
    221		int ab = sc_mem_pool_config[i].absolute_blocks;
    222
    223		/*
    224		 * A negative value is "unused" or "invalid".  Both *can*
    225		 * be valid, but centipercent wins, so check that first
    226		 */
    227		if (cp >= 0) {			/* centipercent valid */
    228			cp_total += cp;
    229		} else if (ab >= 0) {		/* absolute blocks valid */
    230			ab_total += ab;
    231		} else {			/* neither valid */
    232			dd_dev_err(
    233				dd,
    234				"Send context memory pool %d: both the block count and centipercent are invalid\n",
    235				i);
    236			return -EINVAL;
    237		}
    238
    239		mem_pool_info[i].centipercent = cp;
    240		mem_pool_info[i].blocks = ab;
    241	}
    242
    243	/* do not use both % and absolute blocks for different pools */
    244	if (cp_total != 0 && ab_total != 0) {
    245		dd_dev_err(
    246			dd,
    247			"All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
    248		return -EINVAL;
    249	}
    250
    251	/* if any percentages are present, they must add up to 100% x 100 */
    252	if (cp_total != 0 && cp_total != 10000) {
    253		dd_dev_err(
    254			dd,
    255			"Send context memory pool centipercent is %d, expecting 10000\n",
    256			cp_total);
    257		return -EINVAL;
    258	}
    259
    260	/* the absolute pool total cannot be more than the mem total */
    261	if (ab_total > total_blocks) {
    262		dd_dev_err(
    263			dd,
    264			"Send context memory pool absolute block count %d is larger than the memory size %d\n",
    265			ab_total, total_blocks);
    266		return -EINVAL;
    267	}
    268
    269	/*
    270	 * Step 2:
    271	 *	- copy from the context size config
    272	 *	- replace context type wildcard counts with real values
    273	 *	- add up non-memory pool block sizes
    274	 *	- add up memory pool user counts
    275	 */
    276	fixed_blocks = 0;
    277	for (i = 0; i < SC_MAX; i++) {
    278		int count = sc_config_sizes[i].count;
    279		int size = sc_config_sizes[i].size;
    280		int pool;
    281
    282		/*
    283		 * Sanity check count: Either a positive value or
    284		 * one of the expected wildcards is valid.  The positive
    285		 * value is checked later when we compare against total
    286		 * memory available.
    287		 */
    288		if (i == SC_ACK) {
    289			count = dd->n_krcv_queues;
    290		} else if (i == SC_KERNEL) {
    291			count = INIT_SC_PER_VL * num_vls;
    292		} else if (count == SCC_PER_CPU) {
    293			count = dd->num_rcv_contexts - dd->n_krcv_queues;
    294		} else if (count < 0) {
    295			dd_dev_err(
    296				dd,
    297				"%s send context invalid count wildcard %d\n",
    298				sc_type_name(i), count);
    299			return -EINVAL;
    300		}
    301		if (total_contexts + count > chip_send_contexts(dd))
    302			count = chip_send_contexts(dd) - total_contexts;
    303
    304		total_contexts += count;
    305
    306		/*
    307		 * Sanity check pool: The conversion will return a pool
    308		 * number or -1 if a fixed (non-negative) value.  The fixed
    309		 * value is checked later when we compare against
    310		 * total memory available.
    311		 */
    312		pool = wildcard_to_pool(size);
    313		if (pool == -1) {			/* non-wildcard */
    314			fixed_blocks += size * count;
    315		} else if (pool < NUM_SC_POOLS) {	/* valid wildcard */
    316			mem_pool_info[pool].count += count;
    317		} else {				/* invalid wildcard */
    318			dd_dev_err(
    319				dd,
    320				"%s send context invalid pool wildcard %d\n",
    321				sc_type_name(i), size);
    322			return -EINVAL;
    323		}
    324
    325		dd->sc_sizes[i].count = count;
    326		dd->sc_sizes[i].size = size;
    327	}
    328	if (fixed_blocks > total_blocks) {
    329		dd_dev_err(
    330			dd,
    331			"Send context fixed block count, %u, larger than total block count %u\n",
    332			fixed_blocks, total_blocks);
    333		return -EINVAL;
    334	}
    335
    336	/* step 3: calculate the blocks in the pools, and pool context sizes */
    337	pool_blocks = total_blocks - fixed_blocks;
    338	if (ab_total > pool_blocks) {
    339		dd_dev_err(
    340			dd,
    341			"Send context fixed pool sizes, %u, larger than pool block count %u\n",
    342			ab_total, pool_blocks);
    343		return -EINVAL;
    344	}
    345	/* subtract off the fixed pool blocks */
    346	pool_blocks -= ab_total;
    347
    348	for (i = 0; i < NUM_SC_POOLS; i++) {
    349		struct mem_pool_info *pi = &mem_pool_info[i];
    350
    351		/* % beats absolute blocks */
    352		if (pi->centipercent >= 0)
    353			pi->blocks = (pool_blocks * pi->centipercent) / 10000;
    354
    355		if (pi->blocks == 0 && pi->count != 0) {
    356			dd_dev_err(
    357				dd,
    358				"Send context memory pool %d has %u contexts, but no blocks\n",
    359				i, pi->count);
    360			return -EINVAL;
    361		}
    362		if (pi->count == 0) {
    363			/* warn about wasted blocks */
    364			if (pi->blocks != 0)
    365				dd_dev_err(
    366					dd,
    367					"Send context memory pool %d has %u blocks, but zero contexts\n",
    368					i, pi->blocks);
    369			pi->size = 0;
    370		} else {
    371			pi->size = pi->blocks / pi->count;
    372		}
    373	}
    374
    375	/* step 4: fill in the context type sizes from the pool sizes */
    376	used_blocks = 0;
    377	for (i = 0; i < SC_MAX; i++) {
    378		if (dd->sc_sizes[i].size < 0) {
    379			unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
    380
    381			WARN_ON_ONCE(pool >= NUM_SC_POOLS);
    382			dd->sc_sizes[i].size = mem_pool_info[pool].size;
    383		}
    384		/* make sure we are not larger than what is allowed by the HW */
    385#define PIO_MAX_BLOCKS 1024
    386		if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
    387			dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
    388
    389		/* calculate our total usage */
    390		used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
    391	}
    392	extra = total_blocks - used_blocks;
    393	if (extra != 0)
    394		dd_dev_info(dd, "unused send context blocks: %d\n", extra);
    395
    396	return total_contexts;
    397}
    398
    399int init_send_contexts(struct hfi1_devdata *dd)
    400{
    401	u16 base;
    402	int ret, i, j, context;
    403
    404	ret = init_credit_return(dd);
    405	if (ret)
    406		return ret;
    407
    408	dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
    409					GFP_KERNEL);
    410	dd->send_contexts = kcalloc(dd->num_send_contexts,
    411				    sizeof(struct send_context_info),
    412				    GFP_KERNEL);
    413	if (!dd->send_contexts || !dd->hw_to_sw) {
    414		kfree(dd->hw_to_sw);
    415		kfree(dd->send_contexts);
    416		free_credit_return(dd);
    417		return -ENOMEM;
    418	}
    419
    420	/* hardware context map starts with invalid send context indices */
    421	for (i = 0; i < TXE_NUM_CONTEXTS; i++)
    422		dd->hw_to_sw[i] = INVALID_SCI;
    423
    424	/*
    425	 * All send contexts have their credit sizes.  Allocate credits
    426	 * for each context one after another from the global space.
    427	 */
    428	context = 0;
    429	base = 1;
    430	for (i = 0; i < SC_MAX; i++) {
    431		struct sc_config_sizes *scs = &dd->sc_sizes[i];
    432
    433		for (j = 0; j < scs->count; j++) {
    434			struct send_context_info *sci =
    435						&dd->send_contexts[context];
    436			sci->type = i;
    437			sci->base = base;
    438			sci->credits = scs->size;
    439
    440			context++;
    441			base += scs->size;
    442		}
    443	}
    444
    445	return 0;
    446}
    447
    448/*
    449 * Allocate a software index and hardware context of the given type.
    450 *
    451 * Must be called with dd->sc_lock held.
    452 */
    453static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
    454		       u32 *hw_context)
    455{
    456	struct send_context_info *sci;
    457	u32 index;
    458	u32 context;
    459
    460	for (index = 0, sci = &dd->send_contexts[0];
    461			index < dd->num_send_contexts; index++, sci++) {
    462		if (sci->type == type && sci->allocated == 0) {
    463			sci->allocated = 1;
    464			/* use a 1:1 mapping, but make them non-equal */
    465			context = chip_send_contexts(dd) - index - 1;
    466			dd->hw_to_sw[context] = index;
    467			*sw_index = index;
    468			*hw_context = context;
    469			return 0; /* success */
    470		}
    471	}
    472	dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
    473	return -ENOSPC;
    474}
    475
    476/*
    477 * Free the send context given by its software index.
    478 *
    479 * Must be called with dd->sc_lock held.
    480 */
    481static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
    482{
    483	struct send_context_info *sci;
    484
    485	sci = &dd->send_contexts[sw_index];
    486	if (!sci->allocated) {
    487		dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
    488			   __func__, sw_index, hw_context);
    489	}
    490	sci->allocated = 0;
    491	dd->hw_to_sw[hw_context] = INVALID_SCI;
    492}
    493
    494/* return the base context of a context in a group */
    495static inline u32 group_context(u32 context, u32 group)
    496{
    497	return (context >> group) << group;
    498}
    499
    500/* return the size of a group */
    501static inline u32 group_size(u32 group)
    502{
    503	return 1 << group;
    504}
    505
    506/*
    507 * Obtain the credit return addresses, kernel virtual and bus, for the
    508 * given sc.
    509 *
    510 * To understand this routine:
    511 * o va and dma are arrays of struct credit_return.  One for each physical
    512 *   send context, per NUMA.
    513 * o Each send context always looks in its relative location in a struct
    514 *   credit_return for its credit return.
    515 * o Each send context in a group must have its return address CSR programmed
    516 *   with the same value.  Use the address of the first send context in the
    517 *   group.
    518 */
    519static void cr_group_addresses(struct send_context *sc, dma_addr_t *dma)
    520{
    521	u32 gc = group_context(sc->hw_context, sc->group);
    522	u32 index = sc->hw_context & 0x7;
    523
    524	sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
    525	*dma = (unsigned long)
    526	       &((struct credit_return *)sc->dd->cr_base[sc->node].dma)[gc];
    527}
    528
    529/*
    530 * Work queue function triggered in error interrupt routine for
    531 * kernel contexts.
    532 */
    533static void sc_halted(struct work_struct *work)
    534{
    535	struct send_context *sc;
    536
    537	sc = container_of(work, struct send_context, halt_work);
    538	sc_restart(sc);
    539}
    540
    541/*
    542 * Calculate PIO block threshold for this send context using the given MTU.
    543 * Trigger a return when one MTU plus optional header of credits remain.
    544 *
    545 * Parameter mtu is in bytes.
    546 * Parameter hdrqentsize is in DWORDs.
    547 *
    548 * Return value is what to write into the CSR: trigger return when
    549 * unreturned credits pass this count.
    550 */
    551u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
    552{
    553	u32 release_credits;
    554	u32 threshold;
    555
    556	/* add in the header size, then divide by the PIO block size */
    557	mtu += hdrqentsize << 2;
    558	release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
    559
    560	/* check against this context's credits */
    561	if (sc->credits <= release_credits)
    562		threshold = 1;
    563	else
    564		threshold = sc->credits - release_credits;
    565
    566	return threshold;
    567}
    568
    569/*
    570 * Calculate credit threshold in terms of percent of the allocated credits.
    571 * Trigger when unreturned credits equal or exceed the percentage of the whole.
    572 *
    573 * Return value is what to write into the CSR: trigger return when
    574 * unreturned credits pass this count.
    575 */
    576u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
    577{
    578	return (sc->credits * percent) / 100;
    579}
    580
    581/*
    582 * Set the credit return threshold.
    583 */
    584void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
    585{
    586	unsigned long flags;
    587	u32 old_threshold;
    588	int force_return = 0;
    589
    590	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
    591
    592	old_threshold = (sc->credit_ctrl >>
    593				SC(CREDIT_CTRL_THRESHOLD_SHIFT))
    594			 & SC(CREDIT_CTRL_THRESHOLD_MASK);
    595
    596	if (new_threshold != old_threshold) {
    597		sc->credit_ctrl =
    598			(sc->credit_ctrl
    599				& ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
    600			| ((new_threshold
    601				& SC(CREDIT_CTRL_THRESHOLD_MASK))
    602			   << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
    603		write_kctxt_csr(sc->dd, sc->hw_context,
    604				SC(CREDIT_CTRL), sc->credit_ctrl);
    605
    606		/* force a credit return on change to avoid a possible stall */
    607		force_return = 1;
    608	}
    609
    610	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
    611
    612	if (force_return)
    613		sc_return_credits(sc);
    614}
    615
    616/*
    617 * set_pio_integrity
    618 *
    619 * Set the CHECK_ENABLE register for the send context 'sc'.
    620 */
    621void set_pio_integrity(struct send_context *sc)
    622{
    623	struct hfi1_devdata *dd = sc->dd;
    624	u32 hw_context = sc->hw_context;
    625	int type = sc->type;
    626
    627	write_kctxt_csr(dd, hw_context,
    628			SC(CHECK_ENABLE),
    629			hfi1_pkt_default_send_ctxt_mask(dd, type));
    630}
    631
    632static u32 get_buffers_allocated(struct send_context *sc)
    633{
    634	int cpu;
    635	u32 ret = 0;
    636
    637	for_each_possible_cpu(cpu)
    638		ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
    639	return ret;
    640}
    641
    642static void reset_buffers_allocated(struct send_context *sc)
    643{
    644	int cpu;
    645
    646	for_each_possible_cpu(cpu)
    647		(*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
    648}
    649
    650/*
    651 * Allocate a NUMA relative send context structure of the given type along
    652 * with a HW context.
    653 */
    654struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
    655			      uint hdrqentsize, int numa)
    656{
    657	struct send_context_info *sci;
    658	struct send_context *sc = NULL;
    659	dma_addr_t dma;
    660	unsigned long flags;
    661	u64 reg;
    662	u32 thresh;
    663	u32 sw_index;
    664	u32 hw_context;
    665	int ret;
    666	u8 opval, opmask;
    667
    668	/* do not allocate while frozen */
    669	if (dd->flags & HFI1_FROZEN)
    670		return NULL;
    671
    672	sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa);
    673	if (!sc)
    674		return NULL;
    675
    676	sc->buffers_allocated = alloc_percpu(u32);
    677	if (!sc->buffers_allocated) {
    678		kfree(sc);
    679		dd_dev_err(dd,
    680			   "Cannot allocate buffers_allocated per cpu counters\n"
    681			  );
    682		return NULL;
    683	}
    684
    685	spin_lock_irqsave(&dd->sc_lock, flags);
    686	ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
    687	if (ret) {
    688		spin_unlock_irqrestore(&dd->sc_lock, flags);
    689		free_percpu(sc->buffers_allocated);
    690		kfree(sc);
    691		return NULL;
    692	}
    693
    694	sci = &dd->send_contexts[sw_index];
    695	sci->sc = sc;
    696
    697	sc->dd = dd;
    698	sc->node = numa;
    699	sc->type = type;
    700	spin_lock_init(&sc->alloc_lock);
    701	spin_lock_init(&sc->release_lock);
    702	spin_lock_init(&sc->credit_ctrl_lock);
    703	seqlock_init(&sc->waitlock);
    704	INIT_LIST_HEAD(&sc->piowait);
    705	INIT_WORK(&sc->halt_work, sc_halted);
    706	init_waitqueue_head(&sc->halt_wait);
    707
    708	/* grouping is always single context for now */
    709	sc->group = 0;
    710
    711	sc->sw_index = sw_index;
    712	sc->hw_context = hw_context;
    713	cr_group_addresses(sc, &dma);
    714	sc->credits = sci->credits;
    715	sc->size = sc->credits * PIO_BLOCK_SIZE;
    716
    717/* PIO Send Memory Address details */
    718#define PIO_ADDR_CONTEXT_MASK 0xfful
    719#define PIO_ADDR_CONTEXT_SHIFT 16
    720	sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
    721					<< PIO_ADDR_CONTEXT_SHIFT);
    722
    723	/* set base and credits */
    724	reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
    725					<< SC(CTRL_CTXT_DEPTH_SHIFT))
    726		| ((sci->base & SC(CTRL_CTXT_BASE_MASK))
    727					<< SC(CTRL_CTXT_BASE_SHIFT));
    728	write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
    729
    730	set_pio_integrity(sc);
    731
    732	/* unmask all errors */
    733	write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
    734
    735	/* set the default partition key */
    736	write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
    737			(SC(CHECK_PARTITION_KEY_VALUE_MASK) &
    738			 DEFAULT_PKEY) <<
    739			SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
    740
    741	/* per context type checks */
    742	if (type == SC_USER) {
    743		opval = USER_OPCODE_CHECK_VAL;
    744		opmask = USER_OPCODE_CHECK_MASK;
    745	} else {
    746		opval = OPCODE_CHECK_VAL_DISABLED;
    747		opmask = OPCODE_CHECK_MASK_DISABLED;
    748	}
    749
    750	/* set the send context check opcode mask and value */
    751	write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
    752			((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
    753			((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
    754
    755	/* set up credit return */
    756	reg = dma & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
    757	write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
    758
    759	/*
    760	 * Calculate the initial credit return threshold.
    761	 *
    762	 * For Ack contexts, set a threshold for half the credits.
    763	 * For User contexts use the given percentage.  This has been
    764	 * sanitized on driver start-up.
    765	 * For Kernel contexts, use the default MTU plus a header
    766	 * or half the credits, whichever is smaller. This should
    767	 * work for both the 3-deep buffering allocation and the
    768	 * pooling allocation.
    769	 */
    770	if (type == SC_ACK) {
    771		thresh = sc_percent_to_threshold(sc, 50);
    772	} else if (type == SC_USER) {
    773		thresh = sc_percent_to_threshold(sc,
    774						 user_credit_return_threshold);
    775	} else { /* kernel */
    776		thresh = min(sc_percent_to_threshold(sc, 50),
    777			     sc_mtu_to_threshold(sc, hfi1_max_mtu,
    778						 hdrqentsize));
    779	}
    780	reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
    781	/* add in early return */
    782	if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
    783		reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
    784	else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
    785		reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
    786
    787	/* set up write-through credit_ctrl */
    788	sc->credit_ctrl = reg;
    789	write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
    790
    791	/* User send contexts should not allow sending on VL15 */
    792	if (type == SC_USER) {
    793		reg = 1ULL << 15;
    794		write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
    795	}
    796
    797	spin_unlock_irqrestore(&dd->sc_lock, flags);
    798
    799	/*
    800	 * Allocate shadow ring to track outstanding PIO buffers _after_
    801	 * unlocking.  We don't know the size until the lock is held and
    802	 * we can't allocate while the lock is held.  No one is using
    803	 * the context yet, so allocate it now.
    804	 *
    805	 * User contexts do not get a shadow ring.
    806	 */
    807	if (type != SC_USER) {
    808		/*
    809		 * Size the shadow ring 1 larger than the number of credits
    810		 * so head == tail can mean empty.
    811		 */
    812		sc->sr_size = sci->credits + 1;
    813		sc->sr = kcalloc_node(sc->sr_size,
    814				      sizeof(union pio_shadow_ring),
    815				      GFP_KERNEL, numa);
    816		if (!sc->sr) {
    817			sc_free(sc);
    818			return NULL;
    819		}
    820	}
    821
    822	hfi1_cdbg(PIO,
    823		  "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
    824		  sw_index,
    825		  hw_context,
    826		  sc_type_name(type),
    827		  sc->group,
    828		  sc->credits,
    829		  sc->credit_ctrl,
    830		  thresh);
    831
    832	return sc;
    833}
    834
    835/* free a per-NUMA send context structure */
    836void sc_free(struct send_context *sc)
    837{
    838	struct hfi1_devdata *dd;
    839	unsigned long flags;
    840	u32 sw_index;
    841	u32 hw_context;
    842
    843	if (!sc)
    844		return;
    845
    846	sc->flags |= SCF_IN_FREE;	/* ensure no restarts */
    847	dd = sc->dd;
    848	if (!list_empty(&sc->piowait))
    849		dd_dev_err(dd, "piowait list not empty!\n");
    850	sw_index = sc->sw_index;
    851	hw_context = sc->hw_context;
    852	sc_disable(sc);	/* make sure the HW is disabled */
    853	flush_work(&sc->halt_work);
    854
    855	spin_lock_irqsave(&dd->sc_lock, flags);
    856	dd->send_contexts[sw_index].sc = NULL;
    857
    858	/* clear/disable all registers set in sc_alloc */
    859	write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
    860	write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
    861	write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
    862	write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
    863	write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
    864	write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
    865	write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
    866
    867	/* release the index and context for re-use */
    868	sc_hw_free(dd, sw_index, hw_context);
    869	spin_unlock_irqrestore(&dd->sc_lock, flags);
    870
    871	kfree(sc->sr);
    872	free_percpu(sc->buffers_allocated);
    873	kfree(sc);
    874}
    875
    876/* disable the context */
    877void sc_disable(struct send_context *sc)
    878{
    879	u64 reg;
    880	struct pio_buf *pbuf;
    881	LIST_HEAD(wake_list);
    882
    883	if (!sc)
    884		return;
    885
    886	/* do all steps, even if already disabled */
    887	spin_lock_irq(&sc->alloc_lock);
    888	reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
    889	reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
    890	sc->flags &= ~SCF_ENABLED;
    891	sc_wait_for_packet_egress(sc, 1);
    892	write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
    893
    894	/*
    895	 * Flush any waiters.  Once the context is disabled,
    896	 * credit return interrupts are stopped (although there
    897	 * could be one in-process when the context is disabled).
    898	 * Wait one microsecond for any lingering interrupts, then
    899	 * proceed with the flush.
    900	 */
    901	udelay(1);
    902	spin_lock(&sc->release_lock);
    903	if (sc->sr) {	/* this context has a shadow ring */
    904		while (sc->sr_tail != sc->sr_head) {
    905			pbuf = &sc->sr[sc->sr_tail].pbuf;
    906			if (pbuf->cb)
    907				(*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
    908			sc->sr_tail++;
    909			if (sc->sr_tail >= sc->sr_size)
    910				sc->sr_tail = 0;
    911		}
    912	}
    913	spin_unlock(&sc->release_lock);
    914
    915	write_seqlock(&sc->waitlock);
    916	if (!list_empty(&sc->piowait))
    917		list_move(&sc->piowait, &wake_list);
    918	write_sequnlock(&sc->waitlock);
    919	while (!list_empty(&wake_list)) {
    920		struct iowait *wait;
    921		struct rvt_qp *qp;
    922		struct hfi1_qp_priv *priv;
    923
    924		wait = list_first_entry(&wake_list, struct iowait, list);
    925		qp = iowait_to_qp(wait);
    926		priv = qp->priv;
    927		list_del_init(&priv->s_iowait.list);
    928		priv->s_iowait.lock = NULL;
    929		hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
    930	}
    931
    932	spin_unlock_irq(&sc->alloc_lock);
    933}
    934
    935/* return SendEgressCtxtStatus.PacketOccupancy */
    936static u64 packet_occupancy(u64 reg)
    937{
    938	return (reg &
    939		SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)
    940		>> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT;
    941}
    942
    943/* is egress halted on the context? */
    944static bool egress_halted(u64 reg)
    945{
    946	return !!(reg & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK);
    947}
    948
    949/* is the send context halted? */
    950static bool is_sc_halted(struct hfi1_devdata *dd, u32 hw_context)
    951{
    952	return !!(read_kctxt_csr(dd, hw_context, SC(STATUS)) &
    953		  SC(STATUS_CTXT_HALTED_SMASK));
    954}
    955
    956/**
    957 * sc_wait_for_packet_egress - wait for packet
    958 * @sc: valid send context
    959 * @pause: wait for credit return
    960 *
    961 * Wait for packet egress, optionally pause for credit return
    962 *
    963 * Egress halt and Context halt are not necessarily the same thing, so
    964 * check for both.
    965 *
    966 * NOTE: The context halt bit may not be set immediately.  Because of this,
    967 * it is necessary to check the SW SFC_HALTED bit (set in the IRQ) and the HW
    968 * context bit to determine if the context is halted.
    969 */
    970static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
    971{
    972	struct hfi1_devdata *dd = sc->dd;
    973	u64 reg = 0;
    974	u64 reg_prev;
    975	u32 loop = 0;
    976
    977	while (1) {
    978		reg_prev = reg;
    979		reg = read_csr(dd, sc->hw_context * 8 +
    980			       SEND_EGRESS_CTXT_STATUS);
    981		/* done if any halt bits, SW or HW are set */
    982		if (sc->flags & SCF_HALTED ||
    983		    is_sc_halted(dd, sc->hw_context) || egress_halted(reg))
    984			break;
    985		reg = packet_occupancy(reg);
    986		if (reg == 0)
    987			break;
    988		/* counter is reset if occupancy count changes */
    989		if (reg != reg_prev)
    990			loop = 0;
    991		if (loop > 50000) {
    992			/* timed out - bounce the link */
    993			dd_dev_err(dd,
    994				   "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
    995				   __func__, sc->sw_index,
    996				   sc->hw_context, (u32)reg);
    997			queue_work(dd->pport->link_wq,
    998				   &dd->pport->link_bounce_work);
    999			break;
   1000		}
   1001		loop++;
   1002		udelay(1);
   1003	}
   1004
   1005	if (pause)
   1006		/* Add additional delay to ensure chip returns all credits */
   1007		pause_for_credit_return(dd);
   1008}
   1009
   1010void sc_wait(struct hfi1_devdata *dd)
   1011{
   1012	int i;
   1013
   1014	for (i = 0; i < dd->num_send_contexts; i++) {
   1015		struct send_context *sc = dd->send_contexts[i].sc;
   1016
   1017		if (!sc)
   1018			continue;
   1019		sc_wait_for_packet_egress(sc, 0);
   1020	}
   1021}
   1022
   1023/*
   1024 * Restart a context after it has been halted due to error.
   1025 *
   1026 * If the first step fails - wait for the halt to be asserted, return early.
   1027 * Otherwise complain about timeouts but keep going.
   1028 *
   1029 * It is expected that allocations (enabled flag bit) have been shut off
   1030 * already (only applies to kernel contexts).
   1031 */
   1032int sc_restart(struct send_context *sc)
   1033{
   1034	struct hfi1_devdata *dd = sc->dd;
   1035	u64 reg;
   1036	u32 loop;
   1037	int count;
   1038
   1039	/* bounce off if not halted, or being free'd */
   1040	if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
   1041		return -EINVAL;
   1042
   1043	dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
   1044		    sc->hw_context);
   1045
   1046	/*
   1047	 * Step 1: Wait for the context to actually halt.
   1048	 *
   1049	 * The error interrupt is asynchronous to actually setting halt
   1050	 * on the context.
   1051	 */
   1052	loop = 0;
   1053	while (1) {
   1054		reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
   1055		if (reg & SC(STATUS_CTXT_HALTED_SMASK))
   1056			break;
   1057		if (loop > 100) {
   1058			dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
   1059				   __func__, sc->sw_index, sc->hw_context);
   1060			return -ETIME;
   1061		}
   1062		loop++;
   1063		udelay(1);
   1064	}
   1065
   1066	/*
   1067	 * Step 2: Ensure no users are still trying to write to PIO.
   1068	 *
   1069	 * For kernel contexts, we have already turned off buffer allocation.
   1070	 * Now wait for the buffer count to go to zero.
   1071	 *
   1072	 * For user contexts, the user handling code has cut off write access
   1073	 * to the context's PIO pages before calling this routine and will
   1074	 * restore write access after this routine returns.
   1075	 */
   1076	if (sc->type != SC_USER) {
   1077		/* kernel context */
   1078		loop = 0;
   1079		while (1) {
   1080			count = get_buffers_allocated(sc);
   1081			if (count == 0)
   1082				break;
   1083			if (loop > 100) {
   1084				dd_dev_err(dd,
   1085					   "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
   1086					   __func__, sc->sw_index,
   1087					   sc->hw_context, count);
   1088			}
   1089			loop++;
   1090			udelay(1);
   1091		}
   1092	}
   1093
   1094	/*
   1095	 * Step 3: Wait for all packets to egress.
   1096	 * This is done while disabling the send context
   1097	 *
   1098	 * Step 4: Disable the context
   1099	 *
   1100	 * This is a superset of the halt.  After the disable, the
   1101	 * errors can be cleared.
   1102	 */
   1103	sc_disable(sc);
   1104
   1105	/*
   1106	 * Step 5: Enable the context
   1107	 *
   1108	 * This enable will clear the halted flag and per-send context
   1109	 * error flags.
   1110	 */
   1111	return sc_enable(sc);
   1112}
   1113
   1114/*
   1115 * PIO freeze processing.  To be called after the TXE block is fully frozen.
   1116 * Go through all frozen send contexts and disable them.  The contexts are
   1117 * already stopped by the freeze.
   1118 */
   1119void pio_freeze(struct hfi1_devdata *dd)
   1120{
   1121	struct send_context *sc;
   1122	int i;
   1123
   1124	for (i = 0; i < dd->num_send_contexts; i++) {
   1125		sc = dd->send_contexts[i].sc;
   1126		/*
   1127		 * Don't disable unallocated, unfrozen, or user send contexts.
   1128		 * User send contexts will be disabled when the process
   1129		 * calls into the driver to reset its context.
   1130		 */
   1131		if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
   1132			continue;
   1133
   1134		/* only need to disable, the context is already stopped */
   1135		sc_disable(sc);
   1136	}
   1137}
   1138
   1139/*
   1140 * Unfreeze PIO for kernel send contexts.  The precondition for calling this
   1141 * is that all PIO send contexts have been disabled and the SPC freeze has
   1142 * been cleared.  Now perform the last step and re-enable each kernel context.
   1143 * User (PSM) processing will occur when PSM calls into the kernel to
   1144 * acknowledge the freeze.
   1145 */
   1146void pio_kernel_unfreeze(struct hfi1_devdata *dd)
   1147{
   1148	struct send_context *sc;
   1149	int i;
   1150
   1151	for (i = 0; i < dd->num_send_contexts; i++) {
   1152		sc = dd->send_contexts[i].sc;
   1153		if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
   1154			continue;
   1155		if (sc->flags & SCF_LINK_DOWN)
   1156			continue;
   1157
   1158		sc_enable(sc);	/* will clear the sc frozen flag */
   1159	}
   1160}
   1161
   1162/**
   1163 * pio_kernel_linkup() - Re-enable send contexts after linkup event
   1164 * @dd: valid devive data
   1165 *
   1166 * When the link goes down, the freeze path is taken.  However, a link down
   1167 * event is different from a freeze because if the send context is re-enabled
   1168 * whowever is sending data will start sending data again, which will hang
   1169 * any QP that is sending data.
   1170 *
   1171 * The freeze path now looks at the type of event that occurs and takes this
   1172 * path for link down event.
   1173 */
   1174void pio_kernel_linkup(struct hfi1_devdata *dd)
   1175{
   1176	struct send_context *sc;
   1177	int i;
   1178
   1179	for (i = 0; i < dd->num_send_contexts; i++) {
   1180		sc = dd->send_contexts[i].sc;
   1181		if (!sc || !(sc->flags & SCF_LINK_DOWN) || sc->type == SC_USER)
   1182			continue;
   1183
   1184		sc_enable(sc);	/* will clear the sc link down flag */
   1185	}
   1186}
   1187
   1188/*
   1189 * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
   1190 * Returns:
   1191 *	-ETIMEDOUT - if we wait too long
   1192 *	-EIO	   - if there was an error
   1193 */
   1194static int pio_init_wait_progress(struct hfi1_devdata *dd)
   1195{
   1196	u64 reg;
   1197	int max, count = 0;
   1198
   1199	/* max is the longest possible HW init time / delay */
   1200	max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
   1201	while (1) {
   1202		reg = read_csr(dd, SEND_PIO_INIT_CTXT);
   1203		if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
   1204			break;
   1205		if (count >= max)
   1206			return -ETIMEDOUT;
   1207		udelay(5);
   1208		count++;
   1209	}
   1210
   1211	return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
   1212}
   1213
   1214/*
   1215 * Reset all of the send contexts to their power-on state.  Used
   1216 * only during manual init - no lock against sc_enable needed.
   1217 */
   1218void pio_reset_all(struct hfi1_devdata *dd)
   1219{
   1220	int ret;
   1221
   1222	/* make sure the init engine is not busy */
   1223	ret = pio_init_wait_progress(dd);
   1224	/* ignore any timeout */
   1225	if (ret == -EIO) {
   1226		/* clear the error */
   1227		write_csr(dd, SEND_PIO_ERR_CLEAR,
   1228			  SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
   1229	}
   1230
   1231	/* reset init all */
   1232	write_csr(dd, SEND_PIO_INIT_CTXT,
   1233		  SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
   1234	udelay(2);
   1235	ret = pio_init_wait_progress(dd);
   1236	if (ret < 0) {
   1237		dd_dev_err(dd,
   1238			   "PIO send context init %s while initializing all PIO blocks\n",
   1239			   ret == -ETIMEDOUT ? "is stuck" : "had an error");
   1240	}
   1241}
   1242
   1243/* enable the context */
   1244int sc_enable(struct send_context *sc)
   1245{
   1246	u64 sc_ctrl, reg, pio;
   1247	struct hfi1_devdata *dd;
   1248	unsigned long flags;
   1249	int ret = 0;
   1250
   1251	if (!sc)
   1252		return -EINVAL;
   1253	dd = sc->dd;
   1254
   1255	/*
   1256	 * Obtain the allocator lock to guard against any allocation
   1257	 * attempts (which should not happen prior to context being
   1258	 * enabled). On the release/disable side we don't need to
   1259	 * worry about locking since the releaser will not do anything
   1260	 * if the context accounting values have not changed.
   1261	 */
   1262	spin_lock_irqsave(&sc->alloc_lock, flags);
   1263	sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
   1264	if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
   1265		goto unlock; /* already enabled */
   1266
   1267	/* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
   1268
   1269	*sc->hw_free = 0;
   1270	sc->free = 0;
   1271	sc->alloc_free = 0;
   1272	sc->fill = 0;
   1273	sc->fill_wrap = 0;
   1274	sc->sr_head = 0;
   1275	sc->sr_tail = 0;
   1276	sc->flags = 0;
   1277	/* the alloc lock insures no fast path allocation */
   1278	reset_buffers_allocated(sc);
   1279
   1280	/*
   1281	 * Clear all per-context errors.  Some of these will be set when
   1282	 * we are re-enabling after a context halt.  Now that the context
   1283	 * is disabled, the halt will not clear until after the PIO init
   1284	 * engine runs below.
   1285	 */
   1286	reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
   1287	if (reg)
   1288		write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg);
   1289
   1290	/*
   1291	 * The HW PIO initialization engine can handle only one init
   1292	 * request at a time. Serialize access to each device's engine.
   1293	 */
   1294	spin_lock(&dd->sc_init_lock);
   1295	/*
   1296	 * Since access to this code block is serialized and
   1297	 * each access waits for the initialization to complete
   1298	 * before releasing the lock, the PIO initialization engine
   1299	 * should not be in use, so we don't have to wait for the
   1300	 * InProgress bit to go down.
   1301	 */
   1302	pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
   1303	       SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
   1304		SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
   1305	write_csr(dd, SEND_PIO_INIT_CTXT, pio);
   1306	/*
   1307	 * Wait until the engine is done.  Give the chip the required time
   1308	 * so, hopefully, we read the register just once.
   1309	 */
   1310	udelay(2);
   1311	ret = pio_init_wait_progress(dd);
   1312	spin_unlock(&dd->sc_init_lock);
   1313	if (ret) {
   1314		dd_dev_err(dd,
   1315			   "sctxt%u(%u): Context not enabled due to init failure %d\n",
   1316			   sc->sw_index, sc->hw_context, ret);
   1317		goto unlock;
   1318	}
   1319
   1320	/*
   1321	 * All is well. Enable the context.
   1322	 */
   1323	sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
   1324	write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
   1325	/*
   1326	 * Read SendCtxtCtrl to force the write out and prevent a timing
   1327	 * hazard where a PIO write may reach the context before the enable.
   1328	 */
   1329	read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
   1330	sc->flags |= SCF_ENABLED;
   1331
   1332unlock:
   1333	spin_unlock_irqrestore(&sc->alloc_lock, flags);
   1334
   1335	return ret;
   1336}
   1337
   1338/* force a credit return on the context */
   1339void sc_return_credits(struct send_context *sc)
   1340{
   1341	if (!sc)
   1342		return;
   1343
   1344	/* a 0->1 transition schedules a credit return */
   1345	write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
   1346			SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
   1347	/*
   1348	 * Ensure that the write is flushed and the credit return is
   1349	 * scheduled. We care more about the 0 -> 1 transition.
   1350	 */
   1351	read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
   1352	/* set back to 0 for next time */
   1353	write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
   1354}
   1355
   1356/* allow all in-flight packets to drain on the context */
   1357void sc_flush(struct send_context *sc)
   1358{
   1359	if (!sc)
   1360		return;
   1361
   1362	sc_wait_for_packet_egress(sc, 1);
   1363}
   1364
   1365/* drop all packets on the context, no waiting until they are sent */
   1366void sc_drop(struct send_context *sc)
   1367{
   1368	if (!sc)
   1369		return;
   1370
   1371	dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
   1372		    __func__, sc->sw_index, sc->hw_context);
   1373}
   1374
   1375/*
   1376 * Start the software reaction to a context halt or SPC freeze:
   1377 *	- mark the context as halted or frozen
   1378 *	- stop buffer allocations
   1379 *
   1380 * Called from the error interrupt.  Other work is deferred until
   1381 * out of the interrupt.
   1382 */
   1383void sc_stop(struct send_context *sc, int flag)
   1384{
   1385	unsigned long flags;
   1386
   1387	/* stop buffer allocations */
   1388	spin_lock_irqsave(&sc->alloc_lock, flags);
   1389	/* mark the context */
   1390	sc->flags |= flag;
   1391	sc->flags &= ~SCF_ENABLED;
   1392	spin_unlock_irqrestore(&sc->alloc_lock, flags);
   1393	wake_up(&sc->halt_wait);
   1394}
   1395
   1396#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32))
   1397#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
   1398
   1399/*
   1400 * The send context buffer "allocator".
   1401 *
   1402 * @sc: the PIO send context we are allocating from
   1403 * @len: length of whole packet - including PBC - in dwords
   1404 * @cb: optional callback to call when the buffer is finished sending
   1405 * @arg: argument for cb
   1406 *
   1407 * Return a pointer to a PIO buffer, NULL if not enough room, -ECOMM
   1408 * when link is down.
   1409 */
   1410struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
   1411				pio_release_cb cb, void *arg)
   1412{
   1413	struct pio_buf *pbuf = NULL;
   1414	unsigned long flags;
   1415	unsigned long avail;
   1416	unsigned long blocks = dwords_to_blocks(dw_len);
   1417	u32 fill_wrap;
   1418	int trycount = 0;
   1419	u32 head, next;
   1420
   1421	spin_lock_irqsave(&sc->alloc_lock, flags);
   1422	if (!(sc->flags & SCF_ENABLED)) {
   1423		spin_unlock_irqrestore(&sc->alloc_lock, flags);
   1424		return ERR_PTR(-ECOMM);
   1425	}
   1426
   1427retry:
   1428	avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
   1429	if (blocks > avail) {
   1430		/* not enough room */
   1431		if (unlikely(trycount))	{ /* already tried to get more room */
   1432			spin_unlock_irqrestore(&sc->alloc_lock, flags);
   1433			goto done;
   1434		}
   1435		/* copy from receiver cache line and recalculate */
   1436		sc->alloc_free = READ_ONCE(sc->free);
   1437		avail =
   1438			(unsigned long)sc->credits -
   1439			(sc->fill - sc->alloc_free);
   1440		if (blocks > avail) {
   1441			/* still no room, actively update */
   1442			sc_release_update(sc);
   1443			sc->alloc_free = READ_ONCE(sc->free);
   1444			trycount++;
   1445			goto retry;
   1446		}
   1447	}
   1448
   1449	/* there is enough room */
   1450
   1451	preempt_disable();
   1452	this_cpu_inc(*sc->buffers_allocated);
   1453
   1454	/* read this once */
   1455	head = sc->sr_head;
   1456
   1457	/* "allocate" the buffer */
   1458	sc->fill += blocks;
   1459	fill_wrap = sc->fill_wrap;
   1460	sc->fill_wrap += blocks;
   1461	if (sc->fill_wrap >= sc->credits)
   1462		sc->fill_wrap = sc->fill_wrap - sc->credits;
   1463
   1464	/*
   1465	 * Fill the parts that the releaser looks at before moving the head.
   1466	 * The only necessary piece is the sent_at field.  The credits
   1467	 * we have just allocated cannot have been returned yet, so the
   1468	 * cb and arg will not be looked at for a "while".  Put them
   1469	 * on this side of the memory barrier anyway.
   1470	 */
   1471	pbuf = &sc->sr[head].pbuf;
   1472	pbuf->sent_at = sc->fill;
   1473	pbuf->cb = cb;
   1474	pbuf->arg = arg;
   1475	pbuf->sc = sc;	/* could be filled in at sc->sr init time */
   1476	/* make sure this is in memory before updating the head */
   1477
   1478	/* calculate next head index, do not store */
   1479	next = head + 1;
   1480	if (next >= sc->sr_size)
   1481		next = 0;
   1482	/*
   1483	 * update the head - must be last! - the releaser can look at fields
   1484	 * in pbuf once we move the head
   1485	 */
   1486	smp_wmb();
   1487	sc->sr_head = next;
   1488	spin_unlock_irqrestore(&sc->alloc_lock, flags);
   1489
   1490	/* finish filling in the buffer outside the lock */
   1491	pbuf->start = sc->base_addr + fill_wrap * PIO_BLOCK_SIZE;
   1492	pbuf->end = sc->base_addr + sc->size;
   1493	pbuf->qw_written = 0;
   1494	pbuf->carry_bytes = 0;
   1495	pbuf->carry.val64 = 0;
   1496done:
   1497	return pbuf;
   1498}
   1499
   1500/*
   1501 * There are at least two entities that can turn on credit return
   1502 * interrupts and they can overlap.  Avoid problems by implementing
   1503 * a count scheme that is enforced by a lock.  The lock is needed because
   1504 * the count and CSR write must be paired.
   1505 */
   1506
   1507/*
   1508 * Start credit return interrupts.  This is managed by a count.  If already
   1509 * on, just increment the count.
   1510 */
   1511void sc_add_credit_return_intr(struct send_context *sc)
   1512{
   1513	unsigned long flags;
   1514
   1515	/* lock must surround both the count change and the CSR update */
   1516	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
   1517	if (sc->credit_intr_count == 0) {
   1518		sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
   1519		write_kctxt_csr(sc->dd, sc->hw_context,
   1520				SC(CREDIT_CTRL), sc->credit_ctrl);
   1521	}
   1522	sc->credit_intr_count++;
   1523	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
   1524}
   1525
   1526/*
   1527 * Stop credit return interrupts.  This is managed by a count.  Decrement the
   1528 * count, if the last user, then turn the credit interrupts off.
   1529 */
   1530void sc_del_credit_return_intr(struct send_context *sc)
   1531{
   1532	unsigned long flags;
   1533
   1534	WARN_ON(sc->credit_intr_count == 0);
   1535
   1536	/* lock must surround both the count change and the CSR update */
   1537	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
   1538	sc->credit_intr_count--;
   1539	if (sc->credit_intr_count == 0) {
   1540		sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
   1541		write_kctxt_csr(sc->dd, sc->hw_context,
   1542				SC(CREDIT_CTRL), sc->credit_ctrl);
   1543	}
   1544	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
   1545}
   1546
   1547/*
   1548 * The caller must be careful when calling this.  All needint calls
   1549 * must be paired with !needint.
   1550 */
   1551void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
   1552{
   1553	if (needint)
   1554		sc_add_credit_return_intr(sc);
   1555	else
   1556		sc_del_credit_return_intr(sc);
   1557	trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
   1558	if (needint)
   1559		sc_return_credits(sc);
   1560}
   1561
   1562/**
   1563 * sc_piobufavail - callback when a PIO buffer is available
   1564 * @sc: the send context
   1565 *
   1566 * This is called from the interrupt handler when a PIO buffer is
   1567 * available after hfi1_verbs_send() returned an error that no buffers were
   1568 * available. Disable the interrupt if there are no more QPs waiting.
   1569 */
   1570static void sc_piobufavail(struct send_context *sc)
   1571{
   1572	struct hfi1_devdata *dd = sc->dd;
   1573	struct list_head *list;
   1574	struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
   1575	struct rvt_qp *qp;
   1576	struct hfi1_qp_priv *priv;
   1577	unsigned long flags;
   1578	uint i, n = 0, top_idx = 0;
   1579
   1580	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
   1581	    dd->send_contexts[sc->sw_index].type != SC_VL15)
   1582		return;
   1583	list = &sc->piowait;
   1584	/*
   1585	 * Note: checking that the piowait list is empty and clearing
   1586	 * the buffer available interrupt needs to be atomic or we
   1587	 * could end up with QPs on the wait list with the interrupt
   1588	 * disabled.
   1589	 */
   1590	write_seqlock_irqsave(&sc->waitlock, flags);
   1591	while (!list_empty(list)) {
   1592		struct iowait *wait;
   1593
   1594		if (n == ARRAY_SIZE(qps))
   1595			break;
   1596		wait = list_first_entry(list, struct iowait, list);
   1597		iowait_get_priority(wait);
   1598		qp = iowait_to_qp(wait);
   1599		priv = qp->priv;
   1600		list_del_init(&priv->s_iowait.list);
   1601		priv->s_iowait.lock = NULL;
   1602		if (n) {
   1603			priv = qps[top_idx]->priv;
   1604			top_idx = iowait_priority_update_top(wait,
   1605							     &priv->s_iowait,
   1606							     n, top_idx);
   1607		}
   1608
   1609		/* refcount held until actual wake up */
   1610		qps[n++] = qp;
   1611	}
   1612	/*
   1613	 * If there had been waiters and there are more
   1614	 * insure that we redo the force to avoid a potential hang.
   1615	 */
   1616	if (n) {
   1617		hfi1_sc_wantpiobuf_intr(sc, 0);
   1618		if (!list_empty(list))
   1619			hfi1_sc_wantpiobuf_intr(sc, 1);
   1620	}
   1621	write_sequnlock_irqrestore(&sc->waitlock, flags);
   1622
   1623	/* Wake up the top-priority one first */
   1624	if (n)
   1625		hfi1_qp_wakeup(qps[top_idx],
   1626			       RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
   1627	for (i = 0; i < n; i++)
   1628		if (i != top_idx)
   1629			hfi1_qp_wakeup(qps[i],
   1630				       RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
   1631}
   1632
   1633/* translate a send credit update to a bit code of reasons */
   1634static inline int fill_code(u64 hw_free)
   1635{
   1636	int code = 0;
   1637
   1638	if (hw_free & CR_STATUS_SMASK)
   1639		code |= PRC_STATUS_ERR;
   1640	if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
   1641		code |= PRC_PBC;
   1642	if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
   1643		code |= PRC_THRESHOLD;
   1644	if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
   1645		code |= PRC_FILL_ERR;
   1646	if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
   1647		code |= PRC_SC_DISABLE;
   1648	return code;
   1649}
   1650
   1651/* use the jiffies compare to get the wrap right */
   1652#define sent_before(a, b) time_before(a, b)	/* a < b */
   1653
   1654/*
   1655 * The send context buffer "releaser".
   1656 */
   1657void sc_release_update(struct send_context *sc)
   1658{
   1659	struct pio_buf *pbuf;
   1660	u64 hw_free;
   1661	u32 head, tail;
   1662	unsigned long old_free;
   1663	unsigned long free;
   1664	unsigned long extra;
   1665	unsigned long flags;
   1666	int code;
   1667
   1668	if (!sc)
   1669		return;
   1670
   1671	spin_lock_irqsave(&sc->release_lock, flags);
   1672	/* update free */
   1673	hw_free = le64_to_cpu(*sc->hw_free);		/* volatile read */
   1674	old_free = sc->free;
   1675	extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
   1676			- (old_free & CR_COUNTER_MASK))
   1677				& CR_COUNTER_MASK;
   1678	free = old_free + extra;
   1679	trace_hfi1_piofree(sc, extra);
   1680
   1681	/* call sent buffer callbacks */
   1682	code = -1;				/* code not yet set */
   1683	head = READ_ONCE(sc->sr_head);	/* snapshot the head */
   1684	tail = sc->sr_tail;
   1685	while (head != tail) {
   1686		pbuf = &sc->sr[tail].pbuf;
   1687
   1688		if (sent_before(free, pbuf->sent_at)) {
   1689			/* not sent yet */
   1690			break;
   1691		}
   1692		if (pbuf->cb) {
   1693			if (code < 0) /* fill in code on first user */
   1694				code = fill_code(hw_free);
   1695			(*pbuf->cb)(pbuf->arg, code);
   1696		}
   1697
   1698		tail++;
   1699		if (tail >= sc->sr_size)
   1700			tail = 0;
   1701	}
   1702	sc->sr_tail = tail;
   1703	/* make sure tail is updated before free */
   1704	smp_wmb();
   1705	sc->free = free;
   1706	spin_unlock_irqrestore(&sc->release_lock, flags);
   1707	sc_piobufavail(sc);
   1708}
   1709
   1710/*
   1711 * Send context group releaser.  Argument is the send context that caused
   1712 * the interrupt.  Called from the send context interrupt handler.
   1713 *
   1714 * Call release on all contexts in the group.
   1715 *
   1716 * This routine takes the sc_lock without an irqsave because it is only
   1717 * called from an interrupt handler.  Adjust if that changes.
   1718 */
   1719void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
   1720{
   1721	struct send_context *sc;
   1722	u32 sw_index;
   1723	u32 gc, gc_end;
   1724
   1725	spin_lock(&dd->sc_lock);
   1726	sw_index = dd->hw_to_sw[hw_context];
   1727	if (unlikely(sw_index >= dd->num_send_contexts)) {
   1728		dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
   1729			   __func__, hw_context, sw_index);
   1730		goto done;
   1731	}
   1732	sc = dd->send_contexts[sw_index].sc;
   1733	if (unlikely(!sc))
   1734		goto done;
   1735
   1736	gc = group_context(hw_context, sc->group);
   1737	gc_end = gc + group_size(sc->group);
   1738	for (; gc < gc_end; gc++) {
   1739		sw_index = dd->hw_to_sw[gc];
   1740		if (unlikely(sw_index >= dd->num_send_contexts)) {
   1741			dd_dev_err(dd,
   1742				   "%s: invalid hw (%u) to sw (%u) mapping\n",
   1743				   __func__, hw_context, sw_index);
   1744			continue;
   1745		}
   1746		sc_release_update(dd->send_contexts[sw_index].sc);
   1747	}
   1748done:
   1749	spin_unlock(&dd->sc_lock);
   1750}
   1751
   1752/*
   1753 * pio_select_send_context_vl() - select send context
   1754 * @dd: devdata
   1755 * @selector: a spreading factor
   1756 * @vl: this vl
   1757 *
   1758 * This function returns a send context based on the selector and a vl.
   1759 * The mapping fields are protected by RCU
   1760 */
   1761struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
   1762						u32 selector, u8 vl)
   1763{
   1764	struct pio_vl_map *m;
   1765	struct pio_map_elem *e;
   1766	struct send_context *rval;
   1767
   1768	/*
   1769	 * NOTE This should only happen if SC->VL changed after the initial
   1770	 * checks on the QP/AH
   1771	 * Default will return VL0's send context below
   1772	 */
   1773	if (unlikely(vl >= num_vls)) {
   1774		rval = NULL;
   1775		goto done;
   1776	}
   1777
   1778	rcu_read_lock();
   1779	m = rcu_dereference(dd->pio_map);
   1780	if (unlikely(!m)) {
   1781		rcu_read_unlock();
   1782		return dd->vld[0].sc;
   1783	}
   1784	e = m->map[vl & m->mask];
   1785	rval = e->ksc[selector & e->mask];
   1786	rcu_read_unlock();
   1787
   1788done:
   1789	rval = !rval ? dd->vld[0].sc : rval;
   1790	return rval;
   1791}
   1792
   1793/*
   1794 * pio_select_send_context_sc() - select send context
   1795 * @dd: devdata
   1796 * @selector: a spreading factor
   1797 * @sc5: the 5 bit sc
   1798 *
   1799 * This function returns an send context based on the selector and an sc
   1800 */
   1801struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
   1802						u32 selector, u8 sc5)
   1803{
   1804	u8 vl = sc_to_vlt(dd, sc5);
   1805
   1806	return pio_select_send_context_vl(dd, selector, vl);
   1807}
   1808
   1809/*
   1810 * Free the indicated map struct
   1811 */
   1812static void pio_map_free(struct pio_vl_map *m)
   1813{
   1814	int i;
   1815
   1816	for (i = 0; m && i < m->actual_vls; i++)
   1817		kfree(m->map[i]);
   1818	kfree(m);
   1819}
   1820
   1821/*
   1822 * Handle RCU callback
   1823 */
   1824static void pio_map_rcu_callback(struct rcu_head *list)
   1825{
   1826	struct pio_vl_map *m = container_of(list, struct pio_vl_map, list);
   1827
   1828	pio_map_free(m);
   1829}
   1830
   1831/*
   1832 * Set credit return threshold for the kernel send context
   1833 */
   1834static void set_threshold(struct hfi1_devdata *dd, int scontext, int i)
   1835{
   1836	u32 thres;
   1837
   1838	thres = min(sc_percent_to_threshold(dd->kernel_send_context[scontext],
   1839					    50),
   1840		    sc_mtu_to_threshold(dd->kernel_send_context[scontext],
   1841					dd->vld[i].mtu,
   1842					dd->rcd[0]->rcvhdrqentsize));
   1843	sc_set_cr_threshold(dd->kernel_send_context[scontext], thres);
   1844}
   1845
   1846/*
   1847 * pio_map_init - called when #vls change
   1848 * @dd: hfi1_devdata
   1849 * @port: port number
   1850 * @num_vls: number of vls
   1851 * @vl_scontexts: per vl send context mapping (optional)
   1852 *
   1853 * This routine changes the mapping based on the number of vls.
   1854 *
   1855 * vl_scontexts is used to specify a non-uniform vl/send context
   1856 * loading. NULL implies auto computing the loading and giving each
   1857 * VL an uniform distribution of send contexts per VL.
   1858 *
   1859 * The auto algorithm computers the sc_per_vl and the number of extra
   1860 * send contexts. Any extra send contexts are added from the last VL
   1861 * on down
   1862 *
   1863 * rcu locking is used here to control access to the mapping fields.
   1864 *
   1865 * If either the num_vls or num_send_contexts are non-power of 2, the
   1866 * array sizes in the struct pio_vl_map and the struct pio_map_elem are
   1867 * rounded up to the next highest power of 2 and the first entry is
   1868 * reused in a round robin fashion.
   1869 *
   1870 * If an error occurs the map change is not done and the mapping is not
   1871 * chaged.
   1872 *
   1873 */
   1874int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts)
   1875{
   1876	int i, j;
   1877	int extra, sc_per_vl;
   1878	int scontext = 1;
   1879	int num_kernel_send_contexts = 0;
   1880	u8 lvl_scontexts[OPA_MAX_VLS];
   1881	struct pio_vl_map *oldmap, *newmap;
   1882
   1883	if (!vl_scontexts) {
   1884		for (i = 0; i < dd->num_send_contexts; i++)
   1885			if (dd->send_contexts[i].type == SC_KERNEL)
   1886				num_kernel_send_contexts++;
   1887		/* truncate divide */
   1888		sc_per_vl = num_kernel_send_contexts / num_vls;
   1889		/* extras */
   1890		extra = num_kernel_send_contexts % num_vls;
   1891		vl_scontexts = lvl_scontexts;
   1892		/* add extras from last vl down */
   1893		for (i = num_vls - 1; i >= 0; i--, extra--)
   1894			vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0);
   1895	}
   1896	/* build new map */
   1897	newmap = kzalloc(sizeof(*newmap) +
   1898			 roundup_pow_of_two(num_vls) *
   1899			 sizeof(struct pio_map_elem *),
   1900			 GFP_KERNEL);
   1901	if (!newmap)
   1902		goto bail;
   1903	newmap->actual_vls = num_vls;
   1904	newmap->vls = roundup_pow_of_two(num_vls);
   1905	newmap->mask = (1 << ilog2(newmap->vls)) - 1;
   1906	for (i = 0; i < newmap->vls; i++) {
   1907		/* save for wrap around */
   1908		int first_scontext = scontext;
   1909
   1910		if (i < newmap->actual_vls) {
   1911			int sz = roundup_pow_of_two(vl_scontexts[i]);
   1912
   1913			/* only allocate once */
   1914			newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) +
   1915						 sz * sizeof(struct
   1916							     send_context *),
   1917						 GFP_KERNEL);
   1918			if (!newmap->map[i])
   1919				goto bail;
   1920			newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
   1921			/*
   1922			 * assign send contexts and
   1923			 * adjust credit return threshold
   1924			 */
   1925			for (j = 0; j < sz; j++) {
   1926				if (dd->kernel_send_context[scontext]) {
   1927					newmap->map[i]->ksc[j] =
   1928					dd->kernel_send_context[scontext];
   1929					set_threshold(dd, scontext, i);
   1930				}
   1931				if (++scontext >= first_scontext +
   1932						  vl_scontexts[i])
   1933					/* wrap back to first send context */
   1934					scontext = first_scontext;
   1935			}
   1936		} else {
   1937			/* just re-use entry without allocating */
   1938			newmap->map[i] = newmap->map[i % num_vls];
   1939		}
   1940		scontext = first_scontext + vl_scontexts[i];
   1941	}
   1942	/* newmap in hand, save old map */
   1943	spin_lock_irq(&dd->pio_map_lock);
   1944	oldmap = rcu_dereference_protected(dd->pio_map,
   1945					   lockdep_is_held(&dd->pio_map_lock));
   1946
   1947	/* publish newmap */
   1948	rcu_assign_pointer(dd->pio_map, newmap);
   1949
   1950	spin_unlock_irq(&dd->pio_map_lock);
   1951	/* success, free any old map after grace period */
   1952	if (oldmap)
   1953		call_rcu(&oldmap->list, pio_map_rcu_callback);
   1954	return 0;
   1955bail:
   1956	/* free any partial allocation */
   1957	pio_map_free(newmap);
   1958	return -ENOMEM;
   1959}
   1960
   1961void free_pio_map(struct hfi1_devdata *dd)
   1962{
   1963	/* Free PIO map if allocated */
   1964	if (rcu_access_pointer(dd->pio_map)) {
   1965		spin_lock_irq(&dd->pio_map_lock);
   1966		pio_map_free(rcu_access_pointer(dd->pio_map));
   1967		RCU_INIT_POINTER(dd->pio_map, NULL);
   1968		spin_unlock_irq(&dd->pio_map_lock);
   1969		synchronize_rcu();
   1970	}
   1971	kfree(dd->kernel_send_context);
   1972	dd->kernel_send_context = NULL;
   1973}
   1974
   1975int init_pervl_scs(struct hfi1_devdata *dd)
   1976{
   1977	int i;
   1978	u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */
   1979	u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */
   1980	u32 ctxt;
   1981	struct hfi1_pportdata *ppd = dd->pport;
   1982
   1983	dd->vld[15].sc = sc_alloc(dd, SC_VL15,
   1984				  dd->rcd[0]->rcvhdrqentsize, dd->node);
   1985	if (!dd->vld[15].sc)
   1986		return -ENOMEM;
   1987
   1988	hfi1_init_ctxt(dd->vld[15].sc);
   1989	dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
   1990
   1991	dd->kernel_send_context = kcalloc_node(dd->num_send_contexts,
   1992					       sizeof(struct send_context *),
   1993					       GFP_KERNEL, dd->node);
   1994	if (!dd->kernel_send_context)
   1995		goto freesc15;
   1996
   1997	dd->kernel_send_context[0] = dd->vld[15].sc;
   1998
   1999	for (i = 0; i < num_vls; i++) {
   2000		/*
   2001		 * Since this function does not deal with a specific
   2002		 * receive context but we need the RcvHdrQ entry size,
   2003		 * use the size from rcd[0]. It is guaranteed to be
   2004		 * valid at this point and will remain the same for all
   2005		 * receive contexts.
   2006		 */
   2007		dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
   2008					 dd->rcd[0]->rcvhdrqentsize, dd->node);
   2009		if (!dd->vld[i].sc)
   2010			goto nomem;
   2011		dd->kernel_send_context[i + 1] = dd->vld[i].sc;
   2012		hfi1_init_ctxt(dd->vld[i].sc);
   2013		/* non VL15 start with the max MTU */
   2014		dd->vld[i].mtu = hfi1_max_mtu;
   2015	}
   2016	for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
   2017		dd->kernel_send_context[i + 1] =
   2018		sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node);
   2019		if (!dd->kernel_send_context[i + 1])
   2020			goto nomem;
   2021		hfi1_init_ctxt(dd->kernel_send_context[i + 1]);
   2022	}
   2023
   2024	sc_enable(dd->vld[15].sc);
   2025	ctxt = dd->vld[15].sc->hw_context;
   2026	mask = all_vl_mask & ~(1LL << 15);
   2027	write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
   2028	dd_dev_info(dd,
   2029		    "Using send context %u(%u) for VL15\n",
   2030		    dd->vld[15].sc->sw_index, ctxt);
   2031
   2032	for (i = 0; i < num_vls; i++) {
   2033		sc_enable(dd->vld[i].sc);
   2034		ctxt = dd->vld[i].sc->hw_context;
   2035		mask = all_vl_mask & ~(data_vls_mask);
   2036		write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
   2037	}
   2038	for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
   2039		sc_enable(dd->kernel_send_context[i + 1]);
   2040		ctxt = dd->kernel_send_context[i + 1]->hw_context;
   2041		mask = all_vl_mask & ~(data_vls_mask);
   2042		write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
   2043	}
   2044
   2045	if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
   2046		goto nomem;
   2047	return 0;
   2048
   2049nomem:
   2050	for (i = 0; i < num_vls; i++) {
   2051		sc_free(dd->vld[i].sc);
   2052		dd->vld[i].sc = NULL;
   2053	}
   2054
   2055	for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
   2056		sc_free(dd->kernel_send_context[i + 1]);
   2057
   2058	kfree(dd->kernel_send_context);
   2059	dd->kernel_send_context = NULL;
   2060
   2061freesc15:
   2062	sc_free(dd->vld[15].sc);
   2063	return -ENOMEM;
   2064}
   2065
   2066int init_credit_return(struct hfi1_devdata *dd)
   2067{
   2068	int ret;
   2069	int i;
   2070
   2071	dd->cr_base = kcalloc(
   2072		node_affinity.num_possible_nodes,
   2073		sizeof(struct credit_return_base),
   2074		GFP_KERNEL);
   2075	if (!dd->cr_base) {
   2076		ret = -ENOMEM;
   2077		goto done;
   2078	}
   2079	for_each_node_with_cpus(i) {
   2080		int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
   2081
   2082		set_dev_node(&dd->pcidev->dev, i);
   2083		dd->cr_base[i].va = dma_alloc_coherent(&dd->pcidev->dev,
   2084						       bytes,
   2085						       &dd->cr_base[i].dma,
   2086						       GFP_KERNEL);
   2087		if (!dd->cr_base[i].va) {
   2088			set_dev_node(&dd->pcidev->dev, dd->node);
   2089			dd_dev_err(dd,
   2090				   "Unable to allocate credit return DMA range for NUMA %d\n",
   2091				   i);
   2092			ret = -ENOMEM;
   2093			goto done;
   2094		}
   2095	}
   2096	set_dev_node(&dd->pcidev->dev, dd->node);
   2097
   2098	ret = 0;
   2099done:
   2100	return ret;
   2101}
   2102
   2103void free_credit_return(struct hfi1_devdata *dd)
   2104{
   2105	int i;
   2106
   2107	if (!dd->cr_base)
   2108		return;
   2109	for (i = 0; i < node_affinity.num_possible_nodes; i++) {
   2110		if (dd->cr_base[i].va) {
   2111			dma_free_coherent(&dd->pcidev->dev,
   2112					  TXE_NUM_CONTEXTS *
   2113					  sizeof(struct credit_return),
   2114					  dd->cr_base[i].va,
   2115					  dd->cr_base[i].dma);
   2116		}
   2117	}
   2118	kfree(dd->cr_base);
   2119	dd->cr_base = NULL;
   2120}
   2121
   2122void seqfile_dump_sci(struct seq_file *s, u32 i,
   2123		      struct send_context_info *sci)
   2124{
   2125	struct send_context *sc = sci->sc;
   2126	u64 reg;
   2127
   2128	seq_printf(s, "SCI %u: type %u base %u credits %u\n",
   2129		   i, sci->type, sci->base, sci->credits);
   2130	seq_printf(s, "  flags 0x%x sw_inx %u hw_ctxt %u grp %u\n",
   2131		   sc->flags,  sc->sw_index, sc->hw_context, sc->group);
   2132	seq_printf(s, "  sr_size %u credits %u sr_head %u sr_tail %u\n",
   2133		   sc->sr_size, sc->credits, sc->sr_head, sc->sr_tail);
   2134	seq_printf(s, "  fill %lu free %lu fill_wrap %u alloc_free %lu\n",
   2135		   sc->fill, sc->free, sc->fill_wrap, sc->alloc_free);
   2136	seq_printf(s, "  credit_intr_count %u credit_ctrl 0x%llx\n",
   2137		   sc->credit_intr_count, sc->credit_ctrl);
   2138	reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_STATUS));
   2139	seq_printf(s, "  *hw_free %llu CurrentFree %llu LastReturned %llu\n",
   2140		   (le64_to_cpu(*sc->hw_free) & CR_COUNTER_SMASK) >>
   2141		    CR_COUNTER_SHIFT,
   2142		   (reg >> SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT)) &
   2143		    SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK),
   2144		   reg & SC(CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK));
   2145}