cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

init.c (52970B)


      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2/*
      3 * Copyright(c) 2015 - 2020 Intel Corporation.
      4 * Copyright(c) 2021 Cornelis Networks.
      5 */
      6
      7#include <linux/pci.h>
      8#include <linux/netdevice.h>
      9#include <linux/vmalloc.h>
     10#include <linux/delay.h>
     11#include <linux/xarray.h>
     12#include <linux/module.h>
     13#include <linux/printk.h>
     14#include <linux/hrtimer.h>
     15#include <linux/bitmap.h>
     16#include <linux/numa.h>
     17#include <rdma/rdma_vt.h>
     18
     19#include "hfi.h"
     20#include "device.h"
     21#include "common.h"
     22#include "trace.h"
     23#include "mad.h"
     24#include "sdma.h"
     25#include "debugfs.h"
     26#include "verbs.h"
     27#include "aspm.h"
     28#include "affinity.h"
     29#include "vnic.h"
     30#include "exp_rcv.h"
     31#include "netdev.h"
     32
     33#undef pr_fmt
     34#define pr_fmt(fmt) DRIVER_NAME ": " fmt
     35
     36/*
     37 * min buffers we want to have per context, after driver
     38 */
     39#define HFI1_MIN_USER_CTXT_BUFCNT 7
     40
     41#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
     42#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
     43
     44#define NUM_IB_PORTS 1
     45
     46/*
     47 * Number of user receive contexts we are configured to use (to allow for more
     48 * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
     49 */
     50int num_user_contexts = -1;
     51module_param_named(num_user_contexts, num_user_contexts, int, 0444);
     52MODULE_PARM_DESC(
     53	num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)");
     54
     55uint krcvqs[RXE_NUM_DATA_VL];
     56int krcvqsset;
     57module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
     58MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
     59
     60/* computed based on above array */
     61unsigned long n_krcvqs;
     62
     63static unsigned hfi1_rcvarr_split = 25;
     64module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
     65MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
     66
     67static uint eager_buffer_size = (8 << 20); /* 8MB */
     68module_param(eager_buffer_size, uint, S_IRUGO);
     69MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB");
     70
     71static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
     72module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
     73MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
     74
     75static uint hfi1_hdrq_entsize = 32;
     76module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444);
     77MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)");
     78
     79unsigned int user_credit_return_threshold = 33;	/* default is 33% */
     80module_param(user_credit_return_threshold, uint, S_IRUGO);
     81MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
     82
     83DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
     84
     85static int hfi1_create_kctxt(struct hfi1_devdata *dd,
     86			     struct hfi1_pportdata *ppd)
     87{
     88	struct hfi1_ctxtdata *rcd;
     89	int ret;
     90
     91	/* Control context has to be always 0 */
     92	BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
     93
     94	ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd);
     95	if (ret < 0) {
     96		dd_dev_err(dd, "Kernel receive context allocation failed\n");
     97		return ret;
     98	}
     99
    100	/*
    101	 * Set up the kernel context flags here and now because they use
    102	 * default values for all receive side memories.  User contexts will
    103	 * be handled as they are created.
    104	 */
    105	rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
    106		HFI1_CAP_KGET(NODROP_RHQ_FULL) |
    107		HFI1_CAP_KGET(NODROP_EGR_FULL) |
    108		HFI1_CAP_KGET(DMA_RTAIL);
    109
    110	/* Control context must use DMA_RTAIL */
    111	if (rcd->ctxt == HFI1_CTRL_CTXT)
    112		rcd->flags |= HFI1_CAP_DMA_RTAIL;
    113	rcd->fast_handler = get_dma_rtail_setting(rcd) ?
    114				handle_receive_interrupt_dma_rtail :
    115				handle_receive_interrupt_nodma_rtail;
    116
    117	hfi1_set_seq_cnt(rcd, 1);
    118
    119	rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
    120	if (!rcd->sc) {
    121		dd_dev_err(dd, "Kernel send context allocation failed\n");
    122		return -ENOMEM;
    123	}
    124	hfi1_init_ctxt(rcd->sc);
    125
    126	return 0;
    127}
    128
    129/*
    130 * Create the receive context array and one or more kernel contexts
    131 */
    132int hfi1_create_kctxts(struct hfi1_devdata *dd)
    133{
    134	u16 i;
    135	int ret;
    136
    137	dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd),
    138			       GFP_KERNEL, dd->node);
    139	if (!dd->rcd)
    140		return -ENOMEM;
    141
    142	for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
    143		ret = hfi1_create_kctxt(dd, dd->pport);
    144		if (ret)
    145			goto bail;
    146	}
    147
    148	return 0;
    149bail:
    150	for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i)
    151		hfi1_free_ctxt(dd->rcd[i]);
    152
    153	/* All the contexts should be freed, free the array */
    154	kfree(dd->rcd);
    155	dd->rcd = NULL;
    156	return ret;
    157}
    158
    159/*
    160 * Helper routines for the receive context reference count (rcd and uctxt).
    161 */
    162static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd)
    163{
    164	kref_init(&rcd->kref);
    165}
    166
    167/**
    168 * hfi1_rcd_free - When reference is zero clean up.
    169 * @kref: pointer to an initialized rcd data structure
    170 *
    171 */
    172static void hfi1_rcd_free(struct kref *kref)
    173{
    174	unsigned long flags;
    175	struct hfi1_ctxtdata *rcd =
    176		container_of(kref, struct hfi1_ctxtdata, kref);
    177
    178	spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
    179	rcd->dd->rcd[rcd->ctxt] = NULL;
    180	spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags);
    181
    182	hfi1_free_ctxtdata(rcd->dd, rcd);
    183
    184	kfree(rcd);
    185}
    186
    187/**
    188 * hfi1_rcd_put - decrement reference for rcd
    189 * @rcd: pointer to an initialized rcd data structure
    190 *
    191 * Use this to put a reference after the init.
    192 */
    193int hfi1_rcd_put(struct hfi1_ctxtdata *rcd)
    194{
    195	if (rcd)
    196		return kref_put(&rcd->kref, hfi1_rcd_free);
    197
    198	return 0;
    199}
    200
    201/**
    202 * hfi1_rcd_get - increment reference for rcd
    203 * @rcd: pointer to an initialized rcd data structure
    204 *
    205 * Use this to get a reference after the init.
    206 *
    207 * Return : reflect kref_get_unless_zero(), which returns non-zero on
    208 * increment, otherwise 0.
    209 */
    210int hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
    211{
    212	return kref_get_unless_zero(&rcd->kref);
    213}
    214
    215/**
    216 * allocate_rcd_index - allocate an rcd index from the rcd array
    217 * @dd: pointer to a valid devdata structure
    218 * @rcd: rcd data structure to assign
    219 * @index: pointer to index that is allocated
    220 *
    221 * Find an empty index in the rcd array, and assign the given rcd to it.
    222 * If the array is full, we are EBUSY.
    223 *
    224 */
    225static int allocate_rcd_index(struct hfi1_devdata *dd,
    226			      struct hfi1_ctxtdata *rcd, u16 *index)
    227{
    228	unsigned long flags;
    229	u16 ctxt;
    230
    231	spin_lock_irqsave(&dd->uctxt_lock, flags);
    232	for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++)
    233		if (!dd->rcd[ctxt])
    234			break;
    235
    236	if (ctxt < dd->num_rcv_contexts) {
    237		rcd->ctxt = ctxt;
    238		dd->rcd[ctxt] = rcd;
    239		hfi1_rcd_init(rcd);
    240	}
    241	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
    242
    243	if (ctxt >= dd->num_rcv_contexts)
    244		return -EBUSY;
    245
    246	*index = ctxt;
    247
    248	return 0;
    249}
    250
    251/**
    252 * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the
    253 * array
    254 * @dd: pointer to a valid devdata structure
    255 * @ctxt: the index of an possilbe rcd
    256 *
    257 * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given
    258 * ctxt index is valid.
    259 *
    260 * The caller is responsible for making the _put().
    261 *
    262 */
    263struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
    264						 u16 ctxt)
    265{
    266	if (ctxt < dd->num_rcv_contexts)
    267		return hfi1_rcd_get_by_index(dd, ctxt);
    268
    269	return NULL;
    270}
    271
    272/**
    273 * hfi1_rcd_get_by_index - get by index
    274 * @dd: pointer to a valid devdata structure
    275 * @ctxt: the index of an possilbe rcd
    276 *
    277 * We need to protect access to the rcd array.  If access is needed to
    278 * one or more index, get the protecting spinlock and then increment the
    279 * kref.
    280 *
    281 * The caller is responsible for making the _put().
    282 *
    283 */
    284struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt)
    285{
    286	unsigned long flags;
    287	struct hfi1_ctxtdata *rcd = NULL;
    288
    289	spin_lock_irqsave(&dd->uctxt_lock, flags);
    290	if (dd->rcd[ctxt]) {
    291		rcd = dd->rcd[ctxt];
    292		if (!hfi1_rcd_get(rcd))
    293			rcd = NULL;
    294	}
    295	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
    296
    297	return rcd;
    298}
    299
    300/*
    301 * Common code for user and kernel context create and setup.
    302 * NOTE: the initial kref is done here (hf1_rcd_init()).
    303 */
    304int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
    305			 struct hfi1_ctxtdata **context)
    306{
    307	struct hfi1_devdata *dd = ppd->dd;
    308	struct hfi1_ctxtdata *rcd;
    309	unsigned kctxt_ngroups = 0;
    310	u32 base;
    311
    312	if (dd->rcv_entries.nctxt_extra >
    313	    dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)
    314		kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
    315			 (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt));
    316	rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa);
    317	if (rcd) {
    318		u32 rcvtids, max_entries;
    319		u16 ctxt;
    320		int ret;
    321
    322		ret = allocate_rcd_index(dd, rcd, &ctxt);
    323		if (ret) {
    324			*context = NULL;
    325			kfree(rcd);
    326			return ret;
    327		}
    328
    329		INIT_LIST_HEAD(&rcd->qp_wait_list);
    330		hfi1_exp_tid_group_init(rcd);
    331		rcd->ppd = ppd;
    332		rcd->dd = dd;
    333		rcd->numa_id = numa;
    334		rcd->rcv_array_groups = dd->rcv_entries.ngroups;
    335		rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
    336		rcd->slow_handler = handle_receive_interrupt;
    337		rcd->do_interrupt = rcd->slow_handler;
    338		rcd->msix_intr = CCE_NUM_MSIX_VECTORS;
    339
    340		mutex_init(&rcd->exp_mutex);
    341		spin_lock_init(&rcd->exp_lock);
    342		INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
    343		INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
    344
    345		hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
    346
    347		/*
    348		 * Calculate the context's RcvArray entry starting point.
    349		 * We do this here because we have to take into account all
    350		 * the RcvArray entries that previous context would have
    351		 * taken and we have to account for any extra groups assigned
    352		 * to the static (kernel) or dynamic (vnic/user) contexts.
    353		 */
    354		if (ctxt < dd->first_dyn_alloc_ctxt) {
    355			if (ctxt < kctxt_ngroups) {
    356				base = ctxt * (dd->rcv_entries.ngroups + 1);
    357				rcd->rcv_array_groups++;
    358			} else {
    359				base = kctxt_ngroups +
    360					(ctxt * dd->rcv_entries.ngroups);
    361			}
    362		} else {
    363			u16 ct = ctxt - dd->first_dyn_alloc_ctxt;
    364
    365			base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
    366				kctxt_ngroups);
    367			if (ct < dd->rcv_entries.nctxt_extra) {
    368				base += ct * (dd->rcv_entries.ngroups + 1);
    369				rcd->rcv_array_groups++;
    370			} else {
    371				base += dd->rcv_entries.nctxt_extra +
    372					(ct * dd->rcv_entries.ngroups);
    373			}
    374		}
    375		rcd->eager_base = base * dd->rcv_entries.group_size;
    376
    377		rcd->rcvhdrq_cnt = rcvhdrcnt;
    378		rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
    379		rcd->rhf_offset =
    380			rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
    381		/*
    382		 * Simple Eager buffer allocation: we have already pre-allocated
    383		 * the number of RcvArray entry groups. Each ctxtdata structure
    384		 * holds the number of groups for that context.
    385		 *
    386		 * To follow CSR requirements and maintain cacheline alignment,
    387		 * make sure all sizes and bases are multiples of group_size.
    388		 *
    389		 * The expected entry count is what is left after assigning
    390		 * eager.
    391		 */
    392		max_entries = rcd->rcv_array_groups *
    393			dd->rcv_entries.group_size;
    394		rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
    395		rcd->egrbufs.count = round_down(rcvtids,
    396						dd->rcv_entries.group_size);
    397		if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
    398			dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
    399				   rcd->ctxt);
    400			rcd->egrbufs.count = MAX_EAGER_ENTRIES;
    401		}
    402		hfi1_cdbg(PROC,
    403			  "ctxt%u: max Eager buffer RcvArray entries: %u\n",
    404			  rcd->ctxt, rcd->egrbufs.count);
    405
    406		/*
    407		 * Allocate array that will hold the eager buffer accounting
    408		 * data.
    409		 * This will allocate the maximum possible buffer count based
    410		 * on the value of the RcvArray split parameter.
    411		 * The resulting value will be rounded down to the closest
    412		 * multiple of dd->rcv_entries.group_size.
    413		 */
    414		rcd->egrbufs.buffers =
    415			kcalloc_node(rcd->egrbufs.count,
    416				     sizeof(*rcd->egrbufs.buffers),
    417				     GFP_KERNEL, numa);
    418		if (!rcd->egrbufs.buffers)
    419			goto bail;
    420		rcd->egrbufs.rcvtids =
    421			kcalloc_node(rcd->egrbufs.count,
    422				     sizeof(*rcd->egrbufs.rcvtids),
    423				     GFP_KERNEL, numa);
    424		if (!rcd->egrbufs.rcvtids)
    425			goto bail;
    426		rcd->egrbufs.size = eager_buffer_size;
    427		/*
    428		 * The size of the buffers programmed into the RcvArray
    429		 * entries needs to be big enough to handle the highest
    430		 * MTU supported.
    431		 */
    432		if (rcd->egrbufs.size < hfi1_max_mtu) {
    433			rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
    434			hfi1_cdbg(PROC,
    435				  "ctxt%u: eager bufs size too small. Adjusting to %u\n",
    436				    rcd->ctxt, rcd->egrbufs.size);
    437		}
    438		rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
    439
    440		/* Applicable only for statically created kernel contexts */
    441		if (ctxt < dd->first_dyn_alloc_ctxt) {
    442			rcd->opstats = kzalloc_node(sizeof(*rcd->opstats),
    443						    GFP_KERNEL, numa);
    444			if (!rcd->opstats)
    445				goto bail;
    446
    447			/* Initialize TID flow generations for the context */
    448			hfi1_kern_init_ctxt_generations(rcd);
    449		}
    450
    451		*context = rcd;
    452		return 0;
    453	}
    454
    455bail:
    456	*context = NULL;
    457	hfi1_free_ctxt(rcd);
    458	return -ENOMEM;
    459}
    460
    461/**
    462 * hfi1_free_ctxt - free context
    463 * @rcd: pointer to an initialized rcd data structure
    464 *
    465 * This wrapper is the free function that matches hfi1_create_ctxtdata().
    466 * When a context is done being used (kernel or user), this function is called
    467 * for the "final" put to match the kref init from hf1i_create_ctxtdata().
    468 * Other users of the context do a get/put sequence to make sure that the
    469 * structure isn't removed while in use.
    470 */
    471void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd)
    472{
    473	hfi1_rcd_put(rcd);
    474}
    475
    476/*
    477 * Select the largest ccti value over all SLs to determine the intra-
    478 * packet gap for the link.
    479 *
    480 * called with cca_timer_lock held (to protect access to cca_timer
    481 * array), and rcu_read_lock() (to protect access to cc_state).
    482 */
    483void set_link_ipg(struct hfi1_pportdata *ppd)
    484{
    485	struct hfi1_devdata *dd = ppd->dd;
    486	struct cc_state *cc_state;
    487	int i;
    488	u16 cce, ccti_limit, max_ccti = 0;
    489	u16 shift, mult;
    490	u64 src;
    491	u32 current_egress_rate; /* Mbits /sec */
    492	u64 max_pkt_time;
    493	/*
    494	 * max_pkt_time is the maximum packet egress time in units
    495	 * of the fabric clock period 1/(805 MHz).
    496	 */
    497
    498	cc_state = get_cc_state(ppd);
    499
    500	if (!cc_state)
    501		/*
    502		 * This should _never_ happen - rcu_read_lock() is held,
    503		 * and set_link_ipg() should not be called if cc_state
    504		 * is NULL.
    505		 */
    506		return;
    507
    508	for (i = 0; i < OPA_MAX_SLS; i++) {
    509		u16 ccti = ppd->cca_timer[i].ccti;
    510
    511		if (ccti > max_ccti)
    512			max_ccti = ccti;
    513	}
    514
    515	ccti_limit = cc_state->cct.ccti_limit;
    516	if (max_ccti > ccti_limit)
    517		max_ccti = ccti_limit;
    518
    519	cce = cc_state->cct.entries[max_ccti].entry;
    520	shift = (cce & 0xc000) >> 14;
    521	mult = (cce & 0x3fff);
    522
    523	current_egress_rate = active_egress_rate(ppd);
    524
    525	max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
    526
    527	src = (max_pkt_time >> shift) * mult;
    528
    529	src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
    530	src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
    531
    532	write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
    533}
    534
    535static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
    536{
    537	struct cca_timer *cca_timer;
    538	struct hfi1_pportdata *ppd;
    539	int sl;
    540	u16 ccti_timer, ccti_min;
    541	struct cc_state *cc_state;
    542	unsigned long flags;
    543	enum hrtimer_restart ret = HRTIMER_NORESTART;
    544
    545	cca_timer = container_of(t, struct cca_timer, hrtimer);
    546	ppd = cca_timer->ppd;
    547	sl = cca_timer->sl;
    548
    549	rcu_read_lock();
    550
    551	cc_state = get_cc_state(ppd);
    552
    553	if (!cc_state) {
    554		rcu_read_unlock();
    555		return HRTIMER_NORESTART;
    556	}
    557
    558	/*
    559	 * 1) decrement ccti for SL
    560	 * 2) calculate IPG for link (set_link_ipg())
    561	 * 3) restart timer, unless ccti is at min value
    562	 */
    563
    564	ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
    565	ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
    566
    567	spin_lock_irqsave(&ppd->cca_timer_lock, flags);
    568
    569	if (cca_timer->ccti > ccti_min) {
    570		cca_timer->ccti--;
    571		set_link_ipg(ppd);
    572	}
    573
    574	if (cca_timer->ccti > ccti_min) {
    575		unsigned long nsec = 1024 * ccti_timer;
    576		/* ccti_timer is in units of 1.024 usec */
    577		hrtimer_forward_now(t, ns_to_ktime(nsec));
    578		ret = HRTIMER_RESTART;
    579	}
    580
    581	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
    582	rcu_read_unlock();
    583	return ret;
    584}
    585
    586/*
    587 * Common code for initializing the physical port structure.
    588 */
    589void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
    590			 struct hfi1_devdata *dd, u8 hw_pidx, u32 port)
    591{
    592	int i;
    593	uint default_pkey_idx;
    594	struct cc_state *cc_state;
    595
    596	ppd->dd = dd;
    597	ppd->hw_pidx = hw_pidx;
    598	ppd->port = port; /* IB port number, not index */
    599	ppd->prev_link_width = LINK_WIDTH_DEFAULT;
    600	/*
    601	 * There are C_VL_COUNT number of PortVLXmitWait counters.
    602	 * Adding 1 to C_VL_COUNT to include the PortXmitWait counter.
    603	 */
    604	for (i = 0; i < C_VL_COUNT + 1; i++) {
    605		ppd->port_vl_xmit_wait_last[i] = 0;
    606		ppd->vl_xmit_flit_cnt[i] = 0;
    607	}
    608
    609	default_pkey_idx = 1;
    610
    611	ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
    612	ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
    613	ppd->pkeys[0] = 0x8001;
    614
    615	INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
    616	INIT_WORK(&ppd->link_up_work, handle_link_up);
    617	INIT_WORK(&ppd->link_down_work, handle_link_down);
    618	INIT_WORK(&ppd->freeze_work, handle_freeze);
    619	INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
    620	INIT_WORK(&ppd->sma_message_work, handle_sma_message);
    621	INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
    622	INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link);
    623	INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
    624	INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
    625
    626	mutex_init(&ppd->hls_lock);
    627	spin_lock_init(&ppd->qsfp_info.qsfp_lock);
    628
    629	ppd->qsfp_info.ppd = ppd;
    630	ppd->sm_trap_qp = 0x0;
    631	ppd->sa_qp = 0x1;
    632
    633	ppd->hfi1_wq = NULL;
    634
    635	spin_lock_init(&ppd->cca_timer_lock);
    636
    637	for (i = 0; i < OPA_MAX_SLS; i++) {
    638		hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
    639			     HRTIMER_MODE_REL);
    640		ppd->cca_timer[i].ppd = ppd;
    641		ppd->cca_timer[i].sl = i;
    642		ppd->cca_timer[i].ccti = 0;
    643		ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
    644	}
    645
    646	ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
    647
    648	spin_lock_init(&ppd->cc_state_lock);
    649	spin_lock_init(&ppd->cc_log_lock);
    650	cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL);
    651	RCU_INIT_POINTER(ppd->cc_state, cc_state);
    652	if (!cc_state)
    653		goto bail;
    654	return;
    655
    656bail:
    657	dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port);
    658}
    659
    660/*
    661 * Do initialization for device that is only needed on
    662 * first detect, not on resets.
    663 */
    664static int loadtime_init(struct hfi1_devdata *dd)
    665{
    666	return 0;
    667}
    668
    669/**
    670 * init_after_reset - re-initialize after a reset
    671 * @dd: the hfi1_ib device
    672 *
    673 * sanity check at least some of the values after reset, and
    674 * ensure no receive or transmit (explicitly, in case reset
    675 * failed
    676 */
    677static int init_after_reset(struct hfi1_devdata *dd)
    678{
    679	int i;
    680	struct hfi1_ctxtdata *rcd;
    681	/*
    682	 * Ensure chip does no sends or receives, tail updates, or
    683	 * pioavail updates while we re-initialize.  This is mostly
    684	 * for the driver data structures, not chip registers.
    685	 */
    686	for (i = 0; i < dd->num_rcv_contexts; i++) {
    687		rcd = hfi1_rcd_get_by_index(dd, i);
    688		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
    689			     HFI1_RCVCTRL_INTRAVAIL_DIS |
    690			     HFI1_RCVCTRL_TAILUPD_DIS, rcd);
    691		hfi1_rcd_put(rcd);
    692	}
    693	pio_send_control(dd, PSC_GLOBAL_DISABLE);
    694	for (i = 0; i < dd->num_send_contexts; i++)
    695		sc_disable(dd->send_contexts[i].sc);
    696
    697	return 0;
    698}
    699
    700static void enable_chip(struct hfi1_devdata *dd)
    701{
    702	struct hfi1_ctxtdata *rcd;
    703	u32 rcvmask;
    704	u16 i;
    705
    706	/* enable PIO send */
    707	pio_send_control(dd, PSC_GLOBAL_ENABLE);
    708
    709	/*
    710	 * Enable kernel ctxts' receive and receive interrupt.
    711	 * Other ctxts done as user opens and initializes them.
    712	 */
    713	for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
    714		rcd = hfi1_rcd_get_by_index(dd, i);
    715		if (!rcd)
    716			continue;
    717		rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
    718		rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
    719			HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
    720		if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
    721			rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
    722		if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL))
    723			rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
    724		if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
    725			rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
    726		if (HFI1_CAP_IS_KSET(TID_RDMA))
    727			rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
    728		hfi1_rcvctrl(dd, rcvmask, rcd);
    729		sc_enable(rcd->sc);
    730		hfi1_rcd_put(rcd);
    731	}
    732}
    733
    734/**
    735 * create_workqueues - create per port workqueues
    736 * @dd: the hfi1_ib device
    737 */
    738static int create_workqueues(struct hfi1_devdata *dd)
    739{
    740	int pidx;
    741	struct hfi1_pportdata *ppd;
    742
    743	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
    744		ppd = dd->pport + pidx;
    745		if (!ppd->hfi1_wq) {
    746			ppd->hfi1_wq =
    747				alloc_workqueue(
    748				    "hfi%d_%d",
    749				    WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
    750				    WQ_MEM_RECLAIM,
    751				    HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
    752				    dd->unit, pidx);
    753			if (!ppd->hfi1_wq)
    754				goto wq_error;
    755		}
    756		if (!ppd->link_wq) {
    757			/*
    758			 * Make the link workqueue single-threaded to enforce
    759			 * serialization.
    760			 */
    761			ppd->link_wq =
    762				alloc_workqueue(
    763				    "hfi_link_%d_%d",
    764				    WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND,
    765				    1, /* max_active */
    766				    dd->unit, pidx);
    767			if (!ppd->link_wq)
    768				goto wq_error;
    769		}
    770	}
    771	return 0;
    772wq_error:
    773	pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
    774	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
    775		ppd = dd->pport + pidx;
    776		if (ppd->hfi1_wq) {
    777			destroy_workqueue(ppd->hfi1_wq);
    778			ppd->hfi1_wq = NULL;
    779		}
    780		if (ppd->link_wq) {
    781			destroy_workqueue(ppd->link_wq);
    782			ppd->link_wq = NULL;
    783		}
    784	}
    785	return -ENOMEM;
    786}
    787
    788/**
    789 * destroy_workqueues - destroy per port workqueues
    790 * @dd: the hfi1_ib device
    791 */
    792static void destroy_workqueues(struct hfi1_devdata *dd)
    793{
    794	int pidx;
    795	struct hfi1_pportdata *ppd;
    796
    797	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
    798		ppd = dd->pport + pidx;
    799
    800		if (ppd->hfi1_wq) {
    801			destroy_workqueue(ppd->hfi1_wq);
    802			ppd->hfi1_wq = NULL;
    803		}
    804		if (ppd->link_wq) {
    805			destroy_workqueue(ppd->link_wq);
    806			ppd->link_wq = NULL;
    807		}
    808	}
    809}
    810
    811/**
    812 * enable_general_intr() - Enable the IRQs that will be handled by the
    813 * general interrupt handler.
    814 * @dd: valid devdata
    815 *
    816 */
    817static void enable_general_intr(struct hfi1_devdata *dd)
    818{
    819	set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true);
    820	set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true);
    821	set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true);
    822	set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true);
    823	set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true);
    824	set_intr_bits(dd, IS_DC_START, IS_DC_END, true);
    825	set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true);
    826}
    827
    828/**
    829 * hfi1_init - do the actual initialization sequence on the chip
    830 * @dd: the hfi1_ib device
    831 * @reinit: re-initializing, so don't allocate new memory
    832 *
    833 * Do the actual initialization sequence on the chip.  This is done
    834 * both from the init routine called from the PCI infrastructure, and
    835 * when we reset the chip, or detect that it was reset internally,
    836 * or it's administratively re-enabled.
    837 *
    838 * Memory allocation here and in called routines is only done in
    839 * the first case (reinit == 0).  We have to be careful, because even
    840 * without memory allocation, we need to re-write all the chip registers
    841 * TIDs, etc. after the reset or enable has completed.
    842 */
    843int hfi1_init(struct hfi1_devdata *dd, int reinit)
    844{
    845	int ret = 0, pidx, lastfail = 0;
    846	unsigned long len;
    847	u16 i;
    848	struct hfi1_ctxtdata *rcd;
    849	struct hfi1_pportdata *ppd;
    850
    851	/* Set up send low level handlers */
    852	dd->process_pio_send = hfi1_verbs_send_pio;
    853	dd->process_dma_send = hfi1_verbs_send_dma;
    854	dd->pio_inline_send = pio_copy;
    855	dd->process_vnic_dma_send = hfi1_vnic_send_dma;
    856
    857	if (is_ax(dd)) {
    858		atomic_set(&dd->drop_packet, DROP_PACKET_ON);
    859		dd->do_drop = true;
    860	} else {
    861		atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
    862		dd->do_drop = false;
    863	}
    864
    865	/* make sure the link is not "up" */
    866	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
    867		ppd = dd->pport + pidx;
    868		ppd->linkup = 0;
    869	}
    870
    871	if (reinit)
    872		ret = init_after_reset(dd);
    873	else
    874		ret = loadtime_init(dd);
    875	if (ret)
    876		goto done;
    877
    878	/* dd->rcd can be NULL if early initialization failed */
    879	for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) {
    880		/*
    881		 * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
    882		 * re-init, the simplest way to handle this is to free
    883		 * existing, and re-allocate.
    884		 * Need to re-create rest of ctxt 0 ctxtdata as well.
    885		 */
    886		rcd = hfi1_rcd_get_by_index(dd, i);
    887		if (!rcd)
    888			continue;
    889
    890		lastfail = hfi1_create_rcvhdrq(dd, rcd);
    891		if (!lastfail)
    892			lastfail = hfi1_setup_eagerbufs(rcd);
    893		if (!lastfail)
    894			lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
    895		if (lastfail) {
    896			dd_dev_err(dd,
    897				   "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
    898			ret = lastfail;
    899		}
    900		/* enable IRQ */
    901		hfi1_rcd_put(rcd);
    902	}
    903
    904	/* Allocate enough memory for user event notification. */
    905	len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS *
    906			 sizeof(*dd->events));
    907	dd->events = vmalloc_user(len);
    908	if (!dd->events)
    909		dd_dev_err(dd, "Failed to allocate user events page\n");
    910	/*
    911	 * Allocate a page for device and port status.
    912	 * Page will be shared amongst all user processes.
    913	 */
    914	dd->status = vmalloc_user(PAGE_SIZE);
    915	if (!dd->status)
    916		dd_dev_err(dd, "Failed to allocate dev status page\n");
    917	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
    918		ppd = dd->pport + pidx;
    919		if (dd->status)
    920			/* Currently, we only have one port */
    921			ppd->statusp = &dd->status->port;
    922
    923		set_mtu(ppd);
    924	}
    925
    926	/* enable chip even if we have an error, so we can debug cause */
    927	enable_chip(dd);
    928
    929done:
    930	/*
    931	 * Set status even if port serdes is not initialized
    932	 * so that diags will work.
    933	 */
    934	if (dd->status)
    935		dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
    936			HFI1_STATUS_INITTED;
    937	if (!ret) {
    938		/* enable all interrupts from the chip */
    939		enable_general_intr(dd);
    940		init_qsfp_int(dd);
    941
    942		/* chip is OK for user apps; mark it as initialized */
    943		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
    944			ppd = dd->pport + pidx;
    945
    946			/*
    947			 * start the serdes - must be after interrupts are
    948			 * enabled so we are notified when the link goes up
    949			 */
    950			lastfail = bringup_serdes(ppd);
    951			if (lastfail)
    952				dd_dev_info(dd,
    953					    "Failed to bring up port %u\n",
    954					    ppd->port);
    955
    956			/*
    957			 * Set status even if port serdes is not initialized
    958			 * so that diags will work.
    959			 */
    960			if (ppd->statusp)
    961				*ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
    962							HFI1_STATUS_INITTED;
    963			if (!ppd->link_speed_enabled)
    964				continue;
    965		}
    966	}
    967
    968	/* if ret is non-zero, we probably should do some cleanup here... */
    969	return ret;
    970}
    971
    972struct hfi1_devdata *hfi1_lookup(int unit)
    973{
    974	return xa_load(&hfi1_dev_table, unit);
    975}
    976
    977/*
    978 * Stop the timers during unit shutdown, or after an error late
    979 * in initialization.
    980 */
    981static void stop_timers(struct hfi1_devdata *dd)
    982{
    983	struct hfi1_pportdata *ppd;
    984	int pidx;
    985
    986	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
    987		ppd = dd->pport + pidx;
    988		if (ppd->led_override_timer.function) {
    989			del_timer_sync(&ppd->led_override_timer);
    990			atomic_set(&ppd->led_override_timer_active, 0);
    991		}
    992	}
    993}
    994
    995/**
    996 * shutdown_device - shut down a device
    997 * @dd: the hfi1_ib device
    998 *
    999 * This is called to make the device quiet when we are about to
   1000 * unload the driver, and also when the device is administratively
   1001 * disabled.   It does not free any data structures.
   1002 * Everything it does has to be setup again by hfi1_init(dd, 1)
   1003 */
   1004static void shutdown_device(struct hfi1_devdata *dd)
   1005{
   1006	struct hfi1_pportdata *ppd;
   1007	struct hfi1_ctxtdata *rcd;
   1008	unsigned pidx;
   1009	int i;
   1010
   1011	if (dd->flags & HFI1_SHUTDOWN)
   1012		return;
   1013	dd->flags |= HFI1_SHUTDOWN;
   1014
   1015	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
   1016		ppd = dd->pport + pidx;
   1017
   1018		ppd->linkup = 0;
   1019		if (ppd->statusp)
   1020			*ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
   1021					   HFI1_STATUS_IB_READY);
   1022	}
   1023	dd->flags &= ~HFI1_INITTED;
   1024
   1025	/* mask and clean up interrupts */
   1026	set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
   1027	msix_clean_up_interrupts(dd);
   1028
   1029	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
   1030		ppd = dd->pport + pidx;
   1031		for (i = 0; i < dd->num_rcv_contexts; i++) {
   1032			rcd = hfi1_rcd_get_by_index(dd, i);
   1033			hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
   1034				     HFI1_RCVCTRL_CTXT_DIS |
   1035				     HFI1_RCVCTRL_INTRAVAIL_DIS |
   1036				     HFI1_RCVCTRL_PKEY_DIS |
   1037				     HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd);
   1038			hfi1_rcd_put(rcd);
   1039		}
   1040		/*
   1041		 * Gracefully stop all sends allowing any in progress to
   1042		 * trickle out first.
   1043		 */
   1044		for (i = 0; i < dd->num_send_contexts; i++)
   1045			sc_flush(dd->send_contexts[i].sc);
   1046	}
   1047
   1048	/*
   1049	 * Enough for anything that's going to trickle out to have actually
   1050	 * done so.
   1051	 */
   1052	udelay(20);
   1053
   1054	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
   1055		ppd = dd->pport + pidx;
   1056
   1057		/* disable all contexts */
   1058		for (i = 0; i < dd->num_send_contexts; i++)
   1059			sc_disable(dd->send_contexts[i].sc);
   1060		/* disable the send device */
   1061		pio_send_control(dd, PSC_GLOBAL_DISABLE);
   1062
   1063		shutdown_led_override(ppd);
   1064
   1065		/*
   1066		 * Clear SerdesEnable.
   1067		 * We can't count on interrupts since we are stopping.
   1068		 */
   1069		hfi1_quiet_serdes(ppd);
   1070		if (ppd->hfi1_wq)
   1071			flush_workqueue(ppd->hfi1_wq);
   1072		if (ppd->link_wq)
   1073			flush_workqueue(ppd->link_wq);
   1074	}
   1075	sdma_exit(dd);
   1076}
   1077
   1078/**
   1079 * hfi1_free_ctxtdata - free a context's allocated data
   1080 * @dd: the hfi1_ib device
   1081 * @rcd: the ctxtdata structure
   1082 *
   1083 * free up any allocated data for a context
   1084 * It should never change any chip state, or global driver state.
   1085 */
   1086void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
   1087{
   1088	u32 e;
   1089
   1090	if (!rcd)
   1091		return;
   1092
   1093	if (rcd->rcvhdrq) {
   1094		dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd),
   1095				  rcd->rcvhdrq, rcd->rcvhdrq_dma);
   1096		rcd->rcvhdrq = NULL;
   1097		if (hfi1_rcvhdrtail_kvaddr(rcd)) {
   1098			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
   1099					  (void *)hfi1_rcvhdrtail_kvaddr(rcd),
   1100					  rcd->rcvhdrqtailaddr_dma);
   1101			rcd->rcvhdrtail_kvaddr = NULL;
   1102		}
   1103	}
   1104
   1105	/* all the RcvArray entries should have been cleared by now */
   1106	kfree(rcd->egrbufs.rcvtids);
   1107	rcd->egrbufs.rcvtids = NULL;
   1108
   1109	for (e = 0; e < rcd->egrbufs.alloced; e++) {
   1110		if (rcd->egrbufs.buffers[e].addr)
   1111			dma_free_coherent(&dd->pcidev->dev,
   1112					  rcd->egrbufs.buffers[e].len,
   1113					  rcd->egrbufs.buffers[e].addr,
   1114					  rcd->egrbufs.buffers[e].dma);
   1115	}
   1116	kfree(rcd->egrbufs.buffers);
   1117	rcd->egrbufs.alloced = 0;
   1118	rcd->egrbufs.buffers = NULL;
   1119
   1120	sc_free(rcd->sc);
   1121	rcd->sc = NULL;
   1122
   1123	vfree(rcd->subctxt_uregbase);
   1124	vfree(rcd->subctxt_rcvegrbuf);
   1125	vfree(rcd->subctxt_rcvhdr_base);
   1126	kfree(rcd->opstats);
   1127
   1128	rcd->subctxt_uregbase = NULL;
   1129	rcd->subctxt_rcvegrbuf = NULL;
   1130	rcd->subctxt_rcvhdr_base = NULL;
   1131	rcd->opstats = NULL;
   1132}
   1133
   1134/*
   1135 * Release our hold on the shared asic data.  If we are the last one,
   1136 * return the structure to be finalized outside the lock.  Must be
   1137 * holding hfi1_dev_table lock.
   1138 */
   1139static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
   1140{
   1141	struct hfi1_asic_data *ad;
   1142	int other;
   1143
   1144	if (!dd->asic_data)
   1145		return NULL;
   1146	dd->asic_data->dds[dd->hfi1_id] = NULL;
   1147	other = dd->hfi1_id ? 0 : 1;
   1148	ad = dd->asic_data;
   1149	dd->asic_data = NULL;
   1150	/* return NULL if the other dd still has a link */
   1151	return ad->dds[other] ? NULL : ad;
   1152}
   1153
   1154static void finalize_asic_data(struct hfi1_devdata *dd,
   1155			       struct hfi1_asic_data *ad)
   1156{
   1157	clean_up_i2c(dd, ad);
   1158	kfree(ad);
   1159}
   1160
   1161/**
   1162 * hfi1_free_devdata - cleans up and frees per-unit data structure
   1163 * @dd: pointer to a valid devdata structure
   1164 *
   1165 * It cleans up and frees all data structures set up by
   1166 * by hfi1_alloc_devdata().
   1167 */
   1168void hfi1_free_devdata(struct hfi1_devdata *dd)
   1169{
   1170	struct hfi1_asic_data *ad;
   1171	unsigned long flags;
   1172
   1173	xa_lock_irqsave(&hfi1_dev_table, flags);
   1174	__xa_erase(&hfi1_dev_table, dd->unit);
   1175	ad = release_asic_data(dd);
   1176	xa_unlock_irqrestore(&hfi1_dev_table, flags);
   1177
   1178	finalize_asic_data(dd, ad);
   1179	free_platform_config(dd);
   1180	rcu_barrier(); /* wait for rcu callbacks to complete */
   1181	free_percpu(dd->int_counter);
   1182	free_percpu(dd->rcv_limit);
   1183	free_percpu(dd->send_schedule);
   1184	free_percpu(dd->tx_opstats);
   1185	dd->int_counter   = NULL;
   1186	dd->rcv_limit     = NULL;
   1187	dd->send_schedule = NULL;
   1188	dd->tx_opstats    = NULL;
   1189	kfree(dd->comp_vect);
   1190	dd->comp_vect = NULL;
   1191	if (dd->rcvhdrtail_dummy_kvaddr)
   1192		dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
   1193				  (void *)dd->rcvhdrtail_dummy_kvaddr,
   1194				  dd->rcvhdrtail_dummy_dma);
   1195	dd->rcvhdrtail_dummy_kvaddr = NULL;
   1196	sdma_clean(dd, dd->num_sdma);
   1197	rvt_dealloc_device(&dd->verbs_dev.rdi);
   1198}
   1199
   1200/**
   1201 * hfi1_alloc_devdata - Allocate our primary per-unit data structure.
   1202 * @pdev: Valid PCI device
   1203 * @extra: How many bytes to alloc past the default
   1204 *
   1205 * Must be done via verbs allocator, because the verbs cleanup process
   1206 * both does cleanup and free of the data structure.
   1207 * "extra" is for chip-specific data.
   1208 */
   1209static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
   1210					       size_t extra)
   1211{
   1212	struct hfi1_devdata *dd;
   1213	int ret, nports;
   1214
   1215	/* extra is * number of ports */
   1216	nports = extra / sizeof(struct hfi1_pportdata);
   1217
   1218	dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
   1219						     nports);
   1220	if (!dd)
   1221		return ERR_PTR(-ENOMEM);
   1222	dd->num_pports = nports;
   1223	dd->pport = (struct hfi1_pportdata *)(dd + 1);
   1224	dd->pcidev = pdev;
   1225	pci_set_drvdata(pdev, dd);
   1226
   1227	ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b,
   1228			GFP_KERNEL);
   1229	if (ret < 0) {
   1230		dev_err(&pdev->dev,
   1231			"Could not allocate unit ID: error %d\n", -ret);
   1232		goto bail;
   1233	}
   1234	rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
   1235	/*
   1236	 * If the BIOS does not have the NUMA node information set, select
   1237	 * NUMA 0 so we get consistent performance.
   1238	 */
   1239	dd->node = pcibus_to_node(pdev->bus);
   1240	if (dd->node == NUMA_NO_NODE) {
   1241		dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n");
   1242		dd->node = 0;
   1243	}
   1244
   1245	/*
   1246	 * Initialize all locks for the device. This needs to be as early as
   1247	 * possible so locks are usable.
   1248	 */
   1249	spin_lock_init(&dd->sc_lock);
   1250	spin_lock_init(&dd->sendctrl_lock);
   1251	spin_lock_init(&dd->rcvctrl_lock);
   1252	spin_lock_init(&dd->uctxt_lock);
   1253	spin_lock_init(&dd->hfi1_diag_trans_lock);
   1254	spin_lock_init(&dd->sc_init_lock);
   1255	spin_lock_init(&dd->dc8051_memlock);
   1256	seqlock_init(&dd->sc2vl_lock);
   1257	spin_lock_init(&dd->sde_map_lock);
   1258	spin_lock_init(&dd->pio_map_lock);
   1259	mutex_init(&dd->dc8051_lock);
   1260	init_waitqueue_head(&dd->event_queue);
   1261	spin_lock_init(&dd->irq_src_lock);
   1262
   1263	dd->int_counter = alloc_percpu(u64);
   1264	if (!dd->int_counter) {
   1265		ret = -ENOMEM;
   1266		goto bail;
   1267	}
   1268
   1269	dd->rcv_limit = alloc_percpu(u64);
   1270	if (!dd->rcv_limit) {
   1271		ret = -ENOMEM;
   1272		goto bail;
   1273	}
   1274
   1275	dd->send_schedule = alloc_percpu(u64);
   1276	if (!dd->send_schedule) {
   1277		ret = -ENOMEM;
   1278		goto bail;
   1279	}
   1280
   1281	dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx);
   1282	if (!dd->tx_opstats) {
   1283		ret = -ENOMEM;
   1284		goto bail;
   1285	}
   1286
   1287	dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL);
   1288	if (!dd->comp_vect) {
   1289		ret = -ENOMEM;
   1290		goto bail;
   1291	}
   1292
   1293	/* allocate dummy tail memory for all receive contexts */
   1294	dd->rcvhdrtail_dummy_kvaddr =
   1295		dma_alloc_coherent(&dd->pcidev->dev, sizeof(u64),
   1296				   &dd->rcvhdrtail_dummy_dma, GFP_KERNEL);
   1297	if (!dd->rcvhdrtail_dummy_kvaddr) {
   1298		ret = -ENOMEM;
   1299		goto bail;
   1300	}
   1301
   1302	atomic_set(&dd->ipoib_rsm_usr_num, 0);
   1303	return dd;
   1304
   1305bail:
   1306	hfi1_free_devdata(dd);
   1307	return ERR_PTR(ret);
   1308}
   1309
   1310/*
   1311 * Called from freeze mode handlers, and from PCI error
   1312 * reporting code.  Should be paranoid about state of
   1313 * system and data structures.
   1314 */
   1315void hfi1_disable_after_error(struct hfi1_devdata *dd)
   1316{
   1317	if (dd->flags & HFI1_INITTED) {
   1318		u32 pidx;
   1319
   1320		dd->flags &= ~HFI1_INITTED;
   1321		if (dd->pport)
   1322			for (pidx = 0; pidx < dd->num_pports; ++pidx) {
   1323				struct hfi1_pportdata *ppd;
   1324
   1325				ppd = dd->pport + pidx;
   1326				if (dd->flags & HFI1_PRESENT)
   1327					set_link_state(ppd, HLS_DN_DISABLE);
   1328
   1329				if (ppd->statusp)
   1330					*ppd->statusp &= ~HFI1_STATUS_IB_READY;
   1331			}
   1332	}
   1333
   1334	/*
   1335	 * Mark as having had an error for driver, and also
   1336	 * for /sys and status word mapped to user programs.
   1337	 * This marks unit as not usable, until reset.
   1338	 */
   1339	if (dd->status)
   1340		dd->status->dev |= HFI1_STATUS_HWERROR;
   1341}
   1342
   1343static void remove_one(struct pci_dev *);
   1344static int init_one(struct pci_dev *, const struct pci_device_id *);
   1345static void shutdown_one(struct pci_dev *);
   1346
   1347#define DRIVER_LOAD_MSG "Cornelis " DRIVER_NAME " loaded: "
   1348#define PFX DRIVER_NAME ": "
   1349
   1350const struct pci_device_id hfi1_pci_tbl[] = {
   1351	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
   1352	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
   1353	{ 0, }
   1354};
   1355
   1356MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
   1357
   1358static struct pci_driver hfi1_pci_driver = {
   1359	.name = DRIVER_NAME,
   1360	.probe = init_one,
   1361	.remove = remove_one,
   1362	.shutdown = shutdown_one,
   1363	.id_table = hfi1_pci_tbl,
   1364	.err_handler = &hfi1_pci_err_handler,
   1365};
   1366
   1367static void __init compute_krcvqs(void)
   1368{
   1369	int i;
   1370
   1371	for (i = 0; i < krcvqsset; i++)
   1372		n_krcvqs += krcvqs[i];
   1373}
   1374
   1375/*
   1376 * Do all the generic driver unit- and chip-independent memory
   1377 * allocation and initialization.
   1378 */
   1379static int __init hfi1_mod_init(void)
   1380{
   1381	int ret;
   1382
   1383	ret = dev_init();
   1384	if (ret)
   1385		goto bail;
   1386
   1387	ret = node_affinity_init();
   1388	if (ret)
   1389		goto bail;
   1390
   1391	/* validate max MTU before any devices start */
   1392	if (!valid_opa_max_mtu(hfi1_max_mtu)) {
   1393		pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
   1394		       hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
   1395		hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
   1396	}
   1397	/* valid CUs run from 1-128 in powers of 2 */
   1398	if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
   1399		hfi1_cu = 1;
   1400	/* valid credit return threshold is 0-100, variable is unsigned */
   1401	if (user_credit_return_threshold > 100)
   1402		user_credit_return_threshold = 100;
   1403
   1404	compute_krcvqs();
   1405	/*
   1406	 * sanitize receive interrupt count, time must wait until after
   1407	 * the hardware type is known
   1408	 */
   1409	if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
   1410		rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
   1411	/* reject invalid combinations */
   1412	if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
   1413		pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
   1414		rcv_intr_count = 1;
   1415	}
   1416	if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
   1417		/*
   1418		 * Avoid indefinite packet delivery by requiring a timeout
   1419		 * if count is > 1.
   1420		 */
   1421		pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
   1422		rcv_intr_timeout = 1;
   1423	}
   1424	if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
   1425		/*
   1426		 * The dynamic algorithm expects a non-zero timeout
   1427		 * and a count > 1.
   1428		 */
   1429		pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
   1430		rcv_intr_dynamic = 0;
   1431	}
   1432
   1433	/* sanitize link CRC options */
   1434	link_crc_mask &= SUPPORTED_CRCS;
   1435
   1436	ret = opfn_init();
   1437	if (ret < 0) {
   1438		pr_err("Failed to allocate opfn_wq");
   1439		goto bail_dev;
   1440	}
   1441
   1442	/*
   1443	 * These must be called before the driver is registered with
   1444	 * the PCI subsystem.
   1445	 */
   1446	hfi1_dbg_init();
   1447	ret = pci_register_driver(&hfi1_pci_driver);
   1448	if (ret < 0) {
   1449		pr_err("Unable to register driver: error %d\n", -ret);
   1450		goto bail_dev;
   1451	}
   1452	goto bail; /* all OK */
   1453
   1454bail_dev:
   1455	hfi1_dbg_exit();
   1456	dev_cleanup();
   1457bail:
   1458	return ret;
   1459}
   1460
   1461module_init(hfi1_mod_init);
   1462
   1463/*
   1464 * Do the non-unit driver cleanup, memory free, etc. at unload.
   1465 */
   1466static void __exit hfi1_mod_cleanup(void)
   1467{
   1468	pci_unregister_driver(&hfi1_pci_driver);
   1469	opfn_exit();
   1470	node_affinity_destroy_all();
   1471	hfi1_dbg_exit();
   1472
   1473	WARN_ON(!xa_empty(&hfi1_dev_table));
   1474	dispose_firmware();	/* asymmetric with obtain_firmware() */
   1475	dev_cleanup();
   1476}
   1477
   1478module_exit(hfi1_mod_cleanup);
   1479
   1480/* this can only be called after a successful initialization */
   1481static void cleanup_device_data(struct hfi1_devdata *dd)
   1482{
   1483	int ctxt;
   1484	int pidx;
   1485
   1486	/* users can't do anything more with chip */
   1487	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
   1488		struct hfi1_pportdata *ppd = &dd->pport[pidx];
   1489		struct cc_state *cc_state;
   1490		int i;
   1491
   1492		if (ppd->statusp)
   1493			*ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
   1494
   1495		for (i = 0; i < OPA_MAX_SLS; i++)
   1496			hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
   1497
   1498		spin_lock(&ppd->cc_state_lock);
   1499		cc_state = get_cc_state_protected(ppd);
   1500		RCU_INIT_POINTER(ppd->cc_state, NULL);
   1501		spin_unlock(&ppd->cc_state_lock);
   1502
   1503		if (cc_state)
   1504			kfree_rcu(cc_state, rcu);
   1505	}
   1506
   1507	free_credit_return(dd);
   1508
   1509	/*
   1510	 * Free any resources still in use (usually just kernel contexts)
   1511	 * at unload; we do for ctxtcnt, because that's what we allocate.
   1512	 */
   1513	for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) {
   1514		struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
   1515
   1516		if (rcd) {
   1517			hfi1_free_ctxt_rcv_groups(rcd);
   1518			hfi1_free_ctxt(rcd);
   1519		}
   1520	}
   1521
   1522	kfree(dd->rcd);
   1523	dd->rcd = NULL;
   1524
   1525	free_pio_map(dd);
   1526	/* must follow rcv context free - need to remove rcv's hooks */
   1527	for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
   1528		sc_free(dd->send_contexts[ctxt].sc);
   1529	dd->num_send_contexts = 0;
   1530	kfree(dd->send_contexts);
   1531	dd->send_contexts = NULL;
   1532	kfree(dd->hw_to_sw);
   1533	dd->hw_to_sw = NULL;
   1534	kfree(dd->boardname);
   1535	vfree(dd->events);
   1536	vfree(dd->status);
   1537}
   1538
   1539/*
   1540 * Clean up on unit shutdown, or error during unit load after
   1541 * successful initialization.
   1542 */
   1543static void postinit_cleanup(struct hfi1_devdata *dd)
   1544{
   1545	hfi1_start_cleanup(dd);
   1546	hfi1_comp_vectors_clean_up(dd);
   1547	hfi1_dev_affinity_clean_up(dd);
   1548
   1549	hfi1_pcie_ddcleanup(dd);
   1550	hfi1_pcie_cleanup(dd->pcidev);
   1551
   1552	cleanup_device_data(dd);
   1553
   1554	hfi1_free_devdata(dd);
   1555}
   1556
   1557static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
   1558{
   1559	int ret = 0, j, pidx, initfail;
   1560	struct hfi1_devdata *dd;
   1561	struct hfi1_pportdata *ppd;
   1562
   1563	/* First, lock the non-writable module parameters */
   1564	HFI1_CAP_LOCK();
   1565
   1566	/* Validate dev ids */
   1567	if (!(ent->device == PCI_DEVICE_ID_INTEL0 ||
   1568	      ent->device == PCI_DEVICE_ID_INTEL1)) {
   1569		dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n",
   1570			ent->device);
   1571		ret = -ENODEV;
   1572		goto bail;
   1573	}
   1574
   1575	/* Allocate the dd so we can get to work */
   1576	dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
   1577				sizeof(struct hfi1_pportdata));
   1578	if (IS_ERR(dd)) {
   1579		ret = PTR_ERR(dd);
   1580		goto bail;
   1581	}
   1582
   1583	/* Validate some global module parameters */
   1584	ret = hfi1_validate_rcvhdrcnt(dd, rcvhdrcnt);
   1585	if (ret)
   1586		goto bail;
   1587
   1588	/* use the encoding function as a sanitization check */
   1589	if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
   1590		dd_dev_err(dd, "Invalid HdrQ Entry size %u\n",
   1591			   hfi1_hdrq_entsize);
   1592		ret = -EINVAL;
   1593		goto bail;
   1594	}
   1595
   1596	/* The receive eager buffer size must be set before the receive
   1597	 * contexts are created.
   1598	 *
   1599	 * Set the eager buffer size.  Validate that it falls in a range
   1600	 * allowed by the hardware - all powers of 2 between the min and
   1601	 * max.  The maximum valid MTU is within the eager buffer range
   1602	 * so we do not need to cap the max_mtu by an eager buffer size
   1603	 * setting.
   1604	 */
   1605	if (eager_buffer_size) {
   1606		if (!is_power_of_2(eager_buffer_size))
   1607			eager_buffer_size =
   1608				roundup_pow_of_two(eager_buffer_size);
   1609		eager_buffer_size =
   1610			clamp_val(eager_buffer_size,
   1611				  MIN_EAGER_BUFFER * 8,
   1612				  MAX_EAGER_BUFFER_TOTAL);
   1613		dd_dev_info(dd, "Eager buffer size %u\n",
   1614			    eager_buffer_size);
   1615	} else {
   1616		dd_dev_err(dd, "Invalid Eager buffer size of 0\n");
   1617		ret = -EINVAL;
   1618		goto bail;
   1619	}
   1620
   1621	/* restrict value of hfi1_rcvarr_split */
   1622	hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
   1623
   1624	ret = hfi1_pcie_init(dd);
   1625	if (ret)
   1626		goto bail;
   1627
   1628	/*
   1629	 * Do device-specific initialization, function table setup, dd
   1630	 * allocation, etc.
   1631	 */
   1632	ret = hfi1_init_dd(dd);
   1633	if (ret)
   1634		goto clean_bail; /* error already printed */
   1635
   1636	ret = create_workqueues(dd);
   1637	if (ret)
   1638		goto clean_bail;
   1639
   1640	/* do the generic initialization */
   1641	initfail = hfi1_init(dd, 0);
   1642
   1643	ret = hfi1_register_ib_device(dd);
   1644
   1645	/*
   1646	 * Now ready for use.  this should be cleared whenever we
   1647	 * detect a reset, or initiate one.  If earlier failure,
   1648	 * we still create devices, so diags, etc. can be used
   1649	 * to determine cause of problem.
   1650	 */
   1651	if (!initfail && !ret) {
   1652		dd->flags |= HFI1_INITTED;
   1653		/* create debufs files after init and ib register */
   1654		hfi1_dbg_ibdev_init(&dd->verbs_dev);
   1655	}
   1656
   1657	j = hfi1_device_create(dd);
   1658	if (j)
   1659		dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
   1660
   1661	if (initfail || ret) {
   1662		msix_clean_up_interrupts(dd);
   1663		stop_timers(dd);
   1664		flush_workqueue(ib_wq);
   1665		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
   1666			hfi1_quiet_serdes(dd->pport + pidx);
   1667			ppd = dd->pport + pidx;
   1668			if (ppd->hfi1_wq) {
   1669				destroy_workqueue(ppd->hfi1_wq);
   1670				ppd->hfi1_wq = NULL;
   1671			}
   1672			if (ppd->link_wq) {
   1673				destroy_workqueue(ppd->link_wq);
   1674				ppd->link_wq = NULL;
   1675			}
   1676		}
   1677		if (!j)
   1678			hfi1_device_remove(dd);
   1679		if (!ret)
   1680			hfi1_unregister_ib_device(dd);
   1681		postinit_cleanup(dd);
   1682		if (initfail)
   1683			ret = initfail;
   1684		goto bail;	/* everything already cleaned */
   1685	}
   1686
   1687	sdma_start(dd);
   1688
   1689	return 0;
   1690
   1691clean_bail:
   1692	hfi1_pcie_cleanup(pdev);
   1693bail:
   1694	return ret;
   1695}
   1696
   1697static void wait_for_clients(struct hfi1_devdata *dd)
   1698{
   1699	/*
   1700	 * Remove the device init value and complete the device if there is
   1701	 * no clients or wait for active clients to finish.
   1702	 */
   1703	if (refcount_dec_and_test(&dd->user_refcount))
   1704		complete(&dd->user_comp);
   1705
   1706	wait_for_completion(&dd->user_comp);
   1707}
   1708
   1709static void remove_one(struct pci_dev *pdev)
   1710{
   1711	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
   1712
   1713	/* close debugfs files before ib unregister */
   1714	hfi1_dbg_ibdev_exit(&dd->verbs_dev);
   1715
   1716	/* remove the /dev hfi1 interface */
   1717	hfi1_device_remove(dd);
   1718
   1719	/* wait for existing user space clients to finish */
   1720	wait_for_clients(dd);
   1721
   1722	/* unregister from IB core */
   1723	hfi1_unregister_ib_device(dd);
   1724
   1725	/* free netdev data */
   1726	hfi1_free_rx(dd);
   1727
   1728	/*
   1729	 * Disable the IB link, disable interrupts on the device,
   1730	 * clear dma engines, etc.
   1731	 */
   1732	shutdown_device(dd);
   1733	destroy_workqueues(dd);
   1734
   1735	stop_timers(dd);
   1736
   1737	/* wait until all of our (qsfp) queue_work() calls complete */
   1738	flush_workqueue(ib_wq);
   1739
   1740	postinit_cleanup(dd);
   1741}
   1742
   1743static void shutdown_one(struct pci_dev *pdev)
   1744{
   1745	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
   1746
   1747	shutdown_device(dd);
   1748}
   1749
   1750/**
   1751 * hfi1_create_rcvhdrq - create a receive header queue
   1752 * @dd: the hfi1_ib device
   1753 * @rcd: the context data
   1754 *
   1755 * This must be contiguous memory (from an i/o perspective), and must be
   1756 * DMA'able (which means for some systems, it will go through an IOMMU,
   1757 * or be forced into a low address range).
   1758 */
   1759int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
   1760{
   1761	unsigned amt;
   1762
   1763	if (!rcd->rcvhdrq) {
   1764		gfp_t gfp_flags;
   1765
   1766		amt = rcvhdrq_size(rcd);
   1767
   1768		if (rcd->ctxt < dd->first_dyn_alloc_ctxt || rcd->is_vnic)
   1769			gfp_flags = GFP_KERNEL;
   1770		else
   1771			gfp_flags = GFP_USER;
   1772		rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt,
   1773						  &rcd->rcvhdrq_dma,
   1774						  gfp_flags | __GFP_COMP);
   1775
   1776		if (!rcd->rcvhdrq) {
   1777			dd_dev_err(dd,
   1778				   "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
   1779				   amt, rcd->ctxt);
   1780			goto bail;
   1781		}
   1782
   1783		if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
   1784		    HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) {
   1785			rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
   1786								    PAGE_SIZE,
   1787								    &rcd->rcvhdrqtailaddr_dma,
   1788								    gfp_flags);
   1789			if (!rcd->rcvhdrtail_kvaddr)
   1790				goto bail_free;
   1791		}
   1792	}
   1793
   1794	set_hdrq_regs(rcd->dd, rcd->ctxt, rcd->rcvhdrqentsize,
   1795		      rcd->rcvhdrq_cnt);
   1796
   1797	return 0;
   1798
   1799bail_free:
   1800	dd_dev_err(dd,
   1801		   "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
   1802		   rcd->ctxt);
   1803	dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
   1804			  rcd->rcvhdrq_dma);
   1805	rcd->rcvhdrq = NULL;
   1806bail:
   1807	return -ENOMEM;
   1808}
   1809
   1810/**
   1811 * hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user
   1812 * contexts.
   1813 * @rcd: the context we are setting up.
   1814 *
   1815 * Allocate the eager TID buffers and program them into hip.
   1816 * They are no longer completely contiguous, we do multiple allocation
   1817 * calls.  Otherwise we get the OOM code involved, by asking for too
   1818 * much per call, with disastrous results on some kernels.
   1819 */
   1820int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
   1821{
   1822	struct hfi1_devdata *dd = rcd->dd;
   1823	u32 max_entries, egrtop, alloced_bytes = 0;
   1824	gfp_t gfp_flags;
   1825	u16 order, idx = 0;
   1826	int ret = 0;
   1827	u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
   1828
   1829	/*
   1830	 * GFP_USER, but without GFP_FS, so buffer cache can be
   1831	 * coalesced (we hope); otherwise, even at order 4,
   1832	 * heavy filesystem activity makes these fail, and we can
   1833	 * use compound pages.
   1834	 */
   1835	gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
   1836
   1837	/*
   1838	 * The minimum size of the eager buffers is a groups of MTU-sized
   1839	 * buffers.
   1840	 * The global eager_buffer_size parameter is checked against the
   1841	 * theoretical lower limit of the value. Here, we check against the
   1842	 * MTU.
   1843	 */
   1844	if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
   1845		rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
   1846	/*
   1847	 * If using one-pkt-per-egr-buffer, lower the eager buffer
   1848	 * size to the max MTU (page-aligned).
   1849	 */
   1850	if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
   1851		rcd->egrbufs.rcvtid_size = round_mtu;
   1852
   1853	/*
   1854	 * Eager buffers sizes of 1MB or less require smaller TID sizes
   1855	 * to satisfy the "multiple of 8 RcvArray entries" requirement.
   1856	 */
   1857	if (rcd->egrbufs.size <= (1 << 20))
   1858		rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
   1859			rounddown_pow_of_two(rcd->egrbufs.size / 8));
   1860
   1861	while (alloced_bytes < rcd->egrbufs.size &&
   1862	       rcd->egrbufs.alloced < rcd->egrbufs.count) {
   1863		rcd->egrbufs.buffers[idx].addr =
   1864			dma_alloc_coherent(&dd->pcidev->dev,
   1865					   rcd->egrbufs.rcvtid_size,
   1866					   &rcd->egrbufs.buffers[idx].dma,
   1867					   gfp_flags);
   1868		if (rcd->egrbufs.buffers[idx].addr) {
   1869			rcd->egrbufs.buffers[idx].len =
   1870				rcd->egrbufs.rcvtid_size;
   1871			rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
   1872				rcd->egrbufs.buffers[idx].addr;
   1873			rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma =
   1874				rcd->egrbufs.buffers[idx].dma;
   1875			rcd->egrbufs.alloced++;
   1876			alloced_bytes += rcd->egrbufs.rcvtid_size;
   1877			idx++;
   1878		} else {
   1879			u32 new_size, i, j;
   1880			u64 offset = 0;
   1881
   1882			/*
   1883			 * Fail the eager buffer allocation if:
   1884			 *   - we are already using the lowest acceptable size
   1885			 *   - we are using one-pkt-per-egr-buffer (this implies
   1886			 *     that we are accepting only one size)
   1887			 */
   1888			if (rcd->egrbufs.rcvtid_size == round_mtu ||
   1889			    !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
   1890				dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
   1891					   rcd->ctxt);
   1892				ret = -ENOMEM;
   1893				goto bail_rcvegrbuf_phys;
   1894			}
   1895
   1896			new_size = rcd->egrbufs.rcvtid_size / 2;
   1897
   1898			/*
   1899			 * If the first attempt to allocate memory failed, don't
   1900			 * fail everything but continue with the next lower
   1901			 * size.
   1902			 */
   1903			if (idx == 0) {
   1904				rcd->egrbufs.rcvtid_size = new_size;
   1905				continue;
   1906			}
   1907
   1908			/*
   1909			 * Re-partition already allocated buffers to a smaller
   1910			 * size.
   1911			 */
   1912			rcd->egrbufs.alloced = 0;
   1913			for (i = 0, j = 0, offset = 0; j < idx; i++) {
   1914				if (i >= rcd->egrbufs.count)
   1915					break;
   1916				rcd->egrbufs.rcvtids[i].dma =
   1917					rcd->egrbufs.buffers[j].dma + offset;
   1918				rcd->egrbufs.rcvtids[i].addr =
   1919					rcd->egrbufs.buffers[j].addr + offset;
   1920				rcd->egrbufs.alloced++;
   1921				if ((rcd->egrbufs.buffers[j].dma + offset +
   1922				     new_size) ==
   1923				    (rcd->egrbufs.buffers[j].dma +
   1924				     rcd->egrbufs.buffers[j].len)) {
   1925					j++;
   1926					offset = 0;
   1927				} else {
   1928					offset += new_size;
   1929				}
   1930			}
   1931			rcd->egrbufs.rcvtid_size = new_size;
   1932		}
   1933	}
   1934	rcd->egrbufs.numbufs = idx;
   1935	rcd->egrbufs.size = alloced_bytes;
   1936
   1937	hfi1_cdbg(PROC,
   1938		  "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB\n",
   1939		  rcd->ctxt, rcd->egrbufs.alloced,
   1940		  rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
   1941
   1942	/*
   1943	 * Set the contexts rcv array head update threshold to the closest
   1944	 * power of 2 (so we can use a mask instead of modulo) below half
   1945	 * the allocated entries.
   1946	 */
   1947	rcd->egrbufs.threshold =
   1948		rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
   1949	/*
   1950	 * Compute the expected RcvArray entry base. This is done after
   1951	 * allocating the eager buffers in order to maximize the
   1952	 * expected RcvArray entries for the context.
   1953	 */
   1954	max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
   1955	egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
   1956	rcd->expected_count = max_entries - egrtop;
   1957	if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
   1958		rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
   1959
   1960	rcd->expected_base = rcd->eager_base + egrtop;
   1961	hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
   1962		  rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
   1963		  rcd->eager_base, rcd->expected_base);
   1964
   1965	if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
   1966		hfi1_cdbg(PROC,
   1967			  "ctxt%u: current Eager buffer size is invalid %u\n",
   1968			  rcd->ctxt, rcd->egrbufs.rcvtid_size);
   1969		ret = -EINVAL;
   1970		goto bail_rcvegrbuf_phys;
   1971	}
   1972
   1973	for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
   1974		hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
   1975			     rcd->egrbufs.rcvtids[idx].dma, order);
   1976		cond_resched();
   1977	}
   1978
   1979	return 0;
   1980
   1981bail_rcvegrbuf_phys:
   1982	for (idx = 0; idx < rcd->egrbufs.alloced &&
   1983	     rcd->egrbufs.buffers[idx].addr;
   1984	     idx++) {
   1985		dma_free_coherent(&dd->pcidev->dev,
   1986				  rcd->egrbufs.buffers[idx].len,
   1987				  rcd->egrbufs.buffers[idx].addr,
   1988				  rcd->egrbufs.buffers[idx].dma);
   1989		rcd->egrbufs.buffers[idx].addr = NULL;
   1990		rcd->egrbufs.buffers[idx].dma = 0;
   1991		rcd->egrbufs.buffers[idx].len = 0;
   1992	}
   1993
   1994	return ret;
   1995}