cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

smc_wr.c (25628B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
      4 *
      5 * Work Requests exploiting Infiniband API
      6 *
      7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
      8 * are submitted to either RC SQ or RC RQ respectively
      9 * (reliably connected send/receive queue)
     10 * and become work queue entries (WQEs).
     11 * While an SQ WR/WQE is pending, we track it until transmission completion.
     12 * Through a send or receive completion queue (CQ) respectively,
     13 * we get completion queue entries (CQEs) [aka work completions (WCs)].
     14 * Since the CQ callback is called from IRQ context, we split work by using
     15 * bottom halves implemented by tasklets.
     16 *
     17 * SMC uses this to exchange LLC (link layer control)
     18 * and CDC (connection data control) messages.
     19 *
     20 * Copyright IBM Corp. 2016
     21 *
     22 * Author(s):  Steffen Maier <maier@linux.vnet.ibm.com>
     23 */
     24
     25#include <linux/atomic.h>
     26#include <linux/hashtable.h>
     27#include <linux/wait.h>
     28#include <rdma/ib_verbs.h>
     29#include <asm/div64.h>
     30
     31#include "smc.h"
     32#include "smc_wr.h"
     33
     34#define SMC_WR_MAX_POLL_CQE 10	/* max. # of compl. queue elements in 1 poll */
     35
     36#define SMC_WR_RX_HASH_BITS 4
     37static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
     38static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
     39
     40struct smc_wr_tx_pend {	/* control data for a pending send request */
     41	u64			wr_id;		/* work request id sent */
     42	smc_wr_tx_handler	handler;
     43	enum ib_wc_status	wc_status;	/* CQE status */
     44	struct smc_link		*link;
     45	u32			idx;
     46	struct smc_wr_tx_pend_priv priv;
     47	u8			compl_requested;
     48};
     49
     50/******************************** send queue *********************************/
     51
     52/*------------------------------- completion --------------------------------*/
     53
     54/* returns true if at least one tx work request is pending on the given link */
     55static inline bool smc_wr_is_tx_pend(struct smc_link *link)
     56{
     57	return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt);
     58}
     59
     60/* wait till all pending tx work requests on the given link are completed */
     61void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
     62{
     63	wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
     64}
     65
     66static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
     67{
     68	u32 i;
     69
     70	for (i = 0; i < link->wr_tx_cnt; i++) {
     71		if (link->wr_tx_pends[i].wr_id == wr_id)
     72			return i;
     73	}
     74	return link->wr_tx_cnt;
     75}
     76
     77static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
     78{
     79	struct smc_wr_tx_pend pnd_snd;
     80	struct smc_link *link;
     81	u32 pnd_snd_idx;
     82
     83	link = wc->qp->qp_context;
     84
     85	if (wc->opcode == IB_WC_REG_MR) {
     86		if (wc->status)
     87			link->wr_reg_state = FAILED;
     88		else
     89			link->wr_reg_state = CONFIRMED;
     90		smc_wr_wakeup_reg_wait(link);
     91		return;
     92	}
     93
     94	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
     95	if (pnd_snd_idx == link->wr_tx_cnt) {
     96		if (link->lgr->smc_version != SMC_V2 ||
     97		    link->wr_tx_v2_pend->wr_id != wc->wr_id)
     98			return;
     99		link->wr_tx_v2_pend->wc_status = wc->status;
    100		memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
    101		/* clear the full struct smc_wr_tx_pend including .priv */
    102		memset(link->wr_tx_v2_pend, 0,
    103		       sizeof(*link->wr_tx_v2_pend));
    104		memset(link->lgr->wr_tx_buf_v2, 0,
    105		       sizeof(*link->lgr->wr_tx_buf_v2));
    106	} else {
    107		link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
    108		if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
    109			complete(&link->wr_tx_compl[pnd_snd_idx]);
    110		memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
    111		       sizeof(pnd_snd));
    112		/* clear the full struct smc_wr_tx_pend including .priv */
    113		memset(&link->wr_tx_pends[pnd_snd_idx], 0,
    114		       sizeof(link->wr_tx_pends[pnd_snd_idx]));
    115		memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
    116		       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
    117		if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
    118			return;
    119	}
    120
    121	if (wc->status) {
    122		if (link->lgr->smc_version == SMC_V2) {
    123			memset(link->wr_tx_v2_pend, 0,
    124			       sizeof(*link->wr_tx_v2_pend));
    125			memset(link->lgr->wr_tx_buf_v2, 0,
    126			       sizeof(*link->lgr->wr_tx_buf_v2));
    127		}
    128		/* terminate link */
    129		smcr_link_down_cond_sched(link);
    130	}
    131	if (pnd_snd.handler)
    132		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
    133	wake_up(&link->wr_tx_wait);
    134}
    135
    136static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
    137{
    138	struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
    139	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
    140	int i = 0, rc;
    141	int polled = 0;
    142
    143again:
    144	polled++;
    145	do {
    146		memset(&wc, 0, sizeof(wc));
    147		rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
    148		if (polled == 1) {
    149			ib_req_notify_cq(dev->roce_cq_send,
    150					 IB_CQ_NEXT_COMP |
    151					 IB_CQ_REPORT_MISSED_EVENTS);
    152		}
    153		if (!rc)
    154			break;
    155		for (i = 0; i < rc; i++)
    156			smc_wr_tx_process_cqe(&wc[i]);
    157	} while (rc > 0);
    158	if (polled == 1)
    159		goto again;
    160}
    161
    162void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
    163{
    164	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
    165
    166	tasklet_schedule(&dev->send_tasklet);
    167}
    168
    169/*---------------------------- request submission ---------------------------*/
    170
    171static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
    172{
    173	*idx = link->wr_tx_cnt;
    174	if (!smc_link_sendable(link))
    175		return -ENOLINK;
    176	for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
    177		if (!test_and_set_bit(*idx, link->wr_tx_mask))
    178			return 0;
    179	}
    180	*idx = link->wr_tx_cnt;
    181	return -EBUSY;
    182}
    183
    184/**
    185 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
    186 *			and sets info for pending transmit tracking
    187 * @link:		Pointer to smc_link used to later send the message.
    188 * @handler:		Send completion handler function pointer.
    189 * @wr_buf:		Out value returns pointer to message buffer.
    190 * @wr_rdma_buf:	Out value returns pointer to rdma work request.
    191 * @wr_pend_priv:	Out value returns pointer serving as handler context.
    192 *
    193 * Return: 0 on success, or -errno on error.
    194 */
    195int smc_wr_tx_get_free_slot(struct smc_link *link,
    196			    smc_wr_tx_handler handler,
    197			    struct smc_wr_buf **wr_buf,
    198			    struct smc_rdma_wr **wr_rdma_buf,
    199			    struct smc_wr_tx_pend_priv **wr_pend_priv)
    200{
    201	struct smc_link_group *lgr = smc_get_lgr(link);
    202	struct smc_wr_tx_pend *wr_pend;
    203	u32 idx = link->wr_tx_cnt;
    204	struct ib_send_wr *wr_ib;
    205	u64 wr_id;
    206	int rc;
    207
    208	*wr_buf = NULL;
    209	*wr_pend_priv = NULL;
    210	if (in_softirq() || lgr->terminating) {
    211		rc = smc_wr_tx_get_free_slot_index(link, &idx);
    212		if (rc)
    213			return rc;
    214	} else {
    215		rc = wait_event_interruptible_timeout(
    216			link->wr_tx_wait,
    217			!smc_link_sendable(link) ||
    218			lgr->terminating ||
    219			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
    220			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
    221		if (!rc) {
    222			/* timeout - terminate link */
    223			smcr_link_down_cond_sched(link);
    224			return -EPIPE;
    225		}
    226		if (idx == link->wr_tx_cnt)
    227			return -EPIPE;
    228	}
    229	wr_id = smc_wr_tx_get_next_wr_id(link);
    230	wr_pend = &link->wr_tx_pends[idx];
    231	wr_pend->wr_id = wr_id;
    232	wr_pend->handler = handler;
    233	wr_pend->link = link;
    234	wr_pend->idx = idx;
    235	wr_ib = &link->wr_tx_ibs[idx];
    236	wr_ib->wr_id = wr_id;
    237	*wr_buf = &link->wr_tx_bufs[idx];
    238	if (wr_rdma_buf)
    239		*wr_rdma_buf = &link->wr_tx_rdmas[idx];
    240	*wr_pend_priv = &wr_pend->priv;
    241	return 0;
    242}
    243
    244int smc_wr_tx_get_v2_slot(struct smc_link *link,
    245			  smc_wr_tx_handler handler,
    246			  struct smc_wr_v2_buf **wr_buf,
    247			  struct smc_wr_tx_pend_priv **wr_pend_priv)
    248{
    249	struct smc_wr_tx_pend *wr_pend;
    250	struct ib_send_wr *wr_ib;
    251	u64 wr_id;
    252
    253	if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
    254		return -EBUSY;
    255
    256	*wr_buf = NULL;
    257	*wr_pend_priv = NULL;
    258	wr_id = smc_wr_tx_get_next_wr_id(link);
    259	wr_pend = link->wr_tx_v2_pend;
    260	wr_pend->wr_id = wr_id;
    261	wr_pend->handler = handler;
    262	wr_pend->link = link;
    263	wr_pend->idx = link->wr_tx_cnt;
    264	wr_ib = link->wr_tx_v2_ib;
    265	wr_ib->wr_id = wr_id;
    266	*wr_buf = link->lgr->wr_tx_buf_v2;
    267	*wr_pend_priv = &wr_pend->priv;
    268	return 0;
    269}
    270
    271int smc_wr_tx_put_slot(struct smc_link *link,
    272		       struct smc_wr_tx_pend_priv *wr_pend_priv)
    273{
    274	struct smc_wr_tx_pend *pend;
    275
    276	pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
    277	if (pend->idx < link->wr_tx_cnt) {
    278		u32 idx = pend->idx;
    279
    280		/* clear the full struct smc_wr_tx_pend including .priv */
    281		memset(&link->wr_tx_pends[idx], 0,
    282		       sizeof(link->wr_tx_pends[idx]));
    283		memset(&link->wr_tx_bufs[idx], 0,
    284		       sizeof(link->wr_tx_bufs[idx]));
    285		test_and_clear_bit(idx, link->wr_tx_mask);
    286		wake_up(&link->wr_tx_wait);
    287		return 1;
    288	} else if (link->lgr->smc_version == SMC_V2 &&
    289		   pend->idx == link->wr_tx_cnt) {
    290		/* Large v2 buffer */
    291		memset(&link->wr_tx_v2_pend, 0,
    292		       sizeof(link->wr_tx_v2_pend));
    293		memset(&link->lgr->wr_tx_buf_v2, 0,
    294		       sizeof(link->lgr->wr_tx_buf_v2));
    295		return 1;
    296	}
    297
    298	return 0;
    299}
    300
    301/* Send prepared WR slot via ib_post_send.
    302 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
    303 */
    304int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
    305{
    306	struct smc_wr_tx_pend *pend;
    307	int rc;
    308
    309	ib_req_notify_cq(link->smcibdev->roce_cq_send,
    310			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
    311	pend = container_of(priv, struct smc_wr_tx_pend, priv);
    312	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
    313	if (rc) {
    314		smc_wr_tx_put_slot(link, priv);
    315		smcr_link_down_cond_sched(link);
    316	}
    317	return rc;
    318}
    319
    320int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
    321		      int len)
    322{
    323	int rc;
    324
    325	link->wr_tx_v2_ib->sg_list[0].length = len;
    326	ib_req_notify_cq(link->smcibdev->roce_cq_send,
    327			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
    328	rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
    329	if (rc) {
    330		smc_wr_tx_put_slot(link, priv);
    331		smcr_link_down_cond_sched(link);
    332	}
    333	return rc;
    334}
    335
    336/* Send prepared WR slot via ib_post_send and wait for send completion
    337 * notification.
    338 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
    339 */
    340int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
    341			unsigned long timeout)
    342{
    343	struct smc_wr_tx_pend *pend;
    344	u32 pnd_idx;
    345	int rc;
    346
    347	pend = container_of(priv, struct smc_wr_tx_pend, priv);
    348	pend->compl_requested = 1;
    349	pnd_idx = pend->idx;
    350	init_completion(&link->wr_tx_compl[pnd_idx]);
    351
    352	rc = smc_wr_tx_send(link, priv);
    353	if (rc)
    354		return rc;
    355	/* wait for completion by smc_wr_tx_process_cqe() */
    356	rc = wait_for_completion_interruptible_timeout(
    357					&link->wr_tx_compl[pnd_idx], timeout);
    358	if (rc <= 0)
    359		rc = -ENODATA;
    360	if (rc > 0)
    361		rc = 0;
    362	return rc;
    363}
    364
    365/* Register a memory region and wait for result. */
    366int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
    367{
    368	int rc;
    369
    370	ib_req_notify_cq(link->smcibdev->roce_cq_send,
    371			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
    372	link->wr_reg_state = POSTED;
    373	link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
    374	link->wr_reg.mr = mr;
    375	link->wr_reg.key = mr->rkey;
    376	rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
    377	if (rc)
    378		return rc;
    379
    380	atomic_inc(&link->wr_reg_refcnt);
    381	rc = wait_event_interruptible_timeout(link->wr_reg_wait,
    382					      (link->wr_reg_state != POSTED),
    383					      SMC_WR_REG_MR_WAIT_TIME);
    384	if (atomic_dec_and_test(&link->wr_reg_refcnt))
    385		wake_up_all(&link->wr_reg_wait);
    386	if (!rc) {
    387		/* timeout - terminate link */
    388		smcr_link_down_cond_sched(link);
    389		return -EPIPE;
    390	}
    391	if (rc == -ERESTARTSYS)
    392		return -EINTR;
    393	switch (link->wr_reg_state) {
    394	case CONFIRMED:
    395		rc = 0;
    396		break;
    397	case FAILED:
    398		rc = -EIO;
    399		break;
    400	case POSTED:
    401		rc = -EPIPE;
    402		break;
    403	}
    404	return rc;
    405}
    406
    407/****************************** receive queue ********************************/
    408
    409int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
    410{
    411	struct smc_wr_rx_handler *h_iter;
    412	int rc = 0;
    413
    414	spin_lock(&smc_wr_rx_hash_lock);
    415	hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
    416		if (h_iter->type == handler->type) {
    417			rc = -EEXIST;
    418			goto out_unlock;
    419		}
    420	}
    421	hash_add(smc_wr_rx_hash, &handler->list, handler->type);
    422out_unlock:
    423	spin_unlock(&smc_wr_rx_hash_lock);
    424	return rc;
    425}
    426
    427/* Demultiplex a received work request based on the message type to its handler.
    428 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
    429 * and not being modified any more afterwards so we don't need to lock it.
    430 */
    431static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
    432{
    433	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
    434	struct smc_wr_rx_handler *handler;
    435	struct smc_wr_rx_hdr *wr_rx;
    436	u64 temp_wr_id;
    437	u32 index;
    438
    439	if (wc->byte_len < sizeof(*wr_rx))
    440		return; /* short message */
    441	temp_wr_id = wc->wr_id;
    442	index = do_div(temp_wr_id, link->wr_rx_cnt);
    443	wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
    444	hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
    445		if (handler->type == wr_rx->type)
    446			handler->handler(wc, wr_rx);
    447	}
    448}
    449
    450static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
    451{
    452	struct smc_link *link;
    453	int i;
    454
    455	for (i = 0; i < num; i++) {
    456		link = wc[i].qp->qp_context;
    457		if (wc[i].status == IB_WC_SUCCESS) {
    458			link->wr_rx_tstamp = jiffies;
    459			smc_wr_rx_demultiplex(&wc[i]);
    460			smc_wr_rx_post(link); /* refill WR RX */
    461		} else {
    462			/* handle status errors */
    463			switch (wc[i].status) {
    464			case IB_WC_RETRY_EXC_ERR:
    465			case IB_WC_RNR_RETRY_EXC_ERR:
    466			case IB_WC_WR_FLUSH_ERR:
    467				smcr_link_down_cond_sched(link);
    468				break;
    469			default:
    470				smc_wr_rx_post(link); /* refill WR RX */
    471				break;
    472			}
    473		}
    474	}
    475}
    476
    477static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
    478{
    479	struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
    480	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
    481	int polled = 0;
    482	int rc;
    483
    484again:
    485	polled++;
    486	do {
    487		memset(&wc, 0, sizeof(wc));
    488		rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
    489		if (polled == 1) {
    490			ib_req_notify_cq(dev->roce_cq_recv,
    491					 IB_CQ_SOLICITED_MASK
    492					 | IB_CQ_REPORT_MISSED_EVENTS);
    493		}
    494		if (!rc)
    495			break;
    496		smc_wr_rx_process_cqes(&wc[0], rc);
    497	} while (rc > 0);
    498	if (polled == 1)
    499		goto again;
    500}
    501
    502void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
    503{
    504	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
    505
    506	tasklet_schedule(&dev->recv_tasklet);
    507}
    508
    509int smc_wr_rx_post_init(struct smc_link *link)
    510{
    511	u32 i;
    512	int rc = 0;
    513
    514	for (i = 0; i < link->wr_rx_cnt; i++)
    515		rc = smc_wr_rx_post(link);
    516	return rc;
    517}
    518
    519/***************************** init, exit, misc ******************************/
    520
    521void smc_wr_remember_qp_attr(struct smc_link *lnk)
    522{
    523	struct ib_qp_attr *attr = &lnk->qp_attr;
    524	struct ib_qp_init_attr init_attr;
    525
    526	memset(attr, 0, sizeof(*attr));
    527	memset(&init_attr, 0, sizeof(init_attr));
    528	ib_query_qp(lnk->roce_qp, attr,
    529		    IB_QP_STATE |
    530		    IB_QP_CUR_STATE |
    531		    IB_QP_PKEY_INDEX |
    532		    IB_QP_PORT |
    533		    IB_QP_QKEY |
    534		    IB_QP_AV |
    535		    IB_QP_PATH_MTU |
    536		    IB_QP_TIMEOUT |
    537		    IB_QP_RETRY_CNT |
    538		    IB_QP_RNR_RETRY |
    539		    IB_QP_RQ_PSN |
    540		    IB_QP_ALT_PATH |
    541		    IB_QP_MIN_RNR_TIMER |
    542		    IB_QP_SQ_PSN |
    543		    IB_QP_PATH_MIG_STATE |
    544		    IB_QP_CAP |
    545		    IB_QP_DEST_QPN,
    546		    &init_attr);
    547
    548	lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
    549			       lnk->qp_attr.cap.max_send_wr);
    550	lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
    551			       lnk->qp_attr.cap.max_recv_wr);
    552}
    553
    554static void smc_wr_init_sge(struct smc_link *lnk)
    555{
    556	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
    557	bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
    558	u32 i;
    559
    560	for (i = 0; i < lnk->wr_tx_cnt; i++) {
    561		lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
    562			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
    563		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
    564		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
    565		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
    566			lnk->roce_pd->local_dma_lkey;
    567		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
    568			lnk->roce_pd->local_dma_lkey;
    569		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
    570			lnk->roce_pd->local_dma_lkey;
    571		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
    572			lnk->roce_pd->local_dma_lkey;
    573		lnk->wr_tx_ibs[i].next = NULL;
    574		lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
    575		lnk->wr_tx_ibs[i].num_sge = 1;
    576		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
    577		lnk->wr_tx_ibs[i].send_flags =
    578			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
    579		if (send_inline)
    580			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
    581		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
    582		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
    583		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
    584			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
    585		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
    586			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
    587	}
    588
    589	if (lnk->lgr->smc_version == SMC_V2) {
    590		lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
    591		lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
    592		lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
    593
    594		lnk->wr_tx_v2_ib->next = NULL;
    595		lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
    596		lnk->wr_tx_v2_ib->num_sge = 1;
    597		lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
    598		lnk->wr_tx_v2_ib->send_flags =
    599			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
    600	}
    601
    602	/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
    603	 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
    604	 * and the same buffer for all sges. When a larger message arrived then
    605	 * the content of the first small sge is copied to the beginning of
    606	 * the larger spillover buffer, allowing easy data mapping.
    607	 */
    608	for (i = 0; i < lnk->wr_rx_cnt; i++) {
    609		int x = i * sges_per_buf;
    610
    611		lnk->wr_rx_sges[x].addr =
    612			lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
    613		lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
    614		lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
    615		if (lnk->lgr->smc_version == SMC_V2) {
    616			lnk->wr_rx_sges[x + 1].addr =
    617					lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
    618			lnk->wr_rx_sges[x + 1].length =
    619					SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
    620			lnk->wr_rx_sges[x + 1].lkey =
    621					lnk->roce_pd->local_dma_lkey;
    622		}
    623		lnk->wr_rx_ibs[i].next = NULL;
    624		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
    625		lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
    626	}
    627	lnk->wr_reg.wr.next = NULL;
    628	lnk->wr_reg.wr.num_sge = 0;
    629	lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
    630	lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
    631	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
    632}
    633
    634void smc_wr_free_link(struct smc_link *lnk)
    635{
    636	struct ib_device *ibdev;
    637
    638	if (!lnk->smcibdev)
    639		return;
    640	ibdev = lnk->smcibdev->ibdev;
    641
    642	smc_wr_wakeup_reg_wait(lnk);
    643	smc_wr_wakeup_tx_wait(lnk);
    644
    645	smc_wr_tx_wait_no_pending_sends(lnk);
    646	wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
    647	wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
    648
    649	if (lnk->wr_rx_dma_addr) {
    650		ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
    651				    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
    652				    DMA_FROM_DEVICE);
    653		lnk->wr_rx_dma_addr = 0;
    654	}
    655	if (lnk->wr_rx_v2_dma_addr) {
    656		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
    657				    SMC_WR_BUF_V2_SIZE,
    658				    DMA_FROM_DEVICE);
    659		lnk->wr_rx_v2_dma_addr = 0;
    660	}
    661	if (lnk->wr_tx_dma_addr) {
    662		ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
    663				    SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
    664				    DMA_TO_DEVICE);
    665		lnk->wr_tx_dma_addr = 0;
    666	}
    667	if (lnk->wr_tx_v2_dma_addr) {
    668		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
    669				    SMC_WR_BUF_V2_SIZE,
    670				    DMA_TO_DEVICE);
    671		lnk->wr_tx_v2_dma_addr = 0;
    672	}
    673}
    674
    675void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
    676{
    677	if (lgr->smc_version < SMC_V2)
    678		return;
    679
    680	kfree(lgr->wr_rx_buf_v2);
    681	lgr->wr_rx_buf_v2 = NULL;
    682	kfree(lgr->wr_tx_buf_v2);
    683	lgr->wr_tx_buf_v2 = NULL;
    684}
    685
    686void smc_wr_free_link_mem(struct smc_link *lnk)
    687{
    688	kfree(lnk->wr_tx_v2_ib);
    689	lnk->wr_tx_v2_ib = NULL;
    690	kfree(lnk->wr_tx_v2_sge);
    691	lnk->wr_tx_v2_sge = NULL;
    692	kfree(lnk->wr_tx_v2_pend);
    693	lnk->wr_tx_v2_pend = NULL;
    694	kfree(lnk->wr_tx_compl);
    695	lnk->wr_tx_compl = NULL;
    696	kfree(lnk->wr_tx_pends);
    697	lnk->wr_tx_pends = NULL;
    698	bitmap_free(lnk->wr_tx_mask);
    699	lnk->wr_tx_mask = NULL;
    700	kfree(lnk->wr_tx_sges);
    701	lnk->wr_tx_sges = NULL;
    702	kfree(lnk->wr_tx_rdma_sges);
    703	lnk->wr_tx_rdma_sges = NULL;
    704	kfree(lnk->wr_rx_sges);
    705	lnk->wr_rx_sges = NULL;
    706	kfree(lnk->wr_tx_rdmas);
    707	lnk->wr_tx_rdmas = NULL;
    708	kfree(lnk->wr_rx_ibs);
    709	lnk->wr_rx_ibs = NULL;
    710	kfree(lnk->wr_tx_ibs);
    711	lnk->wr_tx_ibs = NULL;
    712	kfree(lnk->wr_tx_bufs);
    713	lnk->wr_tx_bufs = NULL;
    714	kfree(lnk->wr_rx_bufs);
    715	lnk->wr_rx_bufs = NULL;
    716}
    717
    718int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
    719{
    720	if (lgr->smc_version < SMC_V2)
    721		return 0;
    722
    723	lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
    724	if (!lgr->wr_rx_buf_v2)
    725		return -ENOMEM;
    726	lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
    727	if (!lgr->wr_tx_buf_v2) {
    728		kfree(lgr->wr_rx_buf_v2);
    729		return -ENOMEM;
    730	}
    731	return 0;
    732}
    733
    734int smc_wr_alloc_link_mem(struct smc_link *link)
    735{
    736	int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
    737
    738	/* allocate link related memory */
    739	link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
    740	if (!link->wr_tx_bufs)
    741		goto no_mem;
    742	link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
    743				   GFP_KERNEL);
    744	if (!link->wr_rx_bufs)
    745		goto no_mem_wr_tx_bufs;
    746	link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
    747				  GFP_KERNEL);
    748	if (!link->wr_tx_ibs)
    749		goto no_mem_wr_rx_bufs;
    750	link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
    751				  sizeof(link->wr_rx_ibs[0]),
    752				  GFP_KERNEL);
    753	if (!link->wr_rx_ibs)
    754		goto no_mem_wr_tx_ibs;
    755	link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
    756				    sizeof(link->wr_tx_rdmas[0]),
    757				    GFP_KERNEL);
    758	if (!link->wr_tx_rdmas)
    759		goto no_mem_wr_rx_ibs;
    760	link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
    761					sizeof(link->wr_tx_rdma_sges[0]),
    762					GFP_KERNEL);
    763	if (!link->wr_tx_rdma_sges)
    764		goto no_mem_wr_tx_rdmas;
    765	link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
    766				   GFP_KERNEL);
    767	if (!link->wr_tx_sges)
    768		goto no_mem_wr_tx_rdma_sges;
    769	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
    770				   sizeof(link->wr_rx_sges[0]) * sges_per_buf,
    771				   GFP_KERNEL);
    772	if (!link->wr_rx_sges)
    773		goto no_mem_wr_tx_sges;
    774	link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
    775	if (!link->wr_tx_mask)
    776		goto no_mem_wr_rx_sges;
    777	link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
    778				    sizeof(link->wr_tx_pends[0]),
    779				    GFP_KERNEL);
    780	if (!link->wr_tx_pends)
    781		goto no_mem_wr_tx_mask;
    782	link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
    783				    sizeof(link->wr_tx_compl[0]),
    784				    GFP_KERNEL);
    785	if (!link->wr_tx_compl)
    786		goto no_mem_wr_tx_pends;
    787
    788	if (link->lgr->smc_version == SMC_V2) {
    789		link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
    790					    GFP_KERNEL);
    791		if (!link->wr_tx_v2_ib)
    792			goto no_mem_tx_compl;
    793		link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
    794					     GFP_KERNEL);
    795		if (!link->wr_tx_v2_sge)
    796			goto no_mem_v2_ib;
    797		link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
    798					      GFP_KERNEL);
    799		if (!link->wr_tx_v2_pend)
    800			goto no_mem_v2_sge;
    801	}
    802	return 0;
    803
    804no_mem_v2_sge:
    805	kfree(link->wr_tx_v2_sge);
    806no_mem_v2_ib:
    807	kfree(link->wr_tx_v2_ib);
    808no_mem_tx_compl:
    809	kfree(link->wr_tx_compl);
    810no_mem_wr_tx_pends:
    811	kfree(link->wr_tx_pends);
    812no_mem_wr_tx_mask:
    813	kfree(link->wr_tx_mask);
    814no_mem_wr_rx_sges:
    815	kfree(link->wr_rx_sges);
    816no_mem_wr_tx_sges:
    817	kfree(link->wr_tx_sges);
    818no_mem_wr_tx_rdma_sges:
    819	kfree(link->wr_tx_rdma_sges);
    820no_mem_wr_tx_rdmas:
    821	kfree(link->wr_tx_rdmas);
    822no_mem_wr_rx_ibs:
    823	kfree(link->wr_rx_ibs);
    824no_mem_wr_tx_ibs:
    825	kfree(link->wr_tx_ibs);
    826no_mem_wr_rx_bufs:
    827	kfree(link->wr_rx_bufs);
    828no_mem_wr_tx_bufs:
    829	kfree(link->wr_tx_bufs);
    830no_mem:
    831	return -ENOMEM;
    832}
    833
    834void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
    835{
    836	tasklet_kill(&smcibdev->recv_tasklet);
    837	tasklet_kill(&smcibdev->send_tasklet);
    838}
    839
    840void smc_wr_add_dev(struct smc_ib_device *smcibdev)
    841{
    842	tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
    843	tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
    844}
    845
    846int smc_wr_create_link(struct smc_link *lnk)
    847{
    848	struct ib_device *ibdev = lnk->smcibdev->ibdev;
    849	int rc = 0;
    850
    851	smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
    852	lnk->wr_rx_id = 0;
    853	lnk->wr_rx_dma_addr = ib_dma_map_single(
    854		ibdev, lnk->wr_rx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
    855		DMA_FROM_DEVICE);
    856	if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
    857		lnk->wr_rx_dma_addr = 0;
    858		rc = -EIO;
    859		goto out;
    860	}
    861	if (lnk->lgr->smc_version == SMC_V2) {
    862		lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
    863			lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
    864			DMA_FROM_DEVICE);
    865		if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
    866			lnk->wr_rx_v2_dma_addr = 0;
    867			rc = -EIO;
    868			goto dma_unmap;
    869		}
    870		lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
    871			lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
    872			DMA_TO_DEVICE);
    873		if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
    874			lnk->wr_tx_v2_dma_addr = 0;
    875			rc = -EIO;
    876			goto dma_unmap;
    877		}
    878	}
    879	lnk->wr_tx_dma_addr = ib_dma_map_single(
    880		ibdev, lnk->wr_tx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
    881		DMA_TO_DEVICE);
    882	if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
    883		rc = -EIO;
    884		goto dma_unmap;
    885	}
    886	smc_wr_init_sge(lnk);
    887	bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
    888	init_waitqueue_head(&lnk->wr_tx_wait);
    889	atomic_set(&lnk->wr_tx_refcnt, 0);
    890	init_waitqueue_head(&lnk->wr_reg_wait);
    891	atomic_set(&lnk->wr_reg_refcnt, 0);
    892	return rc;
    893
    894dma_unmap:
    895	if (lnk->wr_rx_v2_dma_addr) {
    896		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
    897				    SMC_WR_BUF_V2_SIZE,
    898				    DMA_FROM_DEVICE);
    899		lnk->wr_rx_v2_dma_addr = 0;
    900	}
    901	if (lnk->wr_tx_v2_dma_addr) {
    902		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
    903				    SMC_WR_BUF_V2_SIZE,
    904				    DMA_TO_DEVICE);
    905		lnk->wr_tx_v2_dma_addr = 0;
    906	}
    907	ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
    908			    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
    909			    DMA_FROM_DEVICE);
    910	lnk->wr_rx_dma_addr = 0;
    911out:
    912	return rc;
    913}