cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

siw_qp_rx.c (37431B)


      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2
      3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
      4/* Copyright (c) 2008-2019, IBM Corporation */
      5
      6#include <linux/errno.h>
      7#include <linux/types.h>
      8#include <linux/net.h>
      9#include <linux/scatterlist.h>
     10#include <linux/highmem.h>
     11
     12#include <rdma/iw_cm.h>
     13#include <rdma/ib_verbs.h>
     14
     15#include "siw.h"
     16#include "siw_verbs.h"
     17#include "siw_mem.h"
     18
     19/*
     20 * siw_rx_umem()
     21 *
     22 * Receive data of @len into target referenced by @dest_addr.
     23 *
     24 * @srx:	Receive Context
     25 * @umem:	siw representation of target memory
     26 * @dest_addr:	user virtual address
     27 * @len:	number of bytes to place
     28 */
     29static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
     30		       u64 dest_addr, int len)
     31{
     32	int copied = 0;
     33
     34	while (len) {
     35		struct page *p;
     36		int pg_off, bytes, rv;
     37		void *dest;
     38
     39		p = siw_get_upage(umem, dest_addr);
     40		if (unlikely(!p)) {
     41			pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
     42				__func__, qp_id(rx_qp(srx)),
     43				(void *)(uintptr_t)dest_addr,
     44				(void *)(uintptr_t)umem->fp_addr);
     45			/* siw internal error */
     46			srx->skb_copied += copied;
     47			srx->skb_new -= copied;
     48
     49			return -EFAULT;
     50		}
     51		pg_off = dest_addr & ~PAGE_MASK;
     52		bytes = min(len, (int)PAGE_SIZE - pg_off);
     53
     54		siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
     55
     56		dest = kmap_atomic(p);
     57		rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
     58				   bytes);
     59
     60		if (unlikely(rv)) {
     61			kunmap_atomic(dest);
     62			srx->skb_copied += copied;
     63			srx->skb_new -= copied;
     64
     65			pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
     66				qp_id(rx_qp(srx)), __func__, len, p, rv);
     67
     68			return -EFAULT;
     69		}
     70		if (srx->mpa_crc_hd) {
     71			if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
     72				crypto_shash_update(srx->mpa_crc_hd,
     73					(u8 *)(dest + pg_off), bytes);
     74				kunmap_atomic(dest);
     75			} else {
     76				kunmap_atomic(dest);
     77				/*
     78				 * Do CRC on original, not target buffer.
     79				 * Some user land applications may
     80				 * concurrently write the target buffer,
     81				 * which would yield a broken CRC.
     82				 * Walking the skb twice is very ineffcient.
     83				 * Folding the CRC into skb_copy_bits()
     84				 * would be much better, but is currently
     85				 * not supported.
     86				 */
     87				siw_crc_skb(srx, bytes);
     88			}
     89		} else {
     90			kunmap_atomic(dest);
     91		}
     92		srx->skb_offset += bytes;
     93		copied += bytes;
     94		len -= bytes;
     95		dest_addr += bytes;
     96		pg_off = 0;
     97	}
     98	srx->skb_copied += copied;
     99	srx->skb_new -= copied;
    100
    101	return copied;
    102}
    103
    104static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
    105{
    106	int rv;
    107
    108	siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
    109
    110	rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
    111	if (unlikely(rv)) {
    112		pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
    113			qp_id(rx_qp(srx)), __func__, len, kva, rv);
    114
    115		return rv;
    116	}
    117	if (srx->mpa_crc_hd)
    118		crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
    119
    120	srx->skb_offset += len;
    121	srx->skb_copied += len;
    122	srx->skb_new -= len;
    123
    124	return len;
    125}
    126
    127static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
    128		      struct siw_mem *mem, u64 addr, int len)
    129{
    130	struct siw_pbl *pbl = mem->pbl;
    131	u64 offset = addr - mem->va;
    132	int copied = 0;
    133
    134	while (len) {
    135		int bytes;
    136		dma_addr_t buf_addr =
    137			siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
    138		if (!buf_addr)
    139			break;
    140
    141		bytes = min(bytes, len);
    142		if (siw_rx_kva(srx, (void *)(uintptr_t)buf_addr, bytes) ==
    143		    bytes) {
    144			copied += bytes;
    145			offset += bytes;
    146			len -= bytes;
    147		} else {
    148			break;
    149		}
    150	}
    151	return copied;
    152}
    153
    154/*
    155 * siw_rresp_check_ntoh()
    156 *
    157 * Check incoming RRESP fragment header against expected
    158 * header values and update expected values for potential next
    159 * fragment.
    160 *
    161 * NOTE: This function must be called only if a RRESP DDP segment
    162 *       starts but not for fragmented consecutive pieces of an
    163 *       already started DDP segment.
    164 */
    165static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
    166				struct siw_rx_fpdu *frx)
    167{
    168	struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
    169	struct siw_wqe *wqe = &frx->wqe_active;
    170	enum ddp_ecode ecode;
    171
    172	u32 sink_stag = be32_to_cpu(rresp->sink_stag);
    173	u64 sink_to = be64_to_cpu(rresp->sink_to);
    174
    175	if (frx->first_ddp_seg) {
    176		srx->ddp_stag = wqe->sqe.sge[0].lkey;
    177		srx->ddp_to = wqe->sqe.sge[0].laddr;
    178		frx->pbl_idx = 0;
    179	}
    180	/* Below checks extend beyond the semantics of DDP, and
    181	 * into RDMAP:
    182	 * We check if the read response matches exactly the
    183	 * read request which was send to the remote peer to
    184	 * trigger this read response. RFC5040/5041 do not
    185	 * always have a proper error code for the detected
    186	 * error cases. We choose 'base or bounds error' for
    187	 * cases where the inbound STag is valid, but offset
    188	 * or length do not match our response receive state.
    189	 */
    190	if (unlikely(srx->ddp_stag != sink_stag)) {
    191		pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
    192			qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
    193		ecode = DDP_ECODE_T_INVALID_STAG;
    194		goto error;
    195	}
    196	if (unlikely(srx->ddp_to != sink_to)) {
    197		pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
    198			qp_id(rx_qp(srx)), (unsigned long long)sink_to,
    199			(unsigned long long)srx->ddp_to);
    200		ecode = DDP_ECODE_T_BASE_BOUNDS;
    201		goto error;
    202	}
    203	if (unlikely(!frx->more_ddp_segs &&
    204		     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
    205		pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
    206			qp_id(rx_qp(srx)),
    207			wqe->processed + srx->fpdu_part_rem, wqe->bytes);
    208		ecode = DDP_ECODE_T_BASE_BOUNDS;
    209		goto error;
    210	}
    211	return 0;
    212error:
    213	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
    214			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
    215	return -EINVAL;
    216}
    217
    218/*
    219 * siw_write_check_ntoh()
    220 *
    221 * Check incoming WRITE fragment header against expected
    222 * header values and update expected values for potential next
    223 * fragment
    224 *
    225 * NOTE: This function must be called only if a WRITE DDP segment
    226 *       starts but not for fragmented consecutive pieces of an
    227 *       already started DDP segment.
    228 */
    229static int siw_write_check_ntoh(struct siw_rx_stream *srx,
    230				struct siw_rx_fpdu *frx)
    231{
    232	struct iwarp_rdma_write *write = &srx->hdr.rwrite;
    233	enum ddp_ecode ecode;
    234
    235	u32 sink_stag = be32_to_cpu(write->sink_stag);
    236	u64 sink_to = be64_to_cpu(write->sink_to);
    237
    238	if (frx->first_ddp_seg) {
    239		srx->ddp_stag = sink_stag;
    240		srx->ddp_to = sink_to;
    241		frx->pbl_idx = 0;
    242	} else {
    243		if (unlikely(srx->ddp_stag != sink_stag)) {
    244			pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
    245				qp_id(rx_qp(srx)), sink_stag,
    246				srx->ddp_stag);
    247			ecode = DDP_ECODE_T_INVALID_STAG;
    248			goto error;
    249		}
    250		if (unlikely(srx->ddp_to != sink_to)) {
    251			pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
    252				qp_id(rx_qp(srx)),
    253				(unsigned long long)sink_to,
    254				(unsigned long long)srx->ddp_to);
    255			ecode = DDP_ECODE_T_BASE_BOUNDS;
    256			goto error;
    257		}
    258	}
    259	return 0;
    260error:
    261	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
    262			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
    263	return -EINVAL;
    264}
    265
    266/*
    267 * siw_send_check_ntoh()
    268 *
    269 * Check incoming SEND fragment header against expected
    270 * header values and update expected MSN if no next
    271 * fragment expected
    272 *
    273 * NOTE: This function must be called only if a SEND DDP segment
    274 *       starts but not for fragmented consecutive pieces of an
    275 *       already started DDP segment.
    276 */
    277static int siw_send_check_ntoh(struct siw_rx_stream *srx,
    278			       struct siw_rx_fpdu *frx)
    279{
    280	struct iwarp_send_inv *send = &srx->hdr.send_inv;
    281	struct siw_wqe *wqe = &frx->wqe_active;
    282	enum ddp_ecode ecode;
    283
    284	u32 ddp_msn = be32_to_cpu(send->ddp_msn);
    285	u32 ddp_mo = be32_to_cpu(send->ddp_mo);
    286	u32 ddp_qn = be32_to_cpu(send->ddp_qn);
    287
    288	if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
    289		pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
    290			qp_id(rx_qp(srx)), ddp_qn);
    291		ecode = DDP_ECODE_UT_INVALID_QN;
    292		goto error;
    293	}
    294	if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
    295		pr_warn("siw: [QP %u]: send msn: %u != %u\n",
    296			qp_id(rx_qp(srx)), ddp_msn,
    297			srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
    298		ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
    299		goto error;
    300	}
    301	if (unlikely(ddp_mo != wqe->processed)) {
    302		pr_warn("siw: [QP %u], send mo: %u != %u\n",
    303			qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
    304		ecode = DDP_ECODE_UT_INVALID_MO;
    305		goto error;
    306	}
    307	if (frx->first_ddp_seg) {
    308		/* initialize user memory write position */
    309		frx->sge_idx = 0;
    310		frx->sge_off = 0;
    311		frx->pbl_idx = 0;
    312
    313		/* only valid for SEND_INV and SEND_SE_INV operations */
    314		srx->inval_stag = be32_to_cpu(send->inval_stag);
    315	}
    316	if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
    317		siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
    318			   wqe->bytes, wqe->processed, srx->fpdu_part_rem);
    319		wqe->wc_status = SIW_WC_LOC_LEN_ERR;
    320		ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
    321		goto error;
    322	}
    323	return 0;
    324error:
    325	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
    326			   DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
    327	return -EINVAL;
    328}
    329
    330static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
    331{
    332	struct siw_rqe *rqe;
    333	struct siw_srq *srq;
    334	struct siw_wqe *wqe = NULL;
    335	bool srq_event = false;
    336	unsigned long flags;
    337
    338	srq = qp->srq;
    339	if (srq) {
    340		spin_lock_irqsave(&srq->lock, flags);
    341		if (unlikely(!srq->num_rqe))
    342			goto out;
    343
    344		rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
    345	} else {
    346		if (unlikely(!qp->recvq))
    347			goto out;
    348
    349		rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
    350	}
    351	if (likely(rqe->flags == SIW_WQE_VALID)) {
    352		int num_sge = rqe->num_sge;
    353
    354		if (likely(num_sge <= SIW_MAX_SGE)) {
    355			int i = 0;
    356
    357			wqe = rx_wqe(&qp->rx_untagged);
    358			rx_type(wqe) = SIW_OP_RECEIVE;
    359			wqe->wr_status = SIW_WR_INPROGRESS;
    360			wqe->bytes = 0;
    361			wqe->processed = 0;
    362
    363			wqe->rqe.id = rqe->id;
    364			wqe->rqe.num_sge = num_sge;
    365
    366			while (i < num_sge) {
    367				wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
    368				wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
    369				wqe->rqe.sge[i].length = rqe->sge[i].length;
    370				wqe->bytes += wqe->rqe.sge[i].length;
    371				wqe->mem[i] = NULL;
    372				i++;
    373			}
    374			/* can be re-used by appl */
    375			smp_store_mb(rqe->flags, 0);
    376		} else {
    377			siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
    378			if (srq)
    379				spin_unlock_irqrestore(&srq->lock, flags);
    380			return NULL;
    381		}
    382		if (!srq) {
    383			qp->rq_get++;
    384		} else {
    385			if (srq->armed) {
    386				/* Test SRQ limit */
    387				u32 off = (srq->rq_get + srq->limit) %
    388					  srq->num_rqe;
    389				struct siw_rqe *rqe2 = &srq->recvq[off];
    390
    391				if (!(rqe2->flags & SIW_WQE_VALID)) {
    392					srq->armed = false;
    393					srq_event = true;
    394				}
    395			}
    396			srq->rq_get++;
    397		}
    398	}
    399out:
    400	if (srq) {
    401		spin_unlock_irqrestore(&srq->lock, flags);
    402		if (srq_event)
    403			siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
    404	}
    405	return wqe;
    406}
    407
    408/*
    409 * siw_proc_send:
    410 *
    411 * Process one incoming SEND and place data into memory referenced by
    412 * receive wqe.
    413 *
    414 * Function supports partially received sends (suspending/resuming
    415 * current receive wqe processing)
    416 *
    417 * return value:
    418 *	0:       reached the end of a DDP segment
    419 *	-EAGAIN: to be called again to finish the DDP segment
    420 */
    421int siw_proc_send(struct siw_qp *qp)
    422{
    423	struct siw_rx_stream *srx = &qp->rx_stream;
    424	struct siw_rx_fpdu *frx = &qp->rx_untagged;
    425	struct siw_wqe *wqe;
    426	u32 data_bytes; /* all data bytes available */
    427	u32 rcvd_bytes; /* sum of data bytes rcvd */
    428	int rv = 0;
    429
    430	if (frx->first_ddp_seg) {
    431		wqe = siw_rqe_get(qp);
    432		if (unlikely(!wqe)) {
    433			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    434					   DDP_ETYPE_UNTAGGED_BUF,
    435					   DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
    436			return -ENOENT;
    437		}
    438	} else {
    439		wqe = rx_wqe(frx);
    440	}
    441	if (srx->state == SIW_GET_DATA_START) {
    442		rv = siw_send_check_ntoh(srx, frx);
    443		if (unlikely(rv)) {
    444			siw_qp_event(qp, IB_EVENT_QP_FATAL);
    445			return rv;
    446		}
    447		if (!srx->fpdu_part_rem) /* zero length SEND */
    448			return 0;
    449	}
    450	data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
    451	rcvd_bytes = 0;
    452
    453	/* A zero length SEND will skip below loop */
    454	while (data_bytes) {
    455		struct ib_pd *pd;
    456		struct siw_mem **mem, *mem_p;
    457		struct siw_sge *sge;
    458		u32 sge_bytes; /* data bytes avail for SGE */
    459
    460		sge = &wqe->rqe.sge[frx->sge_idx];
    461
    462		if (!sge->length) {
    463			/* just skip empty sge's */
    464			frx->sge_idx++;
    465			frx->sge_off = 0;
    466			frx->pbl_idx = 0;
    467			continue;
    468		}
    469		sge_bytes = min(data_bytes, sge->length - frx->sge_off);
    470		mem = &wqe->mem[frx->sge_idx];
    471
    472		/*
    473		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
    474		 */
    475		pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
    476
    477		rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
    478				   frx->sge_off, sge_bytes);
    479		if (unlikely(rv)) {
    480			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    481					   DDP_ETYPE_CATASTROPHIC,
    482					   DDP_ECODE_CATASTROPHIC, 0);
    483
    484			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
    485			break;
    486		}
    487		mem_p = *mem;
    488		if (mem_p->mem_obj == NULL)
    489			rv = siw_rx_kva(srx,
    490				(void *)(uintptr_t)(sge->laddr + frx->sge_off),
    491				sge_bytes);
    492		else if (!mem_p->is_pbl)
    493			rv = siw_rx_umem(srx, mem_p->umem,
    494					 sge->laddr + frx->sge_off, sge_bytes);
    495		else
    496			rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
    497					sge->laddr + frx->sge_off, sge_bytes);
    498
    499		if (unlikely(rv != sge_bytes)) {
    500			wqe->processed += rcvd_bytes;
    501
    502			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    503					   DDP_ETYPE_CATASTROPHIC,
    504					   DDP_ECODE_CATASTROPHIC, 0);
    505			return -EINVAL;
    506		}
    507		frx->sge_off += rv;
    508
    509		if (frx->sge_off == sge->length) {
    510			frx->sge_idx++;
    511			frx->sge_off = 0;
    512			frx->pbl_idx = 0;
    513		}
    514		data_bytes -= rv;
    515		rcvd_bytes += rv;
    516
    517		srx->fpdu_part_rem -= rv;
    518		srx->fpdu_part_rcvd += rv;
    519	}
    520	wqe->processed += rcvd_bytes;
    521
    522	if (!srx->fpdu_part_rem)
    523		return 0;
    524
    525	return (rv < 0) ? rv : -EAGAIN;
    526}
    527
    528/*
    529 * siw_proc_write:
    530 *
    531 * Place incoming WRITE after referencing and checking target buffer
    532
    533 * Function supports partially received WRITEs (suspending/resuming
    534 * current receive processing)
    535 *
    536 * return value:
    537 *	0:       reached the end of a DDP segment
    538 *	-EAGAIN: to be called again to finish the DDP segment
    539 */
    540int siw_proc_write(struct siw_qp *qp)
    541{
    542	struct siw_rx_stream *srx = &qp->rx_stream;
    543	struct siw_rx_fpdu *frx = &qp->rx_tagged;
    544	struct siw_mem *mem;
    545	int bytes, rv;
    546
    547	if (srx->state == SIW_GET_DATA_START) {
    548		if (!srx->fpdu_part_rem) /* zero length WRITE */
    549			return 0;
    550
    551		rv = siw_write_check_ntoh(srx, frx);
    552		if (unlikely(rv)) {
    553			siw_qp_event(qp, IB_EVENT_QP_FATAL);
    554			return rv;
    555		}
    556	}
    557	bytes = min(srx->fpdu_part_rem, srx->skb_new);
    558
    559	if (frx->first_ddp_seg) {
    560		struct siw_wqe *wqe = rx_wqe(frx);
    561
    562		rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
    563		if (unlikely(!rx_mem(frx))) {
    564			siw_dbg_qp(qp,
    565				   "sink stag not found/invalid, stag 0x%08x\n",
    566				   srx->ddp_stag);
    567
    568			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    569					   DDP_ETYPE_TAGGED_BUF,
    570					   DDP_ECODE_T_INVALID_STAG, 0);
    571			return -EINVAL;
    572		}
    573		wqe->rqe.num_sge = 1;
    574		rx_type(wqe) = SIW_OP_WRITE;
    575		wqe->wr_status = SIW_WR_INPROGRESS;
    576	}
    577	mem = rx_mem(frx);
    578
    579	/*
    580	 * Check if application re-registered memory with different
    581	 * key field of STag.
    582	 */
    583	if (unlikely(mem->stag != srx->ddp_stag)) {
    584		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    585				   DDP_ETYPE_TAGGED_BUF,
    586				   DDP_ECODE_T_INVALID_STAG, 0);
    587		return -EINVAL;
    588	}
    589	rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
    590			   IB_ACCESS_REMOTE_WRITE, bytes);
    591	if (unlikely(rv)) {
    592		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    593				   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
    594				   0);
    595
    596		siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
    597
    598		return -EINVAL;
    599	}
    600
    601	if (mem->mem_obj == NULL)
    602		rv = siw_rx_kva(srx,
    603			(void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
    604			bytes);
    605	else if (!mem->is_pbl)
    606		rv = siw_rx_umem(srx, mem->umem,
    607				 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
    608	else
    609		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
    610				srx->ddp_to + srx->fpdu_part_rcvd, bytes);
    611
    612	if (unlikely(rv != bytes)) {
    613		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    614				   DDP_ETYPE_CATASTROPHIC,
    615				   DDP_ECODE_CATASTROPHIC, 0);
    616		return -EINVAL;
    617	}
    618	srx->fpdu_part_rem -= rv;
    619	srx->fpdu_part_rcvd += rv;
    620
    621	if (!srx->fpdu_part_rem) {
    622		srx->ddp_to += srx->fpdu_part_rcvd;
    623		return 0;
    624	}
    625	return -EAGAIN;
    626}
    627
    628/*
    629 * Inbound RREQ's cannot carry user data.
    630 */
    631int siw_proc_rreq(struct siw_qp *qp)
    632{
    633	struct siw_rx_stream *srx = &qp->rx_stream;
    634
    635	if (!srx->fpdu_part_rem)
    636		return 0;
    637
    638	pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
    639		be16_to_cpu(srx->hdr.ctrl.mpa_len));
    640
    641	return -EPROTO;
    642}
    643
    644/*
    645 * siw_init_rresp:
    646 *
    647 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
    648 * Put it at the tail of the IRQ, if there is another WQE currently in
    649 * transmit processing. If not, make it the current WQE to be processed
    650 * and schedule transmit processing.
    651 *
    652 * Can be called from softirq context and from process
    653 * context (RREAD socket loopback case!)
    654 *
    655 * return value:
    656 *	0:      success,
    657 *		failure code otherwise
    658 */
    659
    660static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
    661{
    662	struct siw_wqe *tx_work = tx_wqe(qp);
    663	struct siw_sqe *resp;
    664
    665	uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
    666		 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
    667	uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
    668		 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
    669		 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
    670		 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
    671
    672	int run_sq = 1, rv = 0;
    673	unsigned long flags;
    674
    675	if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
    676		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    677				   DDP_ETYPE_UNTAGGED_BUF,
    678				   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
    679		return -EPROTO;
    680	}
    681	spin_lock_irqsave(&qp->sq_lock, flags);
    682
    683	if (unlikely(!qp->attrs.irq_size)) {
    684		run_sq = 0;
    685		goto error_irq;
    686	}
    687	if (tx_work->wr_status == SIW_WR_IDLE) {
    688		/*
    689		 * immediately schedule READ response w/o
    690		 * consuming IRQ entry: IRQ must be empty.
    691		 */
    692		tx_work->processed = 0;
    693		tx_work->mem[0] = NULL;
    694		tx_work->wr_status = SIW_WR_QUEUED;
    695		resp = &tx_work->sqe;
    696	} else {
    697		resp = irq_alloc_free(qp);
    698		run_sq = 0;
    699	}
    700	if (likely(resp)) {
    701		resp->opcode = SIW_OP_READ_RESPONSE;
    702
    703		resp->sge[0].length = length;
    704		resp->sge[0].laddr = laddr;
    705		resp->sge[0].lkey = lkey;
    706
    707		/* Keep aside message sequence number for potential
    708		 * error reporting during Read Response generation.
    709		 */
    710		resp->sge[1].length = msn;
    711
    712		resp->raddr = raddr;
    713		resp->rkey = rkey;
    714		resp->num_sge = length ? 1 : 0;
    715
    716		/* RRESP now valid as current TX wqe or placed into IRQ */
    717		smp_store_mb(resp->flags, SIW_WQE_VALID);
    718	} else {
    719error_irq:
    720		pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
    721			qp_id(qp), qp->attrs.irq_size);
    722
    723		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
    724				   RDMAP_ETYPE_REMOTE_OPERATION,
    725				   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
    726		rv = -EPROTO;
    727	}
    728
    729	spin_unlock_irqrestore(&qp->sq_lock, flags);
    730
    731	if (run_sq)
    732		rv = siw_sq_start(qp);
    733
    734	return rv;
    735}
    736
    737/*
    738 * Only called at start of Read.Resonse processing.
    739 * Transfer pending Read from tip of ORQ into currrent rx wqe,
    740 * but keep ORQ entry valid until Read.Response processing done.
    741 * No Queue locking needed.
    742 */
    743static int siw_orqe_start_rx(struct siw_qp *qp)
    744{
    745	struct siw_sqe *orqe;
    746	struct siw_wqe *wqe = NULL;
    747
    748	if (unlikely(!qp->attrs.orq_size))
    749		return -EPROTO;
    750
    751	/* make sure ORQ indices are current */
    752	smp_mb();
    753
    754	orqe = orq_get_current(qp);
    755	if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
    756		/* RRESP is a TAGGED RDMAP operation */
    757		wqe = rx_wqe(&qp->rx_tagged);
    758		wqe->sqe.id = orqe->id;
    759		wqe->sqe.opcode = orqe->opcode;
    760		wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
    761		wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
    762		wqe->sqe.sge[0].length = orqe->sge[0].length;
    763		wqe->sqe.flags = orqe->flags;
    764		wqe->sqe.num_sge = 1;
    765		wqe->bytes = orqe->sge[0].length;
    766		wqe->processed = 0;
    767		wqe->mem[0] = NULL;
    768		/* make sure WQE is completely written before valid */
    769		smp_wmb();
    770		wqe->wr_status = SIW_WR_INPROGRESS;
    771
    772		return 0;
    773	}
    774	return -EPROTO;
    775}
    776
    777/*
    778 * siw_proc_rresp:
    779 *
    780 * Place incoming RRESP data into memory referenced by RREQ WQE
    781 * which is at the tip of the ORQ
    782 *
    783 * Function supports partially received RRESP's (suspending/resuming
    784 * current receive processing)
    785 */
    786int siw_proc_rresp(struct siw_qp *qp)
    787{
    788	struct siw_rx_stream *srx = &qp->rx_stream;
    789	struct siw_rx_fpdu *frx = &qp->rx_tagged;
    790	struct siw_wqe *wqe = rx_wqe(frx);
    791	struct siw_mem **mem, *mem_p;
    792	struct siw_sge *sge;
    793	int bytes, rv;
    794
    795	if (frx->first_ddp_seg) {
    796		if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
    797			pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
    798				qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
    799			rv = -EPROTO;
    800			goto error_term;
    801		}
    802		/*
    803		 * fetch pending RREQ from orq
    804		 */
    805		rv = siw_orqe_start_rx(qp);
    806		if (rv) {
    807			pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
    808				qp_id(qp), qp->attrs.orq_size);
    809			goto error_term;
    810		}
    811		rv = siw_rresp_check_ntoh(srx, frx);
    812		if (unlikely(rv)) {
    813			siw_qp_event(qp, IB_EVENT_QP_FATAL);
    814			return rv;
    815		}
    816	} else {
    817		if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
    818			pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
    819				qp_id(qp), wqe->wr_status);
    820			rv = -EPROTO;
    821			goto error_term;
    822		}
    823	}
    824	if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
    825		return 0;
    826
    827	sge = wqe->sqe.sge; /* there is only one */
    828	mem = &wqe->mem[0];
    829
    830	if (!(*mem)) {
    831		/*
    832		 * check target memory which resolves memory on first fragment
    833		 */
    834		rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
    835				   wqe->bytes);
    836		if (unlikely(rv)) {
    837			siw_dbg_qp(qp, "target mem check: %d\n", rv);
    838			wqe->wc_status = SIW_WC_LOC_PROT_ERR;
    839
    840			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
    841					   DDP_ETYPE_TAGGED_BUF,
    842					   siw_tagged_error(-rv), 0);
    843
    844			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
    845
    846			return -EINVAL;
    847		}
    848	}
    849	mem_p = *mem;
    850
    851	bytes = min(srx->fpdu_part_rem, srx->skb_new);
    852
    853	if (mem_p->mem_obj == NULL)
    854		rv = siw_rx_kva(srx,
    855			(void *)(uintptr_t)(sge->laddr + wqe->processed),
    856			bytes);
    857	else if (!mem_p->is_pbl)
    858		rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
    859				 bytes);
    860	else
    861		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
    862				sge->laddr + wqe->processed, bytes);
    863	if (rv != bytes) {
    864		wqe->wc_status = SIW_WC_GENERAL_ERR;
    865		rv = -EINVAL;
    866		goto error_term;
    867	}
    868	srx->fpdu_part_rem -= rv;
    869	srx->fpdu_part_rcvd += rv;
    870	wqe->processed += rv;
    871
    872	if (!srx->fpdu_part_rem) {
    873		srx->ddp_to += srx->fpdu_part_rcvd;
    874		return 0;
    875	}
    876	return -EAGAIN;
    877
    878error_term:
    879	siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
    880			   DDP_ECODE_CATASTROPHIC, 0);
    881	return rv;
    882}
    883
    884int siw_proc_terminate(struct siw_qp *qp)
    885{
    886	struct siw_rx_stream *srx = &qp->rx_stream;
    887	struct sk_buff *skb = srx->skb;
    888	struct iwarp_terminate *term = &srx->hdr.terminate;
    889	union iwarp_hdr term_info;
    890	u8 *infop = (u8 *)&term_info;
    891	enum rdma_opcode op;
    892	u16 to_copy = sizeof(struct iwarp_ctrl);
    893
    894	pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
    895		__rdmap_term_layer(term), __rdmap_term_etype(term),
    896		__rdmap_term_ecode(term));
    897
    898	if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
    899	    be32_to_cpu(term->ddp_msn) !=
    900		    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
    901	    be32_to_cpu(term->ddp_mo) != 0) {
    902		pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
    903			be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
    904			be32_to_cpu(term->ddp_mo));
    905		return -ECONNRESET;
    906	}
    907	/*
    908	 * Receive remaining pieces of TERM if indicated
    909	 */
    910	if (!term->flag_m)
    911		return -ECONNRESET;
    912
    913	/* Do not take the effort to reassemble a network fragmented
    914	 * TERM message
    915	 */
    916	if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
    917		return -ECONNRESET;
    918
    919	memset(infop, 0, sizeof(term_info));
    920
    921	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
    922
    923	op = __rdmap_get_opcode(&term_info.ctrl);
    924	if (op >= RDMAP_TERMINATE)
    925		goto out;
    926
    927	infop += to_copy;
    928	srx->skb_offset += to_copy;
    929	srx->skb_new -= to_copy;
    930	srx->skb_copied += to_copy;
    931	srx->fpdu_part_rcvd += to_copy;
    932	srx->fpdu_part_rem -= to_copy;
    933
    934	to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
    935
    936	/* Again, no network fragmented TERM's */
    937	if (to_copy + MPA_CRC_SIZE > srx->skb_new)
    938		return -ECONNRESET;
    939
    940	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
    941
    942	if (term->flag_r) {
    943		siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
    944			   op, be16_to_cpu(term_info.ctrl.mpa_len),
    945			   term->flag_m ? "valid" : "invalid");
    946	} else if (term->flag_d) {
    947		siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
    948			   op, be16_to_cpu(term_info.ctrl.mpa_len),
    949			   term->flag_m ? "valid" : "invalid");
    950	}
    951out:
    952	srx->skb_new -= to_copy;
    953	srx->skb_offset += to_copy;
    954	srx->skb_copied += to_copy;
    955	srx->fpdu_part_rcvd += to_copy;
    956	srx->fpdu_part_rem -= to_copy;
    957
    958	return -ECONNRESET;
    959}
    960
    961static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
    962{
    963	struct sk_buff *skb = srx->skb;
    964	u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
    965	__wsum crc_in, crc_own = 0;
    966
    967	siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
    968		   srx->fpdu_part_rem, srx->skb_new, srx->pad);
    969
    970	if (srx->skb_new < srx->fpdu_part_rem)
    971		return -EAGAIN;
    972
    973	skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
    974
    975	if (srx->mpa_crc_hd && srx->pad)
    976		crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
    977
    978	srx->skb_new -= srx->fpdu_part_rem;
    979	srx->skb_offset += srx->fpdu_part_rem;
    980	srx->skb_copied += srx->fpdu_part_rem;
    981
    982	if (!srx->mpa_crc_hd)
    983		return 0;
    984
    985	/*
    986	 * CRC32 is computed, transmitted and received directly in NBO,
    987	 * so there's never a reason to convert byte order.
    988	 */
    989	crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
    990	crc_in = (__force __wsum)srx->trailer.crc;
    991
    992	if (unlikely(crc_in != crc_own)) {
    993		pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
    994			crc_in, crc_own, qp->rx_stream.rdmap_op);
    995
    996		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
    997				   LLP_ETYPE_MPA,
    998				   LLP_ECODE_RECEIVED_CRC, 0);
    999		return -EINVAL;
   1000	}
   1001	return 0;
   1002}
   1003
   1004#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
   1005
   1006static int siw_get_hdr(struct siw_rx_stream *srx)
   1007{
   1008	struct sk_buff *skb = srx->skb;
   1009	struct siw_qp *qp = rx_qp(srx);
   1010	struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
   1011	struct siw_rx_fpdu *frx;
   1012	u8 opcode;
   1013	int bytes;
   1014
   1015	if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
   1016		/*
   1017		 * copy a mimimum sized (tagged) DDP frame control part
   1018		 */
   1019		bytes = min_t(int, srx->skb_new,
   1020			      MIN_DDP_HDR - srx->fpdu_part_rcvd);
   1021
   1022		skb_copy_bits(skb, srx->skb_offset,
   1023			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
   1024
   1025		srx->fpdu_part_rcvd += bytes;
   1026
   1027		srx->skb_new -= bytes;
   1028		srx->skb_offset += bytes;
   1029		srx->skb_copied += bytes;
   1030
   1031		if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
   1032			return -EAGAIN;
   1033
   1034		if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
   1035			enum ddp_etype etype;
   1036			enum ddp_ecode ecode;
   1037
   1038			pr_warn("siw: received ddp version unsupported %d\n",
   1039				__ddp_get_version(c_hdr));
   1040
   1041			if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
   1042				etype = DDP_ETYPE_TAGGED_BUF;
   1043				ecode = DDP_ECODE_T_VERSION;
   1044			} else {
   1045				etype = DDP_ETYPE_UNTAGGED_BUF;
   1046				ecode = DDP_ECODE_UT_VERSION;
   1047			}
   1048			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
   1049					   etype, ecode, 0);
   1050			return -EINVAL;
   1051		}
   1052		if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
   1053			pr_warn("siw: received rdmap version unsupported %d\n",
   1054				__rdmap_get_version(c_hdr));
   1055
   1056			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
   1057					   RDMAP_ETYPE_REMOTE_OPERATION,
   1058					   RDMAP_ECODE_VERSION, 0);
   1059			return -EINVAL;
   1060		}
   1061		opcode = __rdmap_get_opcode(c_hdr);
   1062
   1063		if (opcode > RDMAP_TERMINATE) {
   1064			pr_warn("siw: received unknown packet type %u\n",
   1065				opcode);
   1066
   1067			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
   1068					   RDMAP_ETYPE_REMOTE_OPERATION,
   1069					   RDMAP_ECODE_OPCODE, 0);
   1070			return -EINVAL;
   1071		}
   1072		siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
   1073	} else {
   1074		opcode = __rdmap_get_opcode(c_hdr);
   1075	}
   1076	set_rx_fpdu_context(qp, opcode);
   1077	frx = qp->rx_fpdu;
   1078
   1079	/*
   1080	 * Figure out len of current hdr: variable length of
   1081	 * iwarp hdr may force us to copy hdr information in
   1082	 * two steps. Only tagged DDP messages are already
   1083	 * completely received.
   1084	 */
   1085	if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
   1086		bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
   1087
   1088		if (srx->skb_new < bytes)
   1089			return -EAGAIN;
   1090
   1091		skb_copy_bits(skb, srx->skb_offset,
   1092			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
   1093
   1094		srx->fpdu_part_rcvd += bytes;
   1095
   1096		srx->skb_new -= bytes;
   1097		srx->skb_offset += bytes;
   1098		srx->skb_copied += bytes;
   1099	}
   1100
   1101	/*
   1102	 * DDP/RDMAP header receive completed. Check if the current
   1103	 * DDP segment starts a new RDMAP message or continues a previously
   1104	 * started RDMAP message.
   1105	 *
   1106	 * Alternating reception of DDP segments (or FPDUs) from incomplete
   1107	 * tagged and untagged RDMAP messages is supported, as long as
   1108	 * the current tagged or untagged message gets eventually completed
   1109	 * w/o intersection from another message of the same type
   1110	 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
   1111	 * but not by a READ RESPONSE etc.
   1112	 */
   1113	if (srx->mpa_crc_hd) {
   1114		/*
   1115		 * Restart CRC computation
   1116		 */
   1117		crypto_shash_init(srx->mpa_crc_hd);
   1118		crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
   1119				    srx->fpdu_part_rcvd);
   1120	}
   1121	if (frx->more_ddp_segs) {
   1122		frx->first_ddp_seg = 0;
   1123		if (frx->prev_rdmap_op != opcode) {
   1124			pr_warn("siw: packet intersection: %u : %u\n",
   1125				frx->prev_rdmap_op, opcode);
   1126			/*
   1127			 * The last inbound RDMA operation of same type
   1128			 * (tagged or untagged) is left unfinished.
   1129			 * To complete it in error, make it the current
   1130			 * operation again, even with the header already
   1131			 * overwritten. For error handling, only the opcode
   1132			 * and current rx context are relevant.
   1133			 */
   1134			set_rx_fpdu_context(qp, frx->prev_rdmap_op);
   1135			__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
   1136			return -EPROTO;
   1137		}
   1138	} else {
   1139		frx->prev_rdmap_op = opcode;
   1140		frx->first_ddp_seg = 1;
   1141	}
   1142	frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
   1143
   1144	return 0;
   1145}
   1146
   1147static int siw_check_tx_fence(struct siw_qp *qp)
   1148{
   1149	struct siw_wqe *tx_waiting = tx_wqe(qp);
   1150	struct siw_sqe *rreq;
   1151	int resume_tx = 0, rv = 0;
   1152	unsigned long flags;
   1153
   1154	spin_lock_irqsave(&qp->orq_lock, flags);
   1155
   1156	/* free current orq entry */
   1157	rreq = orq_get_current(qp);
   1158	WRITE_ONCE(rreq->flags, 0);
   1159
   1160	qp->orq_get++;
   1161
   1162	if (qp->tx_ctx.orq_fence) {
   1163		if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
   1164			pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
   1165				qp_id(qp), tx_waiting->wr_status);
   1166			rv = -EPROTO;
   1167			goto out;
   1168		}
   1169		/* resume SQ processing, if possible */
   1170		if (tx_waiting->sqe.opcode == SIW_OP_READ ||
   1171		    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
   1172
   1173			/* SQ processing was stopped because of a full ORQ */
   1174			rreq = orq_get_free(qp);
   1175			if (unlikely(!rreq)) {
   1176				pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
   1177				rv = -EPROTO;
   1178				goto out;
   1179			}
   1180			siw_read_to_orq(rreq, &tx_waiting->sqe);
   1181
   1182			qp->orq_put++;
   1183			qp->tx_ctx.orq_fence = 0;
   1184			resume_tx = 1;
   1185
   1186		} else if (siw_orq_empty(qp)) {
   1187			/*
   1188			 * SQ processing was stopped by fenced work request.
   1189			 * Resume since all previous Read's are now completed.
   1190			 */
   1191			qp->tx_ctx.orq_fence = 0;
   1192			resume_tx = 1;
   1193		}
   1194	}
   1195out:
   1196	spin_unlock_irqrestore(&qp->orq_lock, flags);
   1197
   1198	if (resume_tx)
   1199		rv = siw_sq_start(qp);
   1200
   1201	return rv;
   1202}
   1203
   1204/*
   1205 * siw_rdmap_complete()
   1206 *
   1207 * Complete processing of an RDMA message after receiving all
   1208 * DDP segmens or ABort processing after encountering error case.
   1209 *
   1210 *   o SENDs + RRESPs will need for completion,
   1211 *   o RREQs need for  READ RESPONSE initialization
   1212 *   o WRITEs need memory dereferencing
   1213 *
   1214 * TODO: Failed WRITEs need local error to be surfaced.
   1215 */
   1216static int siw_rdmap_complete(struct siw_qp *qp, int error)
   1217{
   1218	struct siw_rx_stream *srx = &qp->rx_stream;
   1219	struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
   1220	enum siw_wc_status wc_status = wqe->wc_status;
   1221	u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
   1222	int rv = 0;
   1223
   1224	switch (opcode) {
   1225	case RDMAP_SEND_SE:
   1226	case RDMAP_SEND_SE_INVAL:
   1227		wqe->rqe.flags |= SIW_WQE_SOLICITED;
   1228		fallthrough;
   1229
   1230	case RDMAP_SEND:
   1231	case RDMAP_SEND_INVAL:
   1232		if (wqe->wr_status == SIW_WR_IDLE)
   1233			break;
   1234
   1235		srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
   1236
   1237		if (error != 0 && wc_status == SIW_WC_SUCCESS)
   1238			wc_status = SIW_WC_GENERAL_ERR;
   1239		/*
   1240		 * Handle STag invalidation request
   1241		 */
   1242		if (wc_status == SIW_WC_SUCCESS &&
   1243		    (opcode == RDMAP_SEND_INVAL ||
   1244		     opcode == RDMAP_SEND_SE_INVAL)) {
   1245			rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
   1246			if (rv) {
   1247				siw_init_terminate(
   1248					qp, TERM_ERROR_LAYER_RDMAP,
   1249					rv == -EACCES ?
   1250						RDMAP_ETYPE_REMOTE_PROTECTION :
   1251						RDMAP_ETYPE_REMOTE_OPERATION,
   1252					RDMAP_ECODE_CANNOT_INVALIDATE, 0);
   1253
   1254				wc_status = SIW_WC_REM_INV_REQ_ERR;
   1255			}
   1256			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
   1257					      rv ? 0 : srx->inval_stag,
   1258					      wc_status);
   1259		} else {
   1260			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
   1261					      0, wc_status);
   1262		}
   1263		siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
   1264		break;
   1265
   1266	case RDMAP_RDMA_READ_RESP:
   1267		if (wqe->wr_status == SIW_WR_IDLE)
   1268			break;
   1269
   1270		if (error != 0) {
   1271			if ((srx->state == SIW_GET_HDR &&
   1272			     qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
   1273				/* possible RREQ in ORQ left untouched */
   1274				break;
   1275
   1276			if (wc_status == SIW_WC_SUCCESS)
   1277				wc_status = SIW_WC_GENERAL_ERR;
   1278		} else if (rdma_is_kernel_res(&qp->base_qp.res) &&
   1279			   rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
   1280			/*
   1281			 * Handle any STag invalidation request
   1282			 */
   1283			rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
   1284			if (rv) {
   1285				siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
   1286						   RDMAP_ETYPE_CATASTROPHIC,
   1287						   RDMAP_ECODE_UNSPECIFIED, 0);
   1288
   1289				if (wc_status == SIW_WC_SUCCESS) {
   1290					wc_status = SIW_WC_GENERAL_ERR;
   1291					error = rv;
   1292				}
   1293			}
   1294		}
   1295		/*
   1296		 * All errors turn the wqe into signalled.
   1297		 */
   1298		if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
   1299			rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
   1300					      wc_status);
   1301		siw_wqe_put_mem(wqe, SIW_OP_READ);
   1302
   1303		if (!error) {
   1304			rv = siw_check_tx_fence(qp);
   1305		} else {
   1306			/* Disable current ORQ element */
   1307			if (qp->attrs.orq_size)
   1308				WRITE_ONCE(orq_get_current(qp)->flags, 0);
   1309		}
   1310		break;
   1311
   1312	case RDMAP_RDMA_READ_REQ:
   1313		if (!error) {
   1314			rv = siw_init_rresp(qp, srx);
   1315			srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
   1316		}
   1317		break;
   1318
   1319	case RDMAP_RDMA_WRITE:
   1320		if (wqe->wr_status == SIW_WR_IDLE)
   1321			break;
   1322
   1323		/*
   1324		 * Free References from memory object if
   1325		 * attached to receive context (inbound WRITE).
   1326		 * While a zero-length WRITE is allowed,
   1327		 * no memory reference got created.
   1328		 */
   1329		if (rx_mem(&qp->rx_tagged)) {
   1330			siw_mem_put(rx_mem(&qp->rx_tagged));
   1331			rx_mem(&qp->rx_tagged) = NULL;
   1332		}
   1333		break;
   1334
   1335	default:
   1336		break;
   1337	}
   1338	wqe->wr_status = SIW_WR_IDLE;
   1339
   1340	return rv;
   1341}
   1342
   1343/*
   1344 * siw_tcp_rx_data()
   1345 *
   1346 * Main routine to consume inbound TCP payload
   1347 *
   1348 * @rd_desc:	read descriptor
   1349 * @skb:	socket buffer
   1350 * @off:	offset in skb
   1351 * @len:	skb->len - offset : payload in skb
   1352 */
   1353int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
   1354		    unsigned int off, size_t len)
   1355{
   1356	struct siw_qp *qp = rd_desc->arg.data;
   1357	struct siw_rx_stream *srx = &qp->rx_stream;
   1358	int rv;
   1359
   1360	srx->skb = skb;
   1361	srx->skb_new = skb->len - off;
   1362	srx->skb_offset = off;
   1363	srx->skb_copied = 0;
   1364
   1365	siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
   1366
   1367	while (srx->skb_new) {
   1368		int run_completion = 1;
   1369
   1370		if (unlikely(srx->rx_suspend)) {
   1371			/* Do not process any more data */
   1372			srx->skb_copied += srx->skb_new;
   1373			break;
   1374		}
   1375		switch (srx->state) {
   1376		case SIW_GET_HDR:
   1377			rv = siw_get_hdr(srx);
   1378			if (!rv) {
   1379				srx->fpdu_part_rem =
   1380					be16_to_cpu(srx->hdr.ctrl.mpa_len) -
   1381					srx->fpdu_part_rcvd + MPA_HDR_SIZE;
   1382
   1383				if (srx->fpdu_part_rem)
   1384					srx->pad = -srx->fpdu_part_rem & 0x3;
   1385				else
   1386					srx->pad = 0;
   1387
   1388				srx->state = SIW_GET_DATA_START;
   1389				srx->fpdu_part_rcvd = 0;
   1390			}
   1391			break;
   1392
   1393		case SIW_GET_DATA_MORE:
   1394			/*
   1395			 * Another data fragment of the same DDP segment.
   1396			 * Setting first_ddp_seg = 0 avoids repeating
   1397			 * initializations that shall occur only once per
   1398			 * DDP segment.
   1399			 */
   1400			qp->rx_fpdu->first_ddp_seg = 0;
   1401			fallthrough;
   1402
   1403		case SIW_GET_DATA_START:
   1404			/*
   1405			 * Headers will be checked by the opcode-specific
   1406			 * data receive function below.
   1407			 */
   1408			rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
   1409			if (!rv) {
   1410				int mpa_len =
   1411					be16_to_cpu(srx->hdr.ctrl.mpa_len)
   1412					+ MPA_HDR_SIZE;
   1413
   1414				srx->fpdu_part_rem = (-mpa_len & 0x3)
   1415						      + MPA_CRC_SIZE;
   1416				srx->fpdu_part_rcvd = 0;
   1417				srx->state = SIW_GET_TRAILER;
   1418			} else {
   1419				if (unlikely(rv == -ECONNRESET))
   1420					run_completion = 0;
   1421				else
   1422					srx->state = SIW_GET_DATA_MORE;
   1423			}
   1424			break;
   1425
   1426		case SIW_GET_TRAILER:
   1427			/*
   1428			 * read CRC + any padding
   1429			 */
   1430			rv = siw_get_trailer(qp, srx);
   1431			if (likely(!rv)) {
   1432				/*
   1433				 * FPDU completed.
   1434				 * complete RDMAP message if last fragment
   1435				 */
   1436				srx->state = SIW_GET_HDR;
   1437				srx->fpdu_part_rcvd = 0;
   1438
   1439				if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
   1440				      DDP_FLAG_LAST))
   1441					/* more frags */
   1442					break;
   1443
   1444				rv = siw_rdmap_complete(qp, 0);
   1445				run_completion = 0;
   1446			}
   1447			break;
   1448
   1449		default:
   1450			pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
   1451			rv = -EPROTO;
   1452			run_completion = 0;
   1453		}
   1454		if (unlikely(rv != 0 && rv != -EAGAIN)) {
   1455			if ((srx->state > SIW_GET_HDR ||
   1456			     qp->rx_fpdu->more_ddp_segs) && run_completion)
   1457				siw_rdmap_complete(qp, rv);
   1458
   1459			siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
   1460				   srx->state);
   1461
   1462			siw_qp_cm_drop(qp, 1);
   1463
   1464			break;
   1465		}
   1466		if (rv) {
   1467			siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
   1468				   srx->state, srx->fpdu_part_rem);
   1469			break;
   1470		}
   1471	}
   1472	return srx->skb_copied;
   1473}