cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

siw_qp_tx.c (31499B)


      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2
      3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
      4/* Copyright (c) 2008-2019, IBM Corporation */
      5
      6#include <linux/errno.h>
      7#include <linux/types.h>
      8#include <linux/net.h>
      9#include <linux/scatterlist.h>
     10#include <linux/highmem.h>
     11#include <net/tcp.h>
     12
     13#include <rdma/iw_cm.h>
     14#include <rdma/ib_verbs.h>
     15#include <rdma/ib_user_verbs.h>
     16
     17#include "siw.h"
     18#include "siw_verbs.h"
     19#include "siw_mem.h"
     20
     21#define MAX_HDR_INLINE					\
     22	(((uint32_t)(sizeof(struct siw_rreq_pkt) -	\
     23		     sizeof(struct iwarp_send))) & 0xF8)
     24
     25static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx)
     26{
     27	struct siw_pbl *pbl = mem->pbl;
     28	u64 offset = addr - mem->va;
     29	dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx);
     30
     31	if (paddr)
     32		return virt_to_page(paddr);
     33
     34	return NULL;
     35}
     36
     37/*
     38 * Copy short payload at provided destination payload address
     39 */
     40static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr)
     41{
     42	struct siw_wqe *wqe = &c_tx->wqe_active;
     43	struct siw_sge *sge = &wqe->sqe.sge[0];
     44	u32 bytes = sge->length;
     45
     46	if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1)
     47		return MAX_HDR_INLINE + 1;
     48
     49	if (!bytes)
     50		return 0;
     51
     52	if (tx_flags(wqe) & SIW_WQE_INLINE) {
     53		memcpy(paddr, &wqe->sqe.sge[1], bytes);
     54	} else {
     55		struct siw_mem *mem = wqe->mem[0];
     56
     57		if (!mem->mem_obj) {
     58			/* Kernel client using kva */
     59			memcpy(paddr,
     60			       (const void *)(uintptr_t)sge->laddr, bytes);
     61		} else if (c_tx->in_syscall) {
     62			if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr),
     63					   bytes))
     64				return -EFAULT;
     65		} else {
     66			unsigned int off = sge->laddr & ~PAGE_MASK;
     67			struct page *p;
     68			char *buffer;
     69			int pbl_idx = 0;
     70
     71			if (!mem->is_pbl)
     72				p = siw_get_upage(mem->umem, sge->laddr);
     73			else
     74				p = siw_get_pblpage(mem, sge->laddr, &pbl_idx);
     75
     76			if (unlikely(!p))
     77				return -EFAULT;
     78
     79			buffer = kmap_local_page(p);
     80
     81			if (likely(PAGE_SIZE - off >= bytes)) {
     82				memcpy(paddr, buffer + off, bytes);
     83			} else {
     84				unsigned long part = bytes - (PAGE_SIZE - off);
     85
     86				memcpy(paddr, buffer + off, part);
     87				kunmap_local(buffer);
     88
     89				if (!mem->is_pbl)
     90					p = siw_get_upage(mem->umem,
     91							  sge->laddr + part);
     92				else
     93					p = siw_get_pblpage(mem,
     94							    sge->laddr + part,
     95							    &pbl_idx);
     96				if (unlikely(!p))
     97					return -EFAULT;
     98
     99				buffer = kmap_local_page(p);
    100				memcpy(paddr + part, buffer, bytes - part);
    101			}
    102			kunmap_local(buffer);
    103		}
    104	}
    105	return (int)bytes;
    106}
    107
    108#define PKT_FRAGMENTED 1
    109#define PKT_COMPLETE 0
    110
    111/*
    112 * siw_qp_prepare_tx()
    113 *
    114 * Prepare tx state for sending out one fpdu. Builds complete pkt
    115 * if no user data or only immediate data are present.
    116 *
    117 * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
    118 */
    119static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
    120{
    121	struct siw_wqe *wqe = &c_tx->wqe_active;
    122	char *crc = NULL;
    123	int data = 0;
    124
    125	switch (tx_type(wqe)) {
    126	case SIW_OP_READ:
    127	case SIW_OP_READ_LOCAL_INV:
    128		memcpy(&c_tx->pkt.ctrl,
    129		       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
    130		       sizeof(struct iwarp_ctrl));
    131
    132		c_tx->pkt.rreq.rsvd = 0;
    133		c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
    134		c_tx->pkt.rreq.ddp_msn =
    135			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]);
    136		c_tx->pkt.rreq.ddp_mo = 0;
    137		c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey);
    138		c_tx->pkt.rreq.sink_to =
    139			cpu_to_be64(wqe->sqe.sge[0].laddr);
    140		c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey);
    141		c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr);
    142		c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length);
    143
    144		c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq);
    145		crc = (char *)&c_tx->pkt.rreq_pkt.crc;
    146		break;
    147
    148	case SIW_OP_SEND:
    149		if (tx_flags(wqe) & SIW_WQE_SOLICITED)
    150			memcpy(&c_tx->pkt.ctrl,
    151			       &iwarp_pktinfo[RDMAP_SEND_SE].ctrl,
    152			       sizeof(struct iwarp_ctrl));
    153		else
    154			memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl,
    155			       sizeof(struct iwarp_ctrl));
    156
    157		c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
    158		c_tx->pkt.send.ddp_msn =
    159			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
    160		c_tx->pkt.send.ddp_mo = 0;
    161
    162		c_tx->pkt.send_inv.inval_stag = 0;
    163
    164		c_tx->ctrl_len = sizeof(struct iwarp_send);
    165
    166		crc = (char *)&c_tx->pkt.send_pkt.crc;
    167		data = siw_try_1seg(c_tx, crc);
    168		break;
    169
    170	case SIW_OP_SEND_REMOTE_INV:
    171		if (tx_flags(wqe) & SIW_WQE_SOLICITED)
    172			memcpy(&c_tx->pkt.ctrl,
    173			       &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl,
    174			       sizeof(struct iwarp_ctrl));
    175		else
    176			memcpy(&c_tx->pkt.ctrl,
    177			       &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl,
    178			       sizeof(struct iwarp_ctrl));
    179
    180		c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
    181		c_tx->pkt.send.ddp_msn =
    182			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
    183		c_tx->pkt.send.ddp_mo = 0;
    184
    185		c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey);
    186
    187		c_tx->ctrl_len = sizeof(struct iwarp_send_inv);
    188
    189		crc = (char *)&c_tx->pkt.send_pkt.crc;
    190		data = siw_try_1seg(c_tx, crc);
    191		break;
    192
    193	case SIW_OP_WRITE:
    194		memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl,
    195		       sizeof(struct iwarp_ctrl));
    196
    197		c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey);
    198		c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr);
    199		c_tx->ctrl_len = sizeof(struct iwarp_rdma_write);
    200
    201		crc = (char *)&c_tx->pkt.write_pkt.crc;
    202		data = siw_try_1seg(c_tx, crc);
    203		break;
    204
    205	case SIW_OP_READ_RESPONSE:
    206		memcpy(&c_tx->pkt.ctrl,
    207		       &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl,
    208		       sizeof(struct iwarp_ctrl));
    209
    210		/* NBO */
    211		c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey);
    212		c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr);
    213
    214		c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp);
    215
    216		crc = (char *)&c_tx->pkt.write_pkt.crc;
    217		data = siw_try_1seg(c_tx, crc);
    218		break;
    219
    220	default:
    221		siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe));
    222		return -EOPNOTSUPP;
    223	}
    224	if (unlikely(data < 0))
    225		return data;
    226
    227	c_tx->ctrl_sent = 0;
    228
    229	if (data <= MAX_HDR_INLINE) {
    230		if (data) {
    231			wqe->processed = data;
    232
    233			c_tx->pkt.ctrl.mpa_len =
    234				htons(c_tx->ctrl_len + data - MPA_HDR_SIZE);
    235
    236			/* Add pad, if needed */
    237			data += -(int)data & 0x3;
    238			/* advance CRC location after payload */
    239			crc += data;
    240			c_tx->ctrl_len += data;
    241
    242			if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
    243				c_tx->pkt.c_untagged.ddp_mo = 0;
    244			else
    245				c_tx->pkt.c_tagged.ddp_to =
    246					cpu_to_be64(wqe->sqe.raddr);
    247		}
    248
    249		*(u32 *)crc = 0;
    250		/*
    251		 * Do complete CRC if enabled and short packet
    252		 */
    253		if (c_tx->mpa_crc_hd) {
    254			crypto_shash_init(c_tx->mpa_crc_hd);
    255			if (crypto_shash_update(c_tx->mpa_crc_hd,
    256						(u8 *)&c_tx->pkt,
    257						c_tx->ctrl_len))
    258				return -EINVAL;
    259			crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc);
    260		}
    261		c_tx->ctrl_len += MPA_CRC_SIZE;
    262
    263		return PKT_COMPLETE;
    264	}
    265	c_tx->ctrl_len += MPA_CRC_SIZE;
    266	c_tx->sge_idx = 0;
    267	c_tx->sge_off = 0;
    268	c_tx->pbl_idx = 0;
    269
    270	/*
    271	 * Allow direct sending out of user buffer if WR is non signalled
    272	 * and payload is over threshold.
    273	 * Per RDMA verbs, the application should not change the send buffer
    274	 * until the work completed. In iWarp, work completion is only
    275	 * local delivery to TCP. TCP may reuse the buffer for
    276	 * retransmission. Changing unsent data also breaks the CRC,
    277	 * if applied.
    278	 */
    279	if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH &&
    280	    !(tx_flags(wqe) & SIW_WQE_SIGNALLED))
    281		c_tx->use_sendpage = 1;
    282	else
    283		c_tx->use_sendpage = 0;
    284
    285	return PKT_FRAGMENTED;
    286}
    287
    288/*
    289 * Send out one complete control type FPDU, or header of FPDU carrying
    290 * data. Used for fixed sized packets like Read.Requests or zero length
    291 * SENDs, WRITEs, READ.Responses, or header only.
    292 */
    293static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
    294			      int flags)
    295{
    296	struct msghdr msg = { .msg_flags = flags };
    297	struct kvec iov = { .iov_base =
    298				    (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
    299			    .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent };
    300
    301	int rv = kernel_sendmsg(s, &msg, &iov, 1,
    302				c_tx->ctrl_len - c_tx->ctrl_sent);
    303
    304	if (rv >= 0) {
    305		c_tx->ctrl_sent += rv;
    306
    307		if (c_tx->ctrl_sent == c_tx->ctrl_len)
    308			rv = 0;
    309		else
    310			rv = -EAGAIN;
    311	}
    312	return rv;
    313}
    314
    315/*
    316 * 0copy TCP transmit interface: Use do_tcp_sendpages.
    317 *
    318 * Using sendpage to push page by page appears to be less efficient
    319 * than using sendmsg, even if data are copied.
    320 *
    321 * A general performance limitation might be the extra four bytes
    322 * trailer checksum segment to be pushed after user data.
    323 */
    324static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
    325			     size_t size)
    326{
    327	struct sock *sk = s->sk;
    328	int i = 0, rv = 0, sent = 0,
    329	    flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST;
    330
    331	while (size) {
    332		size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
    333
    334		if (size + offset <= PAGE_SIZE)
    335			flags = MSG_MORE | MSG_DONTWAIT;
    336
    337		tcp_rate_check_app_limited(sk);
    338try_page_again:
    339		lock_sock(sk);
    340		rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags);
    341		release_sock(sk);
    342
    343		if (rv > 0) {
    344			size -= rv;
    345			sent += rv;
    346			if (rv != bytes) {
    347				offset += rv;
    348				bytes -= rv;
    349				goto try_page_again;
    350			}
    351			offset = 0;
    352		} else {
    353			if (rv == -EAGAIN || rv == 0)
    354				break;
    355			return rv;
    356		}
    357		i++;
    358	}
    359	return sent;
    360}
    361
    362/*
    363 * siw_0copy_tx()
    364 *
    365 * Pushes list of pages to TCP socket. If pages from multiple
    366 * SGE's, all referenced pages of each SGE are pushed in one
    367 * shot.
    368 */
    369static int siw_0copy_tx(struct socket *s, struct page **page,
    370			struct siw_sge *sge, unsigned int offset,
    371			unsigned int size)
    372{
    373	int i = 0, sent = 0, rv;
    374	int sge_bytes = min(sge->length - offset, size);
    375
    376	offset = (sge->laddr + offset) & ~PAGE_MASK;
    377
    378	while (sent != size) {
    379		rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
    380		if (rv >= 0) {
    381			sent += rv;
    382			if (size == sent || sge_bytes > rv)
    383				break;
    384
    385			i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
    386			sge++;
    387			sge_bytes = min(sge->length, size - sent);
    388			offset = sge->laddr & ~PAGE_MASK;
    389		} else {
    390			sent = rv;
    391			break;
    392		}
    393	}
    394	return sent;
    395}
    396
    397#define MAX_TRAILER (MPA_CRC_SIZE + 4)
    398
    399static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int len)
    400{
    401	int i;
    402
    403	/*
    404	 * Work backwards through the array to honor the kmap_local_page()
    405	 * ordering requirements.
    406	 */
    407	for (i = (len-1); i >= 0; i--) {
    408		if (kmap_mask & BIT(i)) {
    409			unsigned long addr = (unsigned long)iov[i].iov_base;
    410
    411			kunmap_local((void *)(addr & PAGE_MASK));
    412		}
    413	}
    414}
    415
    416/*
    417 * siw_tx_hdt() tries to push a complete packet to TCP where all
    418 * packet fragments are referenced by the elements of one iovec.
    419 * For the data portion, each involved page must be referenced by
    420 * one extra element. All sge's data can be non-aligned to page
    421 * boundaries. Two more elements are referencing iWARP header
    422 * and trailer:
    423 * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
    424 */
    425#define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
    426
    427/*
    428 * Write out iov referencing hdr, data and trailer of current FPDU.
    429 * Update transmit state dependent on write return status
    430 */
    431static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
    432{
    433	struct siw_wqe *wqe = &c_tx->wqe_active;
    434	struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx];
    435	struct kvec iov[MAX_ARRAY];
    436	struct page *page_array[MAX_ARRAY];
    437	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
    438
    439	int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv;
    440	unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0,
    441		     sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx,
    442		     pbl_idx = c_tx->pbl_idx;
    443	unsigned long kmap_mask = 0L;
    444
    445	if (c_tx->state == SIW_SEND_HDR) {
    446		if (c_tx->use_sendpage) {
    447			rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE);
    448			if (rv)
    449				goto done;
    450
    451			c_tx->state = SIW_SEND_DATA;
    452		} else {
    453			iov[0].iov_base =
    454				(char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
    455			iov[0].iov_len = hdr_len =
    456				c_tx->ctrl_len - c_tx->ctrl_sent;
    457			seg = 1;
    458		}
    459	}
    460
    461	wqe->processed += data_len;
    462
    463	while (data_len) { /* walk the list of SGE's */
    464		unsigned int sge_len = min(sge->length - sge_off, data_len);
    465		unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK;
    466		struct siw_mem *mem;
    467
    468		if (!(tx_flags(wqe) & SIW_WQE_INLINE)) {
    469			mem = wqe->mem[sge_idx];
    470			is_kva = mem->mem_obj == NULL ? 1 : 0;
    471		} else {
    472			is_kva = 1;
    473		}
    474		if (is_kva && !c_tx->use_sendpage) {
    475			/*
    476			 * tx from kernel virtual address: either inline data
    477			 * or memory region with assigned kernel buffer
    478			 */
    479			iov[seg].iov_base =
    480				(void *)(uintptr_t)(sge->laddr + sge_off);
    481			iov[seg].iov_len = sge_len;
    482
    483			if (do_crc)
    484				crypto_shash_update(c_tx->mpa_crc_hd,
    485						    iov[seg].iov_base,
    486						    sge_len);
    487			sge_off += sge_len;
    488			data_len -= sge_len;
    489			seg++;
    490			goto sge_done;
    491		}
    492
    493		while (sge_len) {
    494			size_t plen = min((int)PAGE_SIZE - fp_off, sge_len);
    495			void *kaddr;
    496
    497			if (!is_kva) {
    498				struct page *p;
    499
    500				if (mem->is_pbl)
    501					p = siw_get_pblpage(
    502						mem, sge->laddr + sge_off,
    503						&pbl_idx);
    504				else
    505					p = siw_get_upage(mem->umem,
    506							  sge->laddr + sge_off);
    507				if (unlikely(!p)) {
    508					siw_unmap_pages(iov, kmap_mask, seg);
    509					wqe->processed -= c_tx->bytes_unsent;
    510					rv = -EFAULT;
    511					goto done_crc;
    512				}
    513				page_array[seg] = p;
    514
    515				if (!c_tx->use_sendpage) {
    516					void *kaddr = kmap_local_page(p);
    517
    518					/* Remember for later kunmap() */
    519					kmap_mask |= BIT(seg);
    520					iov[seg].iov_base = kaddr + fp_off;
    521					iov[seg].iov_len = plen;
    522
    523					if (do_crc)
    524						crypto_shash_update(
    525							c_tx->mpa_crc_hd,
    526							iov[seg].iov_base,
    527							plen);
    528				} else if (do_crc) {
    529					kaddr = kmap_local_page(p);
    530					crypto_shash_update(c_tx->mpa_crc_hd,
    531							    kaddr + fp_off,
    532							    plen);
    533					kunmap_local(kaddr);
    534				}
    535			} else {
    536				u64 va = sge->laddr + sge_off;
    537
    538				page_array[seg] = virt_to_page(va & PAGE_MASK);
    539				if (do_crc)
    540					crypto_shash_update(
    541						c_tx->mpa_crc_hd,
    542						(void *)(uintptr_t)va,
    543						plen);
    544			}
    545
    546			sge_len -= plen;
    547			sge_off += plen;
    548			data_len -= plen;
    549			fp_off = 0;
    550
    551			if (++seg > (int)MAX_ARRAY) {
    552				siw_dbg_qp(tx_qp(c_tx), "to many fragments\n");
    553				siw_unmap_pages(iov, kmap_mask, seg-1);
    554				wqe->processed -= c_tx->bytes_unsent;
    555				rv = -EMSGSIZE;
    556				goto done_crc;
    557			}
    558		}
    559sge_done:
    560		/* Update SGE variables at end of SGE */
    561		if (sge_off == sge->length &&
    562		    (data_len != 0 || wqe->processed < wqe->bytes)) {
    563			sge_idx++;
    564			sge++;
    565			sge_off = 0;
    566		}
    567	}
    568	/* trailer */
    569	if (likely(c_tx->state != SIW_SEND_TRAILER)) {
    570		iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
    571		iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
    572	} else {
    573		iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
    574		iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
    575	}
    576
    577	if (c_tx->pad) {
    578		*(u32 *)c_tx->trailer.pad = 0;
    579		if (do_crc)
    580			crypto_shash_update(c_tx->mpa_crc_hd,
    581				(u8 *)&c_tx->trailer.crc - c_tx->pad,
    582				c_tx->pad);
    583	}
    584	if (!c_tx->mpa_crc_hd)
    585		c_tx->trailer.crc = 0;
    586	else if (do_crc)
    587		crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
    588
    589	data_len = c_tx->bytes_unsent;
    590
    591	if (c_tx->use_sendpage) {
    592		rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx],
    593				  c_tx->sge_off, data_len);
    594		if (rv == data_len) {
    595			rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
    596			if (rv > 0)
    597				rv += data_len;
    598			else
    599				rv = data_len;
    600		}
    601	} else {
    602		rv = kernel_sendmsg(s, &msg, iov, seg + 1,
    603				    hdr_len + data_len + trl_len);
    604		siw_unmap_pages(iov, kmap_mask, seg);
    605	}
    606	if (rv < (int)hdr_len) {
    607		/* Not even complete hdr pushed or negative rv */
    608		wqe->processed -= data_len;
    609		if (rv >= 0) {
    610			c_tx->ctrl_sent += rv;
    611			rv = -EAGAIN;
    612		}
    613		goto done_crc;
    614	}
    615	rv -= hdr_len;
    616
    617	if (rv >= (int)data_len) {
    618		/* all user data pushed to TCP or no data to push */
    619		if (data_len > 0 && wqe->processed < wqe->bytes) {
    620			/* Save the current state for next tx */
    621			c_tx->sge_idx = sge_idx;
    622			c_tx->sge_off = sge_off;
    623			c_tx->pbl_idx = pbl_idx;
    624		}
    625		rv -= data_len;
    626
    627		if (rv == trl_len) /* all pushed */
    628			rv = 0;
    629		else {
    630			c_tx->state = SIW_SEND_TRAILER;
    631			c_tx->ctrl_len = MAX_TRAILER;
    632			c_tx->ctrl_sent = rv + 4 - c_tx->pad;
    633			c_tx->bytes_unsent = 0;
    634			rv = -EAGAIN;
    635		}
    636
    637	} else if (data_len > 0) {
    638		/* Maybe some user data pushed to TCP */
    639		c_tx->state = SIW_SEND_DATA;
    640		wqe->processed -= data_len - rv;
    641
    642		if (rv) {
    643			/*
    644			 * Some bytes out. Recompute tx state based
    645			 * on old state and bytes pushed
    646			 */
    647			unsigned int sge_unsent;
    648
    649			c_tx->bytes_unsent -= rv;
    650			sge = &wqe->sqe.sge[c_tx->sge_idx];
    651			sge_unsent = sge->length - c_tx->sge_off;
    652
    653			while (sge_unsent <= rv) {
    654				rv -= sge_unsent;
    655				c_tx->sge_idx++;
    656				c_tx->sge_off = 0;
    657				sge++;
    658				sge_unsent = sge->length;
    659			}
    660			c_tx->sge_off += rv;
    661		}
    662		rv = -EAGAIN;
    663	}
    664done_crc:
    665	c_tx->do_crc = 0;
    666done:
    667	return rv;
    668}
    669
    670static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
    671				     struct socket *s)
    672{
    673	struct tcp_sock *tp = tcp_sk(s->sk);
    674
    675	if (tp->gso_segs) {
    676		if (c_tx->gso_seg_limit == 0)
    677			c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs;
    678		else
    679			c_tx->tcp_seglen =
    680				tp->mss_cache *
    681				min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
    682	} else {
    683		c_tx->tcp_seglen = tp->mss_cache;
    684	}
    685	/* Loopback may give odd numbers */
    686	c_tx->tcp_seglen &= 0xfffffff8;
    687}
    688
    689/*
    690 * siw_prepare_fpdu()
    691 *
    692 * Prepares transmit context to send out one FPDU if FPDU will contain
    693 * user data and user data are not immediate data.
    694 * Computes maximum FPDU length to fill up TCP MSS if possible.
    695 *
    696 * @qp:		QP from which to transmit
    697 * @wqe:	Current WQE causing transmission
    698 *
    699 * TODO: Take into account real available sendspace on socket
    700 *       to avoid header misalignment due to send pausing within
    701 *       fpdu transmission
    702 */
    703static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
    704{
    705	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
    706	int data_len;
    707
    708	c_tx->ctrl_len =
    709		iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len;
    710	c_tx->ctrl_sent = 0;
    711
    712	/*
    713	 * Update target buffer offset if any
    714	 */
    715	if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
    716		/* Untagged message */
    717		c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed);
    718	else /* Tagged message */
    719		c_tx->pkt.c_tagged.ddp_to =
    720			cpu_to_be64(wqe->sqe.raddr + wqe->processed);
    721
    722	data_len = wqe->bytes - wqe->processed;
    723	if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) {
    724		/* Trim DDP payload to fit into current TCP segment */
    725		data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE);
    726		c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST;
    727		c_tx->pad = 0;
    728	} else {
    729		c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST;
    730		c_tx->pad = -data_len & 0x3;
    731	}
    732	c_tx->bytes_unsent = data_len;
    733
    734	c_tx->pkt.ctrl.mpa_len =
    735		htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE);
    736
    737	/*
    738	 * Init MPA CRC computation
    739	 */
    740	if (c_tx->mpa_crc_hd) {
    741		crypto_shash_init(c_tx->mpa_crc_hd);
    742		crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt,
    743				    c_tx->ctrl_len);
    744		c_tx->do_crc = 1;
    745	}
    746}
    747
    748/*
    749 * siw_check_sgl_tx()
    750 *
    751 * Check permissions for a list of SGE's (SGL).
    752 * A successful check will have all memory referenced
    753 * for transmission resolved and assigned to the WQE.
    754 *
    755 * @pd:		Protection Domain SGL should belong to
    756 * @wqe:	WQE to be checked
    757 * @perms:	requested access permissions
    758 *
    759 */
    760
    761static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe,
    762			    enum ib_access_flags perms)
    763{
    764	struct siw_sge *sge = &wqe->sqe.sge[0];
    765	int i, len, num_sge = wqe->sqe.num_sge;
    766
    767	if (unlikely(num_sge > SIW_MAX_SGE))
    768		return -EINVAL;
    769
    770	for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) {
    771		/*
    772		 * rdma verbs: do not check stag for a zero length sge
    773		 */
    774		if (sge->length) {
    775			int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0,
    776					       sge->length);
    777
    778			if (unlikely(rv != E_ACCESS_OK))
    779				return rv;
    780		}
    781		len += sge->length;
    782	}
    783	return len;
    784}
    785
    786/*
    787 * siw_qp_sq_proc_tx()
    788 *
    789 * Process one WQE which needs transmission on the wire.
    790 */
    791static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
    792{
    793	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
    794	struct socket *s = qp->attrs.sk;
    795	int rv = 0, burst_len = qp->tx_ctx.burst;
    796	enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM;
    797
    798	if (unlikely(wqe->wr_status == SIW_WR_IDLE))
    799		return 0;
    800
    801	if (!burst_len)
    802		burst_len = SQ_USER_MAXBURST;
    803
    804	if (wqe->wr_status == SIW_WR_QUEUED) {
    805		if (!(wqe->sqe.flags & SIW_WQE_INLINE)) {
    806			if (tx_type(wqe) == SIW_OP_READ_RESPONSE)
    807				wqe->sqe.num_sge = 1;
    808
    809			if (tx_type(wqe) != SIW_OP_READ &&
    810			    tx_type(wqe) != SIW_OP_READ_LOCAL_INV) {
    811				/*
    812				 * Reference memory to be tx'd w/o checking
    813				 * access for LOCAL_READ permission, since
    814				 * not defined in RDMA core.
    815				 */
    816				rv = siw_check_sgl_tx(qp->pd, wqe, 0);
    817				if (rv < 0) {
    818					if (tx_type(wqe) ==
    819					    SIW_OP_READ_RESPONSE)
    820						ecode = siw_rdmap_error(-rv);
    821					rv = -EINVAL;
    822					goto tx_error;
    823				}
    824				wqe->bytes = rv;
    825			} else {
    826				wqe->bytes = 0;
    827			}
    828		} else {
    829			wqe->bytes = wqe->sqe.sge[0].length;
    830			if (!rdma_is_kernel_res(&qp->base_qp.res)) {
    831				if (wqe->bytes > SIW_MAX_INLINE) {
    832					rv = -EINVAL;
    833					goto tx_error;
    834				}
    835				wqe->sqe.sge[0].laddr =
    836					(u64)(uintptr_t)&wqe->sqe.sge[1];
    837			}
    838		}
    839		wqe->wr_status = SIW_WR_INPROGRESS;
    840		wqe->processed = 0;
    841
    842		siw_update_tcpseg(c_tx, s);
    843
    844		rv = siw_qp_prepare_tx(c_tx);
    845		if (rv == PKT_FRAGMENTED) {
    846			c_tx->state = SIW_SEND_HDR;
    847			siw_prepare_fpdu(qp, wqe);
    848		} else if (rv == PKT_COMPLETE) {
    849			c_tx->state = SIW_SEND_SHORT_FPDU;
    850		} else {
    851			goto tx_error;
    852		}
    853	}
    854
    855next_segment:
    856	siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n",
    857		   tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed,
    858		   wqe->sqe.id);
    859
    860	if (--burst_len == 0) {
    861		rv = -EINPROGRESS;
    862		goto tx_done;
    863	}
    864	if (c_tx->state == SIW_SEND_SHORT_FPDU) {
    865		enum siw_opcode tx_type = tx_type(wqe);
    866		unsigned int msg_flags;
    867
    868		if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1)
    869			/*
    870			 * End current TCP segment, if SQ runs empty,
    871			 * or siw_tcp_nagle is not set, or we bail out
    872			 * soon due to no burst credit left.
    873			 */
    874			msg_flags = MSG_DONTWAIT;
    875		else
    876			msg_flags = MSG_DONTWAIT | MSG_MORE;
    877
    878		rv = siw_tx_ctrl(c_tx, s, msg_flags);
    879
    880		if (!rv && tx_type != SIW_OP_READ &&
    881		    tx_type != SIW_OP_READ_LOCAL_INV)
    882			wqe->processed = wqe->bytes;
    883
    884		goto tx_done;
    885
    886	} else {
    887		rv = siw_tx_hdt(c_tx, s);
    888	}
    889	if (!rv) {
    890		/*
    891		 * One segment sent. Processing completed if last
    892		 * segment, Do next segment otherwise.
    893		 */
    894		if (unlikely(c_tx->tx_suspend)) {
    895			/*
    896			 * Verbs, 6.4.: Try stopping sending after a full
    897			 * DDP segment if the connection goes down
    898			 * (== peer halfclose)
    899			 */
    900			rv = -ECONNABORTED;
    901			goto tx_done;
    902		}
    903		if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) {
    904			siw_dbg_qp(qp, "WQE completed\n");
    905			goto tx_done;
    906		}
    907		c_tx->state = SIW_SEND_HDR;
    908
    909		siw_update_tcpseg(c_tx, s);
    910
    911		siw_prepare_fpdu(qp, wqe);
    912		goto next_segment;
    913	}
    914tx_done:
    915	qp->tx_ctx.burst = burst_len;
    916	return rv;
    917
    918tx_error:
    919	if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM)
    920		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
    921				   RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1);
    922	else
    923		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
    924				   RDMAP_ETYPE_CATASTROPHIC,
    925				   RDMAP_ECODE_UNSPECIFIED, 1);
    926	return rv;
    927}
    928
    929static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe)
    930{
    931	struct ib_mr *base_mr = (struct ib_mr *)(uintptr_t)sqe->base_mr;
    932	struct siw_device *sdev = to_siw_dev(pd->device);
    933	struct siw_mem *mem;
    934	int rv = 0;
    935
    936	siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey);
    937
    938	if (unlikely(!base_mr)) {
    939		pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
    940		return -EINVAL;
    941	}
    942
    943	if (unlikely(base_mr->rkey >> 8 != sqe->rkey  >> 8)) {
    944		pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey);
    945		return -EINVAL;
    946	}
    947
    948	mem = siw_mem_id2obj(sdev, sqe->rkey  >> 8);
    949	if (unlikely(!mem)) {
    950		pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
    951		return -EINVAL;
    952	}
    953
    954	if (unlikely(mem->pd != pd)) {
    955		pr_warn("siw: fastreg: PD mismatch\n");
    956		rv = -EINVAL;
    957		goto out;
    958	}
    959	if (unlikely(mem->stag_valid)) {
    960		pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey);
    961		rv = -EINVAL;
    962		goto out;
    963	}
    964	/* Refresh STag since user may have changed key part */
    965	mem->stag = sqe->rkey;
    966	mem->perms = sqe->access;
    967
    968	siw_dbg_mem(mem, "STag 0x%08x now valid\n", sqe->rkey);
    969	mem->va = base_mr->iova;
    970	mem->stag_valid = 1;
    971out:
    972	siw_mem_put(mem);
    973	return rv;
    974}
    975
    976static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
    977{
    978	int rv;
    979
    980	switch (tx_type(wqe)) {
    981	case SIW_OP_REG_MR:
    982		rv = siw_fastreg_mr(qp->pd, &wqe->sqe);
    983		break;
    984
    985	case SIW_OP_INVAL_STAG:
    986		rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey);
    987		break;
    988
    989	default:
    990		rv = -EINVAL;
    991	}
    992	return rv;
    993}
    994
    995/*
    996 * siw_qp_sq_process()
    997 *
    998 * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
    999 * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
   1000 * MPA FPDUs, each containing a DDP segment.
   1001 *
   1002 * SQ processing may occur in user context as a result of posting
   1003 * new WQE's or from siw_sq_work_handler() context. Processing in
   1004 * user context is limited to non-kernel verbs users.
   1005 *
   1006 * SQ processing may get paused anytime, possibly in the middle of a WR
   1007 * or FPDU, if insufficient send space is available. SQ processing
   1008 * gets resumed from siw_sq_work_handler(), if send space becomes
   1009 * available again.
   1010 *
   1011 * Must be called with the QP state read-locked.
   1012 *
   1013 * Note:
   1014 * An outbound RREQ can be satisfied by the corresponding RRESP
   1015 * _before_ it gets assigned to the ORQ. This happens regularly
   1016 * in RDMA READ via loopback case. Since both outbound RREQ and
   1017 * inbound RRESP can be handled by the same CPU, locking the ORQ
   1018 * is dead-lock prone and thus not an option. With that, the
   1019 * RREQ gets assigned to the ORQ _before_ being sent - see
   1020 * siw_activate_tx() - and pulled back in case of send failure.
   1021 */
   1022int siw_qp_sq_process(struct siw_qp *qp)
   1023{
   1024	struct siw_wqe *wqe = tx_wqe(qp);
   1025	enum siw_opcode tx_type;
   1026	unsigned long flags;
   1027	int rv = 0;
   1028
   1029	siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe));
   1030
   1031next_wqe:
   1032	/*
   1033	 * Stop QP processing if SQ state changed
   1034	 */
   1035	if (unlikely(qp->tx_ctx.tx_suspend)) {
   1036		siw_dbg_qp(qp, "tx suspended\n");
   1037		goto done;
   1038	}
   1039	tx_type = tx_type(wqe);
   1040
   1041	if (tx_type <= SIW_OP_READ_RESPONSE)
   1042		rv = siw_qp_sq_proc_tx(qp, wqe);
   1043	else
   1044		rv = siw_qp_sq_proc_local(qp, wqe);
   1045
   1046	if (!rv) {
   1047		/*
   1048		 * WQE processing done
   1049		 */
   1050		switch (tx_type) {
   1051		case SIW_OP_SEND:
   1052		case SIW_OP_SEND_REMOTE_INV:
   1053		case SIW_OP_WRITE:
   1054			siw_wqe_put_mem(wqe, tx_type);
   1055			fallthrough;
   1056
   1057		case SIW_OP_INVAL_STAG:
   1058		case SIW_OP_REG_MR:
   1059			if (tx_flags(wqe) & SIW_WQE_SIGNALLED)
   1060				siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
   1061						 SIW_WC_SUCCESS);
   1062			break;
   1063
   1064		case SIW_OP_READ:
   1065		case SIW_OP_READ_LOCAL_INV:
   1066			/*
   1067			 * already enqueued to ORQ queue
   1068			 */
   1069			break;
   1070
   1071		case SIW_OP_READ_RESPONSE:
   1072			siw_wqe_put_mem(wqe, tx_type);
   1073			break;
   1074
   1075		default:
   1076			WARN(1, "undefined WQE type %d\n", tx_type);
   1077			rv = -EINVAL;
   1078			goto done;
   1079		}
   1080
   1081		spin_lock_irqsave(&qp->sq_lock, flags);
   1082		wqe->wr_status = SIW_WR_IDLE;
   1083		rv = siw_activate_tx(qp);
   1084		spin_unlock_irqrestore(&qp->sq_lock, flags);
   1085
   1086		if (rv <= 0)
   1087			goto done;
   1088
   1089		goto next_wqe;
   1090
   1091	} else if (rv == -EAGAIN) {
   1092		siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n",
   1093			   qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
   1094			   qp->tx_ctx.bytes_unsent);
   1095		rv = 0;
   1096		goto done;
   1097	} else if (rv == -EINPROGRESS) {
   1098		rv = siw_sq_start(qp);
   1099		goto done;
   1100	} else {
   1101		/*
   1102		 * WQE processing failed.
   1103		 * Verbs 8.3.2:
   1104		 * o It turns any WQE into a signalled WQE.
   1105		 * o Local catastrophic error must be surfaced
   1106		 * o QP must be moved into Terminate state: done by code
   1107		 *   doing socket state change processing
   1108		 *
   1109		 * o TODO: Termination message must be sent.
   1110		 * o TODO: Implement more precise work completion errors,
   1111		 *         see enum ib_wc_status in ib_verbs.h
   1112		 */
   1113		siw_dbg_qp(qp, "wqe type %d processing failed: %d\n",
   1114			   tx_type(wqe), rv);
   1115
   1116		spin_lock_irqsave(&qp->sq_lock, flags);
   1117		/*
   1118		 * RREQ may have already been completed by inbound RRESP!
   1119		 */
   1120		if ((tx_type == SIW_OP_READ ||
   1121		     tx_type == SIW_OP_READ_LOCAL_INV) && qp->attrs.orq_size) {
   1122			/* Cleanup pending entry in ORQ */
   1123			qp->orq_put--;
   1124			qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0;
   1125		}
   1126		spin_unlock_irqrestore(&qp->sq_lock, flags);
   1127		/*
   1128		 * immediately suspends further TX processing
   1129		 */
   1130		if (!qp->tx_ctx.tx_suspend)
   1131			siw_qp_cm_drop(qp, 0);
   1132
   1133		switch (tx_type) {
   1134		case SIW_OP_SEND:
   1135		case SIW_OP_SEND_REMOTE_INV:
   1136		case SIW_OP_SEND_WITH_IMM:
   1137		case SIW_OP_WRITE:
   1138		case SIW_OP_READ:
   1139		case SIW_OP_READ_LOCAL_INV:
   1140			siw_wqe_put_mem(wqe, tx_type);
   1141			fallthrough;
   1142
   1143		case SIW_OP_INVAL_STAG:
   1144		case SIW_OP_REG_MR:
   1145			siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
   1146					 SIW_WC_LOC_QP_OP_ERR);
   1147
   1148			siw_qp_event(qp, IB_EVENT_QP_FATAL);
   1149
   1150			break;
   1151
   1152		case SIW_OP_READ_RESPONSE:
   1153			siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv);
   1154
   1155			siw_qp_event(qp, IB_EVENT_QP_REQ_ERR);
   1156
   1157			siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE);
   1158
   1159			break;
   1160
   1161		default:
   1162			WARN(1, "undefined WQE type %d\n", tx_type);
   1163			rv = -EINVAL;
   1164		}
   1165		wqe->wr_status = SIW_WR_IDLE;
   1166	}
   1167done:
   1168	return rv;
   1169}
   1170
   1171static void siw_sq_resume(struct siw_qp *qp)
   1172{
   1173	if (down_read_trylock(&qp->state_lock)) {
   1174		if (likely(qp->attrs.state == SIW_QP_STATE_RTS &&
   1175			   !qp->tx_ctx.tx_suspend)) {
   1176			int rv = siw_qp_sq_process(qp);
   1177
   1178			up_read(&qp->state_lock);
   1179
   1180			if (unlikely(rv < 0)) {
   1181				siw_dbg_qp(qp, "SQ task failed: err %d\n", rv);
   1182
   1183				if (!qp->tx_ctx.tx_suspend)
   1184					siw_qp_cm_drop(qp, 0);
   1185			}
   1186		} else {
   1187			up_read(&qp->state_lock);
   1188		}
   1189	} else {
   1190		siw_dbg_qp(qp, "Resume SQ while QP locked\n");
   1191	}
   1192	siw_qp_put(qp);
   1193}
   1194
   1195struct tx_task_t {
   1196	struct llist_head active;
   1197	wait_queue_head_t waiting;
   1198};
   1199
   1200static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g);
   1201
   1202void siw_stop_tx_thread(int nr_cpu)
   1203{
   1204	kthread_stop(siw_tx_thread[nr_cpu]);
   1205	wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting);
   1206}
   1207
   1208int siw_run_sq(void *data)
   1209{
   1210	const int nr_cpu = (unsigned int)(long)data;
   1211	struct llist_node *active;
   1212	struct siw_qp *qp;
   1213	struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu);
   1214
   1215	init_llist_head(&tx_task->active);
   1216	init_waitqueue_head(&tx_task->waiting);
   1217
   1218	while (1) {
   1219		struct llist_node *fifo_list = NULL;
   1220
   1221		wait_event_interruptible(tx_task->waiting,
   1222					 !llist_empty(&tx_task->active) ||
   1223						 kthread_should_stop());
   1224
   1225		if (kthread_should_stop())
   1226			break;
   1227
   1228		active = llist_del_all(&tx_task->active);
   1229		/*
   1230		 * llist_del_all returns a list with newest entry first.
   1231		 * Re-order list for fairness among QP's.
   1232		 */
   1233		while (active) {
   1234			struct llist_node *tmp = active;
   1235
   1236			active = llist_next(active);
   1237			tmp->next = fifo_list;
   1238			fifo_list = tmp;
   1239		}
   1240		while (fifo_list) {
   1241			qp = container_of(fifo_list, struct siw_qp, tx_list);
   1242			fifo_list = llist_next(fifo_list);
   1243			qp->tx_list.next = NULL;
   1244
   1245			siw_sq_resume(qp);
   1246		}
   1247	}
   1248	active = llist_del_all(&tx_task->active);
   1249	if (active) {
   1250		llist_for_each_entry(qp, active, tx_list) {
   1251			qp->tx_list.next = NULL;
   1252			siw_sq_resume(qp);
   1253		}
   1254	}
   1255	return 0;
   1256}
   1257
   1258int siw_sq_start(struct siw_qp *qp)
   1259{
   1260	if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
   1261		return 0;
   1262
   1263	if (unlikely(!cpu_online(qp->tx_cpu))) {
   1264		siw_put_tx_cpu(qp->tx_cpu);
   1265		qp->tx_cpu = siw_get_tx_cpu(qp->sdev);
   1266		if (qp->tx_cpu < 0) {
   1267			pr_warn("siw: no tx cpu available\n");
   1268
   1269			return -EIO;
   1270		}
   1271	}
   1272	siw_qp_get(qp);
   1273
   1274	llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active);
   1275
   1276	wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting);
   1277
   1278	return 0;
   1279}