cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

siw_cm.c (46878B)


      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2
      3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
      4/*          Fredy Neeser */
      5/*          Greg Joyce <greg@opengridcomputing.com> */
      6/* Copyright (c) 2008-2019, IBM Corporation */
      7/* Copyright (c) 2017, Open Grid Computing, Inc. */
      8
      9#include <linux/errno.h>
     10#include <linux/types.h>
     11#include <linux/net.h>
     12#include <linux/inetdevice.h>
     13#include <net/addrconf.h>
     14#include <linux/workqueue.h>
     15#include <net/sock.h>
     16#include <net/tcp.h>
     17#include <linux/inet.h>
     18#include <linux/tcp.h>
     19
     20#include <rdma/iw_cm.h>
     21#include <rdma/ib_verbs.h>
     22#include <rdma/ib_user_verbs.h>
     23
     24#include "siw.h"
     25#include "siw_cm.h"
     26
     27/*
     28 * Set to any combination of
     29 * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
     30 */
     31static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
     32static const bool relaxed_ird_negotiation = true;
     33
     34static void siw_cm_llp_state_change(struct sock *s);
     35static void siw_cm_llp_data_ready(struct sock *s);
     36static void siw_cm_llp_write_space(struct sock *s);
     37static void siw_cm_llp_error_report(struct sock *s);
     38static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
     39			 int status);
     40
     41static void siw_sk_assign_cm_upcalls(struct sock *sk)
     42{
     43	write_lock_bh(&sk->sk_callback_lock);
     44	sk->sk_state_change = siw_cm_llp_state_change;
     45	sk->sk_data_ready = siw_cm_llp_data_ready;
     46	sk->sk_write_space = siw_cm_llp_write_space;
     47	sk->sk_error_report = siw_cm_llp_error_report;
     48	write_unlock_bh(&sk->sk_callback_lock);
     49}
     50
     51static void siw_sk_save_upcalls(struct sock *sk)
     52{
     53	struct siw_cep *cep = sk_to_cep(sk);
     54
     55	write_lock_bh(&sk->sk_callback_lock);
     56	cep->sk_state_change = sk->sk_state_change;
     57	cep->sk_data_ready = sk->sk_data_ready;
     58	cep->sk_write_space = sk->sk_write_space;
     59	cep->sk_error_report = sk->sk_error_report;
     60	write_unlock_bh(&sk->sk_callback_lock);
     61}
     62
     63static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
     64{
     65	sk->sk_state_change = cep->sk_state_change;
     66	sk->sk_data_ready = cep->sk_data_ready;
     67	sk->sk_write_space = cep->sk_write_space;
     68	sk->sk_error_report = cep->sk_error_report;
     69	sk->sk_user_data = NULL;
     70}
     71
     72static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
     73{
     74	struct socket *s = cep->sock;
     75	struct sock *sk = s->sk;
     76
     77	write_lock_bh(&sk->sk_callback_lock);
     78
     79	qp->attrs.sk = s;
     80	sk->sk_data_ready = siw_qp_llp_data_ready;
     81	sk->sk_write_space = siw_qp_llp_write_space;
     82
     83	write_unlock_bh(&sk->sk_callback_lock);
     84}
     85
     86static void siw_socket_disassoc(struct socket *s)
     87{
     88	struct sock *sk = s->sk;
     89	struct siw_cep *cep;
     90
     91	if (sk) {
     92		write_lock_bh(&sk->sk_callback_lock);
     93		cep = sk_to_cep(sk);
     94		if (cep) {
     95			siw_sk_restore_upcalls(sk, cep);
     96			siw_cep_put(cep);
     97		} else {
     98			pr_warn("siw: cannot restore sk callbacks: no ep\n");
     99		}
    100		write_unlock_bh(&sk->sk_callback_lock);
    101	} else {
    102		pr_warn("siw: cannot restore sk callbacks: no sk\n");
    103	}
    104}
    105
    106static void siw_rtr_data_ready(struct sock *sk)
    107{
    108	struct siw_cep *cep;
    109	struct siw_qp *qp = NULL;
    110	read_descriptor_t rd_desc;
    111
    112	read_lock(&sk->sk_callback_lock);
    113
    114	cep = sk_to_cep(sk);
    115	if (!cep) {
    116		WARN(1, "No connection endpoint\n");
    117		goto out;
    118	}
    119	qp = sk_to_qp(sk);
    120
    121	memset(&rd_desc, 0, sizeof(rd_desc));
    122	rd_desc.arg.data = qp;
    123	rd_desc.count = 1;
    124
    125	tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
    126	/*
    127	 * Check if first frame was successfully processed.
    128	 * Signal connection full establishment if yes.
    129	 * Failed data processing would have already scheduled
    130	 * connection drop.
    131	 */
    132	if (!qp->rx_stream.rx_suspend)
    133		siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
    134out:
    135	read_unlock(&sk->sk_callback_lock);
    136	if (qp)
    137		siw_qp_socket_assoc(cep, qp);
    138}
    139
    140static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
    141{
    142	struct sock *sk = cep->sock->sk;
    143
    144	write_lock_bh(&sk->sk_callback_lock);
    145	sk->sk_data_ready = siw_rtr_data_ready;
    146	sk->sk_write_space = siw_qp_llp_write_space;
    147	write_unlock_bh(&sk->sk_callback_lock);
    148}
    149
    150static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
    151{
    152	cep->sock = s;
    153	siw_cep_get(cep);
    154	s->sk->sk_user_data = cep;
    155
    156	siw_sk_save_upcalls(s->sk);
    157	siw_sk_assign_cm_upcalls(s->sk);
    158}
    159
    160static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
    161{
    162	struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
    163	unsigned long flags;
    164
    165	if (!cep)
    166		return NULL;
    167
    168	INIT_LIST_HEAD(&cep->listenq);
    169	INIT_LIST_HEAD(&cep->devq);
    170	INIT_LIST_HEAD(&cep->work_freelist);
    171
    172	kref_init(&cep->ref);
    173	cep->state = SIW_EPSTATE_IDLE;
    174	init_waitqueue_head(&cep->waitq);
    175	spin_lock_init(&cep->lock);
    176	cep->sdev = sdev;
    177	cep->enhanced_rdma_conn_est = false;
    178
    179	spin_lock_irqsave(&sdev->lock, flags);
    180	list_add_tail(&cep->devq, &sdev->cep_list);
    181	spin_unlock_irqrestore(&sdev->lock, flags);
    182
    183	siw_dbg_cep(cep, "new endpoint\n");
    184	return cep;
    185}
    186
    187static void siw_cm_free_work(struct siw_cep *cep)
    188{
    189	struct list_head *w, *tmp;
    190	struct siw_cm_work *work;
    191
    192	list_for_each_safe(w, tmp, &cep->work_freelist) {
    193		work = list_entry(w, struct siw_cm_work, list);
    194		list_del(&work->list);
    195		kfree(work);
    196	}
    197}
    198
    199static void siw_cancel_mpatimer(struct siw_cep *cep)
    200{
    201	spin_lock_bh(&cep->lock);
    202	if (cep->mpa_timer) {
    203		if (cancel_delayed_work(&cep->mpa_timer->work)) {
    204			siw_cep_put(cep);
    205			kfree(cep->mpa_timer); /* not needed again */
    206		}
    207		cep->mpa_timer = NULL;
    208	}
    209	spin_unlock_bh(&cep->lock);
    210}
    211
    212static void siw_put_work(struct siw_cm_work *work)
    213{
    214	INIT_LIST_HEAD(&work->list);
    215	spin_lock_bh(&work->cep->lock);
    216	list_add(&work->list, &work->cep->work_freelist);
    217	spin_unlock_bh(&work->cep->lock);
    218}
    219
    220static void siw_cep_set_inuse(struct siw_cep *cep)
    221{
    222	unsigned long flags;
    223retry:
    224	spin_lock_irqsave(&cep->lock, flags);
    225
    226	if (cep->in_use) {
    227		spin_unlock_irqrestore(&cep->lock, flags);
    228		wait_event_interruptible(cep->waitq, !cep->in_use);
    229		if (signal_pending(current))
    230			flush_signals(current);
    231		goto retry;
    232	} else {
    233		cep->in_use = 1;
    234		spin_unlock_irqrestore(&cep->lock, flags);
    235	}
    236}
    237
    238static void siw_cep_set_free(struct siw_cep *cep)
    239{
    240	unsigned long flags;
    241
    242	spin_lock_irqsave(&cep->lock, flags);
    243	cep->in_use = 0;
    244	spin_unlock_irqrestore(&cep->lock, flags);
    245
    246	wake_up(&cep->waitq);
    247}
    248
    249static void __siw_cep_dealloc(struct kref *ref)
    250{
    251	struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
    252	struct siw_device *sdev = cep->sdev;
    253	unsigned long flags;
    254
    255	WARN_ON(cep->listen_cep);
    256
    257	/* kfree(NULL) is safe */
    258	kfree(cep->mpa.pdata);
    259	spin_lock_bh(&cep->lock);
    260	if (!list_empty(&cep->work_freelist))
    261		siw_cm_free_work(cep);
    262	spin_unlock_bh(&cep->lock);
    263
    264	spin_lock_irqsave(&sdev->lock, flags);
    265	list_del(&cep->devq);
    266	spin_unlock_irqrestore(&sdev->lock, flags);
    267
    268	siw_dbg_cep(cep, "free endpoint\n");
    269	kfree(cep);
    270}
    271
    272static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
    273{
    274	struct siw_cm_work *work = NULL;
    275
    276	spin_lock_bh(&cep->lock);
    277	if (!list_empty(&cep->work_freelist)) {
    278		work = list_entry(cep->work_freelist.next, struct siw_cm_work,
    279				  list);
    280		list_del_init(&work->list);
    281	}
    282	spin_unlock_bh(&cep->lock);
    283	return work;
    284}
    285
    286static int siw_cm_alloc_work(struct siw_cep *cep, int num)
    287{
    288	struct siw_cm_work *work;
    289
    290	while (num--) {
    291		work = kmalloc(sizeof(*work), GFP_KERNEL);
    292		if (!work) {
    293			if (!(list_empty(&cep->work_freelist)))
    294				siw_cm_free_work(cep);
    295			return -ENOMEM;
    296		}
    297		work->cep = cep;
    298		INIT_LIST_HEAD(&work->list);
    299		list_add(&work->list, &cep->work_freelist);
    300	}
    301	return 0;
    302}
    303
    304/*
    305 * siw_cm_upcall()
    306 *
    307 * Upcall to IWCM to inform about async connection events
    308 */
    309static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
    310			 int status)
    311{
    312	struct iw_cm_event event;
    313	struct iw_cm_id *id;
    314
    315	memset(&event, 0, sizeof(event));
    316	event.status = status;
    317	event.event = reason;
    318
    319	if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
    320		event.provider_data = cep;
    321		id = cep->listen_cep->cm_id;
    322	} else {
    323		id = cep->cm_id;
    324	}
    325	/* Signal IRD and ORD */
    326	if (reason == IW_CM_EVENT_ESTABLISHED ||
    327	    reason == IW_CM_EVENT_CONNECT_REPLY) {
    328		/* Signal negotiated IRD/ORD values we will use */
    329		event.ird = cep->ird;
    330		event.ord = cep->ord;
    331	} else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
    332		event.ird = cep->ord;
    333		event.ord = cep->ird;
    334	}
    335	/* Signal private data and address information */
    336	if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
    337	    reason == IW_CM_EVENT_CONNECT_REPLY) {
    338		u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
    339
    340		if (pd_len) {
    341			/*
    342			 * hand over MPA private data
    343			 */
    344			event.private_data_len = pd_len;
    345			event.private_data = cep->mpa.pdata;
    346
    347			/* Hide MPA V2 IRD/ORD control */
    348			if (cep->enhanced_rdma_conn_est) {
    349				event.private_data_len -=
    350					sizeof(struct mpa_v2_data);
    351				event.private_data +=
    352					sizeof(struct mpa_v2_data);
    353			}
    354		}
    355		getname_local(cep->sock, &event.local_addr);
    356		getname_peer(cep->sock, &event.remote_addr);
    357	}
    358	siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n",
    359		    cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status);
    360
    361	return id->event_handler(id, &event);
    362}
    363
    364/*
    365 * siw_qp_cm_drop()
    366 *
    367 * Drops established LLP connection if present and not already
    368 * scheduled for dropping. Called from user context, SQ workqueue
    369 * or receive IRQ. Caller signals if socket can be immediately
    370 * closed (basically, if not in IRQ).
    371 */
    372void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
    373{
    374	struct siw_cep *cep = qp->cep;
    375
    376	qp->rx_stream.rx_suspend = 1;
    377	qp->tx_ctx.tx_suspend = 1;
    378
    379	if (!qp->cep)
    380		return;
    381
    382	if (schedule) {
    383		siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
    384	} else {
    385		siw_cep_set_inuse(cep);
    386
    387		if (cep->state == SIW_EPSTATE_CLOSED) {
    388			siw_dbg_cep(cep, "already closed\n");
    389			goto out;
    390		}
    391		siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
    392
    393		if (qp->term_info.valid)
    394			siw_send_terminate(qp);
    395
    396		if (cep->cm_id) {
    397			switch (cep->state) {
    398			case SIW_EPSTATE_AWAIT_MPAREP:
    399				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
    400					      -EINVAL);
    401				break;
    402
    403			case SIW_EPSTATE_RDMA_MODE:
    404				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
    405				break;
    406
    407			case SIW_EPSTATE_IDLE:
    408			case SIW_EPSTATE_LISTENING:
    409			case SIW_EPSTATE_CONNECTING:
    410			case SIW_EPSTATE_AWAIT_MPAREQ:
    411			case SIW_EPSTATE_RECVD_MPAREQ:
    412			case SIW_EPSTATE_CLOSED:
    413			default:
    414				break;
    415			}
    416			cep->cm_id->rem_ref(cep->cm_id);
    417			cep->cm_id = NULL;
    418			siw_cep_put(cep);
    419		}
    420		cep->state = SIW_EPSTATE_CLOSED;
    421
    422		if (cep->sock) {
    423			siw_socket_disassoc(cep->sock);
    424			/*
    425			 * Immediately close socket
    426			 */
    427			sock_release(cep->sock);
    428			cep->sock = NULL;
    429		}
    430		if (cep->qp) {
    431			cep->qp = NULL;
    432			siw_qp_put(qp);
    433		}
    434out:
    435		siw_cep_set_free(cep);
    436	}
    437}
    438
    439void siw_cep_put(struct siw_cep *cep)
    440{
    441	WARN_ON(kref_read(&cep->ref) < 1);
    442	kref_put(&cep->ref, __siw_cep_dealloc);
    443}
    444
    445void siw_cep_get(struct siw_cep *cep)
    446{
    447	kref_get(&cep->ref);
    448}
    449
    450/*
    451 * Expects params->pd_len in host byte order
    452 */
    453static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
    454{
    455	struct socket *s = cep->sock;
    456	struct mpa_rr *rr = &cep->mpa.hdr;
    457	struct kvec iov[3];
    458	struct msghdr msg;
    459	int rv;
    460	int iovec_num = 0;
    461	int mpa_len;
    462
    463	memset(&msg, 0, sizeof(msg));
    464
    465	iov[iovec_num].iov_base = rr;
    466	iov[iovec_num].iov_len = sizeof(*rr);
    467	mpa_len = sizeof(*rr);
    468
    469	if (cep->enhanced_rdma_conn_est) {
    470		iovec_num++;
    471		iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
    472		iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
    473		mpa_len += sizeof(cep->mpa.v2_ctrl);
    474	}
    475	if (pd_len) {
    476		iovec_num++;
    477		iov[iovec_num].iov_base = (char *)pdata;
    478		iov[iovec_num].iov_len = pd_len;
    479		mpa_len += pd_len;
    480	}
    481	if (cep->enhanced_rdma_conn_est)
    482		pd_len += sizeof(cep->mpa.v2_ctrl);
    483
    484	rr->params.pd_len = cpu_to_be16(pd_len);
    485
    486	rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len);
    487
    488	return rv < 0 ? rv : 0;
    489}
    490
    491/*
    492 * Receive MPA Request/Reply header.
    493 *
    494 * Returns 0 if complete MPA Request/Reply header including
    495 * eventual private data was received. Returns -EAGAIN if
    496 * header was partially received or negative error code otherwise.
    497 *
    498 * Context: May be called in process context only
    499 */
    500static int siw_recv_mpa_rr(struct siw_cep *cep)
    501{
    502	struct mpa_rr *hdr = &cep->mpa.hdr;
    503	struct socket *s = cep->sock;
    504	u16 pd_len;
    505	int rcvd, to_rcv;
    506
    507	if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
    508		rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
    509				  sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
    510				  0);
    511		if (rcvd <= 0)
    512			return -ECONNABORTED;
    513
    514		cep->mpa.bytes_rcvd += rcvd;
    515
    516		if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
    517			return -EAGAIN;
    518
    519		if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
    520			return -EPROTO;
    521	}
    522	pd_len = be16_to_cpu(hdr->params.pd_len);
    523
    524	/*
    525	 * At least the MPA Request/Reply header (frame not including
    526	 * private data) has been received.
    527	 * Receive (or continue receiving) any private data.
    528	 */
    529	to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
    530
    531	if (!to_rcv) {
    532		/*
    533		 * We must have hdr->params.pd_len == 0 and thus received a
    534		 * complete MPA Request/Reply frame.
    535		 * Check against peer protocol violation.
    536		 */
    537		u32 word;
    538
    539		rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT);
    540		if (rcvd == -EAGAIN)
    541			return 0;
    542
    543		if (rcvd == 0) {
    544			siw_dbg_cep(cep, "peer EOF\n");
    545			return -EPIPE;
    546		}
    547		if (rcvd < 0) {
    548			siw_dbg_cep(cep, "error: %d\n", rcvd);
    549			return rcvd;
    550		}
    551		siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
    552
    553		return -EPROTO;
    554	}
    555
    556	/*
    557	 * At this point, we must have hdr->params.pd_len != 0.
    558	 * A private data buffer gets allocated if hdr->params.pd_len != 0.
    559	 */
    560	if (!cep->mpa.pdata) {
    561		cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
    562		if (!cep->mpa.pdata)
    563			return -ENOMEM;
    564	}
    565	rcvd = ksock_recv(
    566		s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
    567		to_rcv + 4, MSG_DONTWAIT);
    568
    569	if (rcvd < 0)
    570		return rcvd;
    571
    572	if (rcvd > to_rcv)
    573		return -EPROTO;
    574
    575	cep->mpa.bytes_rcvd += rcvd;
    576
    577	if (to_rcv == rcvd) {
    578		siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
    579		return 0;
    580	}
    581	return -EAGAIN;
    582}
    583
    584/*
    585 * siw_proc_mpareq()
    586 *
    587 * Read MPA Request from socket and signal new connection to IWCM
    588 * if success. Caller must hold lock on corresponding listening CEP.
    589 */
    590static int siw_proc_mpareq(struct siw_cep *cep)
    591{
    592	struct mpa_rr *req;
    593	int version, rv;
    594	u16 pd_len;
    595
    596	rv = siw_recv_mpa_rr(cep);
    597	if (rv)
    598		return rv;
    599
    600	req = &cep->mpa.hdr;
    601
    602	version = __mpa_rr_revision(req->params.bits);
    603	pd_len = be16_to_cpu(req->params.pd_len);
    604
    605	if (version > MPA_REVISION_2)
    606		/* allow for 0, 1, and 2 only */
    607		return -EPROTO;
    608
    609	if (memcmp(req->key, MPA_KEY_REQ, 16))
    610		return -EPROTO;
    611
    612	/* Prepare for sending MPA reply */
    613	memcpy(req->key, MPA_KEY_REP, 16);
    614
    615	if (version == MPA_REVISION_2 &&
    616	    (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
    617		/*
    618		 * MPA version 2 must signal IRD/ORD values and P2P mode
    619		 * in private data if header flag MPA_RR_FLAG_ENHANCED
    620		 * is set.
    621		 */
    622		if (pd_len < sizeof(struct mpa_v2_data))
    623			goto reject_conn;
    624
    625		cep->enhanced_rdma_conn_est = true;
    626	}
    627
    628	/* MPA Markers: currently not supported. Marker TX to be added. */
    629	if (req->params.bits & MPA_RR_FLAG_MARKERS)
    630		goto reject_conn;
    631
    632	if (req->params.bits & MPA_RR_FLAG_CRC) {
    633		/*
    634		 * RFC 5044, page 27: CRC MUST be used if peer requests it.
    635		 * siw specific: 'mpa_crc_strict' parameter to reject
    636		 * connection with CRC if local CRC off enforced by
    637		 * 'mpa_crc_strict' module parameter.
    638		 */
    639		if (!mpa_crc_required && mpa_crc_strict)
    640			goto reject_conn;
    641
    642		/* Enable CRC if requested by module parameter */
    643		if (mpa_crc_required)
    644			req->params.bits |= MPA_RR_FLAG_CRC;
    645	}
    646	if (cep->enhanced_rdma_conn_est) {
    647		struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
    648
    649		/*
    650		 * Peer requested ORD becomes requested local IRD,
    651		 * peer requested IRD becomes requested local ORD.
    652		 * IRD and ORD get limited by global maximum values.
    653		 */
    654		cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
    655		cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
    656		cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
    657		cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
    658
    659		/* May get overwritten by locally negotiated values */
    660		cep->mpa.v2_ctrl.ird = htons(cep->ird);
    661		cep->mpa.v2_ctrl.ord = htons(cep->ord);
    662
    663		/*
    664		 * Support for peer sent zero length Write or Read to
    665		 * let local side enter RTS. Writes are preferred.
    666		 * Sends would require pre-posting a Receive and are
    667		 * not supported.
    668		 * Propose zero length Write if none of Read and Write
    669		 * is indicated.
    670		 */
    671		if (v2->ird & MPA_V2_PEER_TO_PEER) {
    672			cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
    673
    674			if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
    675				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
    676			else if (v2->ord & MPA_V2_RDMA_READ_RTR)
    677				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
    678			else
    679				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
    680		}
    681	}
    682
    683	cep->state = SIW_EPSTATE_RECVD_MPAREQ;
    684
    685	/* Keep reference until IWCM accepts/rejects */
    686	siw_cep_get(cep);
    687	rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
    688	if (rv)
    689		siw_cep_put(cep);
    690
    691	return rv;
    692
    693reject_conn:
    694	siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
    695		    req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
    696		    mpa_crc_required, mpa_crc_strict,
    697		    req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
    698
    699	req->params.bits &= ~MPA_RR_FLAG_MARKERS;
    700	req->params.bits |= MPA_RR_FLAG_REJECT;
    701
    702	if (!mpa_crc_required && mpa_crc_strict)
    703		req->params.bits &= ~MPA_RR_FLAG_CRC;
    704
    705	if (pd_len)
    706		kfree(cep->mpa.pdata);
    707
    708	cep->mpa.pdata = NULL;
    709
    710	siw_send_mpareqrep(cep, NULL, 0);
    711
    712	return -EOPNOTSUPP;
    713}
    714
    715static int siw_proc_mpareply(struct siw_cep *cep)
    716{
    717	struct siw_qp_attrs qp_attrs;
    718	enum siw_qp_attr_mask qp_attr_mask;
    719	struct siw_qp *qp = cep->qp;
    720	struct mpa_rr *rep;
    721	int rv;
    722	u16 rep_ord;
    723	u16 rep_ird;
    724	bool ird_insufficient = false;
    725	enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
    726
    727	rv = siw_recv_mpa_rr(cep);
    728	if (rv != -EAGAIN)
    729		siw_cancel_mpatimer(cep);
    730	if (rv)
    731		goto out_err;
    732
    733	rep = &cep->mpa.hdr;
    734
    735	if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
    736		/* allow for 0, 1,  and 2 only */
    737		rv = -EPROTO;
    738		goto out_err;
    739	}
    740	if (memcmp(rep->key, MPA_KEY_REP, 16)) {
    741		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA,
    742				   LLP_ECODE_INVALID_REQ_RESP, 0);
    743		siw_send_terminate(qp);
    744		rv = -EPROTO;
    745		goto out_err;
    746	}
    747	if (rep->params.bits & MPA_RR_FLAG_REJECT) {
    748		siw_dbg_cep(cep, "got mpa reject\n");
    749		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
    750
    751		return -ECONNRESET;
    752	}
    753	if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
    754		siw_dbg_cep(cep, "peer allows GSO on TX\n");
    755		qp->tx_ctx.gso_seg_limit = 0;
    756	}
    757	if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
    758	    (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
    759	    (mpa_crc_strict && !mpa_crc_required &&
    760	     (rep->params.bits & MPA_RR_FLAG_CRC))) {
    761		siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
    762			    rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
    763			    mpa_crc_required, mpa_crc_strict,
    764			    rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
    765
    766		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
    767
    768		return -EINVAL;
    769	}
    770	if (cep->enhanced_rdma_conn_est) {
    771		struct mpa_v2_data *v2;
    772
    773		if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
    774		    !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
    775			/*
    776			 * Protocol failure: The responder MUST reply with
    777			 * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
    778			 */
    779			siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
    780				    __mpa_rr_revision(rep->params.bits),
    781				    rep->params.bits & MPA_RR_FLAG_ENHANCED ?
    782					    1 :
    783					    0);
    784
    785			siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
    786				      -ECONNRESET);
    787			return -EINVAL;
    788		}
    789		v2 = (struct mpa_v2_data *)cep->mpa.pdata;
    790		rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
    791		rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
    792
    793		if (cep->ird < rep_ord &&
    794		    (relaxed_ird_negotiation == false ||
    795		     rep_ord > cep->sdev->attrs.max_ird)) {
    796			siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
    797				    cep->ird, rep_ord,
    798				    cep->sdev->attrs.max_ord);
    799			ird_insufficient = true;
    800		}
    801		if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
    802			siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
    803				    rep_ird);
    804			ird_insufficient = true;
    805		}
    806		/*
    807		 * Always report negotiated peer values to user,
    808		 * even if IRD/ORD negotiation failed
    809		 */
    810		cep->ird = rep_ord;
    811		cep->ord = rep_ird;
    812
    813		if (ird_insufficient) {
    814			/*
    815			 * If the initiator IRD is insuffient for the
    816			 * responder ORD, send a TERM.
    817			 */
    818			siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
    819					   LLP_ETYPE_MPA,
    820					   LLP_ECODE_INSUFFICIENT_IRD, 0);
    821			siw_send_terminate(qp);
    822			rv = -ENOMEM;
    823			goto out_err;
    824		}
    825		if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
    826			mpa_p2p_mode =
    827				cep->mpa.v2_ctrl_req.ord &
    828				(MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
    829
    830		/*
    831		 * Check if we requested P2P mode, and if peer agrees
    832		 */
    833		if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
    834			if ((mpa_p2p_mode & v2->ord) == 0) {
    835				/*
    836				 * We requested RTR mode(s), but the peer
    837				 * did not pick any mode we support.
    838				 */
    839				siw_dbg_cep(cep,
    840					    "rtr mode:  req %2x, got %2x\n",
    841					    mpa_p2p_mode,
    842					    v2->ord & (MPA_V2_RDMA_WRITE_RTR |
    843						       MPA_V2_RDMA_READ_RTR));
    844
    845				siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
    846						   LLP_ETYPE_MPA,
    847						   LLP_ECODE_NO_MATCHING_RTR,
    848						   0);
    849				siw_send_terminate(qp);
    850				rv = -EPROTO;
    851				goto out_err;
    852			}
    853			mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
    854						  MPA_V2_RDMA_READ_RTR);
    855		}
    856	}
    857	memset(&qp_attrs, 0, sizeof(qp_attrs));
    858
    859	if (rep->params.bits & MPA_RR_FLAG_CRC)
    860		qp_attrs.flags = SIW_MPA_CRC;
    861
    862	qp_attrs.irq_size = cep->ird;
    863	qp_attrs.orq_size = cep->ord;
    864	qp_attrs.sk = cep->sock;
    865	qp_attrs.state = SIW_QP_STATE_RTS;
    866
    867	qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
    868		       SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
    869
    870	/* Move socket RX/TX under QP control */
    871	down_write(&qp->state_lock);
    872	if (qp->attrs.state > SIW_QP_STATE_RTR) {
    873		rv = -EINVAL;
    874		up_write(&qp->state_lock);
    875		goto out_err;
    876	}
    877	rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask);
    878
    879	siw_qp_socket_assoc(cep, qp);
    880
    881	up_write(&qp->state_lock);
    882
    883	/* Send extra RDMA frame to trigger peer RTS if negotiated */
    884	if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
    885		rv = siw_qp_mpa_rts(qp, mpa_p2p_mode);
    886		if (rv)
    887			goto out_err;
    888	}
    889	if (!rv) {
    890		rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
    891		if (!rv)
    892			cep->state = SIW_EPSTATE_RDMA_MODE;
    893
    894		return 0;
    895	}
    896
    897out_err:
    898	siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
    899
    900	return rv;
    901}
    902
    903/*
    904 * siw_accept_newconn - accept an incoming pending connection
    905 *
    906 */
    907static void siw_accept_newconn(struct siw_cep *cep)
    908{
    909	struct socket *s = cep->sock;
    910	struct socket *new_s = NULL;
    911	struct siw_cep *new_cep = NULL;
    912	int rv = 0; /* debug only. should disappear */
    913
    914	if (cep->state != SIW_EPSTATE_LISTENING)
    915		goto error;
    916
    917	new_cep = siw_cep_alloc(cep->sdev);
    918	if (!new_cep)
    919		goto error;
    920
    921	/*
    922	 * 4: Allocate a sufficient number of work elements
    923	 * to allow concurrent handling of local + peer close
    924	 * events, MPA header processing + MPA timeout.
    925	 */
    926	if (siw_cm_alloc_work(new_cep, 4) != 0)
    927		goto error;
    928
    929	/*
    930	 * Copy saved socket callbacks from listening CEP
    931	 * and assign new socket with new CEP
    932	 */
    933	new_cep->sk_state_change = cep->sk_state_change;
    934	new_cep->sk_data_ready = cep->sk_data_ready;
    935	new_cep->sk_write_space = cep->sk_write_space;
    936	new_cep->sk_error_report = cep->sk_error_report;
    937
    938	rv = kernel_accept(s, &new_s, O_NONBLOCK);
    939	if (rv != 0) {
    940		/*
    941		 * Connection already aborted by peer..?
    942		 */
    943		siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
    944		goto error;
    945	}
    946	new_cep->sock = new_s;
    947	siw_cep_get(new_cep);
    948	new_s->sk->sk_user_data = new_cep;
    949
    950	if (siw_tcp_nagle == false)
    951		tcp_sock_set_nodelay(new_s->sk);
    952	new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
    953
    954	rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
    955	if (rv)
    956		goto error;
    957	/*
    958	 * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
    959	 */
    960	new_cep->listen_cep = cep;
    961	siw_cep_get(cep);
    962
    963	if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
    964		/*
    965		 * MPA REQ already queued
    966		 */
    967		siw_dbg_cep(cep, "immediate mpa request\n");
    968
    969		siw_cep_set_inuse(new_cep);
    970		rv = siw_proc_mpareq(new_cep);
    971		if (rv != -EAGAIN) {
    972			siw_cep_put(cep);
    973			new_cep->listen_cep = NULL;
    974			if (rv) {
    975				siw_cep_set_free(new_cep);
    976				goto error;
    977			}
    978		}
    979		siw_cep_set_free(new_cep);
    980	}
    981	return;
    982
    983error:
    984	if (new_cep)
    985		siw_cep_put(new_cep);
    986
    987	if (new_s) {
    988		siw_socket_disassoc(new_s);
    989		sock_release(new_s);
    990		new_cep->sock = NULL;
    991	}
    992	siw_dbg_cep(cep, "error %d\n", rv);
    993}
    994
    995static void siw_cm_work_handler(struct work_struct *w)
    996{
    997	struct siw_cm_work *work;
    998	struct siw_cep *cep;
    999	int release_cep = 0, rv = 0;
   1000
   1001	work = container_of(w, struct siw_cm_work, work.work);
   1002	cep = work->cep;
   1003
   1004	siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
   1005		    cep->qp ? qp_id(cep->qp) : UINT_MAX,
   1006		    work->type, cep->state);
   1007
   1008	siw_cep_set_inuse(cep);
   1009
   1010	switch (work->type) {
   1011	case SIW_CM_WORK_ACCEPT:
   1012		siw_accept_newconn(cep);
   1013		break;
   1014
   1015	case SIW_CM_WORK_READ_MPAHDR:
   1016		if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
   1017			if (cep->listen_cep) {
   1018				siw_cep_set_inuse(cep->listen_cep);
   1019
   1020				if (cep->listen_cep->state ==
   1021				    SIW_EPSTATE_LISTENING)
   1022					rv = siw_proc_mpareq(cep);
   1023				else
   1024					rv = -EFAULT;
   1025
   1026				siw_cep_set_free(cep->listen_cep);
   1027
   1028				if (rv != -EAGAIN) {
   1029					siw_cep_put(cep->listen_cep);
   1030					cep->listen_cep = NULL;
   1031					if (rv)
   1032						siw_cep_put(cep);
   1033				}
   1034			}
   1035		} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
   1036			rv = siw_proc_mpareply(cep);
   1037		} else {
   1038			/*
   1039			 * CEP already moved out of MPA handshake.
   1040			 * any connection management already done.
   1041			 * silently ignore the mpa packet.
   1042			 */
   1043			if (cep->state == SIW_EPSTATE_RDMA_MODE) {
   1044				cep->sock->sk->sk_data_ready(cep->sock->sk);
   1045				siw_dbg_cep(cep, "already in RDMA mode");
   1046			} else {
   1047				siw_dbg_cep(cep, "out of state: %d\n",
   1048					    cep->state);
   1049			}
   1050		}
   1051		if (rv && rv != -EAGAIN)
   1052			release_cep = 1;
   1053		break;
   1054
   1055	case SIW_CM_WORK_CLOSE_LLP:
   1056		/*
   1057		 * QP scheduled LLP close
   1058		 */
   1059		if (cep->qp && cep->qp->term_info.valid)
   1060			siw_send_terminate(cep->qp);
   1061
   1062		if (cep->cm_id)
   1063			siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
   1064
   1065		release_cep = 1;
   1066		break;
   1067
   1068	case SIW_CM_WORK_PEER_CLOSE:
   1069		if (cep->cm_id) {
   1070			if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
   1071				/*
   1072				 * MPA reply not received, but connection drop
   1073				 */
   1074				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
   1075					      -ECONNRESET);
   1076			} else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
   1077				/*
   1078				 * NOTE: IW_CM_EVENT_DISCONNECT is given just
   1079				 *       to transition IWCM into CLOSING.
   1080				 */
   1081				siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
   1082				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
   1083			}
   1084			/*
   1085			 * for other states there is no connection
   1086			 * known to the IWCM.
   1087			 */
   1088		} else {
   1089			if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
   1090				/*
   1091				 * Wait for the ulp/CM to call accept/reject
   1092				 */
   1093				siw_dbg_cep(cep,
   1094					    "mpa req recvd, wait for ULP\n");
   1095			} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
   1096				/*
   1097				 * Socket close before MPA request received.
   1098				 */
   1099				siw_dbg_cep(cep, "no mpareq: drop listener\n");
   1100				siw_cep_put(cep->listen_cep);
   1101				cep->listen_cep = NULL;
   1102			}
   1103		}
   1104		release_cep = 1;
   1105		break;
   1106
   1107	case SIW_CM_WORK_MPATIMEOUT:
   1108		cep->mpa_timer = NULL;
   1109
   1110		if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
   1111			/*
   1112			 * MPA request timed out:
   1113			 * Hide any partially received private data and signal
   1114			 * timeout
   1115			 */
   1116			cep->mpa.hdr.params.pd_len = 0;
   1117
   1118			if (cep->cm_id)
   1119				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
   1120					      -ETIMEDOUT);
   1121			release_cep = 1;
   1122
   1123		} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
   1124			/*
   1125			 * No MPA request received after peer TCP stream setup.
   1126			 */
   1127			if (cep->listen_cep) {
   1128				siw_cep_put(cep->listen_cep);
   1129				cep->listen_cep = NULL;
   1130			}
   1131			release_cep = 1;
   1132		}
   1133		break;
   1134
   1135	default:
   1136		WARN(1, "Undefined CM work type: %d\n", work->type);
   1137	}
   1138	if (release_cep) {
   1139		siw_dbg_cep(cep,
   1140			    "release: timer=%s, QP[%u]\n",
   1141			    cep->mpa_timer ? "y" : "n",
   1142			    cep->qp ? qp_id(cep->qp) : UINT_MAX);
   1143
   1144		siw_cancel_mpatimer(cep);
   1145
   1146		cep->state = SIW_EPSTATE_CLOSED;
   1147
   1148		if (cep->qp) {
   1149			struct siw_qp *qp = cep->qp;
   1150			/*
   1151			 * Serialize a potential race with application
   1152			 * closing the QP and calling siw_qp_cm_drop()
   1153			 */
   1154			siw_qp_get(qp);
   1155			siw_cep_set_free(cep);
   1156
   1157			siw_qp_llp_close(qp);
   1158			siw_qp_put(qp);
   1159
   1160			siw_cep_set_inuse(cep);
   1161			cep->qp = NULL;
   1162			siw_qp_put(qp);
   1163		}
   1164		if (cep->sock) {
   1165			siw_socket_disassoc(cep->sock);
   1166			sock_release(cep->sock);
   1167			cep->sock = NULL;
   1168		}
   1169		if (cep->cm_id) {
   1170			cep->cm_id->rem_ref(cep->cm_id);
   1171			cep->cm_id = NULL;
   1172			siw_cep_put(cep);
   1173		}
   1174	}
   1175	siw_cep_set_free(cep);
   1176	siw_put_work(work);
   1177	siw_cep_put(cep);
   1178}
   1179
   1180static struct workqueue_struct *siw_cm_wq;
   1181
   1182int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
   1183{
   1184	struct siw_cm_work *work = siw_get_work(cep);
   1185	unsigned long delay = 0;
   1186
   1187	if (!work) {
   1188		siw_dbg_cep(cep, "failed with no work available\n");
   1189		return -ENOMEM;
   1190	}
   1191	work->type = type;
   1192	work->cep = cep;
   1193
   1194	siw_cep_get(cep);
   1195
   1196	INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
   1197
   1198	if (type == SIW_CM_WORK_MPATIMEOUT) {
   1199		cep->mpa_timer = work;
   1200
   1201		if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
   1202			delay = MPAREQ_TIMEOUT;
   1203		else
   1204			delay = MPAREP_TIMEOUT;
   1205	}
   1206	siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n",
   1207		    cep->qp ? qp_id(cep->qp) : -1, type, delay);
   1208
   1209	queue_delayed_work(siw_cm_wq, &work->work, delay);
   1210
   1211	return 0;
   1212}
   1213
   1214static void siw_cm_llp_data_ready(struct sock *sk)
   1215{
   1216	struct siw_cep *cep;
   1217
   1218	read_lock(&sk->sk_callback_lock);
   1219
   1220	cep = sk_to_cep(sk);
   1221	if (!cep)
   1222		goto out;
   1223
   1224	siw_dbg_cep(cep, "state: %d\n", cep->state);
   1225
   1226	switch (cep->state) {
   1227	case SIW_EPSTATE_RDMA_MODE:
   1228	case SIW_EPSTATE_LISTENING:
   1229		break;
   1230
   1231	case SIW_EPSTATE_AWAIT_MPAREQ:
   1232	case SIW_EPSTATE_AWAIT_MPAREP:
   1233		siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
   1234		break;
   1235
   1236	default:
   1237		siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
   1238		break;
   1239	}
   1240out:
   1241	read_unlock(&sk->sk_callback_lock);
   1242}
   1243
   1244static void siw_cm_llp_write_space(struct sock *sk)
   1245{
   1246	struct siw_cep *cep = sk_to_cep(sk);
   1247
   1248	if (cep)
   1249		siw_dbg_cep(cep, "state: %d\n", cep->state);
   1250}
   1251
   1252static void siw_cm_llp_error_report(struct sock *sk)
   1253{
   1254	struct siw_cep *cep = sk_to_cep(sk);
   1255
   1256	if (cep) {
   1257		siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
   1258			    sk->sk_err, sk->sk_state, cep->state);
   1259		cep->sk_error_report(sk);
   1260	}
   1261}
   1262
   1263static void siw_cm_llp_state_change(struct sock *sk)
   1264{
   1265	struct siw_cep *cep;
   1266	void (*orig_state_change)(struct sock *s);
   1267
   1268	read_lock(&sk->sk_callback_lock);
   1269
   1270	cep = sk_to_cep(sk);
   1271	if (!cep) {
   1272		/* endpoint already disassociated */
   1273		read_unlock(&sk->sk_callback_lock);
   1274		return;
   1275	}
   1276	orig_state_change = cep->sk_state_change;
   1277
   1278	siw_dbg_cep(cep, "state: %d\n", cep->state);
   1279
   1280	switch (sk->sk_state) {
   1281	case TCP_ESTABLISHED:
   1282		/*
   1283		 * handle accepting socket as special case where only
   1284		 * new connection is possible
   1285		 */
   1286		siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
   1287		break;
   1288
   1289	case TCP_CLOSE:
   1290	case TCP_CLOSE_WAIT:
   1291		if (cep->qp)
   1292			cep->qp->tx_ctx.tx_suspend = 1;
   1293		siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
   1294		break;
   1295
   1296	default:
   1297		siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
   1298	}
   1299	read_unlock(&sk->sk_callback_lock);
   1300	orig_state_change(sk);
   1301}
   1302
   1303static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
   1304			      struct sockaddr *raddr, bool afonly)
   1305{
   1306	int rv, flags = 0;
   1307	size_t size = laddr->sa_family == AF_INET ?
   1308		sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
   1309
   1310	/*
   1311	 * Make address available again asap.
   1312	 */
   1313	sock_set_reuseaddr(s->sk);
   1314
   1315	if (afonly) {
   1316		rv = ip6_sock_set_v6only(s->sk);
   1317		if (rv)
   1318			return rv;
   1319	}
   1320
   1321	rv = s->ops->bind(s, laddr, size);
   1322	if (rv < 0)
   1323		return rv;
   1324
   1325	rv = s->ops->connect(s, raddr, size, flags);
   1326
   1327	return rv < 0 ? rv : 0;
   1328}
   1329
   1330int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
   1331{
   1332	struct siw_device *sdev = to_siw_dev(id->device);
   1333	struct siw_qp *qp;
   1334	struct siw_cep *cep = NULL;
   1335	struct socket *s = NULL;
   1336	struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
   1337			*raddr = (struct sockaddr *)&id->remote_addr;
   1338	bool p2p_mode = peer_to_peer, v4 = true;
   1339	u16 pd_len = params->private_data_len;
   1340	int version = mpa_version, rv;
   1341
   1342	if (pd_len > MPA_MAX_PRIVDATA)
   1343		return -EINVAL;
   1344
   1345	if (params->ird > sdev->attrs.max_ird ||
   1346	    params->ord > sdev->attrs.max_ord)
   1347		return -ENOMEM;
   1348
   1349	if (laddr->sa_family == AF_INET6)
   1350		v4 = false;
   1351	else if (laddr->sa_family != AF_INET)
   1352		return -EAFNOSUPPORT;
   1353
   1354	/*
   1355	 * Respect any iwarp port mapping: Use mapped remote address
   1356	 * if valid. Local address must not be mapped, since siw
   1357	 * uses kernel TCP stack.
   1358	 */
   1359	if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
   1360	     to_sockaddr_in6(id->remote_addr).sin6_port != 0)
   1361		raddr = (struct sockaddr *)&id->m_remote_addr;
   1362
   1363	qp = siw_qp_id2obj(sdev, params->qpn);
   1364	if (!qp) {
   1365		WARN(1, "[QP %u] does not exist\n", params->qpn);
   1366		rv = -EINVAL;
   1367		goto error;
   1368	}
   1369	siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr,
   1370		   raddr);
   1371
   1372	rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
   1373	if (rv < 0)
   1374		goto error;
   1375
   1376	/*
   1377	 * NOTE: For simplification, connect() is called in blocking
   1378	 * mode. Might be reconsidered for async connection setup at
   1379	 * TCP level.
   1380	 */
   1381	rv = kernel_bindconnect(s, laddr, raddr, id->afonly);
   1382	if (rv != 0) {
   1383		siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
   1384		goto error;
   1385	}
   1386	if (siw_tcp_nagle == false)
   1387		tcp_sock_set_nodelay(s->sk);
   1388	cep = siw_cep_alloc(sdev);
   1389	if (!cep) {
   1390		rv = -ENOMEM;
   1391		goto error;
   1392	}
   1393	siw_cep_set_inuse(cep);
   1394
   1395	/* Associate QP with CEP */
   1396	siw_cep_get(cep);
   1397	qp->cep = cep;
   1398
   1399	/* siw_qp_get(qp) already done by QP lookup */
   1400	cep->qp = qp;
   1401
   1402	id->add_ref(id);
   1403	cep->cm_id = id;
   1404
   1405	/*
   1406	 * 4: Allocate a sufficient number of work elements
   1407	 * to allow concurrent handling of local + peer close
   1408	 * events, MPA header processing + MPA timeout.
   1409	 */
   1410	rv = siw_cm_alloc_work(cep, 4);
   1411	if (rv != 0) {
   1412		rv = -ENOMEM;
   1413		goto error;
   1414	}
   1415	cep->ird = params->ird;
   1416	cep->ord = params->ord;
   1417
   1418	if (p2p_mode && cep->ord == 0)
   1419		cep->ord = 1;
   1420
   1421	cep->state = SIW_EPSTATE_CONNECTING;
   1422
   1423	/*
   1424	 * Associate CEP with socket
   1425	 */
   1426	siw_cep_socket_assoc(cep, s);
   1427
   1428	cep->state = SIW_EPSTATE_AWAIT_MPAREP;
   1429
   1430	/*
   1431	 * Set MPA Request bits: CRC if required, no MPA Markers,
   1432	 * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
   1433	 */
   1434	cep->mpa.hdr.params.bits = 0;
   1435	if (version > MPA_REVISION_2) {
   1436		pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
   1437		version = MPA_REVISION_2;
   1438		/* Adjust also module parameter */
   1439		mpa_version = MPA_REVISION_2;
   1440	}
   1441	__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
   1442
   1443	if (try_gso)
   1444		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
   1445
   1446	if (mpa_crc_required)
   1447		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
   1448
   1449	/*
   1450	 * If MPA version == 2:
   1451	 * o Include ORD and IRD.
   1452	 * o Indicate peer-to-peer mode, if required by module
   1453	 *   parameter 'peer_to_peer'.
   1454	 */
   1455	if (version == MPA_REVISION_2) {
   1456		cep->enhanced_rdma_conn_est = true;
   1457		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
   1458
   1459		cep->mpa.v2_ctrl.ird = htons(cep->ird);
   1460		cep->mpa.v2_ctrl.ord = htons(cep->ord);
   1461
   1462		if (p2p_mode) {
   1463			cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
   1464			cep->mpa.v2_ctrl.ord |= rtr_type;
   1465		}
   1466		/* Remember own P2P mode requested */
   1467		cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
   1468		cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
   1469	}
   1470	memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
   1471
   1472	rv = siw_send_mpareqrep(cep, params->private_data, pd_len);
   1473	/*
   1474	 * Reset private data.
   1475	 */
   1476	cep->mpa.hdr.params.pd_len = 0;
   1477
   1478	if (rv >= 0) {
   1479		rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT);
   1480		if (!rv) {
   1481			siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
   1482			siw_cep_set_free(cep);
   1483			return 0;
   1484		}
   1485	}
   1486error:
   1487	siw_dbg(id->device, "failed: %d\n", rv);
   1488
   1489	if (cep) {
   1490		siw_socket_disassoc(s);
   1491		sock_release(s);
   1492		cep->sock = NULL;
   1493
   1494		cep->qp = NULL;
   1495
   1496		cep->cm_id = NULL;
   1497		id->rem_ref(id);
   1498		siw_cep_put(cep);
   1499
   1500		qp->cep = NULL;
   1501		siw_cep_put(cep);
   1502
   1503		cep->state = SIW_EPSTATE_CLOSED;
   1504
   1505		siw_cep_set_free(cep);
   1506
   1507		siw_cep_put(cep);
   1508
   1509	} else if (s) {
   1510		sock_release(s);
   1511	}
   1512	if (qp)
   1513		siw_qp_put(qp);
   1514
   1515	return rv;
   1516}
   1517
   1518/*
   1519 * siw_accept - Let SoftiWARP accept an RDMA connection request
   1520 *
   1521 * @id:		New connection management id to be used for accepted
   1522 *		connection request
   1523 * @params:	Connection parameters provided by ULP for accepting connection
   1524 *
   1525 * Transition QP to RTS state, associate new CM id @id with accepted CEP
   1526 * and get prepared for TCP input by installing socket callbacks.
   1527 * Then send MPA Reply and generate the "connection established" event.
   1528 * Socket callbacks must be installed before sending MPA Reply, because
   1529 * the latter may cause a first RDMA message to arrive from the RDMA Initiator
   1530 * side very quickly, at which time the socket callbacks must be ready.
   1531 */
   1532int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
   1533{
   1534	struct siw_device *sdev = to_siw_dev(id->device);
   1535	struct siw_cep *cep = (struct siw_cep *)id->provider_data;
   1536	struct siw_qp *qp;
   1537	struct siw_qp_attrs qp_attrs;
   1538	int rv, max_priv_data = MPA_MAX_PRIVDATA;
   1539	bool wait_for_peer_rts = false;
   1540
   1541	siw_cep_set_inuse(cep);
   1542	siw_cep_put(cep);
   1543
   1544	/* Free lingering inbound private data */
   1545	if (cep->mpa.hdr.params.pd_len) {
   1546		cep->mpa.hdr.params.pd_len = 0;
   1547		kfree(cep->mpa.pdata);
   1548		cep->mpa.pdata = NULL;
   1549	}
   1550	siw_cancel_mpatimer(cep);
   1551
   1552	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
   1553		siw_dbg_cep(cep, "out of state\n");
   1554
   1555		siw_cep_set_free(cep);
   1556		siw_cep_put(cep);
   1557
   1558		return -ECONNRESET;
   1559	}
   1560	qp = siw_qp_id2obj(sdev, params->qpn);
   1561	if (!qp) {
   1562		WARN(1, "[QP %d] does not exist\n", params->qpn);
   1563		siw_cep_set_free(cep);
   1564		siw_cep_put(cep);
   1565
   1566		return -EINVAL;
   1567	}
   1568	down_write(&qp->state_lock);
   1569	if (qp->attrs.state > SIW_QP_STATE_RTR) {
   1570		rv = -EINVAL;
   1571		up_write(&qp->state_lock);
   1572		goto error;
   1573	}
   1574	siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
   1575
   1576	if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
   1577		siw_dbg_cep(cep, "peer allows GSO on TX\n");
   1578		qp->tx_ctx.gso_seg_limit = 0;
   1579	}
   1580	if (params->ord > sdev->attrs.max_ord ||
   1581	    params->ird > sdev->attrs.max_ird) {
   1582		siw_dbg_cep(
   1583			cep,
   1584			"[QP %u]: ord %d (max %d), ird %d (max %d)\n",
   1585			qp_id(qp), params->ord, sdev->attrs.max_ord,
   1586			params->ird, sdev->attrs.max_ird);
   1587		rv = -EINVAL;
   1588		up_write(&qp->state_lock);
   1589		goto error;
   1590	}
   1591	if (cep->enhanced_rdma_conn_est)
   1592		max_priv_data -= sizeof(struct mpa_v2_data);
   1593
   1594	if (params->private_data_len > max_priv_data) {
   1595		siw_dbg_cep(
   1596			cep,
   1597			"[QP %u]: private data length: %d (max %d)\n",
   1598			qp_id(qp), params->private_data_len, max_priv_data);
   1599		rv = -EINVAL;
   1600		up_write(&qp->state_lock);
   1601		goto error;
   1602	}
   1603	if (cep->enhanced_rdma_conn_est) {
   1604		if (params->ord > cep->ord) {
   1605			if (relaxed_ird_negotiation) {
   1606				params->ord = cep->ord;
   1607			} else {
   1608				cep->ird = params->ird;
   1609				cep->ord = params->ord;
   1610				rv = -EINVAL;
   1611				up_write(&qp->state_lock);
   1612				goto error;
   1613			}
   1614		}
   1615		if (params->ird < cep->ird) {
   1616			if (relaxed_ird_negotiation &&
   1617			    cep->ird <= sdev->attrs.max_ird)
   1618				params->ird = cep->ird;
   1619			else {
   1620				rv = -ENOMEM;
   1621				up_write(&qp->state_lock);
   1622				goto error;
   1623			}
   1624		}
   1625		if (cep->mpa.v2_ctrl.ord &
   1626		    (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
   1627			wait_for_peer_rts = true;
   1628		/*
   1629		 * Signal back negotiated IRD and ORD values
   1630		 */
   1631		cep->mpa.v2_ctrl.ord =
   1632			htons(params->ord & MPA_IRD_ORD_MASK) |
   1633			(cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
   1634		cep->mpa.v2_ctrl.ird =
   1635			htons(params->ird & MPA_IRD_ORD_MASK) |
   1636			(cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
   1637	}
   1638	cep->ird = params->ird;
   1639	cep->ord = params->ord;
   1640
   1641	cep->cm_id = id;
   1642	id->add_ref(id);
   1643
   1644	memset(&qp_attrs, 0, sizeof(qp_attrs));
   1645	qp_attrs.orq_size = cep->ord;
   1646	qp_attrs.irq_size = cep->ird;
   1647	qp_attrs.sk = cep->sock;
   1648	if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
   1649		qp_attrs.flags = SIW_MPA_CRC;
   1650	qp_attrs.state = SIW_QP_STATE_RTS;
   1651
   1652	siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
   1653
   1654	/* Associate QP with CEP */
   1655	siw_cep_get(cep);
   1656	qp->cep = cep;
   1657
   1658	/* siw_qp_get(qp) already done by QP lookup */
   1659	cep->qp = qp;
   1660
   1661	cep->state = SIW_EPSTATE_RDMA_MODE;
   1662
   1663	/* Move socket RX/TX under QP control */
   1664	rv = siw_qp_modify(qp, &qp_attrs,
   1665			   SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
   1666				   SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
   1667				   SIW_QP_ATTR_MPA);
   1668	up_write(&qp->state_lock);
   1669
   1670	if (rv)
   1671		goto error;
   1672
   1673	siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n",
   1674		    qp_id(qp), params->private_data_len);
   1675
   1676	rv = siw_send_mpareqrep(cep, params->private_data,
   1677				params->private_data_len);
   1678	if (rv != 0)
   1679		goto error;
   1680
   1681	if (wait_for_peer_rts) {
   1682		siw_sk_assign_rtr_upcalls(cep);
   1683	} else {
   1684		siw_qp_socket_assoc(cep, qp);
   1685		rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
   1686		if (rv)
   1687			goto error;
   1688	}
   1689	siw_cep_set_free(cep);
   1690
   1691	return 0;
   1692error:
   1693	siw_socket_disassoc(cep->sock);
   1694	sock_release(cep->sock);
   1695	cep->sock = NULL;
   1696
   1697	cep->state = SIW_EPSTATE_CLOSED;
   1698
   1699	if (cep->cm_id) {
   1700		cep->cm_id->rem_ref(id);
   1701		cep->cm_id = NULL;
   1702	}
   1703	if (qp->cep) {
   1704		siw_cep_put(cep);
   1705		qp->cep = NULL;
   1706	}
   1707	cep->qp = NULL;
   1708	siw_qp_put(qp);
   1709
   1710	siw_cep_set_free(cep);
   1711	siw_cep_put(cep);
   1712
   1713	return rv;
   1714}
   1715
   1716/*
   1717 * siw_reject()
   1718 *
   1719 * Local connection reject case. Send private data back to peer,
   1720 * close connection and dereference connection id.
   1721 */
   1722int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
   1723{
   1724	struct siw_cep *cep = (struct siw_cep *)id->provider_data;
   1725
   1726	siw_cep_set_inuse(cep);
   1727	siw_cep_put(cep);
   1728
   1729	siw_cancel_mpatimer(cep);
   1730
   1731	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
   1732		siw_dbg_cep(cep, "out of state\n");
   1733
   1734		siw_cep_set_free(cep);
   1735		siw_cep_put(cep); /* put last reference */
   1736
   1737		return -ECONNRESET;
   1738	}
   1739	siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state,
   1740		    pd_len);
   1741
   1742	if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
   1743		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
   1744		siw_send_mpareqrep(cep, pdata, pd_len);
   1745	}
   1746	siw_socket_disassoc(cep->sock);
   1747	sock_release(cep->sock);
   1748	cep->sock = NULL;
   1749
   1750	cep->state = SIW_EPSTATE_CLOSED;
   1751
   1752	siw_cep_set_free(cep);
   1753	siw_cep_put(cep);
   1754
   1755	return 0;
   1756}
   1757
   1758/*
   1759 * siw_create_listen - Create resources for a listener's IWCM ID @id
   1760 *
   1761 * Starts listen on the socket address id->local_addr.
   1762 *
   1763 */
   1764int siw_create_listen(struct iw_cm_id *id, int backlog)
   1765{
   1766	struct socket *s;
   1767	struct siw_cep *cep = NULL;
   1768	struct siw_device *sdev = to_siw_dev(id->device);
   1769	int addr_family = id->local_addr.ss_family;
   1770	int rv = 0;
   1771
   1772	if (addr_family != AF_INET && addr_family != AF_INET6)
   1773		return -EAFNOSUPPORT;
   1774
   1775	rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
   1776	if (rv < 0)
   1777		return rv;
   1778
   1779	/*
   1780	 * Allow binding local port when still in TIME_WAIT from last close.
   1781	 */
   1782	sock_set_reuseaddr(s->sk);
   1783
   1784	if (addr_family == AF_INET) {
   1785		struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
   1786
   1787		/* For wildcard addr, limit binding to current device only */
   1788		if (ipv4_is_zeronet(laddr->sin_addr.s_addr))
   1789			s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
   1790
   1791		rv = s->ops->bind(s, (struct sockaddr *)laddr,
   1792				  sizeof(struct sockaddr_in));
   1793	} else {
   1794		struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
   1795
   1796		if (id->afonly) {
   1797			rv = ip6_sock_set_v6only(s->sk);
   1798			if (rv) {
   1799				siw_dbg(id->device,
   1800					"ip6_sock_set_v6only erro: %d\n", rv);
   1801				goto error;
   1802			}
   1803		}
   1804
   1805		/* For wildcard addr, limit binding to current device only */
   1806		if (ipv6_addr_any(&laddr->sin6_addr))
   1807			s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
   1808
   1809		rv = s->ops->bind(s, (struct sockaddr *)laddr,
   1810				  sizeof(struct sockaddr_in6));
   1811	}
   1812	if (rv) {
   1813		siw_dbg(id->device, "socket bind error: %d\n", rv);
   1814		goto error;
   1815	}
   1816	cep = siw_cep_alloc(sdev);
   1817	if (!cep) {
   1818		rv = -ENOMEM;
   1819		goto error;
   1820	}
   1821	siw_cep_socket_assoc(cep, s);
   1822
   1823	rv = siw_cm_alloc_work(cep, backlog);
   1824	if (rv) {
   1825		siw_dbg(id->device,
   1826			"alloc_work error %d, backlog %d\n",
   1827			rv, backlog);
   1828		goto error;
   1829	}
   1830	rv = s->ops->listen(s, backlog);
   1831	if (rv) {
   1832		siw_dbg(id->device, "listen error %d\n", rv);
   1833		goto error;
   1834	}
   1835	cep->cm_id = id;
   1836	id->add_ref(id);
   1837
   1838	/*
   1839	 * In case of a wildcard rdma_listen on a multi-homed device,
   1840	 * a listener's IWCM id is associated with more than one listening CEP.
   1841	 *
   1842	 * We currently use id->provider_data in three different ways:
   1843	 *
   1844	 * o For a listener's IWCM id, id->provider_data points to
   1845	 *   the list_head of the list of listening CEPs.
   1846	 *   Uses: siw_create_listen(), siw_destroy_listen()
   1847	 *
   1848	 * o For each accepted passive-side IWCM id, id->provider_data
   1849	 *   points to the CEP itself. This is a consequence of
   1850	 *   - siw_cm_upcall() setting event.provider_data = cep and
   1851	 *   - the IWCM's cm_conn_req_handler() setting provider_data of the
   1852	 *     new passive-side IWCM id equal to event.provider_data
   1853	 *   Uses: siw_accept(), siw_reject()
   1854	 *
   1855	 * o For an active-side IWCM id, id->provider_data is not used at all.
   1856	 *
   1857	 */
   1858	if (!id->provider_data) {
   1859		id->provider_data =
   1860			kmalloc(sizeof(struct list_head), GFP_KERNEL);
   1861		if (!id->provider_data) {
   1862			rv = -ENOMEM;
   1863			goto error;
   1864		}
   1865		INIT_LIST_HEAD((struct list_head *)id->provider_data);
   1866	}
   1867	list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
   1868	cep->state = SIW_EPSTATE_LISTENING;
   1869
   1870	siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr);
   1871
   1872	return 0;
   1873
   1874error:
   1875	siw_dbg(id->device, "failed: %d\n", rv);
   1876
   1877	if (cep) {
   1878		siw_cep_set_inuse(cep);
   1879
   1880		if (cep->cm_id) {
   1881			cep->cm_id->rem_ref(cep->cm_id);
   1882			cep->cm_id = NULL;
   1883		}
   1884		cep->sock = NULL;
   1885		siw_socket_disassoc(s);
   1886		cep->state = SIW_EPSTATE_CLOSED;
   1887
   1888		siw_cep_set_free(cep);
   1889		siw_cep_put(cep);
   1890	}
   1891	sock_release(s);
   1892
   1893	return rv;
   1894}
   1895
   1896static void siw_drop_listeners(struct iw_cm_id *id)
   1897{
   1898	struct list_head *p, *tmp;
   1899
   1900	/*
   1901	 * In case of a wildcard rdma_listen on a multi-homed device,
   1902	 * a listener's IWCM id is associated with more than one listening CEP.
   1903	 */
   1904	list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
   1905		struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
   1906
   1907		list_del(p);
   1908
   1909		siw_dbg_cep(cep, "drop cep, state %d\n", cep->state);
   1910
   1911		siw_cep_set_inuse(cep);
   1912
   1913		if (cep->cm_id) {
   1914			cep->cm_id->rem_ref(cep->cm_id);
   1915			cep->cm_id = NULL;
   1916		}
   1917		if (cep->sock) {
   1918			siw_socket_disassoc(cep->sock);
   1919			sock_release(cep->sock);
   1920			cep->sock = NULL;
   1921		}
   1922		cep->state = SIW_EPSTATE_CLOSED;
   1923		siw_cep_set_free(cep);
   1924		siw_cep_put(cep);
   1925	}
   1926}
   1927
   1928int siw_destroy_listen(struct iw_cm_id *id)
   1929{
   1930	if (!id->provider_data) {
   1931		siw_dbg(id->device, "no cep(s)\n");
   1932		return 0;
   1933	}
   1934	siw_drop_listeners(id);
   1935	kfree(id->provider_data);
   1936	id->provider_data = NULL;
   1937
   1938	return 0;
   1939}
   1940
   1941int siw_cm_init(void)
   1942{
   1943	/*
   1944	 * create_single_workqueue for strict ordering
   1945	 */
   1946	siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
   1947	if (!siw_cm_wq)
   1948		return -ENOMEM;
   1949
   1950	return 0;
   1951}
   1952
   1953void siw_cm_exit(void)
   1954{
   1955	if (siw_cm_wq)
   1956		destroy_workqueue(siw_cm_wq);
   1957}