cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

ipoib_cm.c (44385B)


      1/*
      2 * Copyright (c) 2006 Mellanox Technologies. All rights reserved
      3 *
      4 * This software is available to you under a choice of one of two
      5 * licenses.  You may choose to be licensed under the terms of the GNU
      6 * General Public License (GPL) Version 2, available from the file
      7 * COPYING in the main directory of this source tree, or the
      8 * OpenIB.org BSD license below:
      9 *
     10 *     Redistribution and use in source and binary forms, with or
     11 *     without modification, are permitted provided that the following
     12 *     conditions are met:
     13 *
     14 *      - Redistributions of source code must retain the above
     15 *        copyright notice, this list of conditions and the following
     16 *        disclaimer.
     17 *
     18 *      - Redistributions in binary form must reproduce the above
     19 *        copyright notice, this list of conditions and the following
     20 *        disclaimer in the documentation and/or other materials
     21 *        provided with the distribution.
     22 *
     23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     30 * SOFTWARE.
     31 */
     32
     33#include <rdma/ib_cm.h>
     34#include <net/dst.h>
     35#include <net/icmp.h>
     36#include <linux/icmpv6.h>
     37#include <linux/delay.h>
     38#include <linux/slab.h>
     39#include <linux/vmalloc.h>
     40#include <linux/moduleparam.h>
     41#include <linux/sched/signal.h>
     42#include <linux/sched/mm.h>
     43
     44#include "ipoib.h"
     45
     46int ipoib_max_conn_qp = 128;
     47
     48module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
     49MODULE_PARM_DESC(max_nonsrq_conn_qp,
     50		 "Max number of connected-mode QPs per interface "
     51		 "(applied only if shared receive queue is not available)");
     52
     53#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
     54static int data_debug_level;
     55
     56module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
     57MODULE_PARM_DESC(cm_data_debug_level,
     58		 "Enable data path debug tracing for connected mode if > 0");
     59#endif
     60
     61#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
     62
     63#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
     64#define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
     65#define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
     66#define IPOIB_CM_RX_UPDATE_MASK (0x3)
     67
     68#define IPOIB_CM_RX_RESERVE     (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
     69
     70static struct ib_qp_attr ipoib_cm_err_attr = {
     71	.qp_state = IB_QPS_ERR
     72};
     73
     74#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
     75
     76static struct ib_send_wr ipoib_cm_rx_drain_wr = {
     77	.opcode = IB_WR_SEND,
     78};
     79
     80static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
     81			       const struct ib_cm_event *event);
     82
     83static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
     84				  u64 mapping[IPOIB_CM_RX_SG])
     85{
     86	int i;
     87
     88	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
     89
     90	for (i = 0; i < frags; ++i)
     91		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
     92}
     93
     94static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
     95{
     96	struct ipoib_dev_priv *priv = ipoib_priv(dev);
     97	int i, ret;
     98
     99	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
    100
    101	for (i = 0; i < priv->cm.num_frags; ++i)
    102		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
    103
    104	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, NULL);
    105	if (unlikely(ret)) {
    106		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
    107		ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
    108				      priv->cm.srq_ring[id].mapping);
    109		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
    110		priv->cm.srq_ring[id].skb = NULL;
    111	}
    112
    113	return ret;
    114}
    115
    116static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
    117					struct ipoib_cm_rx *rx,
    118					struct ib_recv_wr *wr,
    119					struct ib_sge *sge, int id)
    120{
    121	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    122	int i, ret;
    123
    124	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
    125
    126	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
    127		sge[i].addr = rx->rx_ring[id].mapping[i];
    128
    129	ret = ib_post_recv(rx->qp, wr, NULL);
    130	if (unlikely(ret)) {
    131		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
    132		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
    133				      rx->rx_ring[id].mapping);
    134		dev_kfree_skb_any(rx->rx_ring[id].skb);
    135		rx->rx_ring[id].skb = NULL;
    136	}
    137
    138	return ret;
    139}
    140
    141static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
    142					     struct ipoib_cm_rx_buf *rx_ring,
    143					     int id, int frags,
    144					     u64 mapping[IPOIB_CM_RX_SG],
    145					     gfp_t gfp)
    146{
    147	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    148	struct sk_buff *skb;
    149	int i;
    150
    151	skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
    152	if (unlikely(!skb))
    153		return NULL;
    154
    155	/*
    156	 * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
    157	 * IP header to a multiple of 16.
    158	 */
    159	skb_reserve(skb, IPOIB_CM_RX_RESERVE);
    160
    161	mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
    162				       DMA_FROM_DEVICE);
    163	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
    164		dev_kfree_skb_any(skb);
    165		return NULL;
    166	}
    167
    168	for (i = 0; i < frags; i++) {
    169		struct page *page = alloc_page(gfp);
    170
    171		if (!page)
    172			goto partial_error;
    173		skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
    174
    175		mapping[i + 1] = ib_dma_map_page(priv->ca, page,
    176						 0, PAGE_SIZE, DMA_FROM_DEVICE);
    177		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
    178			goto partial_error;
    179	}
    180
    181	rx_ring[id].skb = skb;
    182	return skb;
    183
    184partial_error:
    185
    186	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
    187
    188	for (; i > 0; --i)
    189		ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
    190
    191	dev_kfree_skb_any(skb);
    192	return NULL;
    193}
    194
    195static void ipoib_cm_free_rx_ring(struct net_device *dev,
    196				  struct ipoib_cm_rx_buf *rx_ring)
    197{
    198	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    199	int i;
    200
    201	for (i = 0; i < ipoib_recvq_size; ++i)
    202		if (rx_ring[i].skb) {
    203			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
    204					      rx_ring[i].mapping);
    205			dev_kfree_skb_any(rx_ring[i].skb);
    206		}
    207
    208	vfree(rx_ring);
    209}
    210
    211static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
    212{
    213	struct ipoib_cm_rx *p;
    214
    215	/* We only reserved 1 extra slot in CQ for drain WRs, so
    216	 * make sure we have at most 1 outstanding WR. */
    217	if (list_empty(&priv->cm.rx_flush_list) ||
    218	    !list_empty(&priv->cm.rx_drain_list))
    219		return;
    220
    221	/*
    222	 * QPs on flush list are error state.  This way, a "flush
    223	 * error" WC will be immediately generated for each WR we post.
    224	 */
    225	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
    226	ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
    227	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, NULL))
    228		ipoib_warn(priv, "failed to post drain wr\n");
    229
    230	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
    231}
    232
    233static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
    234{
    235	struct ipoib_cm_rx *p = ctx;
    236	struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
    237	unsigned long flags;
    238
    239	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
    240		return;
    241
    242	spin_lock_irqsave(&priv->lock, flags);
    243	list_move(&p->list, &priv->cm.rx_flush_list);
    244	p->state = IPOIB_CM_RX_FLUSH;
    245	ipoib_cm_start_rx_drain(priv);
    246	spin_unlock_irqrestore(&priv->lock, flags);
    247}
    248
    249static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
    250					   struct ipoib_cm_rx *p)
    251{
    252	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    253	struct ib_qp_init_attr attr = {
    254		.event_handler = ipoib_cm_rx_event_handler,
    255		.send_cq = priv->recv_cq, /* For drain WR */
    256		.recv_cq = priv->recv_cq,
    257		.srq = priv->cm.srq,
    258		.cap.max_send_wr = 1, /* For drain WR */
    259		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
    260		.sq_sig_type = IB_SIGNAL_ALL_WR,
    261		.qp_type = IB_QPT_RC,
    262		.qp_context = p,
    263	};
    264
    265	if (!ipoib_cm_has_srq(dev)) {
    266		attr.cap.max_recv_wr  = ipoib_recvq_size;
    267		attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
    268	}
    269
    270	return ib_create_qp(priv->pd, &attr);
    271}
    272
    273static int ipoib_cm_modify_rx_qp(struct net_device *dev,
    274				 struct ib_cm_id *cm_id, struct ib_qp *qp,
    275				 unsigned int psn)
    276{
    277	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    278	struct ib_qp_attr qp_attr;
    279	int qp_attr_mask, ret;
    280
    281	qp_attr.qp_state = IB_QPS_INIT;
    282	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
    283	if (ret) {
    284		ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
    285		return ret;
    286	}
    287	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
    288	if (ret) {
    289		ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
    290		return ret;
    291	}
    292	qp_attr.qp_state = IB_QPS_RTR;
    293	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
    294	if (ret) {
    295		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
    296		return ret;
    297	}
    298	qp_attr.rq_psn = psn;
    299	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
    300	if (ret) {
    301		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
    302		return ret;
    303	}
    304
    305	/*
    306	 * Current Mellanox HCA firmware won't generate completions
    307	 * with error for drain WRs unless the QP has been moved to
    308	 * RTS first. This work-around leaves a window where a QP has
    309	 * moved to error asynchronously, but this will eventually get
    310	 * fixed in firmware, so let's not error out if modify QP
    311	 * fails.
    312	 */
    313	qp_attr.qp_state = IB_QPS_RTS;
    314	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
    315	if (ret) {
    316		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
    317		return 0;
    318	}
    319	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
    320	if (ret) {
    321		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
    322		return 0;
    323	}
    324
    325	return 0;
    326}
    327
    328static void ipoib_cm_init_rx_wr(struct net_device *dev,
    329				struct ib_recv_wr *wr,
    330				struct ib_sge *sge)
    331{
    332	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    333	int i;
    334
    335	for (i = 0; i < priv->cm.num_frags; ++i)
    336		sge[i].lkey = priv->pd->local_dma_lkey;
    337
    338	sge[0].length = IPOIB_CM_HEAD_SIZE;
    339	for (i = 1; i < priv->cm.num_frags; ++i)
    340		sge[i].length = PAGE_SIZE;
    341
    342	wr->next    = NULL;
    343	wr->sg_list = sge;
    344	wr->num_sge = priv->cm.num_frags;
    345}
    346
    347static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
    348				   struct ipoib_cm_rx *rx)
    349{
    350	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    351	struct {
    352		struct ib_recv_wr wr;
    353		struct ib_sge sge[IPOIB_CM_RX_SG];
    354	} *t;
    355	int ret;
    356	int i;
    357
    358	rx->rx_ring = vzalloc(array_size(ipoib_recvq_size,
    359					 sizeof(*rx->rx_ring)));
    360	if (!rx->rx_ring)
    361		return -ENOMEM;
    362
    363	t = kmalloc(sizeof(*t), GFP_KERNEL);
    364	if (!t) {
    365		ret = -ENOMEM;
    366		goto err_free_1;
    367	}
    368
    369	ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
    370
    371	spin_lock_irq(&priv->lock);
    372
    373	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
    374		spin_unlock_irq(&priv->lock);
    375		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
    376		ret = -EINVAL;
    377		goto err_free;
    378	} else
    379		++priv->cm.nonsrq_conn_qp;
    380
    381	spin_unlock_irq(&priv->lock);
    382
    383	for (i = 0; i < ipoib_recvq_size; ++i) {
    384		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
    385					   rx->rx_ring[i].mapping,
    386					   GFP_KERNEL)) {
    387			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
    388			ret = -ENOMEM;
    389			goto err_count;
    390		}
    391		ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
    392		if (ret) {
    393			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
    394				   "failed for buf %d\n", i);
    395			ret = -EIO;
    396			goto err_count;
    397		}
    398	}
    399
    400	rx->recv_count = ipoib_recvq_size;
    401
    402	kfree(t);
    403
    404	return 0;
    405
    406err_count:
    407	spin_lock_irq(&priv->lock);
    408	--priv->cm.nonsrq_conn_qp;
    409	spin_unlock_irq(&priv->lock);
    410
    411err_free:
    412	kfree(t);
    413
    414err_free_1:
    415	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
    416
    417	return ret;
    418}
    419
    420static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
    421			     struct ib_qp *qp,
    422			     const struct ib_cm_req_event_param *req,
    423			     unsigned int psn)
    424{
    425	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    426	struct ipoib_cm_data data = {};
    427	struct ib_cm_rep_param rep = {};
    428
    429	data.qpn = cpu_to_be32(priv->qp->qp_num);
    430	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
    431
    432	rep.private_data = &data;
    433	rep.private_data_len = sizeof(data);
    434	rep.flow_control = 0;
    435	rep.rnr_retry_count = req->rnr_retry_count;
    436	rep.srq = ipoib_cm_has_srq(dev);
    437	rep.qp_num = qp->qp_num;
    438	rep.starting_psn = psn;
    439	return ib_send_cm_rep(cm_id, &rep);
    440}
    441
    442static int ipoib_cm_req_handler(struct ib_cm_id *cm_id,
    443				const struct ib_cm_event *event)
    444{
    445	struct net_device *dev = cm_id->context;
    446	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    447	struct ipoib_cm_rx *p;
    448	unsigned int psn;
    449	int ret;
    450
    451	ipoib_dbg(priv, "REQ arrived\n");
    452	p = kzalloc(sizeof(*p), GFP_KERNEL);
    453	if (!p)
    454		return -ENOMEM;
    455	p->dev = dev;
    456	p->id = cm_id;
    457	cm_id->context = p;
    458	p->state = IPOIB_CM_RX_LIVE;
    459	p->jiffies = jiffies;
    460	INIT_LIST_HEAD(&p->list);
    461
    462	p->qp = ipoib_cm_create_rx_qp(dev, p);
    463	if (IS_ERR(p->qp)) {
    464		ret = PTR_ERR(p->qp);
    465		goto err_qp;
    466	}
    467
    468	psn = prandom_u32() & 0xffffff;
    469	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
    470	if (ret)
    471		goto err_modify;
    472
    473	if (!ipoib_cm_has_srq(dev)) {
    474		ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
    475		if (ret)
    476			goto err_modify;
    477	}
    478
    479	spin_lock_irq(&priv->lock);
    480	queue_delayed_work(priv->wq,
    481			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
    482	/* Add this entry to passive ids list head, but do not re-add it
    483	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
    484	p->jiffies = jiffies;
    485	if (p->state == IPOIB_CM_RX_LIVE)
    486		list_move(&p->list, &priv->cm.passive_ids);
    487	spin_unlock_irq(&priv->lock);
    488
    489	ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
    490	if (ret) {
    491		ipoib_warn(priv, "failed to send REP: %d\n", ret);
    492		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
    493			ipoib_warn(priv, "unable to move qp to error state\n");
    494	}
    495	return 0;
    496
    497err_modify:
    498	ib_destroy_qp(p->qp);
    499err_qp:
    500	kfree(p);
    501	return ret;
    502}
    503
    504static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
    505			       const struct ib_cm_event *event)
    506{
    507	struct ipoib_cm_rx *p;
    508	struct ipoib_dev_priv *priv;
    509
    510	switch (event->event) {
    511	case IB_CM_REQ_RECEIVED:
    512		return ipoib_cm_req_handler(cm_id, event);
    513	case IB_CM_DREQ_RECEIVED:
    514		ib_send_cm_drep(cm_id, NULL, 0);
    515		fallthrough;
    516	case IB_CM_REJ_RECEIVED:
    517		p = cm_id->context;
    518		priv = ipoib_priv(p->dev);
    519		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
    520			ipoib_warn(priv, "unable to move qp to error state\n");
    521		fallthrough;
    522	default:
    523		return 0;
    524	}
    525}
    526/* Adjust length of skb with fragments to match received data */
    527static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
    528			  unsigned int length, struct sk_buff *toskb)
    529{
    530	int i, num_frags;
    531	unsigned int size;
    532
    533	/* put header into skb */
    534	size = min(length, hdr_space);
    535	skb->tail += size;
    536	skb->len += size;
    537	length -= size;
    538
    539	num_frags = skb_shinfo(skb)->nr_frags;
    540	for (i = 0; i < num_frags; i++) {
    541		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
    542
    543		if (length == 0) {
    544			/* don't need this page */
    545			skb_fill_page_desc(toskb, i, skb_frag_page(frag),
    546					   0, PAGE_SIZE);
    547			--skb_shinfo(skb)->nr_frags;
    548		} else {
    549			size = min_t(unsigned int, length, PAGE_SIZE);
    550
    551			skb_frag_size_set(frag, size);
    552			skb->data_len += size;
    553			skb->truesize += size;
    554			skb->len += size;
    555			length -= size;
    556		}
    557	}
    558}
    559
    560void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
    561{
    562	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    563	struct ipoib_cm_rx_buf *rx_ring;
    564	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
    565	struct sk_buff *skb, *newskb;
    566	struct ipoib_cm_rx *p;
    567	unsigned long flags;
    568	u64 mapping[IPOIB_CM_RX_SG];
    569	int frags;
    570	int has_srq;
    571	struct sk_buff *small_skb;
    572
    573	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
    574		       wr_id, wc->status);
    575
    576	if (unlikely(wr_id >= ipoib_recvq_size)) {
    577		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
    578			spin_lock_irqsave(&priv->lock, flags);
    579			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
    580			ipoib_cm_start_rx_drain(priv);
    581			queue_work(priv->wq, &priv->cm.rx_reap_task);
    582			spin_unlock_irqrestore(&priv->lock, flags);
    583		} else
    584			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
    585				   wr_id, ipoib_recvq_size);
    586		return;
    587	}
    588
    589	p = wc->qp->qp_context;
    590
    591	has_srq = ipoib_cm_has_srq(dev);
    592	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
    593
    594	skb = rx_ring[wr_id].skb;
    595
    596	if (unlikely(wc->status != IB_WC_SUCCESS)) {
    597		ipoib_dbg(priv,
    598			  "cm recv error (status=%d, wrid=%d vend_err %#x)\n",
    599			  wc->status, wr_id, wc->vendor_err);
    600		++dev->stats.rx_dropped;
    601		if (has_srq)
    602			goto repost;
    603		else {
    604			if (!--p->recv_count) {
    605				spin_lock_irqsave(&priv->lock, flags);
    606				list_move(&p->list, &priv->cm.rx_reap_list);
    607				spin_unlock_irqrestore(&priv->lock, flags);
    608				queue_work(priv->wq, &priv->cm.rx_reap_task);
    609			}
    610			return;
    611		}
    612	}
    613
    614	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
    615		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
    616			spin_lock_irqsave(&priv->lock, flags);
    617			p->jiffies = jiffies;
    618			/* Move this entry to list head, but do not re-add it
    619			 * if it has been moved out of list. */
    620			if (p->state == IPOIB_CM_RX_LIVE)
    621				list_move(&p->list, &priv->cm.passive_ids);
    622			spin_unlock_irqrestore(&priv->lock, flags);
    623		}
    624	}
    625
    626	if (wc->byte_len < IPOIB_CM_COPYBREAK) {
    627		int dlen = wc->byte_len;
    628
    629		small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
    630		if (small_skb) {
    631			skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
    632			ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
    633						   dlen, DMA_FROM_DEVICE);
    634			skb_copy_from_linear_data(skb, small_skb->data, dlen);
    635			ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
    636						      dlen, DMA_FROM_DEVICE);
    637			skb_put(small_skb, dlen);
    638			skb = small_skb;
    639			goto copied;
    640		}
    641	}
    642
    643	frags = PAGE_ALIGN(wc->byte_len -
    644			   min_t(u32, wc->byte_len, IPOIB_CM_HEAD_SIZE)) /
    645		PAGE_SIZE;
    646
    647	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags,
    648				       mapping, GFP_ATOMIC);
    649	if (unlikely(!newskb)) {
    650		/*
    651		 * If we can't allocate a new RX buffer, dump
    652		 * this packet and reuse the old buffer.
    653		 */
    654		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
    655		++dev->stats.rx_dropped;
    656		goto repost;
    657	}
    658
    659	ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
    660	memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof(*mapping));
    661
    662	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
    663		       wc->byte_len, wc->slid);
    664
    665	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
    666
    667copied:
    668	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
    669	skb_add_pseudo_hdr(skb);
    670
    671	++dev->stats.rx_packets;
    672	dev->stats.rx_bytes += skb->len;
    673
    674	skb->dev = dev;
    675	/* XXX get correct PACKET_ type here */
    676	skb->pkt_type = PACKET_HOST;
    677	netif_receive_skb(skb);
    678
    679repost:
    680	if (has_srq) {
    681		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
    682			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
    683				   "for buf %d\n", wr_id);
    684	} else {
    685		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
    686							  &priv->cm.rx_wr,
    687							  priv->cm.rx_sge,
    688							  wr_id))) {
    689			--p->recv_count;
    690			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
    691				   "for buf %d\n", wr_id);
    692		}
    693	}
    694}
    695
    696static inline int post_send(struct ipoib_dev_priv *priv,
    697			    struct ipoib_cm_tx *tx,
    698			    unsigned int wr_id,
    699			    struct ipoib_tx_buf *tx_req)
    700{
    701	ipoib_build_sge(priv, tx_req);
    702
    703	priv->tx_wr.wr.wr_id	= wr_id | IPOIB_OP_CM;
    704
    705	return ib_post_send(tx->qp, &priv->tx_wr.wr, NULL);
    706}
    707
    708void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
    709{
    710	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    711	struct ipoib_tx_buf *tx_req;
    712	int rc;
    713	unsigned int usable_sge = tx->max_send_sge - !!skb_headlen(skb);
    714
    715	if (unlikely(skb->len > tx->mtu)) {
    716		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
    717			   skb->len, tx->mtu);
    718		++dev->stats.tx_dropped;
    719		++dev->stats.tx_errors;
    720		ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
    721		return;
    722	}
    723	if (skb_shinfo(skb)->nr_frags > usable_sge) {
    724		if (skb_linearize(skb) < 0) {
    725			ipoib_warn(priv, "skb could not be linearized\n");
    726			++dev->stats.tx_dropped;
    727			++dev->stats.tx_errors;
    728			dev_kfree_skb_any(skb);
    729			return;
    730		}
    731		/* Does skb_linearize return ok without reducing nr_frags? */
    732		if (skb_shinfo(skb)->nr_frags > usable_sge) {
    733			ipoib_warn(priv, "too many frags after skb linearize\n");
    734			++dev->stats.tx_dropped;
    735			++dev->stats.tx_errors;
    736			dev_kfree_skb_any(skb);
    737			return;
    738		}
    739	}
    740	ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
    741		       tx->tx_head, skb->len, tx->qp->qp_num);
    742
    743	/*
    744	 * We put the skb into the tx_ring _before_ we call post_send()
    745	 * because it's entirely possible that the completion handler will
    746	 * run before we execute anything after the post_send().  That
    747	 * means we have to make sure everything is properly recorded and
    748	 * our state is consistent before we call post_send().
    749	 */
    750	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
    751	tx_req->skb = skb;
    752
    753	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
    754		++dev->stats.tx_errors;
    755		dev_kfree_skb_any(skb);
    756		return;
    757	}
    758
    759	if ((priv->global_tx_head - priv->global_tx_tail) ==
    760	    ipoib_sendq_size - 1) {
    761		ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
    762			  tx->qp->qp_num);
    763		netif_stop_queue(dev);
    764	}
    765
    766	skb_orphan(skb);
    767	skb_dst_drop(skb);
    768
    769	if (netif_queue_stopped(dev)) {
    770		rc = ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
    771				      IB_CQ_REPORT_MISSED_EVENTS);
    772		if (unlikely(rc < 0))
    773			ipoib_warn(priv, "IPoIB/CM:request notify on send CQ failed\n");
    774		else if (rc)
    775			napi_schedule(&priv->send_napi);
    776	}
    777
    778	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
    779	if (unlikely(rc)) {
    780		ipoib_warn(priv, "IPoIB/CM:post_send failed, error %d\n", rc);
    781		++dev->stats.tx_errors;
    782		ipoib_dma_unmap_tx(priv, tx_req);
    783		dev_kfree_skb_any(skb);
    784
    785		if (netif_queue_stopped(dev))
    786			netif_wake_queue(dev);
    787	} else {
    788		netif_trans_update(dev);
    789		++tx->tx_head;
    790		++priv->global_tx_head;
    791	}
    792}
    793
    794void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
    795{
    796	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    797	struct ipoib_cm_tx *tx = wc->qp->qp_context;
    798	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
    799	struct ipoib_tx_buf *tx_req;
    800	unsigned long flags;
    801
    802	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
    803		       wr_id, wc->status);
    804
    805	if (unlikely(wr_id >= ipoib_sendq_size)) {
    806		ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
    807			   wr_id, ipoib_sendq_size);
    808		return;
    809	}
    810
    811	tx_req = &tx->tx_ring[wr_id];
    812
    813	ipoib_dma_unmap_tx(priv, tx_req);
    814
    815	/* FIXME: is this right? Shouldn't we only increment on success? */
    816	++dev->stats.tx_packets;
    817	dev->stats.tx_bytes += tx_req->skb->len;
    818
    819	dev_kfree_skb_any(tx_req->skb);
    820
    821	netif_tx_lock(dev);
    822
    823	++tx->tx_tail;
    824	++priv->global_tx_tail;
    825
    826	if (unlikely(netif_queue_stopped(dev) &&
    827		     ((priv->global_tx_head - priv->global_tx_tail) <=
    828		      ipoib_sendq_size >> 1) &&
    829		     test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
    830		netif_wake_queue(dev);
    831
    832	if (wc->status != IB_WC_SUCCESS &&
    833	    wc->status != IB_WC_WR_FLUSH_ERR) {
    834		struct ipoib_neigh *neigh;
    835
    836		/* IB_WC[_RNR]_RETRY_EXC_ERR error is part of the life cycle,
    837		 * so don't make waves.
    838		 */
    839		if (wc->status == IB_WC_RNR_RETRY_EXC_ERR ||
    840		    wc->status == IB_WC_RETRY_EXC_ERR)
    841			ipoib_dbg(priv,
    842				  "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n",
    843				   __func__, wc->status, wr_id, wc->vendor_err);
    844		else
    845			ipoib_warn(priv,
    846				    "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n",
    847				   __func__, wc->status, wr_id, wc->vendor_err);
    848
    849		spin_lock_irqsave(&priv->lock, flags);
    850		neigh = tx->neigh;
    851
    852		if (neigh) {
    853			neigh->cm = NULL;
    854			ipoib_neigh_free(neigh);
    855
    856			tx->neigh = NULL;
    857		}
    858
    859		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
    860			list_move(&tx->list, &priv->cm.reap_list);
    861			queue_work(priv->wq, &priv->cm.reap_task);
    862		}
    863
    864		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
    865
    866		spin_unlock_irqrestore(&priv->lock, flags);
    867	}
    868
    869	netif_tx_unlock(dev);
    870}
    871
    872int ipoib_cm_dev_open(struct net_device *dev)
    873{
    874	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    875	int ret;
    876
    877	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
    878		return 0;
    879
    880	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
    881	if (IS_ERR(priv->cm.id)) {
    882		pr_warn("%s: failed to create CM ID\n", priv->ca->name);
    883		ret = PTR_ERR(priv->cm.id);
    884		goto err_cm;
    885	}
    886
    887	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
    888			   0);
    889	if (ret) {
    890		pr_warn("%s: failed to listen on ID 0x%llx\n", priv->ca->name,
    891			IPOIB_CM_IETF_ID | priv->qp->qp_num);
    892		goto err_listen;
    893	}
    894
    895	return 0;
    896
    897err_listen:
    898	ib_destroy_cm_id(priv->cm.id);
    899err_cm:
    900	priv->cm.id = NULL;
    901	return ret;
    902}
    903
    904static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
    905{
    906	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    907	struct ipoib_cm_rx *rx, *n;
    908	LIST_HEAD(list);
    909
    910	spin_lock_irq(&priv->lock);
    911	list_splice_init(&priv->cm.rx_reap_list, &list);
    912	spin_unlock_irq(&priv->lock);
    913
    914	list_for_each_entry_safe(rx, n, &list, list) {
    915		ib_destroy_cm_id(rx->id);
    916		ib_destroy_qp(rx->qp);
    917		if (!ipoib_cm_has_srq(dev)) {
    918			ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
    919			spin_lock_irq(&priv->lock);
    920			--priv->cm.nonsrq_conn_qp;
    921			spin_unlock_irq(&priv->lock);
    922		}
    923		kfree(rx);
    924	}
    925}
    926
    927void ipoib_cm_dev_stop(struct net_device *dev)
    928{
    929	struct ipoib_dev_priv *priv = ipoib_priv(dev);
    930	struct ipoib_cm_rx *p;
    931	unsigned long begin;
    932	int ret;
    933
    934	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
    935		return;
    936
    937	ib_destroy_cm_id(priv->cm.id);
    938	priv->cm.id = NULL;
    939
    940	spin_lock_irq(&priv->lock);
    941	while (!list_empty(&priv->cm.passive_ids)) {
    942		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
    943		list_move(&p->list, &priv->cm.rx_error_list);
    944		p->state = IPOIB_CM_RX_ERROR;
    945		spin_unlock_irq(&priv->lock);
    946		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
    947		if (ret)
    948			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
    949		spin_lock_irq(&priv->lock);
    950	}
    951
    952	/* Wait for all RX to be drained */
    953	begin = jiffies;
    954
    955	while (!list_empty(&priv->cm.rx_error_list) ||
    956	       !list_empty(&priv->cm.rx_flush_list) ||
    957	       !list_empty(&priv->cm.rx_drain_list)) {
    958		if (time_after(jiffies, begin + 5 * HZ)) {
    959			ipoib_warn(priv, "RX drain timing out\n");
    960
    961			/*
    962			 * assume the HW is wedged and just free up everything.
    963			 */
    964			list_splice_init(&priv->cm.rx_flush_list,
    965					 &priv->cm.rx_reap_list);
    966			list_splice_init(&priv->cm.rx_error_list,
    967					 &priv->cm.rx_reap_list);
    968			list_splice_init(&priv->cm.rx_drain_list,
    969					 &priv->cm.rx_reap_list);
    970			break;
    971		}
    972		spin_unlock_irq(&priv->lock);
    973		usleep_range(1000, 2000);
    974		ipoib_drain_cq(dev);
    975		spin_lock_irq(&priv->lock);
    976	}
    977
    978	spin_unlock_irq(&priv->lock);
    979
    980	ipoib_cm_free_rx_reap_list(dev);
    981
    982	cancel_delayed_work(&priv->cm.stale_task);
    983}
    984
    985static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id,
    986				const struct ib_cm_event *event)
    987{
    988	struct ipoib_cm_tx *p = cm_id->context;
    989	struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
    990	struct ipoib_cm_data *data = event->private_data;
    991	struct sk_buff_head skqueue;
    992	struct ib_qp_attr qp_attr;
    993	int qp_attr_mask, ret;
    994	struct sk_buff *skb;
    995
    996	p->mtu = be32_to_cpu(data->mtu);
    997
    998	if (p->mtu <= IPOIB_ENCAP_LEN) {
    999		ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
   1000			   p->mtu, IPOIB_ENCAP_LEN);
   1001		return -EINVAL;
   1002	}
   1003
   1004	qp_attr.qp_state = IB_QPS_RTR;
   1005	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
   1006	if (ret) {
   1007		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
   1008		return ret;
   1009	}
   1010
   1011	qp_attr.rq_psn = 0 /* FIXME */;
   1012	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
   1013	if (ret) {
   1014		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
   1015		return ret;
   1016	}
   1017
   1018	qp_attr.qp_state = IB_QPS_RTS;
   1019	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
   1020	if (ret) {
   1021		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
   1022		return ret;
   1023	}
   1024	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
   1025	if (ret) {
   1026		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
   1027		return ret;
   1028	}
   1029
   1030	skb_queue_head_init(&skqueue);
   1031
   1032	netif_tx_lock_bh(p->dev);
   1033	spin_lock_irq(&priv->lock);
   1034	set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
   1035	if (p->neigh)
   1036		while ((skb = __skb_dequeue(&p->neigh->queue)))
   1037			__skb_queue_tail(&skqueue, skb);
   1038	spin_unlock_irq(&priv->lock);
   1039	netif_tx_unlock_bh(p->dev);
   1040
   1041	while ((skb = __skb_dequeue(&skqueue))) {
   1042		skb->dev = p->dev;
   1043		ret = dev_queue_xmit(skb);
   1044		if (ret)
   1045			ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n",
   1046				   __func__, ret);
   1047	}
   1048
   1049	ret = ib_send_cm_rtu(cm_id, NULL, 0);
   1050	if (ret) {
   1051		ipoib_warn(priv, "failed to send RTU: %d\n", ret);
   1052		return ret;
   1053	}
   1054	return 0;
   1055}
   1056
   1057static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
   1058{
   1059	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1060	struct ib_qp_init_attr attr = {
   1061		.send_cq		= priv->send_cq,
   1062		.recv_cq		= priv->recv_cq,
   1063		.srq			= priv->cm.srq,
   1064		.cap.max_send_wr	= ipoib_sendq_size,
   1065		.cap.max_send_sge	= 1,
   1066		.sq_sig_type		= IB_SIGNAL_ALL_WR,
   1067		.qp_type		= IB_QPT_RC,
   1068		.qp_context		= tx,
   1069		.create_flags		= 0
   1070	};
   1071	struct ib_qp *tx_qp;
   1072
   1073	if (dev->features & NETIF_F_SG)
   1074		attr.cap.max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge,
   1075					      MAX_SKB_FRAGS + 1);
   1076
   1077	tx_qp = ib_create_qp(priv->pd, &attr);
   1078	tx->max_send_sge = attr.cap.max_send_sge;
   1079	return tx_qp;
   1080}
   1081
   1082static int ipoib_cm_send_req(struct net_device *dev,
   1083			     struct ib_cm_id *id, struct ib_qp *qp,
   1084			     u32 qpn,
   1085			     struct sa_path_rec *pathrec)
   1086{
   1087	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1088	struct ipoib_cm_data data = {};
   1089	struct ib_cm_req_param req = {};
   1090
   1091	data.qpn = cpu_to_be32(priv->qp->qp_num);
   1092	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
   1093
   1094	req.primary_path		= pathrec;
   1095	req.alternate_path		= NULL;
   1096	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
   1097	req.qp_num			= qp->qp_num;
   1098	req.qp_type			= qp->qp_type;
   1099	req.private_data		= &data;
   1100	req.private_data_len		= sizeof(data);
   1101	req.flow_control		= 0;
   1102
   1103	req.starting_psn		= 0; /* FIXME */
   1104
   1105	/*
   1106	 * Pick some arbitrary defaults here; we could make these
   1107	 * module parameters if anyone cared about setting them.
   1108	 */
   1109	req.responder_resources		= 4;
   1110	req.remote_cm_response_timeout	= 20;
   1111	req.local_cm_response_timeout	= 20;
   1112	req.retry_count			= 0; /* RFC draft warns against retries */
   1113	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
   1114	req.max_cm_retries		= 15;
   1115	req.srq				= ipoib_cm_has_srq(dev);
   1116	return ib_send_cm_req(id, &req);
   1117}
   1118
   1119static int ipoib_cm_modify_tx_init(struct net_device *dev,
   1120				  struct ib_cm_id *cm_id, struct ib_qp *qp)
   1121{
   1122	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1123	struct ib_qp_attr qp_attr;
   1124	int qp_attr_mask, ret;
   1125
   1126	qp_attr.pkey_index = priv->pkey_index;
   1127	qp_attr.qp_state = IB_QPS_INIT;
   1128	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
   1129	qp_attr.port_num = priv->port;
   1130	qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
   1131
   1132	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
   1133	if (ret) {
   1134		ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
   1135		return ret;
   1136	}
   1137	return 0;
   1138}
   1139
   1140static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
   1141			    struct sa_path_rec *pathrec)
   1142{
   1143	struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
   1144	unsigned int noio_flag;
   1145	int ret;
   1146
   1147	noio_flag = memalloc_noio_save();
   1148	p->tx_ring = vzalloc(array_size(ipoib_sendq_size, sizeof(*p->tx_ring)));
   1149	if (!p->tx_ring) {
   1150		memalloc_noio_restore(noio_flag);
   1151		ret = -ENOMEM;
   1152		goto err_tx;
   1153	}
   1154
   1155	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
   1156	memalloc_noio_restore(noio_flag);
   1157	if (IS_ERR(p->qp)) {
   1158		ret = PTR_ERR(p->qp);
   1159		ipoib_warn(priv, "failed to create tx qp: %d\n", ret);
   1160		goto err_qp;
   1161	}
   1162
   1163	p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
   1164	if (IS_ERR(p->id)) {
   1165		ret = PTR_ERR(p->id);
   1166		ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
   1167		goto err_id;
   1168	}
   1169
   1170	ret = ipoib_cm_modify_tx_init(p->dev, p->id,  p->qp);
   1171	if (ret) {
   1172		ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
   1173		goto err_modify_send;
   1174	}
   1175
   1176	ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
   1177	if (ret) {
   1178		ipoib_warn(priv, "failed to send cm req: %d\n", ret);
   1179		goto err_modify_send;
   1180	}
   1181
   1182	ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
   1183		  p->qp->qp_num, pathrec->dgid.raw, qpn);
   1184
   1185	return 0;
   1186
   1187err_modify_send:
   1188	ib_destroy_cm_id(p->id);
   1189err_id:
   1190	p->id = NULL;
   1191	ib_destroy_qp(p->qp);
   1192err_qp:
   1193	p->qp = NULL;
   1194	vfree(p->tx_ring);
   1195err_tx:
   1196	return ret;
   1197}
   1198
   1199static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
   1200{
   1201	struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
   1202	struct ipoib_tx_buf *tx_req;
   1203	unsigned long begin;
   1204
   1205	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
   1206		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
   1207
   1208	if (p->id)
   1209		ib_destroy_cm_id(p->id);
   1210
   1211	if (p->tx_ring) {
   1212		/* Wait for all sends to complete */
   1213		begin = jiffies;
   1214		while ((int) p->tx_tail - (int) p->tx_head < 0) {
   1215			if (time_after(jiffies, begin + 5 * HZ)) {
   1216				ipoib_warn(priv, "timing out; %d sends not completed\n",
   1217					   p->tx_head - p->tx_tail);
   1218				goto timeout;
   1219			}
   1220
   1221			usleep_range(1000, 2000);
   1222		}
   1223	}
   1224
   1225timeout:
   1226
   1227	while ((int) p->tx_tail - (int) p->tx_head < 0) {
   1228		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
   1229		ipoib_dma_unmap_tx(priv, tx_req);
   1230		dev_kfree_skb_any(tx_req->skb);
   1231		netif_tx_lock_bh(p->dev);
   1232		++p->tx_tail;
   1233		++priv->global_tx_tail;
   1234		if (unlikely((priv->global_tx_head - priv->global_tx_tail) <=
   1235			     ipoib_sendq_size >> 1) &&
   1236		    netif_queue_stopped(p->dev) &&
   1237		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
   1238			netif_wake_queue(p->dev);
   1239		netif_tx_unlock_bh(p->dev);
   1240	}
   1241
   1242	if (p->qp)
   1243		ib_destroy_qp(p->qp);
   1244
   1245	vfree(p->tx_ring);
   1246	kfree(p);
   1247}
   1248
   1249static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
   1250			       const struct ib_cm_event *event)
   1251{
   1252	struct ipoib_cm_tx *tx = cm_id->context;
   1253	struct ipoib_dev_priv *priv = ipoib_priv(tx->dev);
   1254	struct net_device *dev = priv->dev;
   1255	struct ipoib_neigh *neigh;
   1256	unsigned long flags;
   1257	int ret;
   1258
   1259	switch (event->event) {
   1260	case IB_CM_DREQ_RECEIVED:
   1261		ipoib_dbg(priv, "DREQ received.\n");
   1262		ib_send_cm_drep(cm_id, NULL, 0);
   1263		break;
   1264	case IB_CM_REP_RECEIVED:
   1265		ipoib_dbg(priv, "REP received.\n");
   1266		ret = ipoib_cm_rep_handler(cm_id, event);
   1267		if (ret)
   1268			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
   1269				       NULL, 0, NULL, 0);
   1270		break;
   1271	case IB_CM_REQ_ERROR:
   1272	case IB_CM_REJ_RECEIVED:
   1273	case IB_CM_TIMEWAIT_EXIT:
   1274		ipoib_dbg(priv, "CM error %d.\n", event->event);
   1275		netif_tx_lock_bh(dev);
   1276		spin_lock_irqsave(&priv->lock, flags);
   1277		neigh = tx->neigh;
   1278
   1279		if (neigh) {
   1280			neigh->cm = NULL;
   1281			ipoib_neigh_free(neigh);
   1282
   1283			tx->neigh = NULL;
   1284		}
   1285
   1286		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
   1287			list_move(&tx->list, &priv->cm.reap_list);
   1288			queue_work(priv->wq, &priv->cm.reap_task);
   1289		}
   1290
   1291		spin_unlock_irqrestore(&priv->lock, flags);
   1292		netif_tx_unlock_bh(dev);
   1293		break;
   1294	default:
   1295		break;
   1296	}
   1297
   1298	return 0;
   1299}
   1300
   1301struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
   1302				       struct ipoib_neigh *neigh)
   1303{
   1304	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1305	struct ipoib_cm_tx *tx;
   1306
   1307	tx = kzalloc(sizeof(*tx), GFP_ATOMIC);
   1308	if (!tx)
   1309		return NULL;
   1310
   1311	neigh->cm = tx;
   1312	tx->neigh = neigh;
   1313	tx->dev = dev;
   1314	list_add(&tx->list, &priv->cm.start_list);
   1315	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
   1316	queue_work(priv->wq, &priv->cm.start_task);
   1317	return tx;
   1318}
   1319
   1320void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
   1321{
   1322	struct ipoib_dev_priv *priv = ipoib_priv(tx->dev);
   1323	unsigned long flags;
   1324	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
   1325		spin_lock_irqsave(&priv->lock, flags);
   1326		list_move(&tx->list, &priv->cm.reap_list);
   1327		queue_work(priv->wq, &priv->cm.reap_task);
   1328		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
   1329			  tx->neigh->daddr + 4);
   1330		tx->neigh = NULL;
   1331		spin_unlock_irqrestore(&priv->lock, flags);
   1332	}
   1333}
   1334
   1335#define QPN_AND_OPTIONS_OFFSET	4
   1336
   1337static void ipoib_cm_tx_start(struct work_struct *work)
   1338{
   1339	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
   1340						   cm.start_task);
   1341	struct net_device *dev = priv->dev;
   1342	struct ipoib_neigh *neigh;
   1343	struct ipoib_cm_tx *p;
   1344	unsigned long flags;
   1345	struct ipoib_path *path;
   1346	int ret;
   1347
   1348	struct sa_path_rec pathrec;
   1349	u32 qpn;
   1350
   1351	netif_tx_lock_bh(dev);
   1352	spin_lock_irqsave(&priv->lock, flags);
   1353
   1354	while (!list_empty(&priv->cm.start_list)) {
   1355		p = list_entry(priv->cm.start_list.next, typeof(*p), list);
   1356		list_del_init(&p->list);
   1357		neigh = p->neigh;
   1358
   1359		qpn = IPOIB_QPN(neigh->daddr);
   1360		/*
   1361		 * As long as the search is with these 2 locks,
   1362		 * path existence indicates its validity.
   1363		 */
   1364		path = __path_find(dev, neigh->daddr + QPN_AND_OPTIONS_OFFSET);
   1365		if (!path) {
   1366			pr_info("%s ignore not valid path %pI6\n",
   1367				__func__,
   1368				neigh->daddr + QPN_AND_OPTIONS_OFFSET);
   1369			goto free_neigh;
   1370		}
   1371		memcpy(&pathrec, &path->pathrec, sizeof(pathrec));
   1372
   1373		spin_unlock_irqrestore(&priv->lock, flags);
   1374		netif_tx_unlock_bh(dev);
   1375
   1376		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
   1377
   1378		netif_tx_lock_bh(dev);
   1379		spin_lock_irqsave(&priv->lock, flags);
   1380
   1381		if (ret) {
   1382free_neigh:
   1383			neigh = p->neigh;
   1384			if (neigh) {
   1385				neigh->cm = NULL;
   1386				ipoib_neigh_free(neigh);
   1387			}
   1388			list_del(&p->list);
   1389			kfree(p);
   1390		}
   1391	}
   1392
   1393	spin_unlock_irqrestore(&priv->lock, flags);
   1394	netif_tx_unlock_bh(dev);
   1395}
   1396
   1397static void ipoib_cm_tx_reap(struct work_struct *work)
   1398{
   1399	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
   1400						   cm.reap_task);
   1401	struct net_device *dev = priv->dev;
   1402	struct ipoib_cm_tx *p;
   1403	unsigned long flags;
   1404
   1405	netif_tx_lock_bh(dev);
   1406	spin_lock_irqsave(&priv->lock, flags);
   1407
   1408	while (!list_empty(&priv->cm.reap_list)) {
   1409		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
   1410		list_del_init(&p->list);
   1411		spin_unlock_irqrestore(&priv->lock, flags);
   1412		netif_tx_unlock_bh(dev);
   1413		ipoib_cm_tx_destroy(p);
   1414		netif_tx_lock_bh(dev);
   1415		spin_lock_irqsave(&priv->lock, flags);
   1416	}
   1417
   1418	spin_unlock_irqrestore(&priv->lock, flags);
   1419	netif_tx_unlock_bh(dev);
   1420}
   1421
   1422static void ipoib_cm_skb_reap(struct work_struct *work)
   1423{
   1424	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
   1425						   cm.skb_task);
   1426	struct net_device *dev = priv->dev;
   1427	struct sk_buff *skb;
   1428	unsigned long flags;
   1429	unsigned int mtu = priv->mcast_mtu;
   1430
   1431	netif_tx_lock_bh(dev);
   1432	spin_lock_irqsave(&priv->lock, flags);
   1433
   1434	while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
   1435		spin_unlock_irqrestore(&priv->lock, flags);
   1436		netif_tx_unlock_bh(dev);
   1437
   1438		if (skb->protocol == htons(ETH_P_IP)) {
   1439			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
   1440			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
   1441		}
   1442#if IS_ENABLED(CONFIG_IPV6)
   1443		else if (skb->protocol == htons(ETH_P_IPV6)) {
   1444			memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
   1445			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
   1446		}
   1447#endif
   1448		dev_kfree_skb_any(skb);
   1449
   1450		netif_tx_lock_bh(dev);
   1451		spin_lock_irqsave(&priv->lock, flags);
   1452	}
   1453
   1454	spin_unlock_irqrestore(&priv->lock, flags);
   1455	netif_tx_unlock_bh(dev);
   1456}
   1457
   1458void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
   1459			   unsigned int mtu)
   1460{
   1461	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1462	int e = skb_queue_empty(&priv->cm.skb_queue);
   1463
   1464	skb_dst_update_pmtu(skb, mtu);
   1465
   1466	skb_queue_tail(&priv->cm.skb_queue, skb);
   1467	if (e)
   1468		queue_work(priv->wq, &priv->cm.skb_task);
   1469}
   1470
   1471static void ipoib_cm_rx_reap(struct work_struct *work)
   1472{
   1473	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
   1474						cm.rx_reap_task)->dev);
   1475}
   1476
   1477static void ipoib_cm_stale_task(struct work_struct *work)
   1478{
   1479	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
   1480						   cm.stale_task.work);
   1481	struct ipoib_cm_rx *p;
   1482	int ret;
   1483
   1484	spin_lock_irq(&priv->lock);
   1485	while (!list_empty(&priv->cm.passive_ids)) {
   1486		/* List is sorted by LRU, start from tail,
   1487		 * stop when we see a recently used entry */
   1488		p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
   1489		if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
   1490			break;
   1491		list_move(&p->list, &priv->cm.rx_error_list);
   1492		p->state = IPOIB_CM_RX_ERROR;
   1493		spin_unlock_irq(&priv->lock);
   1494		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
   1495		if (ret)
   1496			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
   1497		spin_lock_irq(&priv->lock);
   1498	}
   1499
   1500	if (!list_empty(&priv->cm.passive_ids))
   1501		queue_delayed_work(priv->wq,
   1502				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
   1503	spin_unlock_irq(&priv->lock);
   1504}
   1505
   1506static ssize_t mode_show(struct device *d, struct device_attribute *attr,
   1507			 char *buf)
   1508{
   1509	struct net_device *dev = to_net_dev(d);
   1510	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1511
   1512	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
   1513		return sysfs_emit(buf, "connected\n");
   1514	else
   1515		return sysfs_emit(buf, "datagram\n");
   1516}
   1517
   1518static ssize_t mode_store(struct device *d, struct device_attribute *attr,
   1519			  const char *buf, size_t count)
   1520{
   1521	struct net_device *dev = to_net_dev(d);
   1522	int ret;
   1523
   1524	if (!rtnl_trylock()) {
   1525		return restart_syscall();
   1526	}
   1527
   1528	if (dev->reg_state != NETREG_REGISTERED) {
   1529		rtnl_unlock();
   1530		return -EPERM;
   1531	}
   1532
   1533	ret = ipoib_set_mode(dev, buf);
   1534
   1535	/* The assumption is that the function ipoib_set_mode returned
   1536	 * with the rtnl held by it, if not the value -EBUSY returned,
   1537	 * then no need to rtnl_unlock
   1538	 */
   1539	if (ret != -EBUSY)
   1540		rtnl_unlock();
   1541
   1542	return (!ret || ret == -EBUSY) ? count : ret;
   1543}
   1544
   1545static DEVICE_ATTR_RW(mode);
   1546
   1547int ipoib_cm_add_mode_attr(struct net_device *dev)
   1548{
   1549	return device_create_file(&dev->dev, &dev_attr_mode);
   1550}
   1551
   1552static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
   1553{
   1554	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1555	struct ib_srq_init_attr srq_init_attr = {
   1556		.srq_type = IB_SRQT_BASIC,
   1557		.attr = {
   1558			.max_wr  = ipoib_recvq_size,
   1559			.max_sge = max_sge
   1560		}
   1561	};
   1562
   1563	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
   1564	if (IS_ERR(priv->cm.srq)) {
   1565		if (PTR_ERR(priv->cm.srq) != -EOPNOTSUPP)
   1566			pr_warn("%s: failed to allocate SRQ, error %ld\n",
   1567			       priv->ca->name, PTR_ERR(priv->cm.srq));
   1568		priv->cm.srq = NULL;
   1569		return;
   1570	}
   1571
   1572	priv->cm.srq_ring = vzalloc(array_size(ipoib_recvq_size,
   1573					       sizeof(*priv->cm.srq_ring)));
   1574	if (!priv->cm.srq_ring) {
   1575		ib_destroy_srq(priv->cm.srq);
   1576		priv->cm.srq = NULL;
   1577		return;
   1578	}
   1579
   1580}
   1581
   1582int ipoib_cm_dev_init(struct net_device *dev)
   1583{
   1584	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1585	int max_srq_sge, i;
   1586	u8 addr;
   1587
   1588	INIT_LIST_HEAD(&priv->cm.passive_ids);
   1589	INIT_LIST_HEAD(&priv->cm.reap_list);
   1590	INIT_LIST_HEAD(&priv->cm.start_list);
   1591	INIT_LIST_HEAD(&priv->cm.rx_error_list);
   1592	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
   1593	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
   1594	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
   1595	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
   1596	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
   1597	INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
   1598	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
   1599	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
   1600
   1601	skb_queue_head_init(&priv->cm.skb_queue);
   1602
   1603	ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge);
   1604
   1605	max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge);
   1606	ipoib_cm_create_srq(dev, max_srq_sge);
   1607	if (ipoib_cm_has_srq(dev)) {
   1608		priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10;
   1609		priv->cm.num_frags  = max_srq_sge;
   1610		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
   1611			  priv->cm.max_cm_mtu, priv->cm.num_frags);
   1612	} else {
   1613		priv->cm.max_cm_mtu = IPOIB_CM_MTU;
   1614		priv->cm.num_frags  = IPOIB_CM_RX_SG;
   1615	}
   1616
   1617	ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
   1618
   1619	if (ipoib_cm_has_srq(dev)) {
   1620		for (i = 0; i < ipoib_recvq_size; ++i) {
   1621			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
   1622						   priv->cm.num_frags - 1,
   1623						   priv->cm.srq_ring[i].mapping,
   1624						   GFP_KERNEL)) {
   1625				ipoib_warn(priv, "failed to allocate "
   1626					   "receive buffer %d\n", i);
   1627				ipoib_cm_dev_cleanup(dev);
   1628				return -ENOMEM;
   1629			}
   1630
   1631			if (ipoib_cm_post_receive_srq(dev, i)) {
   1632				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
   1633					   "failed for buf %d\n", i);
   1634				ipoib_cm_dev_cleanup(dev);
   1635				return -EIO;
   1636			}
   1637		}
   1638	}
   1639
   1640	addr = IPOIB_FLAGS_RC;
   1641	dev_addr_mod(dev, 0, &addr, 1);
   1642	return 0;
   1643}
   1644
   1645void ipoib_cm_dev_cleanup(struct net_device *dev)
   1646{
   1647	struct ipoib_dev_priv *priv = ipoib_priv(dev);
   1648
   1649	if (!priv->cm.srq)
   1650		return;
   1651
   1652	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
   1653
   1654	ib_destroy_srq(priv->cm.srq);
   1655	priv->cm.srq = NULL;
   1656	if (!priv->cm.srq_ring)
   1657		return;
   1658
   1659	ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
   1660	priv->cm.srq_ring = NULL;
   1661}