cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

ib_cm.c (36833B)


      1/*
      2 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
      3 *
      4 * This software is available to you under a choice of one of two
      5 * licenses.  You may choose to be licensed under the terms of the GNU
      6 * General Public License (GPL) Version 2, available from the file
      7 * COPYING in the main directory of this source tree, or the
      8 * OpenIB.org BSD license below:
      9 *
     10 *     Redistribution and use in source and binary forms, with or
     11 *     without modification, are permitted provided that the following
     12 *     conditions are met:
     13 *
     14 *      - Redistributions of source code must retain the above
     15 *        copyright notice, this list of conditions and the following
     16 *        disclaimer.
     17 *
     18 *      - Redistributions in binary form must reproduce the above
     19 *        copyright notice, this list of conditions and the following
     20 *        disclaimer in the documentation and/or other materials
     21 *        provided with the distribution.
     22 *
     23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     30 * SOFTWARE.
     31 *
     32 */
     33#include <linux/kernel.h>
     34#include <linux/in.h>
     35#include <linux/slab.h>
     36#include <linux/vmalloc.h>
     37#include <linux/ratelimit.h>
     38#include <net/addrconf.h>
     39#include <rdma/ib_cm.h>
     40
     41#include "rds_single_path.h"
     42#include "rds.h"
     43#include "ib.h"
     44#include "ib_mr.h"
     45
     46/*
     47 * Set the selected protocol version
     48 */
     49static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
     50{
     51	conn->c_version = version;
     52}
     53
     54/*
     55 * Set up flow control
     56 */
     57static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
     58{
     59	struct rds_ib_connection *ic = conn->c_transport_data;
     60
     61	if (rds_ib_sysctl_flow_control && credits != 0) {
     62		/* We're doing flow control */
     63		ic->i_flowctl = 1;
     64		rds_ib_send_add_credits(conn, credits);
     65	} else {
     66		ic->i_flowctl = 0;
     67	}
     68}
     69
     70/*
     71 * Connection established.
     72 * We get here for both outgoing and incoming connection.
     73 */
     74void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
     75{
     76	struct rds_ib_connection *ic = conn->c_transport_data;
     77	const union rds_ib_conn_priv *dp = NULL;
     78	__be64 ack_seq = 0;
     79	__be32 credit = 0;
     80	u8 major = 0;
     81	u8 minor = 0;
     82	int err;
     83
     84	dp = event->param.conn.private_data;
     85	if (conn->c_isv6) {
     86		if (event->param.conn.private_data_len >=
     87		    sizeof(struct rds6_ib_connect_private)) {
     88			major = dp->ricp_v6.dp_protocol_major;
     89			minor = dp->ricp_v6.dp_protocol_minor;
     90			credit = dp->ricp_v6.dp_credit;
     91			/* dp structure start is not guaranteed to be 8 bytes
     92			 * aligned.  Since dp_ack_seq is 64-bit extended load
     93			 * operations can be used so go through get_unaligned
     94			 * to avoid unaligned errors.
     95			 */
     96			ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
     97		}
     98	} else if (event->param.conn.private_data_len >=
     99		   sizeof(struct rds_ib_connect_private)) {
    100		major = dp->ricp_v4.dp_protocol_major;
    101		minor = dp->ricp_v4.dp_protocol_minor;
    102		credit = dp->ricp_v4.dp_credit;
    103		ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
    104	}
    105
    106	/* make sure it isn't empty data */
    107	if (major) {
    108		rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
    109		rds_ib_set_flow_control(conn, be32_to_cpu(credit));
    110	}
    111
    112	if (conn->c_version < RDS_PROTOCOL_VERSION) {
    113		if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
    114			pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
    115				  &conn->c_laddr, &conn->c_faddr,
    116				  RDS_PROTOCOL_MAJOR(conn->c_version),
    117				  RDS_PROTOCOL_MINOR(conn->c_version));
    118			rds_conn_destroy(conn);
    119			return;
    120		}
    121	}
    122
    123	pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
    124		  ic->i_active_side ? "Active" : "Passive",
    125		  &conn->c_laddr, &conn->c_faddr, conn->c_tos,
    126		  RDS_PROTOCOL_MAJOR(conn->c_version),
    127		  RDS_PROTOCOL_MINOR(conn->c_version),
    128		  ic->i_flowctl ? ", flow control" : "");
    129
    130	/* receive sl from the peer */
    131	ic->i_sl = ic->i_cm_id->route.path_rec->sl;
    132
    133	atomic_set(&ic->i_cq_quiesce, 0);
    134
    135	/* Init rings and fill recv. this needs to wait until protocol
    136	 * negotiation is complete, since ring layout is different
    137	 * from 3.1 to 4.1.
    138	 */
    139	rds_ib_send_init_ring(ic);
    140	rds_ib_recv_init_ring(ic);
    141	/* Post receive buffers - as a side effect, this will update
    142	 * the posted credit count. */
    143	rds_ib_recv_refill(conn, 1, GFP_KERNEL);
    144
    145	/* update ib_device with this local ipaddr */
    146	err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
    147	if (err)
    148		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
    149			err);
    150
    151	/* If the peer gave us the last packet it saw, process this as if
    152	 * we had received a regular ACK. */
    153	if (dp) {
    154		if (ack_seq)
    155			rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
    156					    NULL);
    157	}
    158
    159	conn->c_proposed_version = conn->c_version;
    160	rds_connect_complete(conn);
    161}
    162
    163static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
    164				      struct rdma_conn_param *conn_param,
    165				      union rds_ib_conn_priv *dp,
    166				      u32 protocol_version,
    167				      u32 max_responder_resources,
    168				      u32 max_initiator_depth,
    169				      bool isv6)
    170{
    171	struct rds_ib_connection *ic = conn->c_transport_data;
    172	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
    173
    174	memset(conn_param, 0, sizeof(struct rdma_conn_param));
    175
    176	conn_param->responder_resources =
    177		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
    178	conn_param->initiator_depth =
    179		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
    180	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
    181	conn_param->rnr_retry_count = 7;
    182
    183	if (dp) {
    184		memset(dp, 0, sizeof(*dp));
    185		if (isv6) {
    186			dp->ricp_v6.dp_saddr = conn->c_laddr;
    187			dp->ricp_v6.dp_daddr = conn->c_faddr;
    188			dp->ricp_v6.dp_protocol_major =
    189			    RDS_PROTOCOL_MAJOR(protocol_version);
    190			dp->ricp_v6.dp_protocol_minor =
    191			    RDS_PROTOCOL_MINOR(protocol_version);
    192			dp->ricp_v6.dp_protocol_minor_mask =
    193			    cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
    194			dp->ricp_v6.dp_ack_seq =
    195			    cpu_to_be64(rds_ib_piggyb_ack(ic));
    196			dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;
    197
    198			conn_param->private_data = &dp->ricp_v6;
    199			conn_param->private_data_len = sizeof(dp->ricp_v6);
    200		} else {
    201			dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
    202			dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
    203			dp->ricp_v4.dp_protocol_major =
    204			    RDS_PROTOCOL_MAJOR(protocol_version);
    205			dp->ricp_v4.dp_protocol_minor =
    206			    RDS_PROTOCOL_MINOR(protocol_version);
    207			dp->ricp_v4.dp_protocol_minor_mask =
    208			    cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
    209			dp->ricp_v4.dp_ack_seq =
    210			    cpu_to_be64(rds_ib_piggyb_ack(ic));
    211			dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;
    212
    213			conn_param->private_data = &dp->ricp_v4;
    214			conn_param->private_data_len = sizeof(dp->ricp_v4);
    215		}
    216
    217		/* Advertise flow control */
    218		if (ic->i_flowctl) {
    219			unsigned int credits;
    220
    221			credits = IB_GET_POST_CREDITS
    222				(atomic_read(&ic->i_credits));
    223			if (isv6)
    224				dp->ricp_v6.dp_credit = cpu_to_be32(credits);
    225			else
    226				dp->ricp_v4.dp_credit = cpu_to_be32(credits);
    227			atomic_sub(IB_SET_POST_CREDITS(credits),
    228				   &ic->i_credits);
    229		}
    230	}
    231}
    232
    233static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
    234{
    235	rdsdebug("event %u (%s) data %p\n",
    236		 event->event, ib_event_msg(event->event), data);
    237}
    238
    239/* Plucking the oldest entry from the ring can be done concurrently with
    240 * the thread refilling the ring.  Each ring operation is protected by
    241 * spinlocks and the transient state of refilling doesn't change the
    242 * recording of which entry is oldest.
    243 *
    244 * This relies on IB only calling one cq comp_handler for each cq so that
    245 * there will only be one caller of rds_recv_incoming() per RDS connection.
    246 */
    247static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
    248{
    249	struct rds_connection *conn = context;
    250	struct rds_ib_connection *ic = conn->c_transport_data;
    251
    252	rdsdebug("conn %p cq %p\n", conn, cq);
    253
    254	rds_ib_stats_inc(s_ib_evt_handler_call);
    255
    256	tasklet_schedule(&ic->i_recv_tasklet);
    257}
    258
    259static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
    260		     struct ib_wc *wcs)
    261{
    262	int nr, i;
    263	struct ib_wc *wc;
    264
    265	while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
    266		for (i = 0; i < nr; i++) {
    267			wc = wcs + i;
    268			rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
    269				 (unsigned long long)wc->wr_id, wc->status,
    270				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
    271
    272			if (wc->wr_id <= ic->i_send_ring.w_nr ||
    273			    wc->wr_id == RDS_IB_ACK_WR_ID)
    274				rds_ib_send_cqe_handler(ic, wc);
    275			else
    276				rds_ib_mr_cqe_handler(ic, wc);
    277
    278		}
    279	}
    280}
    281
    282static void rds_ib_tasklet_fn_send(unsigned long data)
    283{
    284	struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
    285	struct rds_connection *conn = ic->conn;
    286
    287	rds_ib_stats_inc(s_ib_tasklet_call);
    288
    289	/* if cq has been already reaped, ignore incoming cq event */
    290	if (atomic_read(&ic->i_cq_quiesce))
    291		return;
    292
    293	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
    294	ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
    295	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
    296
    297	if (rds_conn_up(conn) &&
    298	    (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
    299	    test_bit(0, &conn->c_map_queued)))
    300		rds_send_xmit(&ic->conn->c_path[0]);
    301}
    302
    303static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
    304		     struct ib_wc *wcs,
    305		     struct rds_ib_ack_state *ack_state)
    306{
    307	int nr, i;
    308	struct ib_wc *wc;
    309
    310	while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
    311		for (i = 0; i < nr; i++) {
    312			wc = wcs + i;
    313			rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
    314				 (unsigned long long)wc->wr_id, wc->status,
    315				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
    316
    317			rds_ib_recv_cqe_handler(ic, wc, ack_state);
    318		}
    319	}
    320}
    321
    322static void rds_ib_tasklet_fn_recv(unsigned long data)
    323{
    324	struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
    325	struct rds_connection *conn = ic->conn;
    326	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
    327	struct rds_ib_ack_state state;
    328
    329	if (!rds_ibdev)
    330		rds_conn_drop(conn);
    331
    332	rds_ib_stats_inc(s_ib_tasklet_call);
    333
    334	/* if cq has been already reaped, ignore incoming cq event */
    335	if (atomic_read(&ic->i_cq_quiesce))
    336		return;
    337
    338	memset(&state, 0, sizeof(state));
    339	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
    340	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
    341	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
    342
    343	if (state.ack_next_valid)
    344		rds_ib_set_ack(ic, state.ack_next, state.ack_required);
    345	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
    346		rds_send_drop_acked(conn, state.ack_recv, NULL);
    347		ic->i_ack_recv = state.ack_recv;
    348	}
    349
    350	if (rds_conn_up(conn))
    351		rds_ib_attempt_ack(ic);
    352}
    353
    354static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
    355{
    356	struct rds_connection *conn = data;
    357	struct rds_ib_connection *ic = conn->c_transport_data;
    358
    359	rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
    360		 ib_event_msg(event->event));
    361
    362	switch (event->event) {
    363	case IB_EVENT_COMM_EST:
    364		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
    365		break;
    366	default:
    367		rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
    368			 event->event, ib_event_msg(event->event),
    369			 &conn->c_laddr, &conn->c_faddr);
    370		rds_conn_drop(conn);
    371		break;
    372	}
    373}
    374
    375static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
    376{
    377	struct rds_connection *conn = context;
    378	struct rds_ib_connection *ic = conn->c_transport_data;
    379
    380	rdsdebug("conn %p cq %p\n", conn, cq);
    381
    382	rds_ib_stats_inc(s_ib_evt_handler_call);
    383
    384	tasklet_schedule(&ic->i_send_tasklet);
    385}
    386
    387static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
    388{
    389	int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
    390	int index = rds_ibdev->dev->num_comp_vectors - 1;
    391	int i;
    392
    393	for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
    394		if (rds_ibdev->vector_load[i] < min) {
    395			index = i;
    396			min = rds_ibdev->vector_load[i];
    397		}
    398	}
    399
    400	rds_ibdev->vector_load[index]++;
    401	return index;
    402}
    403
    404static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
    405{
    406	rds_ibdev->vector_load[index]--;
    407}
    408
    409static void rds_dma_hdr_free(struct ib_device *dev, struct rds_header *hdr,
    410		dma_addr_t dma_addr, enum dma_data_direction dir)
    411{
    412	ib_dma_unmap_single(dev, dma_addr, sizeof(*hdr), dir);
    413	kfree(hdr);
    414}
    415
    416static struct rds_header *rds_dma_hdr_alloc(struct ib_device *dev,
    417		dma_addr_t *dma_addr, enum dma_data_direction dir)
    418{
    419	struct rds_header *hdr;
    420
    421	hdr = kzalloc_node(sizeof(*hdr), GFP_KERNEL, ibdev_to_node(dev));
    422	if (!hdr)
    423		return NULL;
    424
    425	*dma_addr = ib_dma_map_single(dev, hdr, sizeof(*hdr),
    426				      DMA_BIDIRECTIONAL);
    427	if (ib_dma_mapping_error(dev, *dma_addr)) {
    428		kfree(hdr);
    429		return NULL;
    430	}
    431
    432	return hdr;
    433}
    434
    435/* Free the DMA memory used to store struct rds_header.
    436 *
    437 * @dev: the RDS IB device
    438 * @hdrs: pointer to the array storing DMA memory pointers
    439 * @dma_addrs: pointer to the array storing DMA addresses
    440 * @num_hdars: number of headers to free.
    441 */
    442static void rds_dma_hdrs_free(struct rds_ib_device *dev,
    443		struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs,
    444		enum dma_data_direction dir)
    445{
    446	u32 i;
    447
    448	for (i = 0; i < num_hdrs; i++)
    449		rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir);
    450	kvfree(hdrs);
    451	kvfree(dma_addrs);
    452}
    453
    454
    455/* Allocate DMA coherent memory to be used to store struct rds_header for
    456 * sending/receiving packets.  The pointers to the DMA memory and the
    457 * associated DMA addresses are stored in two arrays.
    458 *
    459 * @dev: the RDS IB device
    460 * @dma_addrs: pointer to the array for storing DMA addresses
    461 * @num_hdrs: number of headers to allocate
    462 *
    463 * It returns the pointer to the array storing the DMA memory pointers.  On
    464 * error, NULL pointer is returned.
    465 */
    466static struct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev,
    467		dma_addr_t **dma_addrs, u32 num_hdrs,
    468		enum dma_data_direction dir)
    469{
    470	struct rds_header **hdrs;
    471	dma_addr_t *hdr_daddrs;
    472	u32 i;
    473
    474	hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
    475			     ibdev_to_node(dev->dev));
    476	if (!hdrs)
    477		return NULL;
    478
    479	hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
    480				   ibdev_to_node(dev->dev));
    481	if (!hdr_daddrs) {
    482		kvfree(hdrs);
    483		return NULL;
    484	}
    485
    486	for (i = 0; i < num_hdrs; i++) {
    487		hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir);
    488		if (!hdrs[i]) {
    489			rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir);
    490			return NULL;
    491		}
    492	}
    493
    494	*dma_addrs = hdr_daddrs;
    495	return hdrs;
    496}
    497
    498/*
    499 * This needs to be very careful to not leave IS_ERR pointers around for
    500 * cleanup to trip over.
    501 */
    502static int rds_ib_setup_qp(struct rds_connection *conn)
    503{
    504	struct rds_ib_connection *ic = conn->c_transport_data;
    505	struct ib_device *dev = ic->i_cm_id->device;
    506	struct ib_qp_init_attr attr;
    507	struct ib_cq_init_attr cq_attr = {};
    508	struct rds_ib_device *rds_ibdev;
    509	unsigned long max_wrs;
    510	int ret, fr_queue_space;
    511
    512	/*
    513	 * It's normal to see a null device if an incoming connection races
    514	 * with device removal, so we don't print a warning.
    515	 */
    516	rds_ibdev = rds_ib_get_client_data(dev);
    517	if (!rds_ibdev)
    518		return -EOPNOTSUPP;
    519
    520	/* The fr_queue_space is currently set to 512, to add extra space on
    521	 * completion queue and send queue. This extra space is used for FRWR
    522	 * registration and invalidation work requests
    523	 */
    524	fr_queue_space = RDS_IB_DEFAULT_FR_WR;
    525
    526	/* add the conn now so that connection establishment has the dev */
    527	rds_ib_add_conn(rds_ibdev, conn);
    528
    529	max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ?
    530		rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr;
    531	if (ic->i_send_ring.w_nr != max_wrs)
    532		rds_ib_ring_resize(&ic->i_send_ring, max_wrs);
    533
    534	max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ?
    535		rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr;
    536	if (ic->i_recv_ring.w_nr != max_wrs)
    537		rds_ib_ring_resize(&ic->i_recv_ring, max_wrs);
    538
    539	/* Protection domain and memory range */
    540	ic->i_pd = rds_ibdev->pd;
    541
    542	ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
    543	cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
    544	cq_attr.comp_vector = ic->i_scq_vector;
    545	ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
    546				     rds_ib_cq_event_handler, conn,
    547				     &cq_attr);
    548	if (IS_ERR(ic->i_send_cq)) {
    549		ret = PTR_ERR(ic->i_send_cq);
    550		ic->i_send_cq = NULL;
    551		ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
    552		rdsdebug("ib_create_cq send failed: %d\n", ret);
    553		goto rds_ibdev_out;
    554	}
    555
    556	ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
    557	cq_attr.cqe = ic->i_recv_ring.w_nr;
    558	cq_attr.comp_vector = ic->i_rcq_vector;
    559	ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
    560				     rds_ib_cq_event_handler, conn,
    561				     &cq_attr);
    562	if (IS_ERR(ic->i_recv_cq)) {
    563		ret = PTR_ERR(ic->i_recv_cq);
    564		ic->i_recv_cq = NULL;
    565		ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
    566		rdsdebug("ib_create_cq recv failed: %d\n", ret);
    567		goto send_cq_out;
    568	}
    569
    570	ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
    571	if (ret) {
    572		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
    573		goto recv_cq_out;
    574	}
    575
    576	ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
    577	if (ret) {
    578		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
    579		goto recv_cq_out;
    580	}
    581
    582	/* XXX negotiate max send/recv with remote? */
    583	memset(&attr, 0, sizeof(attr));
    584	attr.event_handler = rds_ib_qp_event_handler;
    585	attr.qp_context = conn;
    586	/* + 1 to allow for the single ack message */
    587	attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
    588	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
    589	attr.cap.max_send_sge = rds_ibdev->max_sge;
    590	attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
    591	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
    592	attr.qp_type = IB_QPT_RC;
    593	attr.send_cq = ic->i_send_cq;
    594	attr.recv_cq = ic->i_recv_cq;
    595
    596	/*
    597	 * XXX this can fail if max_*_wr is too large?  Are we supposed
    598	 * to back off until we get a value that the hardware can support?
    599	 */
    600	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
    601	if (ret) {
    602		rdsdebug("rdma_create_qp failed: %d\n", ret);
    603		goto recv_cq_out;
    604	}
    605
    606	ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma,
    607					     ic->i_send_ring.w_nr,
    608					     DMA_TO_DEVICE);
    609	if (!ic->i_send_hdrs) {
    610		ret = -ENOMEM;
    611		rdsdebug("DMA send hdrs alloc failed\n");
    612		goto qp_out;
    613	}
    614
    615	ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma,
    616					     ic->i_recv_ring.w_nr,
    617					     DMA_FROM_DEVICE);
    618	if (!ic->i_recv_hdrs) {
    619		ret = -ENOMEM;
    620		rdsdebug("DMA recv hdrs alloc failed\n");
    621		goto send_hdrs_dma_out;
    622	}
    623
    624	ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma,
    625				      DMA_TO_DEVICE);
    626	if (!ic->i_ack) {
    627		ret = -ENOMEM;
    628		rdsdebug("DMA ack header alloc failed\n");
    629		goto recv_hdrs_dma_out;
    630	}
    631
    632	ic->i_sends = vzalloc_node(array_size(sizeof(struct rds_ib_send_work),
    633					      ic->i_send_ring.w_nr),
    634				   ibdev_to_node(dev));
    635	if (!ic->i_sends) {
    636		ret = -ENOMEM;
    637		rdsdebug("send allocation failed\n");
    638		goto ack_dma_out;
    639	}
    640
    641	ic->i_recvs = vzalloc_node(array_size(sizeof(struct rds_ib_recv_work),
    642					      ic->i_recv_ring.w_nr),
    643				   ibdev_to_node(dev));
    644	if (!ic->i_recvs) {
    645		ret = -ENOMEM;
    646		rdsdebug("recv allocation failed\n");
    647		goto sends_out;
    648	}
    649
    650	rds_ib_recv_init_ack(ic);
    651
    652	rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
    653		 ic->i_send_cq, ic->i_recv_cq);
    654
    655	goto out;
    656
    657sends_out:
    658	vfree(ic->i_sends);
    659
    660ack_dma_out:
    661	rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma,
    662			 DMA_TO_DEVICE);
    663	ic->i_ack = NULL;
    664
    665recv_hdrs_dma_out:
    666	rds_dma_hdrs_free(rds_ibdev, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
    667			  ic->i_recv_ring.w_nr, DMA_FROM_DEVICE);
    668	ic->i_recv_hdrs = NULL;
    669	ic->i_recv_hdrs_dma = NULL;
    670
    671send_hdrs_dma_out:
    672	rds_dma_hdrs_free(rds_ibdev, ic->i_send_hdrs, ic->i_send_hdrs_dma,
    673			  ic->i_send_ring.w_nr, DMA_TO_DEVICE);
    674	ic->i_send_hdrs = NULL;
    675	ic->i_send_hdrs_dma = NULL;
    676
    677qp_out:
    678	rdma_destroy_qp(ic->i_cm_id);
    679recv_cq_out:
    680	ib_destroy_cq(ic->i_recv_cq);
    681	ic->i_recv_cq = NULL;
    682send_cq_out:
    683	ib_destroy_cq(ic->i_send_cq);
    684	ic->i_send_cq = NULL;
    685rds_ibdev_out:
    686	rds_ib_remove_conn(rds_ibdev, conn);
    687out:
    688	rds_ib_dev_put(rds_ibdev);
    689
    690	return ret;
    691}
    692
    693static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
    694{
    695	const union rds_ib_conn_priv *dp = event->param.conn.private_data;
    696	u8 data_len, major, minor;
    697	u32 version = 0;
    698	__be16 mask;
    699	u16 common;
    700
    701	/*
    702	 * rdma_cm private data is odd - when there is any private data in the
    703	 * request, we will be given a pretty large buffer without telling us the
    704	 * original size. The only way to tell the difference is by looking at
    705	 * the contents, which are initialized to zero.
    706	 * If the protocol version fields aren't set, this is a connection attempt
    707	 * from an older version. This could be 3.0 or 2.0 - we can't tell.
    708	 * We really should have changed this for OFED 1.3 :-(
    709	 */
    710
    711	/* Be paranoid. RDS always has privdata */
    712	if (!event->param.conn.private_data_len) {
    713		printk(KERN_NOTICE "RDS incoming connection has no private data, "
    714			"rejecting\n");
    715		return 0;
    716	}
    717
    718	if (isv6) {
    719		data_len = sizeof(struct rds6_ib_connect_private);
    720		major = dp->ricp_v6.dp_protocol_major;
    721		minor = dp->ricp_v6.dp_protocol_minor;
    722		mask = dp->ricp_v6.dp_protocol_minor_mask;
    723	} else {
    724		data_len = sizeof(struct rds_ib_connect_private);
    725		major = dp->ricp_v4.dp_protocol_major;
    726		minor = dp->ricp_v4.dp_protocol_minor;
    727		mask = dp->ricp_v4.dp_protocol_minor_mask;
    728	}
    729
    730	/* Even if len is crap *now* I still want to check it. -ASG */
    731	if (event->param.conn.private_data_len < data_len || major == 0)
    732		return RDS_PROTOCOL_4_0;
    733
    734	common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
    735	if (major == 4 && common) {
    736		version = RDS_PROTOCOL_4_0;
    737		while ((common >>= 1) != 0)
    738			version++;
    739	} else if (RDS_PROTOCOL_COMPAT_VERSION ==
    740		   RDS_PROTOCOL(major, minor)) {
    741		version = RDS_PROTOCOL_COMPAT_VERSION;
    742	} else {
    743		if (isv6)
    744			printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
    745					   &dp->ricp_v6.dp_saddr, major, minor);
    746		else
    747			printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
    748					   &dp->ricp_v4.dp_saddr, major, minor);
    749	}
    750	return version;
    751}
    752
    753#if IS_ENABLED(CONFIG_IPV6)
    754/* Given an IPv6 address, find the net_device which hosts that address and
    755 * return its index.  This is used by the rds_ib_cm_handle_connect() code to
    756 * find the interface index of where an incoming request comes from when
    757 * the request is using a link local address.
    758 *
    759 * Note one problem in this search.  It is possible that two interfaces have
    760 * the same link local address.  Unfortunately, this cannot be solved unless
    761 * the underlying layer gives us the interface which an incoming RDMA connect
    762 * request comes from.
    763 */
    764static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
    765{
    766	struct net_device *dev;
    767	int idx = 0;
    768
    769	rcu_read_lock();
    770	for_each_netdev_rcu(net, dev) {
    771		if (ipv6_chk_addr(net, addr, dev, 1)) {
    772			idx = dev->ifindex;
    773			break;
    774		}
    775	}
    776	rcu_read_unlock();
    777
    778	return idx;
    779}
    780#endif
    781
    782int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
    783			     struct rdma_cm_event *event, bool isv6)
    784{
    785	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
    786	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
    787	const struct rds_ib_conn_priv_cmn *dp_cmn;
    788	struct rds_connection *conn = NULL;
    789	struct rds_ib_connection *ic = NULL;
    790	struct rdma_conn_param conn_param;
    791	const union rds_ib_conn_priv *dp;
    792	union rds_ib_conn_priv dp_rep;
    793	struct in6_addr s_mapped_addr;
    794	struct in6_addr d_mapped_addr;
    795	const struct in6_addr *saddr6;
    796	const struct in6_addr *daddr6;
    797	int destroy = 1;
    798	u32 ifindex = 0;
    799	u32 version;
    800	int err = 1;
    801
    802	/* Check whether the remote protocol version matches ours. */
    803	version = rds_ib_protocol_compatible(event, isv6);
    804	if (!version) {
    805		err = RDS_RDMA_REJ_INCOMPAT;
    806		goto out;
    807	}
    808
    809	dp = event->param.conn.private_data;
    810	if (isv6) {
    811#if IS_ENABLED(CONFIG_IPV6)
    812		dp_cmn = &dp->ricp_v6.dp_cmn;
    813		saddr6 = &dp->ricp_v6.dp_saddr;
    814		daddr6 = &dp->ricp_v6.dp_daddr;
    815		/* If either address is link local, need to find the
    816		 * interface index in order to create a proper RDS
    817		 * connection.
    818		 */
    819		if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
    820			/* Using init_net for now ..  */
    821			ifindex = __rds_find_ifindex(&init_net, daddr6);
    822			/* No index found...  Need to bail out. */
    823			if (ifindex == 0) {
    824				err = -EOPNOTSUPP;
    825				goto out;
    826			}
    827		} else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
    828			/* Use our address to find the correct index. */
    829			ifindex = __rds_find_ifindex(&init_net, daddr6);
    830			/* No index found...  Need to bail out. */
    831			if (ifindex == 0) {
    832				err = -EOPNOTSUPP;
    833				goto out;
    834			}
    835		}
    836#else
    837		err = -EOPNOTSUPP;
    838		goto out;
    839#endif
    840	} else {
    841		dp_cmn = &dp->ricp_v4.dp_cmn;
    842		ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
    843		ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
    844		saddr6 = &s_mapped_addr;
    845		daddr6 = &d_mapped_addr;
    846	}
    847
    848	rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
    849		 saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
    850		 RDS_PROTOCOL_MINOR(version),
    851		 (unsigned long long)be64_to_cpu(lguid),
    852		 (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);
    853
    854	/* RDS/IB is not currently netns aware, thus init_net */
    855	conn = rds_conn_create(&init_net, daddr6, saddr6,
    856			       &rds_ib_transport, dp_cmn->ricpc_dp_toss,
    857			       GFP_KERNEL, ifindex);
    858	if (IS_ERR(conn)) {
    859		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
    860		conn = NULL;
    861		goto out;
    862	}
    863
    864	/*
    865	 * The connection request may occur while the
    866	 * previous connection exist, e.g. in case of failover.
    867	 * But as connections may be initiated simultaneously
    868	 * by both hosts, we have a random backoff mechanism -
    869	 * see the comment above rds_queue_reconnect()
    870	 */
    871	mutex_lock(&conn->c_cm_lock);
    872	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
    873		if (rds_conn_state(conn) == RDS_CONN_UP) {
    874			rdsdebug("incoming connect while connecting\n");
    875			rds_conn_drop(conn);
    876			rds_ib_stats_inc(s_ib_listen_closed_stale);
    877		} else
    878		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
    879			/* Wait and see - our connect may still be succeeding */
    880			rds_ib_stats_inc(s_ib_connect_raced);
    881		}
    882		goto out;
    883	}
    884
    885	ic = conn->c_transport_data;
    886
    887	rds_ib_set_protocol(conn, version);
    888	rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
    889
    890	/* If the peer gave us the last packet it saw, process this as if
    891	 * we had received a regular ACK. */
    892	if (dp_cmn->ricpc_ack_seq)
    893		rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
    894				    NULL);
    895
    896	BUG_ON(cm_id->context);
    897	BUG_ON(ic->i_cm_id);
    898
    899	ic->i_cm_id = cm_id;
    900	cm_id->context = conn;
    901
    902	/* We got halfway through setting up the ib_connection, if we
    903	 * fail now, we have to take the long route out of this mess. */
    904	destroy = 0;
    905
    906	err = rds_ib_setup_qp(conn);
    907	if (err) {
    908		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
    909		goto out;
    910	}
    911
    912	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
    913				  event->param.conn.responder_resources,
    914				  event->param.conn.initiator_depth, isv6);
    915
    916	rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32);
    917	/* rdma_accept() calls rdma_reject() internally if it fails */
    918	if (rdma_accept(cm_id, &conn_param))
    919		rds_ib_conn_error(conn, "rdma_accept failed\n");
    920
    921out:
    922	if (conn)
    923		mutex_unlock(&conn->c_cm_lock);
    924	if (err)
    925		rdma_reject(cm_id, &err, sizeof(int),
    926			    IB_CM_REJ_CONSUMER_DEFINED);
    927	return destroy;
    928}
    929
    930
    931int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
    932{
    933	struct rds_connection *conn = cm_id->context;
    934	struct rds_ib_connection *ic = conn->c_transport_data;
    935	struct rdma_conn_param conn_param;
    936	union rds_ib_conn_priv dp;
    937	int ret;
    938
    939	/* If the peer doesn't do protocol negotiation, we must
    940	 * default to RDSv3.0 */
    941	rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
    942	ic->i_flowctl = rds_ib_sysctl_flow_control;	/* advertise flow control */
    943
    944	ret = rds_ib_setup_qp(conn);
    945	if (ret) {
    946		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
    947		goto out;
    948	}
    949
    950	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
    951				  conn->c_proposed_version,
    952				  UINT_MAX, UINT_MAX, isv6);
    953	ret = rdma_connect_locked(cm_id, &conn_param);
    954	if (ret)
    955		rds_ib_conn_error(conn, "rdma_connect_locked failed (%d)\n",
    956				  ret);
    957
    958out:
    959	/* Beware - returning non-zero tells the rdma_cm to destroy
    960	 * the cm_id. We should certainly not do it as long as we still
    961	 * "own" the cm_id. */
    962	if (ret) {
    963		if (ic->i_cm_id == cm_id)
    964			ret = 0;
    965	}
    966	ic->i_active_side = true;
    967	return ret;
    968}
    969
    970int rds_ib_conn_path_connect(struct rds_conn_path *cp)
    971{
    972	struct rds_connection *conn = cp->cp_conn;
    973	struct sockaddr_storage src, dest;
    974	rdma_cm_event_handler handler;
    975	struct rds_ib_connection *ic;
    976	int ret;
    977
    978	ic = conn->c_transport_data;
    979
    980	/* XXX I wonder what affect the port space has */
    981	/* delegate cm event handler to rdma_transport */
    982#if IS_ENABLED(CONFIG_IPV6)
    983	if (conn->c_isv6)
    984		handler = rds6_rdma_cm_event_handler;
    985	else
    986#endif
    987		handler = rds_rdma_cm_event_handler;
    988	ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
    989				     RDMA_PS_TCP, IB_QPT_RC);
    990	if (IS_ERR(ic->i_cm_id)) {
    991		ret = PTR_ERR(ic->i_cm_id);
    992		ic->i_cm_id = NULL;
    993		rdsdebug("rdma_create_id() failed: %d\n", ret);
    994		goto out;
    995	}
    996
    997	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
    998
    999	if (ipv6_addr_v4mapped(&conn->c_faddr)) {
   1000		struct sockaddr_in *sin;
   1001
   1002		sin = (struct sockaddr_in *)&src;
   1003		sin->sin_family = AF_INET;
   1004		sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
   1005		sin->sin_port = 0;
   1006
   1007		sin = (struct sockaddr_in *)&dest;
   1008		sin->sin_family = AF_INET;
   1009		sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
   1010		sin->sin_port = htons(RDS_PORT);
   1011	} else {
   1012		struct sockaddr_in6 *sin6;
   1013
   1014		sin6 = (struct sockaddr_in6 *)&src;
   1015		sin6->sin6_family = AF_INET6;
   1016		sin6->sin6_addr = conn->c_laddr;
   1017		sin6->sin6_port = 0;
   1018		sin6->sin6_scope_id = conn->c_dev_if;
   1019
   1020		sin6 = (struct sockaddr_in6 *)&dest;
   1021		sin6->sin6_family = AF_INET6;
   1022		sin6->sin6_addr = conn->c_faddr;
   1023		sin6->sin6_port = htons(RDS_CM_PORT);
   1024		sin6->sin6_scope_id = conn->c_dev_if;
   1025	}
   1026
   1027	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
   1028				(struct sockaddr *)&dest,
   1029				RDS_RDMA_RESOLVE_TIMEOUT_MS);
   1030	if (ret) {
   1031		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
   1032			 ret);
   1033		rdma_destroy_id(ic->i_cm_id);
   1034		ic->i_cm_id = NULL;
   1035	}
   1036
   1037out:
   1038	return ret;
   1039}
   1040
   1041/*
   1042 * This is so careful about only cleaning up resources that were built up
   1043 * so that it can be called at any point during startup.  In fact it
   1044 * can be called multiple times for a given connection.
   1045 */
   1046void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
   1047{
   1048	struct rds_connection *conn = cp->cp_conn;
   1049	struct rds_ib_connection *ic = conn->c_transport_data;
   1050	int err = 0;
   1051
   1052	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
   1053		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
   1054		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
   1055
   1056	if (ic->i_cm_id) {
   1057		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
   1058		err = rdma_disconnect(ic->i_cm_id);
   1059		if (err) {
   1060			/* Actually this may happen quite frequently, when
   1061			 * an outgoing connect raced with an incoming connect.
   1062			 */
   1063			rdsdebug("failed to disconnect, cm: %p err %d\n",
   1064				ic->i_cm_id, err);
   1065		}
   1066
   1067		/* kick off "flush_worker" for all pools in order to reap
   1068		 * all FRMR registrations that are still marked "FRMR_IS_INUSE"
   1069		 */
   1070		rds_ib_flush_mrs();
   1071
   1072		/*
   1073		 * We want to wait for tx and rx completion to finish
   1074		 * before we tear down the connection, but we have to be
   1075		 * careful not to get stuck waiting on a send ring that
   1076		 * only has unsignaled sends in it.  We've shutdown new
   1077		 * sends before getting here so by waiting for signaled
   1078		 * sends to complete we're ensured that there will be no
   1079		 * more tx processing.
   1080		 */
   1081		wait_event(rds_ib_ring_empty_wait,
   1082			   rds_ib_ring_empty(&ic->i_recv_ring) &&
   1083			   (atomic_read(&ic->i_signaled_sends) == 0) &&
   1084			   (atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
   1085			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
   1086		tasklet_kill(&ic->i_send_tasklet);
   1087		tasklet_kill(&ic->i_recv_tasklet);
   1088
   1089		atomic_set(&ic->i_cq_quiesce, 1);
   1090
   1091		/* first destroy the ib state that generates callbacks */
   1092		if (ic->i_cm_id->qp)
   1093			rdma_destroy_qp(ic->i_cm_id);
   1094		if (ic->i_send_cq) {
   1095			if (ic->rds_ibdev)
   1096				ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
   1097			ib_destroy_cq(ic->i_send_cq);
   1098		}
   1099
   1100		if (ic->i_recv_cq) {
   1101			if (ic->rds_ibdev)
   1102				ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
   1103			ib_destroy_cq(ic->i_recv_cq);
   1104		}
   1105
   1106		if (ic->rds_ibdev) {
   1107			/* then free the resources that ib callbacks use */
   1108			if (ic->i_send_hdrs) {
   1109				rds_dma_hdrs_free(ic->rds_ibdev,
   1110						  ic->i_send_hdrs,
   1111						  ic->i_send_hdrs_dma,
   1112						  ic->i_send_ring.w_nr,
   1113						  DMA_TO_DEVICE);
   1114				ic->i_send_hdrs = NULL;
   1115				ic->i_send_hdrs_dma = NULL;
   1116			}
   1117
   1118			if (ic->i_recv_hdrs) {
   1119				rds_dma_hdrs_free(ic->rds_ibdev,
   1120						  ic->i_recv_hdrs,
   1121						  ic->i_recv_hdrs_dma,
   1122						  ic->i_recv_ring.w_nr,
   1123						  DMA_FROM_DEVICE);
   1124				ic->i_recv_hdrs = NULL;
   1125				ic->i_recv_hdrs_dma = NULL;
   1126			}
   1127
   1128			if (ic->i_ack) {
   1129				rds_dma_hdr_free(ic->rds_ibdev->dev, ic->i_ack,
   1130						 ic->i_ack_dma, DMA_TO_DEVICE);
   1131				ic->i_ack = NULL;
   1132			}
   1133		} else {
   1134			WARN_ON(ic->i_send_hdrs);
   1135			WARN_ON(ic->i_send_hdrs_dma);
   1136			WARN_ON(ic->i_recv_hdrs);
   1137			WARN_ON(ic->i_recv_hdrs_dma);
   1138			WARN_ON(ic->i_ack);
   1139		}
   1140
   1141		if (ic->i_sends)
   1142			rds_ib_send_clear_ring(ic);
   1143		if (ic->i_recvs)
   1144			rds_ib_recv_clear_ring(ic);
   1145
   1146		rdma_destroy_id(ic->i_cm_id);
   1147
   1148		/*
   1149		 * Move connection back to the nodev list.
   1150		 */
   1151		if (ic->rds_ibdev)
   1152			rds_ib_remove_conn(ic->rds_ibdev, conn);
   1153
   1154		ic->i_cm_id = NULL;
   1155		ic->i_pd = NULL;
   1156		ic->i_send_cq = NULL;
   1157		ic->i_recv_cq = NULL;
   1158	}
   1159	BUG_ON(ic->rds_ibdev);
   1160
   1161	/* Clear pending transmit */
   1162	if (ic->i_data_op) {
   1163		struct rds_message *rm;
   1164
   1165		rm = container_of(ic->i_data_op, struct rds_message, data);
   1166		rds_message_put(rm);
   1167		ic->i_data_op = NULL;
   1168	}
   1169
   1170	/* Clear the ACK state */
   1171	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
   1172#ifdef KERNEL_HAS_ATOMIC64
   1173	atomic64_set(&ic->i_ack_next, 0);
   1174#else
   1175	ic->i_ack_next = 0;
   1176#endif
   1177	ic->i_ack_recv = 0;
   1178
   1179	/* Clear flow control state */
   1180	ic->i_flowctl = 0;
   1181	atomic_set(&ic->i_credits, 0);
   1182
   1183	/* Re-init rings, but retain sizes. */
   1184	rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr);
   1185	rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr);
   1186
   1187	if (ic->i_ibinc) {
   1188		rds_inc_put(&ic->i_ibinc->ii_inc);
   1189		ic->i_ibinc = NULL;
   1190	}
   1191
   1192	vfree(ic->i_sends);
   1193	ic->i_sends = NULL;
   1194	vfree(ic->i_recvs);
   1195	ic->i_recvs = NULL;
   1196	ic->i_active_side = false;
   1197}
   1198
   1199int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
   1200{
   1201	struct rds_ib_connection *ic;
   1202	unsigned long flags;
   1203	int ret;
   1204
   1205	/* XXX too lazy? */
   1206	ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
   1207	if (!ic)
   1208		return -ENOMEM;
   1209
   1210	ret = rds_ib_recv_alloc_caches(ic, gfp);
   1211	if (ret) {
   1212		kfree(ic);
   1213		return ret;
   1214	}
   1215
   1216	INIT_LIST_HEAD(&ic->ib_node);
   1217	tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send,
   1218		     (unsigned long)ic);
   1219	tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv,
   1220		     (unsigned long)ic);
   1221	mutex_init(&ic->i_recv_mutex);
   1222#ifndef KERNEL_HAS_ATOMIC64
   1223	spin_lock_init(&ic->i_ack_lock);
   1224#endif
   1225	atomic_set(&ic->i_signaled_sends, 0);
   1226	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
   1227
   1228	/*
   1229	 * rds_ib_conn_shutdown() waits for these to be emptied so they
   1230	 * must be initialized before it can be called.
   1231	 */
   1232	rds_ib_ring_init(&ic->i_send_ring, 0);
   1233	rds_ib_ring_init(&ic->i_recv_ring, 0);
   1234
   1235	ic->conn = conn;
   1236	conn->c_transport_data = ic;
   1237
   1238	spin_lock_irqsave(&ib_nodev_conns_lock, flags);
   1239	list_add_tail(&ic->ib_node, &ib_nodev_conns);
   1240	spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
   1241
   1242
   1243	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
   1244	return 0;
   1245}
   1246
   1247/*
   1248 * Free a connection. Connection must be shut down and not set for reconnect.
   1249 */
   1250void rds_ib_conn_free(void *arg)
   1251{
   1252	struct rds_ib_connection *ic = arg;
   1253	spinlock_t	*lock_ptr;
   1254
   1255	rdsdebug("ic %p\n", ic);
   1256
   1257	/*
   1258	 * Conn is either on a dev's list or on the nodev list.
   1259	 * A race with shutdown() or connect() would cause problems
   1260	 * (since rds_ibdev would change) but that should never happen.
   1261	 */
   1262	lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
   1263
   1264	spin_lock_irq(lock_ptr);
   1265	list_del(&ic->ib_node);
   1266	spin_unlock_irq(lock_ptr);
   1267
   1268	rds_ib_recv_free_caches(ic);
   1269
   1270	kfree(ic);
   1271}
   1272
   1273
   1274/*
   1275 * An error occurred on the connection
   1276 */
   1277void
   1278__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
   1279{
   1280	va_list ap;
   1281
   1282	rds_conn_drop(conn);
   1283
   1284	va_start(ap, fmt);
   1285	vprintk(fmt, ap);
   1286	va_end(ap);
   1287}