cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

rdma.c (67052B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * NVMe over Fabrics RDMA host code.
      4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
      5 */
      6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      7#include <linux/module.h>
      8#include <linux/init.h>
      9#include <linux/slab.h>
     10#include <rdma/mr_pool.h>
     11#include <linux/err.h>
     12#include <linux/string.h>
     13#include <linux/atomic.h>
     14#include <linux/blk-mq.h>
     15#include <linux/blk-mq-rdma.h>
     16#include <linux/blk-integrity.h>
     17#include <linux/types.h>
     18#include <linux/list.h>
     19#include <linux/mutex.h>
     20#include <linux/scatterlist.h>
     21#include <linux/nvme.h>
     22#include <asm/unaligned.h>
     23
     24#include <rdma/ib_verbs.h>
     25#include <rdma/rdma_cm.h>
     26#include <linux/nvme-rdma.h>
     27
     28#include "nvme.h"
     29#include "fabrics.h"
     30
     31
     32#define NVME_RDMA_CONNECT_TIMEOUT_MS	3000		/* 3 second */
     33
     34#define NVME_RDMA_MAX_SEGMENTS		256
     35
     36#define NVME_RDMA_MAX_INLINE_SEGMENTS	4
     37
     38#define NVME_RDMA_DATA_SGL_SIZE \
     39	(sizeof(struct scatterlist) * NVME_INLINE_SG_CNT)
     40#define NVME_RDMA_METADATA_SGL_SIZE \
     41	(sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT)
     42
     43struct nvme_rdma_device {
     44	struct ib_device	*dev;
     45	struct ib_pd		*pd;
     46	struct kref		ref;
     47	struct list_head	entry;
     48	unsigned int		num_inline_segments;
     49};
     50
     51struct nvme_rdma_qe {
     52	struct ib_cqe		cqe;
     53	void			*data;
     54	u64			dma;
     55};
     56
     57struct nvme_rdma_sgl {
     58	int			nents;
     59	struct sg_table		sg_table;
     60};
     61
     62struct nvme_rdma_queue;
     63struct nvme_rdma_request {
     64	struct nvme_request	req;
     65	struct ib_mr		*mr;
     66	struct nvme_rdma_qe	sqe;
     67	union nvme_result	result;
     68	__le16			status;
     69	refcount_t		ref;
     70	struct ib_sge		sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
     71	u32			num_sge;
     72	struct ib_reg_wr	reg_wr;
     73	struct ib_cqe		reg_cqe;
     74	struct nvme_rdma_queue  *queue;
     75	struct nvme_rdma_sgl	data_sgl;
     76	struct nvme_rdma_sgl	*metadata_sgl;
     77	bool			use_sig_mr;
     78};
     79
     80enum nvme_rdma_queue_flags {
     81	NVME_RDMA_Q_ALLOCATED		= 0,
     82	NVME_RDMA_Q_LIVE		= 1,
     83	NVME_RDMA_Q_TR_READY		= 2,
     84};
     85
     86struct nvme_rdma_queue {
     87	struct nvme_rdma_qe	*rsp_ring;
     88	int			queue_size;
     89	size_t			cmnd_capsule_len;
     90	struct nvme_rdma_ctrl	*ctrl;
     91	struct nvme_rdma_device	*device;
     92	struct ib_cq		*ib_cq;
     93	struct ib_qp		*qp;
     94
     95	unsigned long		flags;
     96	struct rdma_cm_id	*cm_id;
     97	int			cm_error;
     98	struct completion	cm_done;
     99	bool			pi_support;
    100	int			cq_size;
    101	struct mutex		queue_lock;
    102};
    103
    104struct nvme_rdma_ctrl {
    105	/* read only in the hot path */
    106	struct nvme_rdma_queue	*queues;
    107
    108	/* other member variables */
    109	struct blk_mq_tag_set	tag_set;
    110	struct work_struct	err_work;
    111
    112	struct nvme_rdma_qe	async_event_sqe;
    113
    114	struct delayed_work	reconnect_work;
    115
    116	struct list_head	list;
    117
    118	struct blk_mq_tag_set	admin_tag_set;
    119	struct nvme_rdma_device	*device;
    120
    121	u32			max_fr_pages;
    122
    123	struct sockaddr_storage addr;
    124	struct sockaddr_storage src_addr;
    125
    126	struct nvme_ctrl	ctrl;
    127	bool			use_inline_data;
    128	u32			io_queues[HCTX_MAX_TYPES];
    129};
    130
    131static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
    132{
    133	return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
    134}
    135
    136static LIST_HEAD(device_list);
    137static DEFINE_MUTEX(device_list_mutex);
    138
    139static LIST_HEAD(nvme_rdma_ctrl_list);
    140static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
    141
    142/*
    143 * Disabling this option makes small I/O goes faster, but is fundamentally
    144 * unsafe.  With it turned off we will have to register a global rkey that
    145 * allows read and write access to all physical memory.
    146 */
    147static bool register_always = true;
    148module_param(register_always, bool, 0444);
    149MODULE_PARM_DESC(register_always,
    150	 "Use memory registration even for contiguous memory regions");
    151
    152static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
    153		struct rdma_cm_event *event);
    154static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
    155static void nvme_rdma_complete_rq(struct request *rq);
    156
    157static const struct blk_mq_ops nvme_rdma_mq_ops;
    158static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
    159
    160static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
    161{
    162	return queue - queue->ctrl->queues;
    163}
    164
    165static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
    166{
    167	return nvme_rdma_queue_idx(queue) >
    168		queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
    169		queue->ctrl->io_queues[HCTX_TYPE_READ];
    170}
    171
    172static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
    173{
    174	return queue->cmnd_capsule_len - sizeof(struct nvme_command);
    175}
    176
    177static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
    178		size_t capsule_size, enum dma_data_direction dir)
    179{
    180	ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
    181	kfree(qe->data);
    182}
    183
    184static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
    185		size_t capsule_size, enum dma_data_direction dir)
    186{
    187	qe->data = kzalloc(capsule_size, GFP_KERNEL);
    188	if (!qe->data)
    189		return -ENOMEM;
    190
    191	qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
    192	if (ib_dma_mapping_error(ibdev, qe->dma)) {
    193		kfree(qe->data);
    194		qe->data = NULL;
    195		return -ENOMEM;
    196	}
    197
    198	return 0;
    199}
    200
    201static void nvme_rdma_free_ring(struct ib_device *ibdev,
    202		struct nvme_rdma_qe *ring, size_t ib_queue_size,
    203		size_t capsule_size, enum dma_data_direction dir)
    204{
    205	int i;
    206
    207	for (i = 0; i < ib_queue_size; i++)
    208		nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
    209	kfree(ring);
    210}
    211
    212static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
    213		size_t ib_queue_size, size_t capsule_size,
    214		enum dma_data_direction dir)
    215{
    216	struct nvme_rdma_qe *ring;
    217	int i;
    218
    219	ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
    220	if (!ring)
    221		return NULL;
    222
    223	/*
    224	 * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
    225	 * lifetime. It's safe, since any chage in the underlying RDMA device
    226	 * will issue error recovery and queue re-creation.
    227	 */
    228	for (i = 0; i < ib_queue_size; i++) {
    229		if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
    230			goto out_free_ring;
    231	}
    232
    233	return ring;
    234
    235out_free_ring:
    236	nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
    237	return NULL;
    238}
    239
    240static void nvme_rdma_qp_event(struct ib_event *event, void *context)
    241{
    242	pr_debug("QP event %s (%d)\n",
    243		 ib_event_msg(event->event), event->event);
    244
    245}
    246
    247static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
    248{
    249	int ret;
    250
    251	ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
    252			msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
    253	if (ret < 0)
    254		return ret;
    255	if (ret == 0)
    256		return -ETIMEDOUT;
    257	WARN_ON_ONCE(queue->cm_error > 0);
    258	return queue->cm_error;
    259}
    260
    261static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
    262{
    263	struct nvme_rdma_device *dev = queue->device;
    264	struct ib_qp_init_attr init_attr;
    265	int ret;
    266
    267	memset(&init_attr, 0, sizeof(init_attr));
    268	init_attr.event_handler = nvme_rdma_qp_event;
    269	/* +1 for drain */
    270	init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
    271	/* +1 for drain */
    272	init_attr.cap.max_recv_wr = queue->queue_size + 1;
    273	init_attr.cap.max_recv_sge = 1;
    274	init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
    275	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
    276	init_attr.qp_type = IB_QPT_RC;
    277	init_attr.send_cq = queue->ib_cq;
    278	init_attr.recv_cq = queue->ib_cq;
    279	if (queue->pi_support)
    280		init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
    281	init_attr.qp_context = queue;
    282
    283	ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
    284
    285	queue->qp = queue->cm_id->qp;
    286	return ret;
    287}
    288
    289static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
    290		struct request *rq, unsigned int hctx_idx)
    291{
    292	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
    293
    294	kfree(req->sqe.data);
    295}
    296
    297static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
    298		struct request *rq, unsigned int hctx_idx,
    299		unsigned int numa_node)
    300{
    301	struct nvme_rdma_ctrl *ctrl = set->driver_data;
    302	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
    303	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
    304	struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
    305
    306	nvme_req(rq)->ctrl = &ctrl->ctrl;
    307	req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
    308	if (!req->sqe.data)
    309		return -ENOMEM;
    310
    311	/* metadata nvme_rdma_sgl struct is located after command's data SGL */
    312	if (queue->pi_support)
    313		req->metadata_sgl = (void *)nvme_req(rq) +
    314			sizeof(struct nvme_rdma_request) +
    315			NVME_RDMA_DATA_SGL_SIZE;
    316
    317	req->queue = queue;
    318	nvme_req(rq)->cmd = req->sqe.data;
    319
    320	return 0;
    321}
    322
    323static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
    324		unsigned int hctx_idx)
    325{
    326	struct nvme_rdma_ctrl *ctrl = data;
    327	struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
    328
    329	BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
    330
    331	hctx->driver_data = queue;
    332	return 0;
    333}
    334
    335static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
    336		unsigned int hctx_idx)
    337{
    338	struct nvme_rdma_ctrl *ctrl = data;
    339	struct nvme_rdma_queue *queue = &ctrl->queues[0];
    340
    341	BUG_ON(hctx_idx != 0);
    342
    343	hctx->driver_data = queue;
    344	return 0;
    345}
    346
    347static void nvme_rdma_free_dev(struct kref *ref)
    348{
    349	struct nvme_rdma_device *ndev =
    350		container_of(ref, struct nvme_rdma_device, ref);
    351
    352	mutex_lock(&device_list_mutex);
    353	list_del(&ndev->entry);
    354	mutex_unlock(&device_list_mutex);
    355
    356	ib_dealloc_pd(ndev->pd);
    357	kfree(ndev);
    358}
    359
    360static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
    361{
    362	kref_put(&dev->ref, nvme_rdma_free_dev);
    363}
    364
    365static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
    366{
    367	return kref_get_unless_zero(&dev->ref);
    368}
    369
    370static struct nvme_rdma_device *
    371nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
    372{
    373	struct nvme_rdma_device *ndev;
    374
    375	mutex_lock(&device_list_mutex);
    376	list_for_each_entry(ndev, &device_list, entry) {
    377		if (ndev->dev->node_guid == cm_id->device->node_guid &&
    378		    nvme_rdma_dev_get(ndev))
    379			goto out_unlock;
    380	}
    381
    382	ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
    383	if (!ndev)
    384		goto out_err;
    385
    386	ndev->dev = cm_id->device;
    387	kref_init(&ndev->ref);
    388
    389	ndev->pd = ib_alloc_pd(ndev->dev,
    390		register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
    391	if (IS_ERR(ndev->pd))
    392		goto out_free_dev;
    393
    394	if (!(ndev->dev->attrs.device_cap_flags &
    395	      IB_DEVICE_MEM_MGT_EXTENSIONS)) {
    396		dev_err(&ndev->dev->dev,
    397			"Memory registrations not supported.\n");
    398		goto out_free_pd;
    399	}
    400
    401	ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
    402					ndev->dev->attrs.max_send_sge - 1);
    403	list_add(&ndev->entry, &device_list);
    404out_unlock:
    405	mutex_unlock(&device_list_mutex);
    406	return ndev;
    407
    408out_free_pd:
    409	ib_dealloc_pd(ndev->pd);
    410out_free_dev:
    411	kfree(ndev);
    412out_err:
    413	mutex_unlock(&device_list_mutex);
    414	return NULL;
    415}
    416
    417static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
    418{
    419	if (nvme_rdma_poll_queue(queue))
    420		ib_free_cq(queue->ib_cq);
    421	else
    422		ib_cq_pool_put(queue->ib_cq, queue->cq_size);
    423}
    424
    425static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
    426{
    427	struct nvme_rdma_device *dev;
    428	struct ib_device *ibdev;
    429
    430	if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags))
    431		return;
    432
    433	dev = queue->device;
    434	ibdev = dev->dev;
    435
    436	if (queue->pi_support)
    437		ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs);
    438	ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
    439
    440	/*
    441	 * The cm_id object might have been destroyed during RDMA connection
    442	 * establishment error flow to avoid getting other cma events, thus
    443	 * the destruction of the QP shouldn't use rdma_cm API.
    444	 */
    445	ib_destroy_qp(queue->qp);
    446	nvme_rdma_free_cq(queue);
    447
    448	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
    449			sizeof(struct nvme_completion), DMA_FROM_DEVICE);
    450
    451	nvme_rdma_dev_put(dev);
    452}
    453
    454static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
    455{
    456	u32 max_page_list_len;
    457
    458	if (pi_support)
    459		max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len;
    460	else
    461		max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len;
    462
    463	return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
    464}
    465
    466static int nvme_rdma_create_cq(struct ib_device *ibdev,
    467		struct nvme_rdma_queue *queue)
    468{
    469	int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
    470	enum ib_poll_context poll_ctx;
    471
    472	/*
    473	 * Spread I/O queues completion vectors according their queue index.
    474	 * Admin queues can always go on completion vector 0.
    475	 */
    476	comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
    477
    478	/* Polling queues need direct cq polling context */
    479	if (nvme_rdma_poll_queue(queue)) {
    480		poll_ctx = IB_POLL_DIRECT;
    481		queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
    482					   comp_vector, poll_ctx);
    483	} else {
    484		poll_ctx = IB_POLL_SOFTIRQ;
    485		queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
    486					      comp_vector, poll_ctx);
    487	}
    488
    489	if (IS_ERR(queue->ib_cq)) {
    490		ret = PTR_ERR(queue->ib_cq);
    491		return ret;
    492	}
    493
    494	return 0;
    495}
    496
    497static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
    498{
    499	struct ib_device *ibdev;
    500	const int send_wr_factor = 3;			/* MR, SEND, INV */
    501	const int cq_factor = send_wr_factor + 1;	/* + RECV */
    502	int ret, pages_per_mr;
    503
    504	queue->device = nvme_rdma_find_get_device(queue->cm_id);
    505	if (!queue->device) {
    506		dev_err(queue->cm_id->device->dev.parent,
    507			"no client data found!\n");
    508		return -ECONNREFUSED;
    509	}
    510	ibdev = queue->device->dev;
    511
    512	/* +1 for ib_stop_cq */
    513	queue->cq_size = cq_factor * queue->queue_size + 1;
    514
    515	ret = nvme_rdma_create_cq(ibdev, queue);
    516	if (ret)
    517		goto out_put_dev;
    518
    519	ret = nvme_rdma_create_qp(queue, send_wr_factor);
    520	if (ret)
    521		goto out_destroy_ib_cq;
    522
    523	queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
    524			sizeof(struct nvme_completion), DMA_FROM_DEVICE);
    525	if (!queue->rsp_ring) {
    526		ret = -ENOMEM;
    527		goto out_destroy_qp;
    528	}
    529
    530	/*
    531	 * Currently we don't use SG_GAPS MR's so if the first entry is
    532	 * misaligned we'll end up using two entries for a single data page,
    533	 * so one additional entry is required.
    534	 */
    535	pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1;
    536	ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
    537			      queue->queue_size,
    538			      IB_MR_TYPE_MEM_REG,
    539			      pages_per_mr, 0);
    540	if (ret) {
    541		dev_err(queue->ctrl->ctrl.device,
    542			"failed to initialize MR pool sized %d for QID %d\n",
    543			queue->queue_size, nvme_rdma_queue_idx(queue));
    544		goto out_destroy_ring;
    545	}
    546
    547	if (queue->pi_support) {
    548		ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs,
    549				      queue->queue_size, IB_MR_TYPE_INTEGRITY,
    550				      pages_per_mr, pages_per_mr);
    551		if (ret) {
    552			dev_err(queue->ctrl->ctrl.device,
    553				"failed to initialize PI MR pool sized %d for QID %d\n",
    554				queue->queue_size, nvme_rdma_queue_idx(queue));
    555			goto out_destroy_mr_pool;
    556		}
    557	}
    558
    559	set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
    560
    561	return 0;
    562
    563out_destroy_mr_pool:
    564	ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
    565out_destroy_ring:
    566	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
    567			    sizeof(struct nvme_completion), DMA_FROM_DEVICE);
    568out_destroy_qp:
    569	rdma_destroy_qp(queue->cm_id);
    570out_destroy_ib_cq:
    571	nvme_rdma_free_cq(queue);
    572out_put_dev:
    573	nvme_rdma_dev_put(queue->device);
    574	return ret;
    575}
    576
    577static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
    578		int idx, size_t queue_size)
    579{
    580	struct nvme_rdma_queue *queue;
    581	struct sockaddr *src_addr = NULL;
    582	int ret;
    583
    584	queue = &ctrl->queues[idx];
    585	mutex_init(&queue->queue_lock);
    586	queue->ctrl = ctrl;
    587	if (idx && ctrl->ctrl.max_integrity_segments)
    588		queue->pi_support = true;
    589	else
    590		queue->pi_support = false;
    591	init_completion(&queue->cm_done);
    592
    593	if (idx > 0)
    594		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
    595	else
    596		queue->cmnd_capsule_len = sizeof(struct nvme_command);
    597
    598	queue->queue_size = queue_size;
    599
    600	queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
    601			RDMA_PS_TCP, IB_QPT_RC);
    602	if (IS_ERR(queue->cm_id)) {
    603		dev_info(ctrl->ctrl.device,
    604			"failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
    605		ret = PTR_ERR(queue->cm_id);
    606		goto out_destroy_mutex;
    607	}
    608
    609	if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
    610		src_addr = (struct sockaddr *)&ctrl->src_addr;
    611
    612	queue->cm_error = -ETIMEDOUT;
    613	ret = rdma_resolve_addr(queue->cm_id, src_addr,
    614			(struct sockaddr *)&ctrl->addr,
    615			NVME_RDMA_CONNECT_TIMEOUT_MS);
    616	if (ret) {
    617		dev_info(ctrl->ctrl.device,
    618			"rdma_resolve_addr failed (%d).\n", ret);
    619		goto out_destroy_cm_id;
    620	}
    621
    622	ret = nvme_rdma_wait_for_cm(queue);
    623	if (ret) {
    624		dev_info(ctrl->ctrl.device,
    625			"rdma connection establishment failed (%d)\n", ret);
    626		goto out_destroy_cm_id;
    627	}
    628
    629	set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
    630
    631	return 0;
    632
    633out_destroy_cm_id:
    634	rdma_destroy_id(queue->cm_id);
    635	nvme_rdma_destroy_queue_ib(queue);
    636out_destroy_mutex:
    637	mutex_destroy(&queue->queue_lock);
    638	return ret;
    639}
    640
    641static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
    642{
    643	rdma_disconnect(queue->cm_id);
    644	ib_drain_qp(queue->qp);
    645}
    646
    647static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
    648{
    649	mutex_lock(&queue->queue_lock);
    650	if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
    651		__nvme_rdma_stop_queue(queue);
    652	mutex_unlock(&queue->queue_lock);
    653}
    654
    655static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
    656{
    657	if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
    658		return;
    659
    660	rdma_destroy_id(queue->cm_id);
    661	nvme_rdma_destroy_queue_ib(queue);
    662	mutex_destroy(&queue->queue_lock);
    663}
    664
    665static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
    666{
    667	int i;
    668
    669	for (i = 1; i < ctrl->ctrl.queue_count; i++)
    670		nvme_rdma_free_queue(&ctrl->queues[i]);
    671}
    672
    673static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
    674{
    675	int i;
    676
    677	for (i = 1; i < ctrl->ctrl.queue_count; i++)
    678		nvme_rdma_stop_queue(&ctrl->queues[i]);
    679}
    680
    681static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
    682{
    683	struct nvme_rdma_queue *queue = &ctrl->queues[idx];
    684	int ret;
    685
    686	if (idx)
    687		ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
    688	else
    689		ret = nvmf_connect_admin_queue(&ctrl->ctrl);
    690
    691	if (!ret) {
    692		set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
    693	} else {
    694		if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
    695			__nvme_rdma_stop_queue(queue);
    696		dev_info(ctrl->ctrl.device,
    697			"failed to connect queue: %d ret=%d\n", idx, ret);
    698	}
    699	return ret;
    700}
    701
    702static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl)
    703{
    704	int i, ret = 0;
    705
    706	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
    707		ret = nvme_rdma_start_queue(ctrl, i);
    708		if (ret)
    709			goto out_stop_queues;
    710	}
    711
    712	return 0;
    713
    714out_stop_queues:
    715	for (i--; i >= 1; i--)
    716		nvme_rdma_stop_queue(&ctrl->queues[i]);
    717	return ret;
    718}
    719
    720static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
    721{
    722	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
    723	struct ib_device *ibdev = ctrl->device->dev;
    724	unsigned int nr_io_queues, nr_default_queues;
    725	unsigned int nr_read_queues, nr_poll_queues;
    726	int i, ret;
    727
    728	nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
    729				min(opts->nr_io_queues, num_online_cpus()));
    730	nr_default_queues =  min_t(unsigned int, ibdev->num_comp_vectors,
    731				min(opts->nr_write_queues, num_online_cpus()));
    732	nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
    733	nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
    734
    735	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
    736	if (ret)
    737		return ret;
    738
    739	if (nr_io_queues == 0) {
    740		dev_err(ctrl->ctrl.device,
    741			"unable to set any I/O queues\n");
    742		return -ENOMEM;
    743	}
    744
    745	ctrl->ctrl.queue_count = nr_io_queues + 1;
    746	dev_info(ctrl->ctrl.device,
    747		"creating %d I/O queues.\n", nr_io_queues);
    748
    749	if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
    750		/*
    751		 * separate read/write queues
    752		 * hand out dedicated default queues only after we have
    753		 * sufficient read queues.
    754		 */
    755		ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
    756		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
    757		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
    758			min(nr_default_queues, nr_io_queues);
    759		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
    760	} else {
    761		/*
    762		 * shared read/write queues
    763		 * either no write queues were requested, or we don't have
    764		 * sufficient queue count to have dedicated default queues.
    765		 */
    766		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
    767			min(nr_read_queues, nr_io_queues);
    768		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
    769	}
    770
    771	if (opts->nr_poll_queues && nr_io_queues) {
    772		/* map dedicated poll queues only if we have queues left */
    773		ctrl->io_queues[HCTX_TYPE_POLL] =
    774			min(nr_poll_queues, nr_io_queues);
    775	}
    776
    777	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
    778		ret = nvme_rdma_alloc_queue(ctrl, i,
    779				ctrl->ctrl.sqsize + 1);
    780		if (ret)
    781			goto out_free_queues;
    782	}
    783
    784	return 0;
    785
    786out_free_queues:
    787	for (i--; i >= 1; i--)
    788		nvme_rdma_free_queue(&ctrl->queues[i]);
    789
    790	return ret;
    791}
    792
    793static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
    794		bool admin)
    795{
    796	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
    797	struct blk_mq_tag_set *set;
    798	int ret;
    799
    800	if (admin) {
    801		set = &ctrl->admin_tag_set;
    802		memset(set, 0, sizeof(*set));
    803		set->ops = &nvme_rdma_admin_mq_ops;
    804		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
    805		set->reserved_tags = NVMF_RESERVED_TAGS;
    806		set->numa_node = nctrl->numa_node;
    807		set->cmd_size = sizeof(struct nvme_rdma_request) +
    808				NVME_RDMA_DATA_SGL_SIZE;
    809		set->driver_data = ctrl;
    810		set->nr_hw_queues = 1;
    811		set->timeout = NVME_ADMIN_TIMEOUT;
    812		set->flags = BLK_MQ_F_NO_SCHED;
    813	} else {
    814		set = &ctrl->tag_set;
    815		memset(set, 0, sizeof(*set));
    816		set->ops = &nvme_rdma_mq_ops;
    817		set->queue_depth = nctrl->sqsize + 1;
    818		set->reserved_tags = NVMF_RESERVED_TAGS;
    819		set->numa_node = nctrl->numa_node;
    820		set->flags = BLK_MQ_F_SHOULD_MERGE;
    821		set->cmd_size = sizeof(struct nvme_rdma_request) +
    822				NVME_RDMA_DATA_SGL_SIZE;
    823		if (nctrl->max_integrity_segments)
    824			set->cmd_size += sizeof(struct nvme_rdma_sgl) +
    825					 NVME_RDMA_METADATA_SGL_SIZE;
    826		set->driver_data = ctrl;
    827		set->nr_hw_queues = nctrl->queue_count - 1;
    828		set->timeout = NVME_IO_TIMEOUT;
    829		set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
    830	}
    831
    832	ret = blk_mq_alloc_tag_set(set);
    833	if (ret)
    834		return ERR_PTR(ret);
    835
    836	return set;
    837}
    838
    839static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
    840		bool remove)
    841{
    842	if (remove) {
    843		blk_cleanup_queue(ctrl->ctrl.admin_q);
    844		blk_cleanup_queue(ctrl->ctrl.fabrics_q);
    845		blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
    846	}
    847	if (ctrl->async_event_sqe.data) {
    848		cancel_work_sync(&ctrl->ctrl.async_event_work);
    849		nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
    850				sizeof(struct nvme_command), DMA_TO_DEVICE);
    851		ctrl->async_event_sqe.data = NULL;
    852	}
    853	nvme_rdma_free_queue(&ctrl->queues[0]);
    854}
    855
    856static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
    857		bool new)
    858{
    859	bool pi_capable = false;
    860	int error;
    861
    862	error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
    863	if (error)
    864		return error;
    865
    866	ctrl->device = ctrl->queues[0].device;
    867	ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev);
    868
    869	/* T10-PI support */
    870	if (ctrl->device->dev->attrs.kernel_cap_flags &
    871	    IBK_INTEGRITY_HANDOVER)
    872		pi_capable = true;
    873
    874	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
    875							pi_capable);
    876
    877	/*
    878	 * Bind the async event SQE DMA mapping to the admin queue lifetime.
    879	 * It's safe, since any chage in the underlying RDMA device will issue
    880	 * error recovery and queue re-creation.
    881	 */
    882	error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
    883			sizeof(struct nvme_command), DMA_TO_DEVICE);
    884	if (error)
    885		goto out_free_queue;
    886
    887	if (new) {
    888		ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
    889		if (IS_ERR(ctrl->ctrl.admin_tagset)) {
    890			error = PTR_ERR(ctrl->ctrl.admin_tagset);
    891			goto out_free_async_qe;
    892		}
    893
    894		ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
    895		if (IS_ERR(ctrl->ctrl.fabrics_q)) {
    896			error = PTR_ERR(ctrl->ctrl.fabrics_q);
    897			goto out_free_tagset;
    898		}
    899
    900		ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
    901		if (IS_ERR(ctrl->ctrl.admin_q)) {
    902			error = PTR_ERR(ctrl->ctrl.admin_q);
    903			goto out_cleanup_fabrics_q;
    904		}
    905	}
    906
    907	error = nvme_rdma_start_queue(ctrl, 0);
    908	if (error)
    909		goto out_cleanup_queue;
    910
    911	error = nvme_enable_ctrl(&ctrl->ctrl);
    912	if (error)
    913		goto out_stop_queue;
    914
    915	ctrl->ctrl.max_segments = ctrl->max_fr_pages;
    916	ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
    917	if (pi_capable)
    918		ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
    919	else
    920		ctrl->ctrl.max_integrity_segments = 0;
    921
    922	nvme_start_admin_queue(&ctrl->ctrl);
    923
    924	error = nvme_init_ctrl_finish(&ctrl->ctrl);
    925	if (error)
    926		goto out_quiesce_queue;
    927
    928	return 0;
    929
    930out_quiesce_queue:
    931	nvme_stop_admin_queue(&ctrl->ctrl);
    932	blk_sync_queue(ctrl->ctrl.admin_q);
    933out_stop_queue:
    934	nvme_rdma_stop_queue(&ctrl->queues[0]);
    935	nvme_cancel_admin_tagset(&ctrl->ctrl);
    936out_cleanup_queue:
    937	if (new)
    938		blk_cleanup_queue(ctrl->ctrl.admin_q);
    939out_cleanup_fabrics_q:
    940	if (new)
    941		blk_cleanup_queue(ctrl->ctrl.fabrics_q);
    942out_free_tagset:
    943	if (new)
    944		blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
    945out_free_async_qe:
    946	if (ctrl->async_event_sqe.data) {
    947		nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
    948			sizeof(struct nvme_command), DMA_TO_DEVICE);
    949		ctrl->async_event_sqe.data = NULL;
    950	}
    951out_free_queue:
    952	nvme_rdma_free_queue(&ctrl->queues[0]);
    953	return error;
    954}
    955
    956static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
    957		bool remove)
    958{
    959	if (remove) {
    960		blk_cleanup_queue(ctrl->ctrl.connect_q);
    961		blk_mq_free_tag_set(ctrl->ctrl.tagset);
    962	}
    963	nvme_rdma_free_io_queues(ctrl);
    964}
    965
    966static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
    967{
    968	int ret;
    969
    970	ret = nvme_rdma_alloc_io_queues(ctrl);
    971	if (ret)
    972		return ret;
    973
    974	if (new) {
    975		ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
    976		if (IS_ERR(ctrl->ctrl.tagset)) {
    977			ret = PTR_ERR(ctrl->ctrl.tagset);
    978			goto out_free_io_queues;
    979		}
    980
    981		ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl));
    982		if (ret)
    983			goto out_free_tag_set;
    984	}
    985
    986	ret = nvme_rdma_start_io_queues(ctrl);
    987	if (ret)
    988		goto out_cleanup_connect_q;
    989
    990	if (!new) {
    991		nvme_start_queues(&ctrl->ctrl);
    992		if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
    993			/*
    994			 * If we timed out waiting for freeze we are likely to
    995			 * be stuck.  Fail the controller initialization just
    996			 * to be safe.
    997			 */
    998			ret = -ENODEV;
    999			goto out_wait_freeze_timed_out;
   1000		}
   1001		blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
   1002			ctrl->ctrl.queue_count - 1);
   1003		nvme_unfreeze(&ctrl->ctrl);
   1004	}
   1005
   1006	return 0;
   1007
   1008out_wait_freeze_timed_out:
   1009	nvme_stop_queues(&ctrl->ctrl);
   1010	nvme_sync_io_queues(&ctrl->ctrl);
   1011	nvme_rdma_stop_io_queues(ctrl);
   1012out_cleanup_connect_q:
   1013	nvme_cancel_tagset(&ctrl->ctrl);
   1014	if (new)
   1015		blk_cleanup_queue(ctrl->ctrl.connect_q);
   1016out_free_tag_set:
   1017	if (new)
   1018		blk_mq_free_tag_set(ctrl->ctrl.tagset);
   1019out_free_io_queues:
   1020	nvme_rdma_free_io_queues(ctrl);
   1021	return ret;
   1022}
   1023
   1024static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
   1025		bool remove)
   1026{
   1027	nvme_stop_admin_queue(&ctrl->ctrl);
   1028	blk_sync_queue(ctrl->ctrl.admin_q);
   1029	nvme_rdma_stop_queue(&ctrl->queues[0]);
   1030	nvme_cancel_admin_tagset(&ctrl->ctrl);
   1031	if (remove)
   1032		nvme_start_admin_queue(&ctrl->ctrl);
   1033	nvme_rdma_destroy_admin_queue(ctrl, remove);
   1034}
   1035
   1036static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
   1037		bool remove)
   1038{
   1039	if (ctrl->ctrl.queue_count > 1) {
   1040		nvme_start_freeze(&ctrl->ctrl);
   1041		nvme_stop_queues(&ctrl->ctrl);
   1042		nvme_sync_io_queues(&ctrl->ctrl);
   1043		nvme_rdma_stop_io_queues(ctrl);
   1044		nvme_cancel_tagset(&ctrl->ctrl);
   1045		if (remove)
   1046			nvme_start_queues(&ctrl->ctrl);
   1047		nvme_rdma_destroy_io_queues(ctrl, remove);
   1048	}
   1049}
   1050
   1051static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
   1052{
   1053	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
   1054
   1055	cancel_work_sync(&ctrl->err_work);
   1056	cancel_delayed_work_sync(&ctrl->reconnect_work);
   1057}
   1058
   1059static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
   1060{
   1061	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
   1062
   1063	if (list_empty(&ctrl->list))
   1064		goto free_ctrl;
   1065
   1066	mutex_lock(&nvme_rdma_ctrl_mutex);
   1067	list_del(&ctrl->list);
   1068	mutex_unlock(&nvme_rdma_ctrl_mutex);
   1069
   1070	nvmf_free_options(nctrl->opts);
   1071free_ctrl:
   1072	kfree(ctrl->queues);
   1073	kfree(ctrl);
   1074}
   1075
   1076static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
   1077{
   1078	/* If we are resetting/deleting then do nothing */
   1079	if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
   1080		WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
   1081			ctrl->ctrl.state == NVME_CTRL_LIVE);
   1082		return;
   1083	}
   1084
   1085	if (nvmf_should_reconnect(&ctrl->ctrl)) {
   1086		dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
   1087			ctrl->ctrl.opts->reconnect_delay);
   1088		queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
   1089				ctrl->ctrl.opts->reconnect_delay * HZ);
   1090	} else {
   1091		nvme_delete_ctrl(&ctrl->ctrl);
   1092	}
   1093}
   1094
   1095static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
   1096{
   1097	int ret;
   1098	bool changed;
   1099
   1100	ret = nvme_rdma_configure_admin_queue(ctrl, new);
   1101	if (ret)
   1102		return ret;
   1103
   1104	if (ctrl->ctrl.icdoff) {
   1105		ret = -EOPNOTSUPP;
   1106		dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
   1107		goto destroy_admin;
   1108	}
   1109
   1110	if (!(ctrl->ctrl.sgls & (1 << 2))) {
   1111		ret = -EOPNOTSUPP;
   1112		dev_err(ctrl->ctrl.device,
   1113			"Mandatory keyed sgls are not supported!\n");
   1114		goto destroy_admin;
   1115	}
   1116
   1117	if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
   1118		dev_warn(ctrl->ctrl.device,
   1119			"queue_size %zu > ctrl sqsize %u, clamping down\n",
   1120			ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
   1121	}
   1122
   1123	if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
   1124		dev_warn(ctrl->ctrl.device,
   1125			"ctrl sqsize %u > max queue size %u, clamping down\n",
   1126			ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
   1127		ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
   1128	}
   1129
   1130	if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
   1131		dev_warn(ctrl->ctrl.device,
   1132			"sqsize %u > ctrl maxcmd %u, clamping down\n",
   1133			ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
   1134		ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
   1135	}
   1136
   1137	if (ctrl->ctrl.sgls & (1 << 20))
   1138		ctrl->use_inline_data = true;
   1139
   1140	if (ctrl->ctrl.queue_count > 1) {
   1141		ret = nvme_rdma_configure_io_queues(ctrl, new);
   1142		if (ret)
   1143			goto destroy_admin;
   1144	}
   1145
   1146	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
   1147	if (!changed) {
   1148		/*
   1149		 * state change failure is ok if we started ctrl delete,
   1150		 * unless we're during creation of a new controller to
   1151		 * avoid races with teardown flow.
   1152		 */
   1153		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
   1154			     ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
   1155		WARN_ON_ONCE(new);
   1156		ret = -EINVAL;
   1157		goto destroy_io;
   1158	}
   1159
   1160	nvme_start_ctrl(&ctrl->ctrl);
   1161	return 0;
   1162
   1163destroy_io:
   1164	if (ctrl->ctrl.queue_count > 1) {
   1165		nvme_stop_queues(&ctrl->ctrl);
   1166		nvme_sync_io_queues(&ctrl->ctrl);
   1167		nvme_rdma_stop_io_queues(ctrl);
   1168		nvme_cancel_tagset(&ctrl->ctrl);
   1169		nvme_rdma_destroy_io_queues(ctrl, new);
   1170	}
   1171destroy_admin:
   1172	nvme_stop_admin_queue(&ctrl->ctrl);
   1173	blk_sync_queue(ctrl->ctrl.admin_q);
   1174	nvme_rdma_stop_queue(&ctrl->queues[0]);
   1175	nvme_cancel_admin_tagset(&ctrl->ctrl);
   1176	nvme_rdma_destroy_admin_queue(ctrl, new);
   1177	return ret;
   1178}
   1179
   1180static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
   1181{
   1182	struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
   1183			struct nvme_rdma_ctrl, reconnect_work);
   1184
   1185	++ctrl->ctrl.nr_reconnects;
   1186
   1187	if (nvme_rdma_setup_ctrl(ctrl, false))
   1188		goto requeue;
   1189
   1190	dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
   1191			ctrl->ctrl.nr_reconnects);
   1192
   1193	ctrl->ctrl.nr_reconnects = 0;
   1194
   1195	return;
   1196
   1197requeue:
   1198	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
   1199			ctrl->ctrl.nr_reconnects);
   1200	nvme_rdma_reconnect_or_remove(ctrl);
   1201}
   1202
   1203static void nvme_rdma_error_recovery_work(struct work_struct *work)
   1204{
   1205	struct nvme_rdma_ctrl *ctrl = container_of(work,
   1206			struct nvme_rdma_ctrl, err_work);
   1207
   1208	nvme_stop_keep_alive(&ctrl->ctrl);
   1209	flush_work(&ctrl->ctrl.async_event_work);
   1210	nvme_rdma_teardown_io_queues(ctrl, false);
   1211	nvme_start_queues(&ctrl->ctrl);
   1212	nvme_rdma_teardown_admin_queue(ctrl, false);
   1213	nvme_start_admin_queue(&ctrl->ctrl);
   1214
   1215	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
   1216		/* state change failure is ok if we started ctrl delete */
   1217		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
   1218			     ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
   1219		return;
   1220	}
   1221
   1222	nvme_rdma_reconnect_or_remove(ctrl);
   1223}
   1224
   1225static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
   1226{
   1227	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
   1228		return;
   1229
   1230	dev_warn(ctrl->ctrl.device, "starting error recovery\n");
   1231	queue_work(nvme_reset_wq, &ctrl->err_work);
   1232}
   1233
   1234static void nvme_rdma_end_request(struct nvme_rdma_request *req)
   1235{
   1236	struct request *rq = blk_mq_rq_from_pdu(req);
   1237
   1238	if (!refcount_dec_and_test(&req->ref))
   1239		return;
   1240	if (!nvme_try_complete_req(rq, req->status, req->result))
   1241		nvme_rdma_complete_rq(rq);
   1242}
   1243
   1244static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
   1245		const char *op)
   1246{
   1247	struct nvme_rdma_queue *queue = wc->qp->qp_context;
   1248	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
   1249
   1250	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
   1251		dev_info(ctrl->ctrl.device,
   1252			     "%s for CQE 0x%p failed with status %s (%d)\n",
   1253			     op, wc->wr_cqe,
   1254			     ib_wc_status_msg(wc->status), wc->status);
   1255	nvme_rdma_error_recovery(ctrl);
   1256}
   1257
   1258static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
   1259{
   1260	if (unlikely(wc->status != IB_WC_SUCCESS))
   1261		nvme_rdma_wr_error(cq, wc, "MEMREG");
   1262}
   1263
   1264static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
   1265{
   1266	struct nvme_rdma_request *req =
   1267		container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
   1268
   1269	if (unlikely(wc->status != IB_WC_SUCCESS))
   1270		nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
   1271	else
   1272		nvme_rdma_end_request(req);
   1273}
   1274
   1275static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
   1276		struct nvme_rdma_request *req)
   1277{
   1278	struct ib_send_wr wr = {
   1279		.opcode		    = IB_WR_LOCAL_INV,
   1280		.next		    = NULL,
   1281		.num_sge	    = 0,
   1282		.send_flags	    = IB_SEND_SIGNALED,
   1283		.ex.invalidate_rkey = req->mr->rkey,
   1284	};
   1285
   1286	req->reg_cqe.done = nvme_rdma_inv_rkey_done;
   1287	wr.wr_cqe = &req->reg_cqe;
   1288
   1289	return ib_post_send(queue->qp, &wr, NULL);
   1290}
   1291
   1292static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq)
   1293{
   1294	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
   1295
   1296	if (blk_integrity_rq(rq)) {
   1297		ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
   1298				req->metadata_sgl->nents, rq_dma_dir(rq));
   1299		sg_free_table_chained(&req->metadata_sgl->sg_table,
   1300				      NVME_INLINE_METADATA_SG_CNT);
   1301	}
   1302
   1303	ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
   1304			rq_dma_dir(rq));
   1305	sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
   1306}
   1307
   1308static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
   1309		struct request *rq)
   1310{
   1311	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
   1312	struct nvme_rdma_device *dev = queue->device;
   1313	struct ib_device *ibdev = dev->dev;
   1314	struct list_head *pool = &queue->qp->rdma_mrs;
   1315
   1316	if (!blk_rq_nr_phys_segments(rq))
   1317		return;
   1318
   1319	if (req->use_sig_mr)
   1320		pool = &queue->qp->sig_mrs;
   1321
   1322	if (req->mr) {
   1323		ib_mr_pool_put(queue->qp, pool, req->mr);
   1324		req->mr = NULL;
   1325	}
   1326
   1327	nvme_rdma_dma_unmap_req(ibdev, rq);
   1328}
   1329
   1330static int nvme_rdma_set_sg_null(struct nvme_command *c)
   1331{
   1332	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
   1333
   1334	sg->addr = 0;
   1335	put_unaligned_le24(0, sg->length);
   1336	put_unaligned_le32(0, sg->key);
   1337	sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
   1338	return 0;
   1339}
   1340
   1341static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
   1342		struct nvme_rdma_request *req, struct nvme_command *c,
   1343		int count)
   1344{
   1345	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
   1346	struct ib_sge *sge = &req->sge[1];
   1347	struct scatterlist *sgl;
   1348	u32 len = 0;
   1349	int i;
   1350
   1351	for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) {
   1352		sge->addr = sg_dma_address(sgl);
   1353		sge->length = sg_dma_len(sgl);
   1354		sge->lkey = queue->device->pd->local_dma_lkey;
   1355		len += sge->length;
   1356		sge++;
   1357	}
   1358
   1359	sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
   1360	sg->length = cpu_to_le32(len);
   1361	sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
   1362
   1363	req->num_sge += count;
   1364	return 0;
   1365}
   1366
   1367static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
   1368		struct nvme_rdma_request *req, struct nvme_command *c)
   1369{
   1370	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
   1371
   1372	sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl));
   1373	put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length);
   1374	put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
   1375	sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
   1376	return 0;
   1377}
   1378
   1379static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
   1380		struct nvme_rdma_request *req, struct nvme_command *c,
   1381		int count)
   1382{
   1383	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
   1384	int nr;
   1385
   1386	req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
   1387	if (WARN_ON_ONCE(!req->mr))
   1388		return -EAGAIN;
   1389
   1390	/*
   1391	 * Align the MR to a 4K page size to match the ctrl page size and
   1392	 * the block virtual boundary.
   1393	 */
   1394	nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL,
   1395			  SZ_4K);
   1396	if (unlikely(nr < count)) {
   1397		ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
   1398		req->mr = NULL;
   1399		if (nr < 0)
   1400			return nr;
   1401		return -EINVAL;
   1402	}
   1403
   1404	ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
   1405
   1406	req->reg_cqe.done = nvme_rdma_memreg_done;
   1407	memset(&req->reg_wr, 0, sizeof(req->reg_wr));
   1408	req->reg_wr.wr.opcode = IB_WR_REG_MR;
   1409	req->reg_wr.wr.wr_cqe = &req->reg_cqe;
   1410	req->reg_wr.wr.num_sge = 0;
   1411	req->reg_wr.mr = req->mr;
   1412	req->reg_wr.key = req->mr->rkey;
   1413	req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
   1414			     IB_ACCESS_REMOTE_READ |
   1415			     IB_ACCESS_REMOTE_WRITE;
   1416
   1417	sg->addr = cpu_to_le64(req->mr->iova);
   1418	put_unaligned_le24(req->mr->length, sg->length);
   1419	put_unaligned_le32(req->mr->rkey, sg->key);
   1420	sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
   1421			NVME_SGL_FMT_INVALIDATE;
   1422
   1423	return 0;
   1424}
   1425
   1426static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
   1427		struct nvme_command *cmd, struct ib_sig_domain *domain,
   1428		u16 control, u8 pi_type)
   1429{
   1430	domain->sig_type = IB_SIG_TYPE_T10_DIF;
   1431	domain->sig.dif.bg_type = IB_T10DIF_CRC;
   1432	domain->sig.dif.pi_interval = 1 << bi->interval_exp;
   1433	domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
   1434	if (control & NVME_RW_PRINFO_PRCHK_REF)
   1435		domain->sig.dif.ref_remap = true;
   1436
   1437	domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
   1438	domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
   1439	domain->sig.dif.app_escape = true;
   1440	if (pi_type == NVME_NS_DPS_PI_TYPE3)
   1441		domain->sig.dif.ref_escape = true;
   1442}
   1443
   1444static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi,
   1445		struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs,
   1446		u8 pi_type)
   1447{
   1448	u16 control = le16_to_cpu(cmd->rw.control);
   1449
   1450	memset(sig_attrs, 0, sizeof(*sig_attrs));
   1451	if (control & NVME_RW_PRINFO_PRACT) {
   1452		/* for WRITE_INSERT/READ_STRIP no memory domain */
   1453		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
   1454		nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
   1455					 pi_type);
   1456		/* Clear the PRACT bit since HCA will generate/verify the PI */
   1457		control &= ~NVME_RW_PRINFO_PRACT;
   1458		cmd->rw.control = cpu_to_le16(control);
   1459	} else {
   1460		/* for WRITE_PASS/READ_PASS both wire/memory domains exist */
   1461		nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
   1462					 pi_type);
   1463		nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
   1464					 pi_type);
   1465	}
   1466}
   1467
   1468static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask)
   1469{
   1470	*mask = 0;
   1471	if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
   1472		*mask |= IB_SIG_CHECK_REFTAG;
   1473	if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
   1474		*mask |= IB_SIG_CHECK_GUARD;
   1475}
   1476
   1477static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc)
   1478{
   1479	if (unlikely(wc->status != IB_WC_SUCCESS))
   1480		nvme_rdma_wr_error(cq, wc, "SIG");
   1481}
   1482
   1483static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
   1484		struct nvme_rdma_request *req, struct nvme_command *c,
   1485		int count, int pi_count)
   1486{
   1487	struct nvme_rdma_sgl *sgl = &req->data_sgl;
   1488	struct ib_reg_wr *wr = &req->reg_wr;
   1489	struct request *rq = blk_mq_rq_from_pdu(req);
   1490	struct nvme_ns *ns = rq->q->queuedata;
   1491	struct bio *bio = rq->bio;
   1492	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
   1493	int nr;
   1494
   1495	req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
   1496	if (WARN_ON_ONCE(!req->mr))
   1497		return -EAGAIN;
   1498
   1499	nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL,
   1500			     req->metadata_sgl->sg_table.sgl, pi_count, NULL,
   1501			     SZ_4K);
   1502	if (unlikely(nr))
   1503		goto mr_put;
   1504
   1505	nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
   1506				req->mr->sig_attrs, ns->pi_type);
   1507	nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
   1508
   1509	ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
   1510
   1511	req->reg_cqe.done = nvme_rdma_sig_done;
   1512	memset(wr, 0, sizeof(*wr));
   1513	wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
   1514	wr->wr.wr_cqe = &req->reg_cqe;
   1515	wr->wr.num_sge = 0;
   1516	wr->wr.send_flags = 0;
   1517	wr->mr = req->mr;
   1518	wr->key = req->mr->rkey;
   1519	wr->access = IB_ACCESS_LOCAL_WRITE |
   1520		     IB_ACCESS_REMOTE_READ |
   1521		     IB_ACCESS_REMOTE_WRITE;
   1522
   1523	sg->addr = cpu_to_le64(req->mr->iova);
   1524	put_unaligned_le24(req->mr->length, sg->length);
   1525	put_unaligned_le32(req->mr->rkey, sg->key);
   1526	sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
   1527
   1528	return 0;
   1529
   1530mr_put:
   1531	ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
   1532	req->mr = NULL;
   1533	if (nr < 0)
   1534		return nr;
   1535	return -EINVAL;
   1536}
   1537
   1538static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq,
   1539		int *count, int *pi_count)
   1540{
   1541	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
   1542	int ret;
   1543
   1544	req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1);
   1545	ret = sg_alloc_table_chained(&req->data_sgl.sg_table,
   1546			blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl,
   1547			NVME_INLINE_SG_CNT);
   1548	if (ret)
   1549		return -ENOMEM;
   1550
   1551	req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
   1552					    req->data_sgl.sg_table.sgl);
   1553
   1554	*count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
   1555			       req->data_sgl.nents, rq_dma_dir(rq));
   1556	if (unlikely(*count <= 0)) {
   1557		ret = -EIO;
   1558		goto out_free_table;
   1559	}
   1560
   1561	if (blk_integrity_rq(rq)) {
   1562		req->metadata_sgl->sg_table.sgl =
   1563			(struct scatterlist *)(req->metadata_sgl + 1);
   1564		ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table,
   1565				blk_rq_count_integrity_sg(rq->q, rq->bio),
   1566				req->metadata_sgl->sg_table.sgl,
   1567				NVME_INLINE_METADATA_SG_CNT);
   1568		if (unlikely(ret)) {
   1569			ret = -ENOMEM;
   1570			goto out_unmap_sg;
   1571		}
   1572
   1573		req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q,
   1574				rq->bio, req->metadata_sgl->sg_table.sgl);
   1575		*pi_count = ib_dma_map_sg(ibdev,
   1576					  req->metadata_sgl->sg_table.sgl,
   1577					  req->metadata_sgl->nents,
   1578					  rq_dma_dir(rq));
   1579		if (unlikely(*pi_count <= 0)) {
   1580			ret = -EIO;
   1581			goto out_free_pi_table;
   1582		}
   1583	}
   1584
   1585	return 0;
   1586
   1587out_free_pi_table:
   1588	sg_free_table_chained(&req->metadata_sgl->sg_table,
   1589			      NVME_INLINE_METADATA_SG_CNT);
   1590out_unmap_sg:
   1591	ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
   1592			rq_dma_dir(rq));
   1593out_free_table:
   1594	sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
   1595	return ret;
   1596}
   1597
   1598static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
   1599		struct request *rq, struct nvme_command *c)
   1600{
   1601	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
   1602	struct nvme_rdma_device *dev = queue->device;
   1603	struct ib_device *ibdev = dev->dev;
   1604	int pi_count = 0;
   1605	int count, ret;
   1606
   1607	req->num_sge = 1;
   1608	refcount_set(&req->ref, 2); /* send and recv completions */
   1609
   1610	c->common.flags |= NVME_CMD_SGL_METABUF;
   1611
   1612	if (!blk_rq_nr_phys_segments(rq))
   1613		return nvme_rdma_set_sg_null(c);
   1614
   1615	ret = nvme_rdma_dma_map_req(ibdev, rq, &count, &pi_count);
   1616	if (unlikely(ret))
   1617		return ret;
   1618
   1619	if (req->use_sig_mr) {
   1620		ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count);
   1621		goto out;
   1622	}
   1623
   1624	if (count <= dev->num_inline_segments) {
   1625		if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
   1626		    queue->ctrl->use_inline_data &&
   1627		    blk_rq_payload_bytes(rq) <=
   1628				nvme_rdma_inline_data_size(queue)) {
   1629			ret = nvme_rdma_map_sg_inline(queue, req, c, count);
   1630			goto out;
   1631		}
   1632
   1633		if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
   1634			ret = nvme_rdma_map_sg_single(queue, req, c);
   1635			goto out;
   1636		}
   1637	}
   1638
   1639	ret = nvme_rdma_map_sg_fr(queue, req, c, count);
   1640out:
   1641	if (unlikely(ret))
   1642		goto out_dma_unmap_req;
   1643
   1644	return 0;
   1645
   1646out_dma_unmap_req:
   1647	nvme_rdma_dma_unmap_req(ibdev, rq);
   1648	return ret;
   1649}
   1650
   1651static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
   1652{
   1653	struct nvme_rdma_qe *qe =
   1654		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
   1655	struct nvme_rdma_request *req =
   1656		container_of(qe, struct nvme_rdma_request, sqe);
   1657
   1658	if (unlikely(wc->status != IB_WC_SUCCESS))
   1659		nvme_rdma_wr_error(cq, wc, "SEND");
   1660	else
   1661		nvme_rdma_end_request(req);
   1662}
   1663
   1664static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
   1665		struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
   1666		struct ib_send_wr *first)
   1667{
   1668	struct ib_send_wr wr;
   1669	int ret;
   1670
   1671	sge->addr   = qe->dma;
   1672	sge->length = sizeof(struct nvme_command);
   1673	sge->lkey   = queue->device->pd->local_dma_lkey;
   1674
   1675	wr.next       = NULL;
   1676	wr.wr_cqe     = &qe->cqe;
   1677	wr.sg_list    = sge;
   1678	wr.num_sge    = num_sge;
   1679	wr.opcode     = IB_WR_SEND;
   1680	wr.send_flags = IB_SEND_SIGNALED;
   1681
   1682	if (first)
   1683		first->next = &wr;
   1684	else
   1685		first = &wr;
   1686
   1687	ret = ib_post_send(queue->qp, first, NULL);
   1688	if (unlikely(ret)) {
   1689		dev_err(queue->ctrl->ctrl.device,
   1690			     "%s failed with error code %d\n", __func__, ret);
   1691	}
   1692	return ret;
   1693}
   1694
   1695static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
   1696		struct nvme_rdma_qe *qe)
   1697{
   1698	struct ib_recv_wr wr;
   1699	struct ib_sge list;
   1700	int ret;
   1701
   1702	list.addr   = qe->dma;
   1703	list.length = sizeof(struct nvme_completion);
   1704	list.lkey   = queue->device->pd->local_dma_lkey;
   1705
   1706	qe->cqe.done = nvme_rdma_recv_done;
   1707
   1708	wr.next     = NULL;
   1709	wr.wr_cqe   = &qe->cqe;
   1710	wr.sg_list  = &list;
   1711	wr.num_sge  = 1;
   1712
   1713	ret = ib_post_recv(queue->qp, &wr, NULL);
   1714	if (unlikely(ret)) {
   1715		dev_err(queue->ctrl->ctrl.device,
   1716			"%s failed with error code %d\n", __func__, ret);
   1717	}
   1718	return ret;
   1719}
   1720
   1721static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
   1722{
   1723	u32 queue_idx = nvme_rdma_queue_idx(queue);
   1724
   1725	if (queue_idx == 0)
   1726		return queue->ctrl->admin_tag_set.tags[queue_idx];
   1727	return queue->ctrl->tag_set.tags[queue_idx - 1];
   1728}
   1729
   1730static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
   1731{
   1732	if (unlikely(wc->status != IB_WC_SUCCESS))
   1733		nvme_rdma_wr_error(cq, wc, "ASYNC");
   1734}
   1735
   1736static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
   1737{
   1738	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
   1739	struct nvme_rdma_queue *queue = &ctrl->queues[0];
   1740	struct ib_device *dev = queue->device->dev;
   1741	struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
   1742	struct nvme_command *cmd = sqe->data;
   1743	struct ib_sge sge;
   1744	int ret;
   1745
   1746	ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
   1747
   1748	memset(cmd, 0, sizeof(*cmd));
   1749	cmd->common.opcode = nvme_admin_async_event;
   1750	cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
   1751	cmd->common.flags |= NVME_CMD_SGL_METABUF;
   1752	nvme_rdma_set_sg_null(cmd);
   1753
   1754	sqe->cqe.done = nvme_rdma_async_done;
   1755
   1756	ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
   1757			DMA_TO_DEVICE);
   1758
   1759	ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
   1760	WARN_ON_ONCE(ret);
   1761}
   1762
   1763static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
   1764		struct nvme_completion *cqe, struct ib_wc *wc)
   1765{
   1766	struct request *rq;
   1767	struct nvme_rdma_request *req;
   1768
   1769	rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id);
   1770	if (!rq) {
   1771		dev_err(queue->ctrl->ctrl.device,
   1772			"got bad command_id %#x on QP %#x\n",
   1773			cqe->command_id, queue->qp->qp_num);
   1774		nvme_rdma_error_recovery(queue->ctrl);
   1775		return;
   1776	}
   1777	req = blk_mq_rq_to_pdu(rq);
   1778
   1779	req->status = cqe->status;
   1780	req->result = cqe->result;
   1781
   1782	if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
   1783		if (unlikely(!req->mr ||
   1784			     wc->ex.invalidate_rkey != req->mr->rkey)) {
   1785			dev_err(queue->ctrl->ctrl.device,
   1786				"Bogus remote invalidation for rkey %#x\n",
   1787				req->mr ? req->mr->rkey : 0);
   1788			nvme_rdma_error_recovery(queue->ctrl);
   1789		}
   1790	} else if (req->mr) {
   1791		int ret;
   1792
   1793		ret = nvme_rdma_inv_rkey(queue, req);
   1794		if (unlikely(ret < 0)) {
   1795			dev_err(queue->ctrl->ctrl.device,
   1796				"Queueing INV WR for rkey %#x failed (%d)\n",
   1797				req->mr->rkey, ret);
   1798			nvme_rdma_error_recovery(queue->ctrl);
   1799		}
   1800		/* the local invalidation completion will end the request */
   1801		return;
   1802	}
   1803
   1804	nvme_rdma_end_request(req);
   1805}
   1806
   1807static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
   1808{
   1809	struct nvme_rdma_qe *qe =
   1810		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
   1811	struct nvme_rdma_queue *queue = wc->qp->qp_context;
   1812	struct ib_device *ibdev = queue->device->dev;
   1813	struct nvme_completion *cqe = qe->data;
   1814	const size_t len = sizeof(struct nvme_completion);
   1815
   1816	if (unlikely(wc->status != IB_WC_SUCCESS)) {
   1817		nvme_rdma_wr_error(cq, wc, "RECV");
   1818		return;
   1819	}
   1820
   1821	/* sanity checking for received data length */
   1822	if (unlikely(wc->byte_len < len)) {
   1823		dev_err(queue->ctrl->ctrl.device,
   1824			"Unexpected nvme completion length(%d)\n", wc->byte_len);
   1825		nvme_rdma_error_recovery(queue->ctrl);
   1826		return;
   1827	}
   1828
   1829	ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
   1830	/*
   1831	 * AEN requests are special as they don't time out and can
   1832	 * survive any kind of queue freeze and often don't respond to
   1833	 * aborts.  We don't even bother to allocate a struct request
   1834	 * for them but rather special case them here.
   1835	 */
   1836	if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue),
   1837				     cqe->command_id)))
   1838		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
   1839				&cqe->result);
   1840	else
   1841		nvme_rdma_process_nvme_rsp(queue, cqe, wc);
   1842	ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
   1843
   1844	nvme_rdma_post_recv(queue, qe);
   1845}
   1846
   1847static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
   1848{
   1849	int ret, i;
   1850
   1851	for (i = 0; i < queue->queue_size; i++) {
   1852		ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
   1853		if (ret)
   1854			return ret;
   1855	}
   1856
   1857	return 0;
   1858}
   1859
   1860static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
   1861		struct rdma_cm_event *ev)
   1862{
   1863	struct rdma_cm_id *cm_id = queue->cm_id;
   1864	int status = ev->status;
   1865	const char *rej_msg;
   1866	const struct nvme_rdma_cm_rej *rej_data;
   1867	u8 rej_data_len;
   1868
   1869	rej_msg = rdma_reject_msg(cm_id, status);
   1870	rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len);
   1871
   1872	if (rej_data && rej_data_len >= sizeof(u16)) {
   1873		u16 sts = le16_to_cpu(rej_data->sts);
   1874
   1875		dev_err(queue->ctrl->ctrl.device,
   1876		      "Connect rejected: status %d (%s) nvme status %d (%s).\n",
   1877		      status, rej_msg, sts, nvme_rdma_cm_msg(sts));
   1878	} else {
   1879		dev_err(queue->ctrl->ctrl.device,
   1880			"Connect rejected: status %d (%s).\n", status, rej_msg);
   1881	}
   1882
   1883	return -ECONNRESET;
   1884}
   1885
   1886static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
   1887{
   1888	struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
   1889	int ret;
   1890
   1891	ret = nvme_rdma_create_queue_ib(queue);
   1892	if (ret)
   1893		return ret;
   1894
   1895	if (ctrl->opts->tos >= 0)
   1896		rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
   1897	ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
   1898	if (ret) {
   1899		dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
   1900			queue->cm_error);
   1901		goto out_destroy_queue;
   1902	}
   1903
   1904	return 0;
   1905
   1906out_destroy_queue:
   1907	nvme_rdma_destroy_queue_ib(queue);
   1908	return ret;
   1909}
   1910
   1911static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
   1912{
   1913	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
   1914	struct rdma_conn_param param = { };
   1915	struct nvme_rdma_cm_req priv = { };
   1916	int ret;
   1917
   1918	param.qp_num = queue->qp->qp_num;
   1919	param.flow_control = 1;
   1920
   1921	param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
   1922	/* maximum retry count */
   1923	param.retry_count = 7;
   1924	param.rnr_retry_count = 7;
   1925	param.private_data = &priv;
   1926	param.private_data_len = sizeof(priv);
   1927
   1928	priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
   1929	priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
   1930	/*
   1931	 * set the admin queue depth to the minimum size
   1932	 * specified by the Fabrics standard.
   1933	 */
   1934	if (priv.qid == 0) {
   1935		priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH);
   1936		priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1);
   1937	} else {
   1938		/*
   1939		 * current interpretation of the fabrics spec
   1940		 * is at minimum you make hrqsize sqsize+1, or a
   1941		 * 1's based representation of sqsize.
   1942		 */
   1943		priv.hrqsize = cpu_to_le16(queue->queue_size);
   1944		priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
   1945	}
   1946
   1947	ret = rdma_connect_locked(queue->cm_id, &param);
   1948	if (ret) {
   1949		dev_err(ctrl->ctrl.device,
   1950			"rdma_connect_locked failed (%d).\n", ret);
   1951		return ret;
   1952	}
   1953
   1954	return 0;
   1955}
   1956
   1957static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
   1958		struct rdma_cm_event *ev)
   1959{
   1960	struct nvme_rdma_queue *queue = cm_id->context;
   1961	int cm_error = 0;
   1962
   1963	dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
   1964		rdma_event_msg(ev->event), ev->event,
   1965		ev->status, cm_id);
   1966
   1967	switch (ev->event) {
   1968	case RDMA_CM_EVENT_ADDR_RESOLVED:
   1969		cm_error = nvme_rdma_addr_resolved(queue);
   1970		break;
   1971	case RDMA_CM_EVENT_ROUTE_RESOLVED:
   1972		cm_error = nvme_rdma_route_resolved(queue);
   1973		break;
   1974	case RDMA_CM_EVENT_ESTABLISHED:
   1975		queue->cm_error = nvme_rdma_conn_established(queue);
   1976		/* complete cm_done regardless of success/failure */
   1977		complete(&queue->cm_done);
   1978		return 0;
   1979	case RDMA_CM_EVENT_REJECTED:
   1980		cm_error = nvme_rdma_conn_rejected(queue, ev);
   1981		break;
   1982	case RDMA_CM_EVENT_ROUTE_ERROR:
   1983	case RDMA_CM_EVENT_CONNECT_ERROR:
   1984	case RDMA_CM_EVENT_UNREACHABLE:
   1985	case RDMA_CM_EVENT_ADDR_ERROR:
   1986		dev_dbg(queue->ctrl->ctrl.device,
   1987			"CM error event %d\n", ev->event);
   1988		cm_error = -ECONNRESET;
   1989		break;
   1990	case RDMA_CM_EVENT_DISCONNECTED:
   1991	case RDMA_CM_EVENT_ADDR_CHANGE:
   1992	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
   1993		dev_dbg(queue->ctrl->ctrl.device,
   1994			"disconnect received - connection closed\n");
   1995		nvme_rdma_error_recovery(queue->ctrl);
   1996		break;
   1997	case RDMA_CM_EVENT_DEVICE_REMOVAL:
   1998		/* device removal is handled via the ib_client API */
   1999		break;
   2000	default:
   2001		dev_err(queue->ctrl->ctrl.device,
   2002			"Unexpected RDMA CM event (%d)\n", ev->event);
   2003		nvme_rdma_error_recovery(queue->ctrl);
   2004		break;
   2005	}
   2006
   2007	if (cm_error) {
   2008		queue->cm_error = cm_error;
   2009		complete(&queue->cm_done);
   2010	}
   2011
   2012	return 0;
   2013}
   2014
   2015static void nvme_rdma_complete_timed_out(struct request *rq)
   2016{
   2017	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
   2018	struct nvme_rdma_queue *queue = req->queue;
   2019
   2020	nvme_rdma_stop_queue(queue);
   2021	nvmf_complete_timed_out_request(rq);
   2022}
   2023
   2024static enum blk_eh_timer_return
   2025nvme_rdma_timeout(struct request *rq, bool reserved)
   2026{
   2027	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
   2028	struct nvme_rdma_queue *queue = req->queue;
   2029	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
   2030
   2031	dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
   2032		 rq->tag, nvme_rdma_queue_idx(queue));
   2033
   2034	if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
   2035		/*
   2036		 * If we are resetting, connecting or deleting we should
   2037		 * complete immediately because we may block controller
   2038		 * teardown or setup sequence
   2039		 * - ctrl disable/shutdown fabrics requests
   2040		 * - connect requests
   2041		 * - initialization admin requests
   2042		 * - I/O requests that entered after unquiescing and
   2043		 *   the controller stopped responding
   2044		 *
   2045		 * All other requests should be cancelled by the error
   2046		 * recovery work, so it's fine that we fail it here.
   2047		 */
   2048		nvme_rdma_complete_timed_out(rq);
   2049		return BLK_EH_DONE;
   2050	}
   2051
   2052	/*
   2053	 * LIVE state should trigger the normal error recovery which will
   2054	 * handle completing this request.
   2055	 */
   2056	nvme_rdma_error_recovery(ctrl);
   2057	return BLK_EH_RESET_TIMER;
   2058}
   2059
   2060static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
   2061		const struct blk_mq_queue_data *bd)
   2062{
   2063	struct nvme_ns *ns = hctx->queue->queuedata;
   2064	struct nvme_rdma_queue *queue = hctx->driver_data;
   2065	struct request *rq = bd->rq;
   2066	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
   2067	struct nvme_rdma_qe *sqe = &req->sqe;
   2068	struct nvme_command *c = nvme_req(rq)->cmd;
   2069	struct ib_device *dev;
   2070	bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags);
   2071	blk_status_t ret;
   2072	int err;
   2073
   2074	WARN_ON_ONCE(rq->tag < 0);
   2075
   2076	if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
   2077		return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
   2078
   2079	dev = queue->device->dev;
   2080
   2081	req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
   2082					 sizeof(struct nvme_command),
   2083					 DMA_TO_DEVICE);
   2084	err = ib_dma_mapping_error(dev, req->sqe.dma);
   2085	if (unlikely(err))
   2086		return BLK_STS_RESOURCE;
   2087
   2088	ib_dma_sync_single_for_cpu(dev, sqe->dma,
   2089			sizeof(struct nvme_command), DMA_TO_DEVICE);
   2090
   2091	ret = nvme_setup_cmd(ns, rq);
   2092	if (ret)
   2093		goto unmap_qe;
   2094
   2095	blk_mq_start_request(rq);
   2096
   2097	if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
   2098	    queue->pi_support &&
   2099	    (c->common.opcode == nvme_cmd_write ||
   2100	     c->common.opcode == nvme_cmd_read) &&
   2101	    nvme_ns_has_pi(ns))
   2102		req->use_sig_mr = true;
   2103	else
   2104		req->use_sig_mr = false;
   2105
   2106	err = nvme_rdma_map_data(queue, rq, c);
   2107	if (unlikely(err < 0)) {
   2108		dev_err(queue->ctrl->ctrl.device,
   2109			     "Failed to map data (%d)\n", err);
   2110		goto err;
   2111	}
   2112
   2113	sqe->cqe.done = nvme_rdma_send_done;
   2114
   2115	ib_dma_sync_single_for_device(dev, sqe->dma,
   2116			sizeof(struct nvme_command), DMA_TO_DEVICE);
   2117
   2118	err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
   2119			req->mr ? &req->reg_wr.wr : NULL);
   2120	if (unlikely(err))
   2121		goto err_unmap;
   2122
   2123	return BLK_STS_OK;
   2124
   2125err_unmap:
   2126	nvme_rdma_unmap_data(queue, rq);
   2127err:
   2128	if (err == -EIO)
   2129		ret = nvme_host_path_error(rq);
   2130	else if (err == -ENOMEM || err == -EAGAIN)
   2131		ret = BLK_STS_RESOURCE;
   2132	else
   2133		ret = BLK_STS_IOERR;
   2134	nvme_cleanup_cmd(rq);
   2135unmap_qe:
   2136	ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
   2137			    DMA_TO_DEVICE);
   2138	return ret;
   2139}
   2140
   2141static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
   2142{
   2143	struct nvme_rdma_queue *queue = hctx->driver_data;
   2144
   2145	return ib_process_cq_direct(queue->ib_cq, -1);
   2146}
   2147
   2148static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req)
   2149{
   2150	struct request *rq = blk_mq_rq_from_pdu(req);
   2151	struct ib_mr_status mr_status;
   2152	int ret;
   2153
   2154	ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
   2155	if (ret) {
   2156		pr_err("ib_check_mr_status failed, ret %d\n", ret);
   2157		nvme_req(rq)->status = NVME_SC_INVALID_PI;
   2158		return;
   2159	}
   2160
   2161	if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
   2162		switch (mr_status.sig_err.err_type) {
   2163		case IB_SIG_BAD_GUARD:
   2164			nvme_req(rq)->status = NVME_SC_GUARD_CHECK;
   2165			break;
   2166		case IB_SIG_BAD_REFTAG:
   2167			nvme_req(rq)->status = NVME_SC_REFTAG_CHECK;
   2168			break;
   2169		case IB_SIG_BAD_APPTAG:
   2170			nvme_req(rq)->status = NVME_SC_APPTAG_CHECK;
   2171			break;
   2172		}
   2173		pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
   2174		       mr_status.sig_err.err_type, mr_status.sig_err.expected,
   2175		       mr_status.sig_err.actual);
   2176	}
   2177}
   2178
   2179static void nvme_rdma_complete_rq(struct request *rq)
   2180{
   2181	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
   2182	struct nvme_rdma_queue *queue = req->queue;
   2183	struct ib_device *ibdev = queue->device->dev;
   2184
   2185	if (req->use_sig_mr)
   2186		nvme_rdma_check_pi_status(req);
   2187
   2188	nvme_rdma_unmap_data(queue, rq);
   2189	ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
   2190			    DMA_TO_DEVICE);
   2191	nvme_complete_rq(rq);
   2192}
   2193
   2194static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
   2195{
   2196	struct nvme_rdma_ctrl *ctrl = set->driver_data;
   2197	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
   2198
   2199	if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
   2200		/* separate read/write queues */
   2201		set->map[HCTX_TYPE_DEFAULT].nr_queues =
   2202			ctrl->io_queues[HCTX_TYPE_DEFAULT];
   2203		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
   2204		set->map[HCTX_TYPE_READ].nr_queues =
   2205			ctrl->io_queues[HCTX_TYPE_READ];
   2206		set->map[HCTX_TYPE_READ].queue_offset =
   2207			ctrl->io_queues[HCTX_TYPE_DEFAULT];
   2208	} else {
   2209		/* shared read/write queues */
   2210		set->map[HCTX_TYPE_DEFAULT].nr_queues =
   2211			ctrl->io_queues[HCTX_TYPE_DEFAULT];
   2212		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
   2213		set->map[HCTX_TYPE_READ].nr_queues =
   2214			ctrl->io_queues[HCTX_TYPE_DEFAULT];
   2215		set->map[HCTX_TYPE_READ].queue_offset = 0;
   2216	}
   2217	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
   2218			ctrl->device->dev, 0);
   2219	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
   2220			ctrl->device->dev, 0);
   2221
   2222	if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
   2223		/* map dedicated poll queues only if we have queues left */
   2224		set->map[HCTX_TYPE_POLL].nr_queues =
   2225				ctrl->io_queues[HCTX_TYPE_POLL];
   2226		set->map[HCTX_TYPE_POLL].queue_offset =
   2227			ctrl->io_queues[HCTX_TYPE_DEFAULT] +
   2228			ctrl->io_queues[HCTX_TYPE_READ];
   2229		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
   2230	}
   2231
   2232	dev_info(ctrl->ctrl.device,
   2233		"mapped %d/%d/%d default/read/poll queues.\n",
   2234		ctrl->io_queues[HCTX_TYPE_DEFAULT],
   2235		ctrl->io_queues[HCTX_TYPE_READ],
   2236		ctrl->io_queues[HCTX_TYPE_POLL]);
   2237
   2238	return 0;
   2239}
   2240
   2241static const struct blk_mq_ops nvme_rdma_mq_ops = {
   2242	.queue_rq	= nvme_rdma_queue_rq,
   2243	.complete	= nvme_rdma_complete_rq,
   2244	.init_request	= nvme_rdma_init_request,
   2245	.exit_request	= nvme_rdma_exit_request,
   2246	.init_hctx	= nvme_rdma_init_hctx,
   2247	.timeout	= nvme_rdma_timeout,
   2248	.map_queues	= nvme_rdma_map_queues,
   2249	.poll		= nvme_rdma_poll,
   2250};
   2251
   2252static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
   2253	.queue_rq	= nvme_rdma_queue_rq,
   2254	.complete	= nvme_rdma_complete_rq,
   2255	.init_request	= nvme_rdma_init_request,
   2256	.exit_request	= nvme_rdma_exit_request,
   2257	.init_hctx	= nvme_rdma_init_admin_hctx,
   2258	.timeout	= nvme_rdma_timeout,
   2259};
   2260
   2261static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
   2262{
   2263	nvme_rdma_teardown_io_queues(ctrl, shutdown);
   2264	nvme_stop_admin_queue(&ctrl->ctrl);
   2265	if (shutdown)
   2266		nvme_shutdown_ctrl(&ctrl->ctrl);
   2267	else
   2268		nvme_disable_ctrl(&ctrl->ctrl);
   2269	nvme_rdma_teardown_admin_queue(ctrl, shutdown);
   2270}
   2271
   2272static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
   2273{
   2274	nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
   2275}
   2276
   2277static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
   2278{
   2279	struct nvme_rdma_ctrl *ctrl =
   2280		container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
   2281
   2282	nvme_stop_ctrl(&ctrl->ctrl);
   2283	nvme_rdma_shutdown_ctrl(ctrl, false);
   2284
   2285	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
   2286		/* state change failure should never happen */
   2287		WARN_ON_ONCE(1);
   2288		return;
   2289	}
   2290
   2291	if (nvme_rdma_setup_ctrl(ctrl, false))
   2292		goto out_fail;
   2293
   2294	return;
   2295
   2296out_fail:
   2297	++ctrl->ctrl.nr_reconnects;
   2298	nvme_rdma_reconnect_or_remove(ctrl);
   2299}
   2300
   2301static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
   2302	.name			= "rdma",
   2303	.module			= THIS_MODULE,
   2304	.flags			= NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED,
   2305	.reg_read32		= nvmf_reg_read32,
   2306	.reg_read64		= nvmf_reg_read64,
   2307	.reg_write32		= nvmf_reg_write32,
   2308	.free_ctrl		= nvme_rdma_free_ctrl,
   2309	.submit_async_event	= nvme_rdma_submit_async_event,
   2310	.delete_ctrl		= nvme_rdma_delete_ctrl,
   2311	.get_address		= nvmf_get_address,
   2312	.stop_ctrl		= nvme_rdma_stop_ctrl,
   2313};
   2314
   2315/*
   2316 * Fails a connection request if it matches an existing controller
   2317 * (association) with the same tuple:
   2318 * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
   2319 *
   2320 * if local address is not specified in the request, it will match an
   2321 * existing controller with all the other parameters the same and no
   2322 * local port address specified as well.
   2323 *
   2324 * The ports don't need to be compared as they are intrinsically
   2325 * already matched by the port pointers supplied.
   2326 */
   2327static bool
   2328nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
   2329{
   2330	struct nvme_rdma_ctrl *ctrl;
   2331	bool found = false;
   2332
   2333	mutex_lock(&nvme_rdma_ctrl_mutex);
   2334	list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
   2335		found = nvmf_ip_options_match(&ctrl->ctrl, opts);
   2336		if (found)
   2337			break;
   2338	}
   2339	mutex_unlock(&nvme_rdma_ctrl_mutex);
   2340
   2341	return found;
   2342}
   2343
   2344static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
   2345		struct nvmf_ctrl_options *opts)
   2346{
   2347	struct nvme_rdma_ctrl *ctrl;
   2348	int ret;
   2349	bool changed;
   2350
   2351	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
   2352	if (!ctrl)
   2353		return ERR_PTR(-ENOMEM);
   2354	ctrl->ctrl.opts = opts;
   2355	INIT_LIST_HEAD(&ctrl->list);
   2356
   2357	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
   2358		opts->trsvcid =
   2359			kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
   2360		if (!opts->trsvcid) {
   2361			ret = -ENOMEM;
   2362			goto out_free_ctrl;
   2363		}
   2364		opts->mask |= NVMF_OPT_TRSVCID;
   2365	}
   2366
   2367	ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
   2368			opts->traddr, opts->trsvcid, &ctrl->addr);
   2369	if (ret) {
   2370		pr_err("malformed address passed: %s:%s\n",
   2371			opts->traddr, opts->trsvcid);
   2372		goto out_free_ctrl;
   2373	}
   2374
   2375	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
   2376		ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
   2377			opts->host_traddr, NULL, &ctrl->src_addr);
   2378		if (ret) {
   2379			pr_err("malformed src address passed: %s\n",
   2380			       opts->host_traddr);
   2381			goto out_free_ctrl;
   2382		}
   2383	}
   2384
   2385	if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
   2386		ret = -EALREADY;
   2387		goto out_free_ctrl;
   2388	}
   2389
   2390	INIT_DELAYED_WORK(&ctrl->reconnect_work,
   2391			nvme_rdma_reconnect_ctrl_work);
   2392	INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
   2393	INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
   2394
   2395	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
   2396				opts->nr_poll_queues + 1;
   2397	ctrl->ctrl.sqsize = opts->queue_size - 1;
   2398	ctrl->ctrl.kato = opts->kato;
   2399
   2400	ret = -ENOMEM;
   2401	ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
   2402				GFP_KERNEL);
   2403	if (!ctrl->queues)
   2404		goto out_free_ctrl;
   2405
   2406	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
   2407				0 /* no quirks, we're perfect! */);
   2408	if (ret)
   2409		goto out_kfree_queues;
   2410
   2411	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
   2412	WARN_ON_ONCE(!changed);
   2413
   2414	ret = nvme_rdma_setup_ctrl(ctrl, true);
   2415	if (ret)
   2416		goto out_uninit_ctrl;
   2417
   2418	dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
   2419		nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
   2420
   2421	mutex_lock(&nvme_rdma_ctrl_mutex);
   2422	list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
   2423	mutex_unlock(&nvme_rdma_ctrl_mutex);
   2424
   2425	return &ctrl->ctrl;
   2426
   2427out_uninit_ctrl:
   2428	nvme_uninit_ctrl(&ctrl->ctrl);
   2429	nvme_put_ctrl(&ctrl->ctrl);
   2430	if (ret > 0)
   2431		ret = -EIO;
   2432	return ERR_PTR(ret);
   2433out_kfree_queues:
   2434	kfree(ctrl->queues);
   2435out_free_ctrl:
   2436	kfree(ctrl);
   2437	return ERR_PTR(ret);
   2438}
   2439
   2440static struct nvmf_transport_ops nvme_rdma_transport = {
   2441	.name		= "rdma",
   2442	.module		= THIS_MODULE,
   2443	.required_opts	= NVMF_OPT_TRADDR,
   2444	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
   2445			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
   2446			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
   2447			  NVMF_OPT_TOS,
   2448	.create_ctrl	= nvme_rdma_create_ctrl,
   2449};
   2450
   2451static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
   2452{
   2453	struct nvme_rdma_ctrl *ctrl;
   2454	struct nvme_rdma_device *ndev;
   2455	bool found = false;
   2456
   2457	mutex_lock(&device_list_mutex);
   2458	list_for_each_entry(ndev, &device_list, entry) {
   2459		if (ndev->dev == ib_device) {
   2460			found = true;
   2461			break;
   2462		}
   2463	}
   2464	mutex_unlock(&device_list_mutex);
   2465
   2466	if (!found)
   2467		return;
   2468
   2469	/* Delete all controllers using this device */
   2470	mutex_lock(&nvme_rdma_ctrl_mutex);
   2471	list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
   2472		if (ctrl->device->dev != ib_device)
   2473			continue;
   2474		nvme_delete_ctrl(&ctrl->ctrl);
   2475	}
   2476	mutex_unlock(&nvme_rdma_ctrl_mutex);
   2477
   2478	flush_workqueue(nvme_delete_wq);
   2479}
   2480
   2481static struct ib_client nvme_rdma_ib_client = {
   2482	.name   = "nvme_rdma",
   2483	.remove = nvme_rdma_remove_one
   2484};
   2485
   2486static int __init nvme_rdma_init_module(void)
   2487{
   2488	int ret;
   2489
   2490	ret = ib_register_client(&nvme_rdma_ib_client);
   2491	if (ret)
   2492		return ret;
   2493
   2494	ret = nvmf_register_transport(&nvme_rdma_transport);
   2495	if (ret)
   2496		goto err_unreg_client;
   2497
   2498	return 0;
   2499
   2500err_unreg_client:
   2501	ib_unregister_client(&nvme_rdma_ib_client);
   2502	return ret;
   2503}
   2504
   2505static void __exit nvme_rdma_cleanup_module(void)
   2506{
   2507	struct nvme_rdma_ctrl *ctrl;
   2508
   2509	nvmf_unregister_transport(&nvme_rdma_transport);
   2510	ib_unregister_client(&nvme_rdma_ib_client);
   2511
   2512	mutex_lock(&nvme_rdma_ctrl_mutex);
   2513	list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
   2514		nvme_delete_ctrl(&ctrl->ctrl);
   2515	mutex_unlock(&nvme_rdma_ctrl_mutex);
   2516	flush_workqueue(nvme_delete_wq);
   2517}
   2518
   2519module_init(nvme_rdma_init_module);
   2520module_exit(nvme_rdma_cleanup_module);
   2521
   2522MODULE_LICENSE("GPL v2");