cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

nbd.c (65893B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Network block device - make block devices work over TCP
      4 *
      5 * Note that you can not swap over this thing, yet. Seems to work but
      6 * deadlocks sometimes - you can not swap over TCP in general.
      7 * 
      8 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
      9 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
     10 *
     11 * (part of code stolen from loop.c)
     12 */
     13
     14#include <linux/major.h>
     15
     16#include <linux/blkdev.h>
     17#include <linux/module.h>
     18#include <linux/init.h>
     19#include <linux/sched.h>
     20#include <linux/sched/mm.h>
     21#include <linux/fs.h>
     22#include <linux/bio.h>
     23#include <linux/stat.h>
     24#include <linux/errno.h>
     25#include <linux/file.h>
     26#include <linux/ioctl.h>
     27#include <linux/mutex.h>
     28#include <linux/compiler.h>
     29#include <linux/completion.h>
     30#include <linux/err.h>
     31#include <linux/kernel.h>
     32#include <linux/slab.h>
     33#include <net/sock.h>
     34#include <linux/net.h>
     35#include <linux/kthread.h>
     36#include <linux/types.h>
     37#include <linux/debugfs.h>
     38#include <linux/blk-mq.h>
     39
     40#include <linux/uaccess.h>
     41#include <asm/types.h>
     42
     43#include <linux/nbd.h>
     44#include <linux/nbd-netlink.h>
     45#include <net/genetlink.h>
     46
     47#define CREATE_TRACE_POINTS
     48#include <trace/events/nbd.h>
     49
     50static DEFINE_IDR(nbd_index_idr);
     51static DEFINE_MUTEX(nbd_index_mutex);
     52static struct workqueue_struct *nbd_del_wq;
     53static int nbd_total_devices = 0;
     54
     55struct nbd_sock {
     56	struct socket *sock;
     57	struct mutex tx_lock;
     58	struct request *pending;
     59	int sent;
     60	bool dead;
     61	int fallback_index;
     62	int cookie;
     63};
     64
     65struct recv_thread_args {
     66	struct work_struct work;
     67	struct nbd_device *nbd;
     68	int index;
     69};
     70
     71struct link_dead_args {
     72	struct work_struct work;
     73	int index;
     74};
     75
     76#define NBD_RT_TIMEDOUT			0
     77#define NBD_RT_DISCONNECT_REQUESTED	1
     78#define NBD_RT_DISCONNECTED		2
     79#define NBD_RT_HAS_PID_FILE		3
     80#define NBD_RT_HAS_CONFIG_REF		4
     81#define NBD_RT_BOUND			5
     82#define NBD_RT_DISCONNECT_ON_CLOSE	6
     83#define NBD_RT_HAS_BACKEND_FILE		7
     84
     85#define NBD_DESTROY_ON_DISCONNECT	0
     86#define NBD_DISCONNECT_REQUESTED	1
     87
     88struct nbd_config {
     89	u32 flags;
     90	unsigned long runtime_flags;
     91	u64 dead_conn_timeout;
     92
     93	struct nbd_sock **socks;
     94	int num_connections;
     95	atomic_t live_connections;
     96	wait_queue_head_t conn_wait;
     97
     98	atomic_t recv_threads;
     99	wait_queue_head_t recv_wq;
    100	unsigned int blksize_bits;
    101	loff_t bytesize;
    102#if IS_ENABLED(CONFIG_DEBUG_FS)
    103	struct dentry *dbg_dir;
    104#endif
    105};
    106
    107static inline unsigned int nbd_blksize(struct nbd_config *config)
    108{
    109	return 1u << config->blksize_bits;
    110}
    111
    112struct nbd_device {
    113	struct blk_mq_tag_set tag_set;
    114
    115	int index;
    116	refcount_t config_refs;
    117	refcount_t refs;
    118	struct nbd_config *config;
    119	struct mutex config_lock;
    120	struct gendisk *disk;
    121	struct workqueue_struct *recv_workq;
    122	struct work_struct remove_work;
    123
    124	struct list_head list;
    125	struct task_struct *task_setup;
    126
    127	unsigned long flags;
    128	pid_t pid; /* pid of nbd-client, if attached */
    129
    130	char *backend;
    131};
    132
    133#define NBD_CMD_REQUEUED	1
    134/*
    135 * This flag will be set if nbd_queue_rq() succeed, and will be checked and
    136 * cleared in completion. Both setting and clearing of the flag are protected
    137 * by cmd->lock.
    138 */
    139#define NBD_CMD_INFLIGHT	2
    140
    141struct nbd_cmd {
    142	struct nbd_device *nbd;
    143	struct mutex lock;
    144	int index;
    145	int cookie;
    146	int retries;
    147	blk_status_t status;
    148	unsigned long flags;
    149	u32 cmd_cookie;
    150};
    151
    152#if IS_ENABLED(CONFIG_DEBUG_FS)
    153static struct dentry *nbd_dbg_dir;
    154#endif
    155
    156#define nbd_name(nbd) ((nbd)->disk->disk_name)
    157
    158#define NBD_MAGIC 0x68797548
    159
    160#define NBD_DEF_BLKSIZE_BITS 10
    161
    162static unsigned int nbds_max = 16;
    163static int max_part = 16;
    164static int part_shift;
    165
    166static int nbd_dev_dbg_init(struct nbd_device *nbd);
    167static void nbd_dev_dbg_close(struct nbd_device *nbd);
    168static void nbd_config_put(struct nbd_device *nbd);
    169static void nbd_connect_reply(struct genl_info *info, int index);
    170static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
    171static void nbd_dead_link_work(struct work_struct *work);
    172static void nbd_disconnect_and_put(struct nbd_device *nbd);
    173
    174static inline struct device *nbd_to_dev(struct nbd_device *nbd)
    175{
    176	return disk_to_dev(nbd->disk);
    177}
    178
    179static void nbd_requeue_cmd(struct nbd_cmd *cmd)
    180{
    181	struct request *req = blk_mq_rq_from_pdu(cmd);
    182
    183	if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
    184		blk_mq_requeue_request(req, true);
    185}
    186
    187#define NBD_COOKIE_BITS 32
    188
    189static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
    190{
    191	struct request *req = blk_mq_rq_from_pdu(cmd);
    192	u32 tag = blk_mq_unique_tag(req);
    193	u64 cookie = cmd->cmd_cookie;
    194
    195	return (cookie << NBD_COOKIE_BITS) | tag;
    196}
    197
    198static u32 nbd_handle_to_tag(u64 handle)
    199{
    200	return (u32)handle;
    201}
    202
    203static u32 nbd_handle_to_cookie(u64 handle)
    204{
    205	return (u32)(handle >> NBD_COOKIE_BITS);
    206}
    207
    208static const char *nbdcmd_to_ascii(int cmd)
    209{
    210	switch (cmd) {
    211	case  NBD_CMD_READ: return "read";
    212	case NBD_CMD_WRITE: return "write";
    213	case  NBD_CMD_DISC: return "disconnect";
    214	case NBD_CMD_FLUSH: return "flush";
    215	case  NBD_CMD_TRIM: return "trim/discard";
    216	}
    217	return "invalid";
    218}
    219
    220static ssize_t pid_show(struct device *dev,
    221			struct device_attribute *attr, char *buf)
    222{
    223	struct gendisk *disk = dev_to_disk(dev);
    224	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
    225
    226	return sprintf(buf, "%d\n", nbd->pid);
    227}
    228
    229static const struct device_attribute pid_attr = {
    230	.attr = { .name = "pid", .mode = 0444},
    231	.show = pid_show,
    232};
    233
    234static ssize_t backend_show(struct device *dev,
    235		struct device_attribute *attr, char *buf)
    236{
    237	struct gendisk *disk = dev_to_disk(dev);
    238	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
    239
    240	return sprintf(buf, "%s\n", nbd->backend ?: "");
    241}
    242
    243static const struct device_attribute backend_attr = {
    244	.attr = { .name = "backend", .mode = 0444},
    245	.show = backend_show,
    246};
    247
    248static void nbd_dev_remove(struct nbd_device *nbd)
    249{
    250	struct gendisk *disk = nbd->disk;
    251
    252	del_gendisk(disk);
    253	blk_cleanup_disk(disk);
    254	blk_mq_free_tag_set(&nbd->tag_set);
    255
    256	/*
    257	 * Remove from idr after del_gendisk() completes, so if the same ID is
    258	 * reused, the following add_disk() will succeed.
    259	 */
    260	mutex_lock(&nbd_index_mutex);
    261	idr_remove(&nbd_index_idr, nbd->index);
    262	mutex_unlock(&nbd_index_mutex);
    263	destroy_workqueue(nbd->recv_workq);
    264	kfree(nbd);
    265}
    266
    267static void nbd_dev_remove_work(struct work_struct *work)
    268{
    269	nbd_dev_remove(container_of(work, struct nbd_device, remove_work));
    270}
    271
    272static void nbd_put(struct nbd_device *nbd)
    273{
    274	if (!refcount_dec_and_test(&nbd->refs))
    275		return;
    276
    277	/* Call del_gendisk() asynchrounously to prevent deadlock */
    278	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
    279		queue_work(nbd_del_wq, &nbd->remove_work);
    280	else
    281		nbd_dev_remove(nbd);
    282}
    283
    284static int nbd_disconnected(struct nbd_config *config)
    285{
    286	return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
    287		test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
    288}
    289
    290static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
    291				int notify)
    292{
    293	if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
    294		struct link_dead_args *args;
    295		args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
    296		if (args) {
    297			INIT_WORK(&args->work, nbd_dead_link_work);
    298			args->index = nbd->index;
    299			queue_work(system_wq, &args->work);
    300		}
    301	}
    302	if (!nsock->dead) {
    303		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
    304		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
    305			if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
    306					       &nbd->config->runtime_flags)) {
    307				set_bit(NBD_RT_DISCONNECTED,
    308					&nbd->config->runtime_flags);
    309				dev_info(nbd_to_dev(nbd),
    310					"Disconnected due to user request.\n");
    311			}
    312		}
    313	}
    314	nsock->dead = true;
    315	nsock->pending = NULL;
    316	nsock->sent = 0;
    317}
    318
    319static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
    320		loff_t blksize)
    321{
    322	if (!blksize)
    323		blksize = 1u << NBD_DEF_BLKSIZE_BITS;
    324
    325	if (blk_validate_block_size(blksize))
    326		return -EINVAL;
    327
    328	nbd->config->bytesize = bytesize;
    329	nbd->config->blksize_bits = __ffs(blksize);
    330
    331	if (!nbd->pid)
    332		return 0;
    333
    334	if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
    335		nbd->disk->queue->limits.discard_granularity = blksize;
    336		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
    337	}
    338	blk_queue_logical_block_size(nbd->disk->queue, blksize);
    339	blk_queue_physical_block_size(nbd->disk->queue, blksize);
    340
    341	if (max_part)
    342		set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
    343	if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
    344		kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
    345	return 0;
    346}
    347
    348static void nbd_complete_rq(struct request *req)
    349{
    350	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
    351
    352	dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
    353		cmd->status ? "failed" : "done");
    354
    355	blk_mq_end_request(req, cmd->status);
    356}
    357
    358/*
    359 * Forcibly shutdown the socket causing all listeners to error
    360 */
    361static void sock_shutdown(struct nbd_device *nbd)
    362{
    363	struct nbd_config *config = nbd->config;
    364	int i;
    365
    366	if (config->num_connections == 0)
    367		return;
    368	if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
    369		return;
    370
    371	for (i = 0; i < config->num_connections; i++) {
    372		struct nbd_sock *nsock = config->socks[i];
    373		mutex_lock(&nsock->tx_lock);
    374		nbd_mark_nsock_dead(nbd, nsock, 0);
    375		mutex_unlock(&nsock->tx_lock);
    376	}
    377	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
    378}
    379
    380static u32 req_to_nbd_cmd_type(struct request *req)
    381{
    382	switch (req_op(req)) {
    383	case REQ_OP_DISCARD:
    384		return NBD_CMD_TRIM;
    385	case REQ_OP_FLUSH:
    386		return NBD_CMD_FLUSH;
    387	case REQ_OP_WRITE:
    388		return NBD_CMD_WRITE;
    389	case REQ_OP_READ:
    390		return NBD_CMD_READ;
    391	default:
    392		return U32_MAX;
    393	}
    394}
    395
    396static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
    397						 bool reserved)
    398{
    399	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
    400	struct nbd_device *nbd = cmd->nbd;
    401	struct nbd_config *config;
    402
    403	if (!mutex_trylock(&cmd->lock))
    404		return BLK_EH_RESET_TIMER;
    405
    406	if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
    407		mutex_unlock(&cmd->lock);
    408		return BLK_EH_DONE;
    409	}
    410
    411	if (!refcount_inc_not_zero(&nbd->config_refs)) {
    412		cmd->status = BLK_STS_TIMEOUT;
    413		__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
    414		mutex_unlock(&cmd->lock);
    415		goto done;
    416	}
    417	config = nbd->config;
    418
    419	if (config->num_connections > 1 ||
    420	    (config->num_connections == 1 && nbd->tag_set.timeout)) {
    421		dev_err_ratelimited(nbd_to_dev(nbd),
    422				    "Connection timed out, retrying (%d/%d alive)\n",
    423				    atomic_read(&config->live_connections),
    424				    config->num_connections);
    425		/*
    426		 * Hooray we have more connections, requeue this IO, the submit
    427		 * path will put it on a real connection. Or if only one
    428		 * connection is configured, the submit path will wait util
    429		 * a new connection is reconfigured or util dead timeout.
    430		 */
    431		if (config->socks) {
    432			if (cmd->index < config->num_connections) {
    433				struct nbd_sock *nsock =
    434					config->socks[cmd->index];
    435				mutex_lock(&nsock->tx_lock);
    436				/* We can have multiple outstanding requests, so
    437				 * we don't want to mark the nsock dead if we've
    438				 * already reconnected with a new socket, so
    439				 * only mark it dead if its the same socket we
    440				 * were sent out on.
    441				 */
    442				if (cmd->cookie == nsock->cookie)
    443					nbd_mark_nsock_dead(nbd, nsock, 1);
    444				mutex_unlock(&nsock->tx_lock);
    445			}
    446			mutex_unlock(&cmd->lock);
    447			nbd_requeue_cmd(cmd);
    448			nbd_config_put(nbd);
    449			return BLK_EH_DONE;
    450		}
    451	}
    452
    453	if (!nbd->tag_set.timeout) {
    454		/*
    455		 * Userspace sets timeout=0 to disable socket disconnection,
    456		 * so just warn and reset the timer.
    457		 */
    458		struct nbd_sock *nsock = config->socks[cmd->index];
    459		cmd->retries++;
    460		dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
    461			req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
    462			(unsigned long long)blk_rq_pos(req) << 9,
    463			blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
    464
    465		mutex_lock(&nsock->tx_lock);
    466		if (cmd->cookie != nsock->cookie) {
    467			nbd_requeue_cmd(cmd);
    468			mutex_unlock(&nsock->tx_lock);
    469			mutex_unlock(&cmd->lock);
    470			nbd_config_put(nbd);
    471			return BLK_EH_DONE;
    472		}
    473		mutex_unlock(&nsock->tx_lock);
    474		mutex_unlock(&cmd->lock);
    475		nbd_config_put(nbd);
    476		return BLK_EH_RESET_TIMER;
    477	}
    478
    479	dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
    480	set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
    481	cmd->status = BLK_STS_IOERR;
    482	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
    483	mutex_unlock(&cmd->lock);
    484	sock_shutdown(nbd);
    485	nbd_config_put(nbd);
    486done:
    487	blk_mq_complete_request(req);
    488	return BLK_EH_DONE;
    489}
    490
    491/*
    492 *  Send or receive packet. Return a positive value on success and
    493 *  negtive value on failue, and never return 0.
    494 */
    495static int sock_xmit(struct nbd_device *nbd, int index, int send,
    496		     struct iov_iter *iter, int msg_flags, int *sent)
    497{
    498	struct nbd_config *config = nbd->config;
    499	struct socket *sock = config->socks[index]->sock;
    500	int result;
    501	struct msghdr msg;
    502	unsigned int noreclaim_flag;
    503
    504	if (unlikely(!sock)) {
    505		dev_err_ratelimited(disk_to_dev(nbd->disk),
    506			"Attempted %s on closed socket in sock_xmit\n",
    507			(send ? "send" : "recv"));
    508		return -EINVAL;
    509	}
    510
    511	msg.msg_iter = *iter;
    512
    513	noreclaim_flag = memalloc_noreclaim_save();
    514	do {
    515		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
    516		msg.msg_name = NULL;
    517		msg.msg_namelen = 0;
    518		msg.msg_control = NULL;
    519		msg.msg_controllen = 0;
    520		msg.msg_flags = msg_flags | MSG_NOSIGNAL;
    521
    522		if (send)
    523			result = sock_sendmsg(sock, &msg);
    524		else
    525			result = sock_recvmsg(sock, &msg, msg.msg_flags);
    526
    527		if (result <= 0) {
    528			if (result == 0)
    529				result = -EPIPE; /* short read */
    530			break;
    531		}
    532		if (sent)
    533			*sent += result;
    534	} while (msg_data_left(&msg));
    535
    536	memalloc_noreclaim_restore(noreclaim_flag);
    537
    538	return result;
    539}
    540
    541/*
    542 * Different settings for sk->sk_sndtimeo can result in different return values
    543 * if there is a signal pending when we enter sendmsg, because reasons?
    544 */
    545static inline int was_interrupted(int result)
    546{
    547	return result == -ERESTARTSYS || result == -EINTR;
    548}
    549
    550/* always call with the tx_lock held */
    551static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
    552{
    553	struct request *req = blk_mq_rq_from_pdu(cmd);
    554	struct nbd_config *config = nbd->config;
    555	struct nbd_sock *nsock = config->socks[index];
    556	int result;
    557	struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
    558	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
    559	struct iov_iter from;
    560	unsigned long size = blk_rq_bytes(req);
    561	struct bio *bio;
    562	u64 handle;
    563	u32 type;
    564	u32 nbd_cmd_flags = 0;
    565	int sent = nsock->sent, skip = 0;
    566
    567	iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
    568
    569	type = req_to_nbd_cmd_type(req);
    570	if (type == U32_MAX)
    571		return -EIO;
    572
    573	if (rq_data_dir(req) == WRITE &&
    574	    (config->flags & NBD_FLAG_READ_ONLY)) {
    575		dev_err_ratelimited(disk_to_dev(nbd->disk),
    576				    "Write on read-only\n");
    577		return -EIO;
    578	}
    579
    580	if (req->cmd_flags & REQ_FUA)
    581		nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
    582
    583	/* We did a partial send previously, and we at least sent the whole
    584	 * request struct, so just go and send the rest of the pages in the
    585	 * request.
    586	 */
    587	if (sent) {
    588		if (sent >= sizeof(request)) {
    589			skip = sent - sizeof(request);
    590
    591			/* initialize handle for tracing purposes */
    592			handle = nbd_cmd_handle(cmd);
    593
    594			goto send_pages;
    595		}
    596		iov_iter_advance(&from, sent);
    597	} else {
    598		cmd->cmd_cookie++;
    599	}
    600	cmd->index = index;
    601	cmd->cookie = nsock->cookie;
    602	cmd->retries = 0;
    603	request.type = htonl(type | nbd_cmd_flags);
    604	if (type != NBD_CMD_FLUSH) {
    605		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
    606		request.len = htonl(size);
    607	}
    608	handle = nbd_cmd_handle(cmd);
    609	memcpy(request.handle, &handle, sizeof(handle));
    610
    611	trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
    612
    613	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
    614		req, nbdcmd_to_ascii(type),
    615		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
    616	result = sock_xmit(nbd, index, 1, &from,
    617			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
    618	trace_nbd_header_sent(req, handle);
    619	if (result < 0) {
    620		if (was_interrupted(result)) {
    621			/* If we havne't sent anything we can just return BUSY,
    622			 * however if we have sent something we need to make
    623			 * sure we only allow this req to be sent until we are
    624			 * completely done.
    625			 */
    626			if (sent) {
    627				nsock->pending = req;
    628				nsock->sent = sent;
    629			}
    630			set_bit(NBD_CMD_REQUEUED, &cmd->flags);
    631			return BLK_STS_RESOURCE;
    632		}
    633		dev_err_ratelimited(disk_to_dev(nbd->disk),
    634			"Send control failed (result %d)\n", result);
    635		return -EAGAIN;
    636	}
    637send_pages:
    638	if (type != NBD_CMD_WRITE)
    639		goto out;
    640
    641	bio = req->bio;
    642	while (bio) {
    643		struct bio *next = bio->bi_next;
    644		struct bvec_iter iter;
    645		struct bio_vec bvec;
    646
    647		bio_for_each_segment(bvec, bio, iter) {
    648			bool is_last = !next && bio_iter_last(bvec, iter);
    649			int flags = is_last ? 0 : MSG_MORE;
    650
    651			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
    652				req, bvec.bv_len);
    653			iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
    654			if (skip) {
    655				if (skip >= iov_iter_count(&from)) {
    656					skip -= iov_iter_count(&from);
    657					continue;
    658				}
    659				iov_iter_advance(&from, skip);
    660				skip = 0;
    661			}
    662			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
    663			if (result < 0) {
    664				if (was_interrupted(result)) {
    665					/* We've already sent the header, we
    666					 * have no choice but to set pending and
    667					 * return BUSY.
    668					 */
    669					nsock->pending = req;
    670					nsock->sent = sent;
    671					set_bit(NBD_CMD_REQUEUED, &cmd->flags);
    672					return BLK_STS_RESOURCE;
    673				}
    674				dev_err(disk_to_dev(nbd->disk),
    675					"Send data failed (result %d)\n",
    676					result);
    677				return -EAGAIN;
    678			}
    679			/*
    680			 * The completion might already have come in,
    681			 * so break for the last one instead of letting
    682			 * the iterator do it. This prevents use-after-free
    683			 * of the bio.
    684			 */
    685			if (is_last)
    686				break;
    687		}
    688		bio = next;
    689	}
    690out:
    691	trace_nbd_payload_sent(req, handle);
    692	nsock->pending = NULL;
    693	nsock->sent = 0;
    694	return 0;
    695}
    696
    697static int nbd_read_reply(struct nbd_device *nbd, int index,
    698			  struct nbd_reply *reply)
    699{
    700	struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
    701	struct iov_iter to;
    702	int result;
    703
    704	reply->magic = 0;
    705	iov_iter_kvec(&to, READ, &iov, 1, sizeof(*reply));
    706	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
    707	if (result < 0) {
    708		if (!nbd_disconnected(nbd->config))
    709			dev_err(disk_to_dev(nbd->disk),
    710				"Receive control failed (result %d)\n", result);
    711		return result;
    712	}
    713
    714	if (ntohl(reply->magic) != NBD_REPLY_MAGIC) {
    715		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
    716				(unsigned long)ntohl(reply->magic));
    717		return -EPROTO;
    718	}
    719
    720	return 0;
    721}
    722
    723/* NULL returned = something went wrong, inform userspace */
    724static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
    725					struct nbd_reply *reply)
    726{
    727	int result;
    728	struct nbd_cmd *cmd;
    729	struct request *req = NULL;
    730	u64 handle;
    731	u16 hwq;
    732	u32 tag;
    733	int ret = 0;
    734
    735	memcpy(&handle, reply->handle, sizeof(handle));
    736	tag = nbd_handle_to_tag(handle);
    737	hwq = blk_mq_unique_tag_to_hwq(tag);
    738	if (hwq < nbd->tag_set.nr_hw_queues)
    739		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
    740				       blk_mq_unique_tag_to_tag(tag));
    741	if (!req || !blk_mq_request_started(req)) {
    742		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
    743			tag, req);
    744		return ERR_PTR(-ENOENT);
    745	}
    746	trace_nbd_header_received(req, handle);
    747	cmd = blk_mq_rq_to_pdu(req);
    748
    749	mutex_lock(&cmd->lock);
    750	if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
    751		dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
    752			tag, cmd->status, cmd->flags);
    753		ret = -ENOENT;
    754		goto out;
    755	}
    756	if (cmd->index != index) {
    757		dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)",
    758			tag, index, cmd->index);
    759		ret = -ENOENT;
    760		goto out;
    761	}
    762	if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
    763		dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
    764			req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
    765		ret = -ENOENT;
    766		goto out;
    767	}
    768	if (cmd->status != BLK_STS_OK) {
    769		dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
    770			req);
    771		ret = -ENOENT;
    772		goto out;
    773	}
    774	if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
    775		dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
    776			req);
    777		ret = -ENOENT;
    778		goto out;
    779	}
    780	if (ntohl(reply->error)) {
    781		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
    782			ntohl(reply->error));
    783		cmd->status = BLK_STS_IOERR;
    784		goto out;
    785	}
    786
    787	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
    788	if (rq_data_dir(req) != WRITE) {
    789		struct req_iterator iter;
    790		struct bio_vec bvec;
    791		struct iov_iter to;
    792
    793		rq_for_each_segment(bvec, req, iter) {
    794			iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
    795			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
    796			if (result < 0) {
    797				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
    798					result);
    799				/*
    800				 * If we've disconnected, we need to make sure we
    801				 * complete this request, otherwise error out
    802				 * and let the timeout stuff handle resubmitting
    803				 * this request onto another connection.
    804				 */
    805				if (nbd_disconnected(nbd->config)) {
    806					cmd->status = BLK_STS_IOERR;
    807					goto out;
    808				}
    809				ret = -EIO;
    810				goto out;
    811			}
    812			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
    813				req, bvec.bv_len);
    814		}
    815	}
    816out:
    817	trace_nbd_payload_received(req, handle);
    818	mutex_unlock(&cmd->lock);
    819	return ret ? ERR_PTR(ret) : cmd;
    820}
    821
    822static void recv_work(struct work_struct *work)
    823{
    824	struct recv_thread_args *args = container_of(work,
    825						     struct recv_thread_args,
    826						     work);
    827	struct nbd_device *nbd = args->nbd;
    828	struct nbd_config *config = nbd->config;
    829	struct request_queue *q = nbd->disk->queue;
    830	struct nbd_sock *nsock;
    831	struct nbd_cmd *cmd;
    832	struct request *rq;
    833
    834	while (1) {
    835		struct nbd_reply reply;
    836
    837		if (nbd_read_reply(nbd, args->index, &reply))
    838			break;
    839
    840		/*
    841		 * Grab .q_usage_counter so request pool won't go away, then no
    842		 * request use-after-free is possible during nbd_handle_reply().
    843		 * If queue is frozen, there won't be any inflight requests, we
    844		 * needn't to handle the incoming garbage message.
    845		 */
    846		if (!percpu_ref_tryget(&q->q_usage_counter)) {
    847			dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n",
    848				__func__);
    849			break;
    850		}
    851
    852		cmd = nbd_handle_reply(nbd, args->index, &reply);
    853		if (IS_ERR(cmd)) {
    854			percpu_ref_put(&q->q_usage_counter);
    855			break;
    856		}
    857
    858		rq = blk_mq_rq_from_pdu(cmd);
    859		if (likely(!blk_should_fake_timeout(rq->q))) {
    860			bool complete;
    861
    862			mutex_lock(&cmd->lock);
    863			complete = __test_and_clear_bit(NBD_CMD_INFLIGHT,
    864							&cmd->flags);
    865			mutex_unlock(&cmd->lock);
    866			if (complete)
    867				blk_mq_complete_request(rq);
    868		}
    869		percpu_ref_put(&q->q_usage_counter);
    870	}
    871
    872	nsock = config->socks[args->index];
    873	mutex_lock(&nsock->tx_lock);
    874	nbd_mark_nsock_dead(nbd, nsock, 1);
    875	mutex_unlock(&nsock->tx_lock);
    876
    877	nbd_config_put(nbd);
    878	atomic_dec(&config->recv_threads);
    879	wake_up(&config->recv_wq);
    880	kfree(args);
    881}
    882
    883static bool nbd_clear_req(struct request *req, void *data, bool reserved)
    884{
    885	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
    886
    887	/* don't abort one completed request */
    888	if (blk_mq_request_completed(req))
    889		return true;
    890
    891	mutex_lock(&cmd->lock);
    892	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
    893		mutex_unlock(&cmd->lock);
    894		return true;
    895	}
    896	cmd->status = BLK_STS_IOERR;
    897	mutex_unlock(&cmd->lock);
    898
    899	blk_mq_complete_request(req);
    900	return true;
    901}
    902
    903static void nbd_clear_que(struct nbd_device *nbd)
    904{
    905	blk_mq_quiesce_queue(nbd->disk->queue);
    906	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
    907	blk_mq_unquiesce_queue(nbd->disk->queue);
    908	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
    909}
    910
    911static int find_fallback(struct nbd_device *nbd, int index)
    912{
    913	struct nbd_config *config = nbd->config;
    914	int new_index = -1;
    915	struct nbd_sock *nsock = config->socks[index];
    916	int fallback = nsock->fallback_index;
    917
    918	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
    919		return new_index;
    920
    921	if (config->num_connections <= 1) {
    922		dev_err_ratelimited(disk_to_dev(nbd->disk),
    923				    "Dead connection, failed to find a fallback\n");
    924		return new_index;
    925	}
    926
    927	if (fallback >= 0 && fallback < config->num_connections &&
    928	    !config->socks[fallback]->dead)
    929		return fallback;
    930
    931	if (nsock->fallback_index < 0 ||
    932	    nsock->fallback_index >= config->num_connections ||
    933	    config->socks[nsock->fallback_index]->dead) {
    934		int i;
    935		for (i = 0; i < config->num_connections; i++) {
    936			if (i == index)
    937				continue;
    938			if (!config->socks[i]->dead) {
    939				new_index = i;
    940				break;
    941			}
    942		}
    943		nsock->fallback_index = new_index;
    944		if (new_index < 0) {
    945			dev_err_ratelimited(disk_to_dev(nbd->disk),
    946					    "Dead connection, failed to find a fallback\n");
    947			return new_index;
    948		}
    949	}
    950	new_index = nsock->fallback_index;
    951	return new_index;
    952}
    953
    954static int wait_for_reconnect(struct nbd_device *nbd)
    955{
    956	struct nbd_config *config = nbd->config;
    957	if (!config->dead_conn_timeout)
    958		return 0;
    959
    960	if (!wait_event_timeout(config->conn_wait,
    961				test_bit(NBD_RT_DISCONNECTED,
    962					 &config->runtime_flags) ||
    963				atomic_read(&config->live_connections) > 0,
    964				config->dead_conn_timeout))
    965		return 0;
    966
    967	return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
    968}
    969
    970static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
    971{
    972	struct request *req = blk_mq_rq_from_pdu(cmd);
    973	struct nbd_device *nbd = cmd->nbd;
    974	struct nbd_config *config;
    975	struct nbd_sock *nsock;
    976	int ret;
    977
    978	if (!refcount_inc_not_zero(&nbd->config_refs)) {
    979		dev_err_ratelimited(disk_to_dev(nbd->disk),
    980				    "Socks array is empty\n");
    981		return -EINVAL;
    982	}
    983	config = nbd->config;
    984
    985	if (index >= config->num_connections) {
    986		dev_err_ratelimited(disk_to_dev(nbd->disk),
    987				    "Attempted send on invalid socket\n");
    988		nbd_config_put(nbd);
    989		return -EINVAL;
    990	}
    991	cmd->status = BLK_STS_OK;
    992again:
    993	nsock = config->socks[index];
    994	mutex_lock(&nsock->tx_lock);
    995	if (nsock->dead) {
    996		int old_index = index;
    997		index = find_fallback(nbd, index);
    998		mutex_unlock(&nsock->tx_lock);
    999		if (index < 0) {
   1000			if (wait_for_reconnect(nbd)) {
   1001				index = old_index;
   1002				goto again;
   1003			}
   1004			/* All the sockets should already be down at this point,
   1005			 * we just want to make sure that DISCONNECTED is set so
   1006			 * any requests that come in that were queue'ed waiting
   1007			 * for the reconnect timer don't trigger the timer again
   1008			 * and instead just error out.
   1009			 */
   1010			sock_shutdown(nbd);
   1011			nbd_config_put(nbd);
   1012			return -EIO;
   1013		}
   1014		goto again;
   1015	}
   1016
   1017	/* Handle the case that we have a pending request that was partially
   1018	 * transmitted that _has_ to be serviced first.  We need to call requeue
   1019	 * here so that it gets put _after_ the request that is already on the
   1020	 * dispatch list.
   1021	 */
   1022	blk_mq_start_request(req);
   1023	if (unlikely(nsock->pending && nsock->pending != req)) {
   1024		nbd_requeue_cmd(cmd);
   1025		ret = 0;
   1026		goto out;
   1027	}
   1028	/*
   1029	 * Some failures are related to the link going down, so anything that
   1030	 * returns EAGAIN can be retried on a different socket.
   1031	 */
   1032	ret = nbd_send_cmd(nbd, cmd, index);
   1033	/*
   1034	 * Access to this flag is protected by cmd->lock, thus it's safe to set
   1035	 * the flag after nbd_send_cmd() succeed to send request to server.
   1036	 */
   1037	if (!ret)
   1038		__set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
   1039	else if (ret == -EAGAIN) {
   1040		dev_err_ratelimited(disk_to_dev(nbd->disk),
   1041				    "Request send failed, requeueing\n");
   1042		nbd_mark_nsock_dead(nbd, nsock, 1);
   1043		nbd_requeue_cmd(cmd);
   1044		ret = 0;
   1045	}
   1046out:
   1047	mutex_unlock(&nsock->tx_lock);
   1048	nbd_config_put(nbd);
   1049	return ret;
   1050}
   1051
   1052static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
   1053			const struct blk_mq_queue_data *bd)
   1054{
   1055	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
   1056	int ret;
   1057
   1058	/*
   1059	 * Since we look at the bio's to send the request over the network we
   1060	 * need to make sure the completion work doesn't mark this request done
   1061	 * before we are done doing our send.  This keeps us from dereferencing
   1062	 * freed data if we have particularly fast completions (ie we get the
   1063	 * completion before we exit sock_xmit on the last bvec) or in the case
   1064	 * that the server is misbehaving (or there was an error) before we're
   1065	 * done sending everything over the wire.
   1066	 */
   1067	mutex_lock(&cmd->lock);
   1068	clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
   1069
   1070	/* We can be called directly from the user space process, which means we
   1071	 * could possibly have signals pending so our sendmsg will fail.  In
   1072	 * this case we need to return that we are busy, otherwise error out as
   1073	 * appropriate.
   1074	 */
   1075	ret = nbd_handle_cmd(cmd, hctx->queue_num);
   1076	if (ret < 0)
   1077		ret = BLK_STS_IOERR;
   1078	else if (!ret)
   1079		ret = BLK_STS_OK;
   1080	mutex_unlock(&cmd->lock);
   1081
   1082	return ret;
   1083}
   1084
   1085static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
   1086				     int *err)
   1087{
   1088	struct socket *sock;
   1089
   1090	*err = 0;
   1091	sock = sockfd_lookup(fd, err);
   1092	if (!sock)
   1093		return NULL;
   1094
   1095	if (sock->ops->shutdown == sock_no_shutdown) {
   1096		dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
   1097		*err = -EINVAL;
   1098		sockfd_put(sock);
   1099		return NULL;
   1100	}
   1101
   1102	return sock;
   1103}
   1104
   1105static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
   1106			  bool netlink)
   1107{
   1108	struct nbd_config *config = nbd->config;
   1109	struct socket *sock;
   1110	struct nbd_sock **socks;
   1111	struct nbd_sock *nsock;
   1112	int err;
   1113
   1114	sock = nbd_get_socket(nbd, arg, &err);
   1115	if (!sock)
   1116		return err;
   1117
   1118	/*
   1119	 * We need to make sure we don't get any errant requests while we're
   1120	 * reallocating the ->socks array.
   1121	 */
   1122	blk_mq_freeze_queue(nbd->disk->queue);
   1123
   1124	if (!netlink && !nbd->task_setup &&
   1125	    !test_bit(NBD_RT_BOUND, &config->runtime_flags))
   1126		nbd->task_setup = current;
   1127
   1128	if (!netlink &&
   1129	    (nbd->task_setup != current ||
   1130	     test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
   1131		dev_err(disk_to_dev(nbd->disk),
   1132			"Device being setup by another task");
   1133		err = -EBUSY;
   1134		goto put_socket;
   1135	}
   1136
   1137	nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
   1138	if (!nsock) {
   1139		err = -ENOMEM;
   1140		goto put_socket;
   1141	}
   1142
   1143	socks = krealloc(config->socks, (config->num_connections + 1) *
   1144			 sizeof(struct nbd_sock *), GFP_KERNEL);
   1145	if (!socks) {
   1146		kfree(nsock);
   1147		err = -ENOMEM;
   1148		goto put_socket;
   1149	}
   1150
   1151	config->socks = socks;
   1152
   1153	nsock->fallback_index = -1;
   1154	nsock->dead = false;
   1155	mutex_init(&nsock->tx_lock);
   1156	nsock->sock = sock;
   1157	nsock->pending = NULL;
   1158	nsock->sent = 0;
   1159	nsock->cookie = 0;
   1160	socks[config->num_connections++] = nsock;
   1161	atomic_inc(&config->live_connections);
   1162	blk_mq_unfreeze_queue(nbd->disk->queue);
   1163
   1164	return 0;
   1165
   1166put_socket:
   1167	blk_mq_unfreeze_queue(nbd->disk->queue);
   1168	sockfd_put(sock);
   1169	return err;
   1170}
   1171
   1172static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
   1173{
   1174	struct nbd_config *config = nbd->config;
   1175	struct socket *sock, *old;
   1176	struct recv_thread_args *args;
   1177	int i;
   1178	int err;
   1179
   1180	sock = nbd_get_socket(nbd, arg, &err);
   1181	if (!sock)
   1182		return err;
   1183
   1184	args = kzalloc(sizeof(*args), GFP_KERNEL);
   1185	if (!args) {
   1186		sockfd_put(sock);
   1187		return -ENOMEM;
   1188	}
   1189
   1190	for (i = 0; i < config->num_connections; i++) {
   1191		struct nbd_sock *nsock = config->socks[i];
   1192
   1193		if (!nsock->dead)
   1194			continue;
   1195
   1196		mutex_lock(&nsock->tx_lock);
   1197		if (!nsock->dead) {
   1198			mutex_unlock(&nsock->tx_lock);
   1199			continue;
   1200		}
   1201		sk_set_memalloc(sock->sk);
   1202		if (nbd->tag_set.timeout)
   1203			sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
   1204		atomic_inc(&config->recv_threads);
   1205		refcount_inc(&nbd->config_refs);
   1206		old = nsock->sock;
   1207		nsock->fallback_index = -1;
   1208		nsock->sock = sock;
   1209		nsock->dead = false;
   1210		INIT_WORK(&args->work, recv_work);
   1211		args->index = i;
   1212		args->nbd = nbd;
   1213		nsock->cookie++;
   1214		mutex_unlock(&nsock->tx_lock);
   1215		sockfd_put(old);
   1216
   1217		clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
   1218
   1219		/* We take the tx_mutex in an error path in the recv_work, so we
   1220		 * need to queue_work outside of the tx_mutex.
   1221		 */
   1222		queue_work(nbd->recv_workq, &args->work);
   1223
   1224		atomic_inc(&config->live_connections);
   1225		wake_up(&config->conn_wait);
   1226		return 0;
   1227	}
   1228	sockfd_put(sock);
   1229	kfree(args);
   1230	return -ENOSPC;
   1231}
   1232
   1233static void nbd_bdev_reset(struct nbd_device *nbd)
   1234{
   1235	if (disk_openers(nbd->disk) > 1)
   1236		return;
   1237	set_capacity(nbd->disk, 0);
   1238}
   1239
   1240static void nbd_parse_flags(struct nbd_device *nbd)
   1241{
   1242	struct nbd_config *config = nbd->config;
   1243	if (config->flags & NBD_FLAG_READ_ONLY)
   1244		set_disk_ro(nbd->disk, true);
   1245	else
   1246		set_disk_ro(nbd->disk, false);
   1247	if (config->flags & NBD_FLAG_SEND_FLUSH) {
   1248		if (config->flags & NBD_FLAG_SEND_FUA)
   1249			blk_queue_write_cache(nbd->disk->queue, true, true);
   1250		else
   1251			blk_queue_write_cache(nbd->disk->queue, true, false);
   1252	}
   1253	else
   1254		blk_queue_write_cache(nbd->disk->queue, false, false);
   1255}
   1256
   1257static void send_disconnects(struct nbd_device *nbd)
   1258{
   1259	struct nbd_config *config = nbd->config;
   1260	struct nbd_request request = {
   1261		.magic = htonl(NBD_REQUEST_MAGIC),
   1262		.type = htonl(NBD_CMD_DISC),
   1263	};
   1264	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
   1265	struct iov_iter from;
   1266	int i, ret;
   1267
   1268	for (i = 0; i < config->num_connections; i++) {
   1269		struct nbd_sock *nsock = config->socks[i];
   1270
   1271		iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
   1272		mutex_lock(&nsock->tx_lock);
   1273		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
   1274		if (ret < 0)
   1275			dev_err(disk_to_dev(nbd->disk),
   1276				"Send disconnect failed %d\n", ret);
   1277		mutex_unlock(&nsock->tx_lock);
   1278	}
   1279}
   1280
   1281static int nbd_disconnect(struct nbd_device *nbd)
   1282{
   1283	struct nbd_config *config = nbd->config;
   1284
   1285	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
   1286	set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
   1287	set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
   1288	send_disconnects(nbd);
   1289	return 0;
   1290}
   1291
   1292static void nbd_clear_sock(struct nbd_device *nbd)
   1293{
   1294	sock_shutdown(nbd);
   1295	nbd_clear_que(nbd);
   1296	nbd->task_setup = NULL;
   1297}
   1298
   1299static void nbd_config_put(struct nbd_device *nbd)
   1300{
   1301	if (refcount_dec_and_mutex_lock(&nbd->config_refs,
   1302					&nbd->config_lock)) {
   1303		struct nbd_config *config = nbd->config;
   1304		nbd_dev_dbg_close(nbd);
   1305		invalidate_disk(nbd->disk);
   1306		if (nbd->config->bytesize)
   1307			kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
   1308		if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
   1309				       &config->runtime_flags))
   1310			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
   1311		nbd->pid = 0;
   1312		if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE,
   1313				       &config->runtime_flags)) {
   1314			device_remove_file(disk_to_dev(nbd->disk), &backend_attr);
   1315			kfree(nbd->backend);
   1316			nbd->backend = NULL;
   1317		}
   1318		nbd_clear_sock(nbd);
   1319		if (config->num_connections) {
   1320			int i;
   1321			for (i = 0; i < config->num_connections; i++) {
   1322				sockfd_put(config->socks[i]->sock);
   1323				kfree(config->socks[i]);
   1324			}
   1325			kfree(config->socks);
   1326		}
   1327		kfree(nbd->config);
   1328		nbd->config = NULL;
   1329
   1330		nbd->tag_set.timeout = 0;
   1331		nbd->disk->queue->limits.discard_granularity = 0;
   1332		blk_queue_max_discard_sectors(nbd->disk->queue, 0);
   1333
   1334		mutex_unlock(&nbd->config_lock);
   1335		nbd_put(nbd);
   1336		module_put(THIS_MODULE);
   1337	}
   1338}
   1339
   1340static int nbd_start_device(struct nbd_device *nbd)
   1341{
   1342	struct nbd_config *config = nbd->config;
   1343	int num_connections = config->num_connections;
   1344	int error = 0, i;
   1345
   1346	if (nbd->pid)
   1347		return -EBUSY;
   1348	if (!config->socks)
   1349		return -EINVAL;
   1350	if (num_connections > 1 &&
   1351	    !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
   1352		dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
   1353		return -EINVAL;
   1354	}
   1355
   1356	blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
   1357	nbd->pid = task_pid_nr(current);
   1358
   1359	nbd_parse_flags(nbd);
   1360
   1361	error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
   1362	if (error) {
   1363		dev_err(disk_to_dev(nbd->disk), "device_create_file failed for pid!\n");
   1364		return error;
   1365	}
   1366	set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
   1367
   1368	nbd_dev_dbg_init(nbd);
   1369	for (i = 0; i < num_connections; i++) {
   1370		struct recv_thread_args *args;
   1371
   1372		args = kzalloc(sizeof(*args), GFP_KERNEL);
   1373		if (!args) {
   1374			sock_shutdown(nbd);
   1375			/*
   1376			 * If num_connections is m (2 < m),
   1377			 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
   1378			 * But NO.(n + 1) failed. We still have n recv threads.
   1379			 * So, add flush_workqueue here to prevent recv threads
   1380			 * dropping the last config_refs and trying to destroy
   1381			 * the workqueue from inside the workqueue.
   1382			 */
   1383			if (i)
   1384				flush_workqueue(nbd->recv_workq);
   1385			return -ENOMEM;
   1386		}
   1387		sk_set_memalloc(config->socks[i]->sock->sk);
   1388		if (nbd->tag_set.timeout)
   1389			config->socks[i]->sock->sk->sk_sndtimeo =
   1390				nbd->tag_set.timeout;
   1391		atomic_inc(&config->recv_threads);
   1392		refcount_inc(&nbd->config_refs);
   1393		INIT_WORK(&args->work, recv_work);
   1394		args->nbd = nbd;
   1395		args->index = i;
   1396		queue_work(nbd->recv_workq, &args->work);
   1397	}
   1398	return nbd_set_size(nbd, config->bytesize, nbd_blksize(config));
   1399}
   1400
   1401static int nbd_start_device_ioctl(struct nbd_device *nbd)
   1402{
   1403	struct nbd_config *config = nbd->config;
   1404	int ret;
   1405
   1406	ret = nbd_start_device(nbd);
   1407	if (ret)
   1408		return ret;
   1409
   1410	if (max_part)
   1411		set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
   1412	mutex_unlock(&nbd->config_lock);
   1413	ret = wait_event_interruptible(config->recv_wq,
   1414					 atomic_read(&config->recv_threads) == 0);
   1415	if (ret)
   1416		sock_shutdown(nbd);
   1417	flush_workqueue(nbd->recv_workq);
   1418
   1419	mutex_lock(&nbd->config_lock);
   1420	nbd_bdev_reset(nbd);
   1421	/* user requested, ignore socket errors */
   1422	if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
   1423		ret = 0;
   1424	if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
   1425		ret = -ETIMEDOUT;
   1426	return ret;
   1427}
   1428
   1429static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
   1430				 struct block_device *bdev)
   1431{
   1432	nbd_clear_sock(nbd);
   1433	__invalidate_device(bdev, true);
   1434	nbd_bdev_reset(nbd);
   1435	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
   1436			       &nbd->config->runtime_flags))
   1437		nbd_config_put(nbd);
   1438}
   1439
   1440static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
   1441{
   1442	nbd->tag_set.timeout = timeout * HZ;
   1443	if (timeout)
   1444		blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
   1445	else
   1446		blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
   1447}
   1448
   1449/* Must be called with config_lock held */
   1450static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
   1451		       unsigned int cmd, unsigned long arg)
   1452{
   1453	struct nbd_config *config = nbd->config;
   1454	loff_t bytesize;
   1455
   1456	switch (cmd) {
   1457	case NBD_DISCONNECT:
   1458		return nbd_disconnect(nbd);
   1459	case NBD_CLEAR_SOCK:
   1460		nbd_clear_sock_ioctl(nbd, bdev);
   1461		return 0;
   1462	case NBD_SET_SOCK:
   1463		return nbd_add_socket(nbd, arg, false);
   1464	case NBD_SET_BLKSIZE:
   1465		return nbd_set_size(nbd, config->bytesize, arg);
   1466	case NBD_SET_SIZE:
   1467		return nbd_set_size(nbd, arg, nbd_blksize(config));
   1468	case NBD_SET_SIZE_BLOCKS:
   1469		if (check_shl_overflow(arg, config->blksize_bits, &bytesize))
   1470			return -EINVAL;
   1471		return nbd_set_size(nbd, bytesize, nbd_blksize(config));
   1472	case NBD_SET_TIMEOUT:
   1473		nbd_set_cmd_timeout(nbd, arg);
   1474		return 0;
   1475
   1476	case NBD_SET_FLAGS:
   1477		config->flags = arg;
   1478		return 0;
   1479	case NBD_DO_IT:
   1480		return nbd_start_device_ioctl(nbd);
   1481	case NBD_CLEAR_QUE:
   1482		/*
   1483		 * This is for compatibility only.  The queue is always cleared
   1484		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
   1485		 */
   1486		return 0;
   1487	case NBD_PRINT_DEBUG:
   1488		/*
   1489		 * For compatibility only, we no longer keep a list of
   1490		 * outstanding requests.
   1491		 */
   1492		return 0;
   1493	}
   1494	return -ENOTTY;
   1495}
   1496
   1497static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
   1498		     unsigned int cmd, unsigned long arg)
   1499{
   1500	struct nbd_device *nbd = bdev->bd_disk->private_data;
   1501	struct nbd_config *config = nbd->config;
   1502	int error = -EINVAL;
   1503
   1504	if (!capable(CAP_SYS_ADMIN))
   1505		return -EPERM;
   1506
   1507	/* The block layer will pass back some non-nbd ioctls in case we have
   1508	 * special handling for them, but we don't so just return an error.
   1509	 */
   1510	if (_IOC_TYPE(cmd) != 0xab)
   1511		return -EINVAL;
   1512
   1513	mutex_lock(&nbd->config_lock);
   1514
   1515	/* Don't allow ioctl operations on a nbd device that was created with
   1516	 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
   1517	 */
   1518	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
   1519	    (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
   1520		error = __nbd_ioctl(bdev, nbd, cmd, arg);
   1521	else
   1522		dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
   1523	mutex_unlock(&nbd->config_lock);
   1524	return error;
   1525}
   1526
   1527static struct nbd_config *nbd_alloc_config(void)
   1528{
   1529	struct nbd_config *config;
   1530
   1531	if (!try_module_get(THIS_MODULE))
   1532		return ERR_PTR(-ENODEV);
   1533
   1534	config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
   1535	if (!config) {
   1536		module_put(THIS_MODULE);
   1537		return ERR_PTR(-ENOMEM);
   1538	}
   1539
   1540	atomic_set(&config->recv_threads, 0);
   1541	init_waitqueue_head(&config->recv_wq);
   1542	init_waitqueue_head(&config->conn_wait);
   1543	config->blksize_bits = NBD_DEF_BLKSIZE_BITS;
   1544	atomic_set(&config->live_connections, 0);
   1545	return config;
   1546}
   1547
   1548static int nbd_open(struct block_device *bdev, fmode_t mode)
   1549{
   1550	struct nbd_device *nbd;
   1551	int ret = 0;
   1552
   1553	mutex_lock(&nbd_index_mutex);
   1554	nbd = bdev->bd_disk->private_data;
   1555	if (!nbd) {
   1556		ret = -ENXIO;
   1557		goto out;
   1558	}
   1559	if (!refcount_inc_not_zero(&nbd->refs)) {
   1560		ret = -ENXIO;
   1561		goto out;
   1562	}
   1563	if (!refcount_inc_not_zero(&nbd->config_refs)) {
   1564		struct nbd_config *config;
   1565
   1566		mutex_lock(&nbd->config_lock);
   1567		if (refcount_inc_not_zero(&nbd->config_refs)) {
   1568			mutex_unlock(&nbd->config_lock);
   1569			goto out;
   1570		}
   1571		config = nbd_alloc_config();
   1572		if (IS_ERR(config)) {
   1573			ret = PTR_ERR(config);
   1574			mutex_unlock(&nbd->config_lock);
   1575			goto out;
   1576		}
   1577		nbd->config = config;
   1578		refcount_set(&nbd->config_refs, 1);
   1579		refcount_inc(&nbd->refs);
   1580		mutex_unlock(&nbd->config_lock);
   1581		if (max_part)
   1582			set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
   1583	} else if (nbd_disconnected(nbd->config)) {
   1584		if (max_part)
   1585			set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
   1586	}
   1587out:
   1588	mutex_unlock(&nbd_index_mutex);
   1589	return ret;
   1590}
   1591
   1592static void nbd_release(struct gendisk *disk, fmode_t mode)
   1593{
   1594	struct nbd_device *nbd = disk->private_data;
   1595
   1596	if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
   1597			disk_openers(disk) == 0)
   1598		nbd_disconnect_and_put(nbd);
   1599
   1600	nbd_config_put(nbd);
   1601	nbd_put(nbd);
   1602}
   1603
   1604static const struct block_device_operations nbd_fops =
   1605{
   1606	.owner =	THIS_MODULE,
   1607	.open =		nbd_open,
   1608	.release =	nbd_release,
   1609	.ioctl =	nbd_ioctl,
   1610	.compat_ioctl =	nbd_ioctl,
   1611};
   1612
   1613#if IS_ENABLED(CONFIG_DEBUG_FS)
   1614
   1615static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
   1616{
   1617	struct nbd_device *nbd = s->private;
   1618
   1619	if (nbd->pid)
   1620		seq_printf(s, "recv: %d\n", nbd->pid);
   1621
   1622	return 0;
   1623}
   1624
   1625DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks);
   1626
   1627static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
   1628{
   1629	struct nbd_device *nbd = s->private;
   1630	u32 flags = nbd->config->flags;
   1631
   1632	seq_printf(s, "Hex: 0x%08x\n\n", flags);
   1633
   1634	seq_puts(s, "Known flags:\n");
   1635
   1636	if (flags & NBD_FLAG_HAS_FLAGS)
   1637		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
   1638	if (flags & NBD_FLAG_READ_ONLY)
   1639		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
   1640	if (flags & NBD_FLAG_SEND_FLUSH)
   1641		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
   1642	if (flags & NBD_FLAG_SEND_FUA)
   1643		seq_puts(s, "NBD_FLAG_SEND_FUA\n");
   1644	if (flags & NBD_FLAG_SEND_TRIM)
   1645		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
   1646
   1647	return 0;
   1648}
   1649
   1650DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags);
   1651
   1652static int nbd_dev_dbg_init(struct nbd_device *nbd)
   1653{
   1654	struct dentry *dir;
   1655	struct nbd_config *config = nbd->config;
   1656
   1657	if (!nbd_dbg_dir)
   1658		return -EIO;
   1659
   1660	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
   1661	if (!dir) {
   1662		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
   1663			nbd_name(nbd));
   1664		return -EIO;
   1665	}
   1666	config->dbg_dir = dir;
   1667
   1668	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops);
   1669	debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
   1670	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
   1671	debugfs_create_u32("blocksize_bits", 0444, dir, &config->blksize_bits);
   1672	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops);
   1673
   1674	return 0;
   1675}
   1676
   1677static void nbd_dev_dbg_close(struct nbd_device *nbd)
   1678{
   1679	debugfs_remove_recursive(nbd->config->dbg_dir);
   1680}
   1681
   1682static int nbd_dbg_init(void)
   1683{
   1684	struct dentry *dbg_dir;
   1685
   1686	dbg_dir = debugfs_create_dir("nbd", NULL);
   1687	if (!dbg_dir)
   1688		return -EIO;
   1689
   1690	nbd_dbg_dir = dbg_dir;
   1691
   1692	return 0;
   1693}
   1694
   1695static void nbd_dbg_close(void)
   1696{
   1697	debugfs_remove_recursive(nbd_dbg_dir);
   1698}
   1699
   1700#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
   1701
   1702static int nbd_dev_dbg_init(struct nbd_device *nbd)
   1703{
   1704	return 0;
   1705}
   1706
   1707static void nbd_dev_dbg_close(struct nbd_device *nbd)
   1708{
   1709}
   1710
   1711static int nbd_dbg_init(void)
   1712{
   1713	return 0;
   1714}
   1715
   1716static void nbd_dbg_close(void)
   1717{
   1718}
   1719
   1720#endif
   1721
   1722static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
   1723			    unsigned int hctx_idx, unsigned int numa_node)
   1724{
   1725	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
   1726	cmd->nbd = set->driver_data;
   1727	cmd->flags = 0;
   1728	mutex_init(&cmd->lock);
   1729	return 0;
   1730}
   1731
   1732static const struct blk_mq_ops nbd_mq_ops = {
   1733	.queue_rq	= nbd_queue_rq,
   1734	.complete	= nbd_complete_rq,
   1735	.init_request	= nbd_init_request,
   1736	.timeout	= nbd_xmit_timeout,
   1737};
   1738
   1739static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
   1740{
   1741	struct nbd_device *nbd;
   1742	struct gendisk *disk;
   1743	int err = -ENOMEM;
   1744
   1745	nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
   1746	if (!nbd)
   1747		goto out;
   1748
   1749	nbd->tag_set.ops = &nbd_mq_ops;
   1750	nbd->tag_set.nr_hw_queues = 1;
   1751	nbd->tag_set.queue_depth = 128;
   1752	nbd->tag_set.numa_node = NUMA_NO_NODE;
   1753	nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
   1754	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
   1755		BLK_MQ_F_BLOCKING;
   1756	nbd->tag_set.driver_data = nbd;
   1757	INIT_WORK(&nbd->remove_work, nbd_dev_remove_work);
   1758	nbd->backend = NULL;
   1759
   1760	err = blk_mq_alloc_tag_set(&nbd->tag_set);
   1761	if (err)
   1762		goto out_free_nbd;
   1763
   1764	mutex_lock(&nbd_index_mutex);
   1765	if (index >= 0) {
   1766		err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
   1767				GFP_KERNEL);
   1768		if (err == -ENOSPC)
   1769			err = -EEXIST;
   1770	} else {
   1771		err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
   1772		if (err >= 0)
   1773			index = err;
   1774	}
   1775	nbd->index = index;
   1776	mutex_unlock(&nbd_index_mutex);
   1777	if (err < 0)
   1778		goto out_free_tags;
   1779
   1780	disk = blk_mq_alloc_disk(&nbd->tag_set, NULL);
   1781	if (IS_ERR(disk)) {
   1782		err = PTR_ERR(disk);
   1783		goto out_free_idr;
   1784	}
   1785	nbd->disk = disk;
   1786
   1787	nbd->recv_workq = alloc_workqueue("nbd%d-recv",
   1788					  WQ_MEM_RECLAIM | WQ_HIGHPRI |
   1789					  WQ_UNBOUND, 0, nbd->index);
   1790	if (!nbd->recv_workq) {
   1791		dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
   1792		err = -ENOMEM;
   1793		goto out_err_disk;
   1794	}
   1795
   1796	/*
   1797	 * Tell the block layer that we are not a rotational device
   1798	 */
   1799	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
   1800	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
   1801	disk->queue->limits.discard_granularity = 0;
   1802	blk_queue_max_discard_sectors(disk->queue, 0);
   1803	blk_queue_max_segment_size(disk->queue, UINT_MAX);
   1804	blk_queue_max_segments(disk->queue, USHRT_MAX);
   1805	blk_queue_max_hw_sectors(disk->queue, 65536);
   1806	disk->queue->limits.max_sectors = 256;
   1807
   1808	mutex_init(&nbd->config_lock);
   1809	refcount_set(&nbd->config_refs, 0);
   1810	/*
   1811	 * Start out with a zero references to keep other threads from using
   1812	 * this device until it is fully initialized.
   1813	 */
   1814	refcount_set(&nbd->refs, 0);
   1815	INIT_LIST_HEAD(&nbd->list);
   1816	disk->major = NBD_MAJOR;
   1817	disk->first_minor = index << part_shift;
   1818	disk->minors = 1 << part_shift;
   1819	disk->fops = &nbd_fops;
   1820	disk->private_data = nbd;
   1821	sprintf(disk->disk_name, "nbd%d", index);
   1822	err = add_disk(disk);
   1823	if (err)
   1824		goto out_free_work;
   1825
   1826	/*
   1827	 * Now publish the device.
   1828	 */
   1829	refcount_set(&nbd->refs, refs);
   1830	nbd_total_devices++;
   1831	return nbd;
   1832
   1833out_free_work:
   1834	destroy_workqueue(nbd->recv_workq);
   1835out_err_disk:
   1836	blk_cleanup_disk(disk);
   1837out_free_idr:
   1838	mutex_lock(&nbd_index_mutex);
   1839	idr_remove(&nbd_index_idr, index);
   1840	mutex_unlock(&nbd_index_mutex);
   1841out_free_tags:
   1842	blk_mq_free_tag_set(&nbd->tag_set);
   1843out_free_nbd:
   1844	kfree(nbd);
   1845out:
   1846	return ERR_PTR(err);
   1847}
   1848
   1849static struct nbd_device *nbd_find_get_unused(void)
   1850{
   1851	struct nbd_device *nbd;
   1852	int id;
   1853
   1854	lockdep_assert_held(&nbd_index_mutex);
   1855
   1856	idr_for_each_entry(&nbd_index_idr, nbd, id) {
   1857		if (refcount_read(&nbd->config_refs) ||
   1858		    test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
   1859			continue;
   1860		if (refcount_inc_not_zero(&nbd->refs))
   1861			return nbd;
   1862	}
   1863
   1864	return NULL;
   1865}
   1866
   1867/* Netlink interface. */
   1868static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
   1869	[NBD_ATTR_INDEX]		=	{ .type = NLA_U32 },
   1870	[NBD_ATTR_SIZE_BYTES]		=	{ .type = NLA_U64 },
   1871	[NBD_ATTR_BLOCK_SIZE_BYTES]	=	{ .type = NLA_U64 },
   1872	[NBD_ATTR_TIMEOUT]		=	{ .type = NLA_U64 },
   1873	[NBD_ATTR_SERVER_FLAGS]		=	{ .type = NLA_U64 },
   1874	[NBD_ATTR_CLIENT_FLAGS]		=	{ .type = NLA_U64 },
   1875	[NBD_ATTR_SOCKETS]		=	{ .type = NLA_NESTED},
   1876	[NBD_ATTR_DEAD_CONN_TIMEOUT]	=	{ .type = NLA_U64 },
   1877	[NBD_ATTR_DEVICE_LIST]		=	{ .type = NLA_NESTED},
   1878	[NBD_ATTR_BACKEND_IDENTIFIER]	=	{ .type = NLA_STRING},
   1879};
   1880
   1881static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
   1882	[NBD_SOCK_FD]			=	{ .type = NLA_U32 },
   1883};
   1884
   1885/* We don't use this right now since we don't parse the incoming list, but we
   1886 * still want it here so userspace knows what to expect.
   1887 */
   1888static const struct nla_policy __attribute__((unused))
   1889nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
   1890	[NBD_DEVICE_INDEX]		=	{ .type = NLA_U32 },
   1891	[NBD_DEVICE_CONNECTED]		=	{ .type = NLA_U8 },
   1892};
   1893
   1894static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
   1895{
   1896	struct nbd_config *config = nbd->config;
   1897	u64 bsize = nbd_blksize(config);
   1898	u64 bytes = config->bytesize;
   1899
   1900	if (info->attrs[NBD_ATTR_SIZE_BYTES])
   1901		bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
   1902
   1903	if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES])
   1904		bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
   1905
   1906	if (bytes != config->bytesize || bsize != nbd_blksize(config))
   1907		return nbd_set_size(nbd, bytes, bsize);
   1908	return 0;
   1909}
   1910
   1911static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
   1912{
   1913	struct nbd_device *nbd;
   1914	struct nbd_config *config;
   1915	int index = -1;
   1916	int ret;
   1917	bool put_dev = false;
   1918
   1919	if (!netlink_capable(skb, CAP_SYS_ADMIN))
   1920		return -EPERM;
   1921
   1922	if (info->attrs[NBD_ATTR_INDEX]) {
   1923		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
   1924
   1925		/*
   1926		 * Too big first_minor can cause duplicate creation of
   1927		 * sysfs files/links, since index << part_shift might overflow, or
   1928		 * MKDEV() expect that the max bits of first_minor is 20.
   1929		 */
   1930		if (index < 0 || index > MINORMASK >> part_shift) {
   1931			pr_err("illegal input index %d\n", index);
   1932			return -EINVAL;
   1933		}
   1934	}
   1935	if (!info->attrs[NBD_ATTR_SOCKETS]) {
   1936		pr_err("must specify at least one socket\n");
   1937		return -EINVAL;
   1938	}
   1939	if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
   1940		pr_err("must specify a size in bytes for the device\n");
   1941		return -EINVAL;
   1942	}
   1943again:
   1944	mutex_lock(&nbd_index_mutex);
   1945	if (index == -1) {
   1946		nbd = nbd_find_get_unused();
   1947	} else {
   1948		nbd = idr_find(&nbd_index_idr, index);
   1949		if (nbd) {
   1950			if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
   1951			     test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
   1952			    !refcount_inc_not_zero(&nbd->refs)) {
   1953				mutex_unlock(&nbd_index_mutex);
   1954				pr_err("nbd: device at index %d is going down\n",
   1955					index);
   1956				return -EINVAL;
   1957			}
   1958		}
   1959	}
   1960	mutex_unlock(&nbd_index_mutex);
   1961
   1962	if (!nbd) {
   1963		nbd = nbd_dev_add(index, 2);
   1964		if (IS_ERR(nbd)) {
   1965			pr_err("nbd: failed to add new device\n");
   1966			return PTR_ERR(nbd);
   1967		}
   1968	}
   1969
   1970	mutex_lock(&nbd->config_lock);
   1971	if (refcount_read(&nbd->config_refs)) {
   1972		mutex_unlock(&nbd->config_lock);
   1973		nbd_put(nbd);
   1974		if (index == -1)
   1975			goto again;
   1976		pr_err("nbd%d already in use\n", index);
   1977		return -EBUSY;
   1978	}
   1979	if (WARN_ON(nbd->config)) {
   1980		mutex_unlock(&nbd->config_lock);
   1981		nbd_put(nbd);
   1982		return -EINVAL;
   1983	}
   1984	config = nbd_alloc_config();
   1985	if (IS_ERR(config)) {
   1986		mutex_unlock(&nbd->config_lock);
   1987		nbd_put(nbd);
   1988		pr_err("couldn't allocate config\n");
   1989		return PTR_ERR(config);
   1990	}
   1991	nbd->config = config;
   1992	refcount_set(&nbd->config_refs, 1);
   1993	set_bit(NBD_RT_BOUND, &config->runtime_flags);
   1994
   1995	ret = nbd_genl_size_set(info, nbd);
   1996	if (ret)
   1997		goto out;
   1998
   1999	if (info->attrs[NBD_ATTR_TIMEOUT])
   2000		nbd_set_cmd_timeout(nbd,
   2001				    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
   2002	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
   2003		config->dead_conn_timeout =
   2004			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
   2005		config->dead_conn_timeout *= HZ;
   2006	}
   2007	if (info->attrs[NBD_ATTR_SERVER_FLAGS])
   2008		config->flags =
   2009			nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
   2010	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
   2011		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
   2012		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
   2013			/*
   2014			 * We have 1 ref to keep the device around, and then 1
   2015			 * ref for our current operation here, which will be
   2016			 * inherited by the config.  If we already have
   2017			 * DESTROY_ON_DISCONNECT set then we know we don't have
   2018			 * that extra ref already held so we don't need the
   2019			 * put_dev.
   2020			 */
   2021			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
   2022					      &nbd->flags))
   2023				put_dev = true;
   2024		} else {
   2025			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
   2026					       &nbd->flags))
   2027				refcount_inc(&nbd->refs);
   2028		}
   2029		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
   2030			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
   2031				&config->runtime_flags);
   2032		}
   2033	}
   2034
   2035	if (info->attrs[NBD_ATTR_SOCKETS]) {
   2036		struct nlattr *attr;
   2037		int rem, fd;
   2038
   2039		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
   2040				    rem) {
   2041			struct nlattr *socks[NBD_SOCK_MAX+1];
   2042
   2043			if (nla_type(attr) != NBD_SOCK_ITEM) {
   2044				pr_err("socks must be embedded in a SOCK_ITEM attr\n");
   2045				ret = -EINVAL;
   2046				goto out;
   2047			}
   2048			ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
   2049							  attr,
   2050							  nbd_sock_policy,
   2051							  info->extack);
   2052			if (ret != 0) {
   2053				pr_err("error processing sock list\n");
   2054				ret = -EINVAL;
   2055				goto out;
   2056			}
   2057			if (!socks[NBD_SOCK_FD])
   2058				continue;
   2059			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
   2060			ret = nbd_add_socket(nbd, fd, true);
   2061			if (ret)
   2062				goto out;
   2063		}
   2064	}
   2065	ret = nbd_start_device(nbd);
   2066	if (ret)
   2067		goto out;
   2068	if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
   2069		nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
   2070					  GFP_KERNEL);
   2071		if (!nbd->backend) {
   2072			ret = -ENOMEM;
   2073			goto out;
   2074		}
   2075	}
   2076	ret = device_create_file(disk_to_dev(nbd->disk), &backend_attr);
   2077	if (ret) {
   2078		dev_err(disk_to_dev(nbd->disk),
   2079			"device_create_file failed for backend!\n");
   2080		goto out;
   2081	}
   2082	set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags);
   2083out:
   2084	mutex_unlock(&nbd->config_lock);
   2085	if (!ret) {
   2086		set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
   2087		refcount_inc(&nbd->config_refs);
   2088		nbd_connect_reply(info, nbd->index);
   2089	}
   2090	nbd_config_put(nbd);
   2091	if (put_dev)
   2092		nbd_put(nbd);
   2093	return ret;
   2094}
   2095
   2096static void nbd_disconnect_and_put(struct nbd_device *nbd)
   2097{
   2098	mutex_lock(&nbd->config_lock);
   2099	nbd_disconnect(nbd);
   2100	sock_shutdown(nbd);
   2101	wake_up(&nbd->config->conn_wait);
   2102	/*
   2103	 * Make sure recv thread has finished, we can safely call nbd_clear_que()
   2104	 * to cancel the inflight I/Os.
   2105	 */
   2106	flush_workqueue(nbd->recv_workq);
   2107	nbd_clear_que(nbd);
   2108	nbd->task_setup = NULL;
   2109	mutex_unlock(&nbd->config_lock);
   2110
   2111	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
   2112			       &nbd->config->runtime_flags))
   2113		nbd_config_put(nbd);
   2114}
   2115
   2116static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
   2117{
   2118	struct nbd_device *nbd;
   2119	int index;
   2120
   2121	if (!netlink_capable(skb, CAP_SYS_ADMIN))
   2122		return -EPERM;
   2123
   2124	if (!info->attrs[NBD_ATTR_INDEX]) {
   2125		pr_err("must specify an index to disconnect\n");
   2126		return -EINVAL;
   2127	}
   2128	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
   2129	mutex_lock(&nbd_index_mutex);
   2130	nbd = idr_find(&nbd_index_idr, index);
   2131	if (!nbd) {
   2132		mutex_unlock(&nbd_index_mutex);
   2133		pr_err("couldn't find device at index %d\n", index);
   2134		return -EINVAL;
   2135	}
   2136	if (!refcount_inc_not_zero(&nbd->refs)) {
   2137		mutex_unlock(&nbd_index_mutex);
   2138		pr_err("device at index %d is going down\n", index);
   2139		return -EINVAL;
   2140	}
   2141	mutex_unlock(&nbd_index_mutex);
   2142	if (!refcount_inc_not_zero(&nbd->config_refs))
   2143		goto put_nbd;
   2144	nbd_disconnect_and_put(nbd);
   2145	nbd_config_put(nbd);
   2146put_nbd:
   2147	nbd_put(nbd);
   2148	return 0;
   2149}
   2150
   2151static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
   2152{
   2153	struct nbd_device *nbd = NULL;
   2154	struct nbd_config *config;
   2155	int index;
   2156	int ret = 0;
   2157	bool put_dev = false;
   2158
   2159	if (!netlink_capable(skb, CAP_SYS_ADMIN))
   2160		return -EPERM;
   2161
   2162	if (!info->attrs[NBD_ATTR_INDEX]) {
   2163		pr_err("must specify a device to reconfigure\n");
   2164		return -EINVAL;
   2165	}
   2166	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
   2167	mutex_lock(&nbd_index_mutex);
   2168	nbd = idr_find(&nbd_index_idr, index);
   2169	if (!nbd) {
   2170		mutex_unlock(&nbd_index_mutex);
   2171		pr_err("couldn't find a device at index %d\n", index);
   2172		return -EINVAL;
   2173	}
   2174	if (nbd->backend) {
   2175		if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
   2176			if (nla_strcmp(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
   2177				       nbd->backend)) {
   2178				mutex_unlock(&nbd_index_mutex);
   2179				dev_err(nbd_to_dev(nbd),
   2180					"backend image doesn't match with %s\n",
   2181					nbd->backend);
   2182				return -EINVAL;
   2183			}
   2184		} else {
   2185			mutex_unlock(&nbd_index_mutex);
   2186			dev_err(nbd_to_dev(nbd), "must specify backend\n");
   2187			return -EINVAL;
   2188		}
   2189	}
   2190	if (!refcount_inc_not_zero(&nbd->refs)) {
   2191		mutex_unlock(&nbd_index_mutex);
   2192		pr_err("device at index %d is going down\n", index);
   2193		return -EINVAL;
   2194	}
   2195	mutex_unlock(&nbd_index_mutex);
   2196
   2197	if (!refcount_inc_not_zero(&nbd->config_refs)) {
   2198		dev_err(nbd_to_dev(nbd),
   2199			"not configured, cannot reconfigure\n");
   2200		nbd_put(nbd);
   2201		return -EINVAL;
   2202	}
   2203
   2204	mutex_lock(&nbd->config_lock);
   2205	config = nbd->config;
   2206	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
   2207	    !nbd->pid) {
   2208		dev_err(nbd_to_dev(nbd),
   2209			"not configured, cannot reconfigure\n");
   2210		ret = -EINVAL;
   2211		goto out;
   2212	}
   2213
   2214	ret = nbd_genl_size_set(info, nbd);
   2215	if (ret)
   2216		goto out;
   2217
   2218	if (info->attrs[NBD_ATTR_TIMEOUT])
   2219		nbd_set_cmd_timeout(nbd,
   2220				    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
   2221	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
   2222		config->dead_conn_timeout =
   2223			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
   2224		config->dead_conn_timeout *= HZ;
   2225	}
   2226	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
   2227		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
   2228		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
   2229			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
   2230					      &nbd->flags))
   2231				put_dev = true;
   2232		} else {
   2233			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
   2234					       &nbd->flags))
   2235				refcount_inc(&nbd->refs);
   2236		}
   2237
   2238		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
   2239			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
   2240					&config->runtime_flags);
   2241		} else {
   2242			clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
   2243					&config->runtime_flags);
   2244		}
   2245	}
   2246
   2247	if (info->attrs[NBD_ATTR_SOCKETS]) {
   2248		struct nlattr *attr;
   2249		int rem, fd;
   2250
   2251		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
   2252				    rem) {
   2253			struct nlattr *socks[NBD_SOCK_MAX+1];
   2254
   2255			if (nla_type(attr) != NBD_SOCK_ITEM) {
   2256				pr_err("socks must be embedded in a SOCK_ITEM attr\n");
   2257				ret = -EINVAL;
   2258				goto out;
   2259			}
   2260			ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
   2261							  attr,
   2262							  nbd_sock_policy,
   2263							  info->extack);
   2264			if (ret != 0) {
   2265				pr_err("error processing sock list\n");
   2266				ret = -EINVAL;
   2267				goto out;
   2268			}
   2269			if (!socks[NBD_SOCK_FD])
   2270				continue;
   2271			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
   2272			ret = nbd_reconnect_socket(nbd, fd);
   2273			if (ret) {
   2274				if (ret == -ENOSPC)
   2275					ret = 0;
   2276				goto out;
   2277			}
   2278			dev_info(nbd_to_dev(nbd), "reconnected socket\n");
   2279		}
   2280	}
   2281out:
   2282	mutex_unlock(&nbd->config_lock);
   2283	nbd_config_put(nbd);
   2284	nbd_put(nbd);
   2285	if (put_dev)
   2286		nbd_put(nbd);
   2287	return ret;
   2288}
   2289
   2290static const struct genl_small_ops nbd_connect_genl_ops[] = {
   2291	{
   2292		.cmd	= NBD_CMD_CONNECT,
   2293		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
   2294		.doit	= nbd_genl_connect,
   2295	},
   2296	{
   2297		.cmd	= NBD_CMD_DISCONNECT,
   2298		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
   2299		.doit	= nbd_genl_disconnect,
   2300	},
   2301	{
   2302		.cmd	= NBD_CMD_RECONFIGURE,
   2303		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
   2304		.doit	= nbd_genl_reconfigure,
   2305	},
   2306	{
   2307		.cmd	= NBD_CMD_STATUS,
   2308		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
   2309		.doit	= nbd_genl_status,
   2310	},
   2311};
   2312
   2313static const struct genl_multicast_group nbd_mcast_grps[] = {
   2314	{ .name = NBD_GENL_MCAST_GROUP_NAME, },
   2315};
   2316
   2317static struct genl_family nbd_genl_family __ro_after_init = {
   2318	.hdrsize	= 0,
   2319	.name		= NBD_GENL_FAMILY_NAME,
   2320	.version	= NBD_GENL_VERSION,
   2321	.module		= THIS_MODULE,
   2322	.small_ops	= nbd_connect_genl_ops,
   2323	.n_small_ops	= ARRAY_SIZE(nbd_connect_genl_ops),
   2324	.maxattr	= NBD_ATTR_MAX,
   2325	.policy = nbd_attr_policy,
   2326	.mcgrps		= nbd_mcast_grps,
   2327	.n_mcgrps	= ARRAY_SIZE(nbd_mcast_grps),
   2328};
   2329
   2330static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
   2331{
   2332	struct nlattr *dev_opt;
   2333	u8 connected = 0;
   2334	int ret;
   2335
   2336	/* This is a little racey, but for status it's ok.  The
   2337	 * reason we don't take a ref here is because we can't
   2338	 * take a ref in the index == -1 case as we would need
   2339	 * to put under the nbd_index_mutex, which could
   2340	 * deadlock if we are configured to remove ourselves
   2341	 * once we're disconnected.
   2342	 */
   2343	if (refcount_read(&nbd->config_refs))
   2344		connected = 1;
   2345	dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
   2346	if (!dev_opt)
   2347		return -EMSGSIZE;
   2348	ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
   2349	if (ret)
   2350		return -EMSGSIZE;
   2351	ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
   2352			 connected);
   2353	if (ret)
   2354		return -EMSGSIZE;
   2355	nla_nest_end(reply, dev_opt);
   2356	return 0;
   2357}
   2358
   2359static int status_cb(int id, void *ptr, void *data)
   2360{
   2361	struct nbd_device *nbd = ptr;
   2362	return populate_nbd_status(nbd, (struct sk_buff *)data);
   2363}
   2364
   2365static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
   2366{
   2367	struct nlattr *dev_list;
   2368	struct sk_buff *reply;
   2369	void *reply_head;
   2370	size_t msg_size;
   2371	int index = -1;
   2372	int ret = -ENOMEM;
   2373
   2374	if (info->attrs[NBD_ATTR_INDEX])
   2375		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
   2376
   2377	mutex_lock(&nbd_index_mutex);
   2378
   2379	msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
   2380				  nla_attr_size(sizeof(u8)));
   2381	msg_size *= (index == -1) ? nbd_total_devices : 1;
   2382
   2383	reply = genlmsg_new(msg_size, GFP_KERNEL);
   2384	if (!reply)
   2385		goto out;
   2386	reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
   2387				       NBD_CMD_STATUS);
   2388	if (!reply_head) {
   2389		nlmsg_free(reply);
   2390		goto out;
   2391	}
   2392
   2393	dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
   2394	if (index == -1) {
   2395		ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
   2396		if (ret) {
   2397			nlmsg_free(reply);
   2398			goto out;
   2399		}
   2400	} else {
   2401		struct nbd_device *nbd;
   2402		nbd = idr_find(&nbd_index_idr, index);
   2403		if (nbd) {
   2404			ret = populate_nbd_status(nbd, reply);
   2405			if (ret) {
   2406				nlmsg_free(reply);
   2407				goto out;
   2408			}
   2409		}
   2410	}
   2411	nla_nest_end(reply, dev_list);
   2412	genlmsg_end(reply, reply_head);
   2413	ret = genlmsg_reply(reply, info);
   2414out:
   2415	mutex_unlock(&nbd_index_mutex);
   2416	return ret;
   2417}
   2418
   2419static void nbd_connect_reply(struct genl_info *info, int index)
   2420{
   2421	struct sk_buff *skb;
   2422	void *msg_head;
   2423	int ret;
   2424
   2425	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
   2426	if (!skb)
   2427		return;
   2428	msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
   2429				     NBD_CMD_CONNECT);
   2430	if (!msg_head) {
   2431		nlmsg_free(skb);
   2432		return;
   2433	}
   2434	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
   2435	if (ret) {
   2436		nlmsg_free(skb);
   2437		return;
   2438	}
   2439	genlmsg_end(skb, msg_head);
   2440	genlmsg_reply(skb, info);
   2441}
   2442
   2443static void nbd_mcast_index(int index)
   2444{
   2445	struct sk_buff *skb;
   2446	void *msg_head;
   2447	int ret;
   2448
   2449	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
   2450	if (!skb)
   2451		return;
   2452	msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
   2453				     NBD_CMD_LINK_DEAD);
   2454	if (!msg_head) {
   2455		nlmsg_free(skb);
   2456		return;
   2457	}
   2458	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
   2459	if (ret) {
   2460		nlmsg_free(skb);
   2461		return;
   2462	}
   2463	genlmsg_end(skb, msg_head);
   2464	genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
   2465}
   2466
   2467static void nbd_dead_link_work(struct work_struct *work)
   2468{
   2469	struct link_dead_args *args = container_of(work, struct link_dead_args,
   2470						   work);
   2471	nbd_mcast_index(args->index);
   2472	kfree(args);
   2473}
   2474
   2475static int __init nbd_init(void)
   2476{
   2477	int i;
   2478
   2479	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
   2480
   2481	if (max_part < 0) {
   2482		pr_err("max_part must be >= 0\n");
   2483		return -EINVAL;
   2484	}
   2485
   2486	part_shift = 0;
   2487	if (max_part > 0) {
   2488		part_shift = fls(max_part);
   2489
   2490		/*
   2491		 * Adjust max_part according to part_shift as it is exported
   2492		 * to user space so that user can know the max number of
   2493		 * partition kernel should be able to manage.
   2494		 *
   2495		 * Note that -1 is required because partition 0 is reserved
   2496		 * for the whole disk.
   2497		 */
   2498		max_part = (1UL << part_shift) - 1;
   2499	}
   2500
   2501	if ((1UL << part_shift) > DISK_MAX_PARTS)
   2502		return -EINVAL;
   2503
   2504	if (nbds_max > 1UL << (MINORBITS - part_shift))
   2505		return -EINVAL;
   2506
   2507	if (register_blkdev(NBD_MAJOR, "nbd"))
   2508		return -EIO;
   2509
   2510	nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0);
   2511	if (!nbd_del_wq) {
   2512		unregister_blkdev(NBD_MAJOR, "nbd");
   2513		return -ENOMEM;
   2514	}
   2515
   2516	if (genl_register_family(&nbd_genl_family)) {
   2517		destroy_workqueue(nbd_del_wq);
   2518		unregister_blkdev(NBD_MAJOR, "nbd");
   2519		return -EINVAL;
   2520	}
   2521	nbd_dbg_init();
   2522
   2523	for (i = 0; i < nbds_max; i++)
   2524		nbd_dev_add(i, 1);
   2525	return 0;
   2526}
   2527
   2528static int nbd_exit_cb(int id, void *ptr, void *data)
   2529{
   2530	struct list_head *list = (struct list_head *)data;
   2531	struct nbd_device *nbd = ptr;
   2532
   2533	/* Skip nbd that is being removed asynchronously */
   2534	if (refcount_read(&nbd->refs))
   2535		list_add_tail(&nbd->list, list);
   2536
   2537	return 0;
   2538}
   2539
   2540static void __exit nbd_cleanup(void)
   2541{
   2542	struct nbd_device *nbd;
   2543	LIST_HEAD(del_list);
   2544
   2545	/*
   2546	 * Unregister netlink interface prior to waiting
   2547	 * for the completion of netlink commands.
   2548	 */
   2549	genl_unregister_family(&nbd_genl_family);
   2550
   2551	nbd_dbg_close();
   2552
   2553	mutex_lock(&nbd_index_mutex);
   2554	idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
   2555	mutex_unlock(&nbd_index_mutex);
   2556
   2557	while (!list_empty(&del_list)) {
   2558		nbd = list_first_entry(&del_list, struct nbd_device, list);
   2559		list_del_init(&nbd->list);
   2560		if (refcount_read(&nbd->config_refs))
   2561			pr_err("possibly leaking nbd_config (ref %d)\n",
   2562					refcount_read(&nbd->config_refs));
   2563		if (refcount_read(&nbd->refs) != 1)
   2564			pr_err("possibly leaking a device\n");
   2565		nbd_put(nbd);
   2566	}
   2567
   2568	/* Also wait for nbd_dev_remove_work() completes */
   2569	destroy_workqueue(nbd_del_wq);
   2570
   2571	idr_destroy(&nbd_index_idr);
   2572	unregister_blkdev(NBD_MAJOR, "nbd");
   2573}
   2574
   2575module_init(nbd_init);
   2576module_exit(nbd_cleanup);
   2577
   2578MODULE_DESCRIPTION("Network Block Device");
   2579MODULE_LICENSE("GPL");
   2580
   2581module_param(nbds_max, int, 0444);
   2582MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
   2583module_param(max_part, int, 0444);
   2584MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");