cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

virtio_blk.c (31256B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2//#define DEBUG
      3#include <linux/spinlock.h>
      4#include <linux/slab.h>
      5#include <linux/blkdev.h>
      6#include <linux/hdreg.h>
      7#include <linux/module.h>
      8#include <linux/mutex.h>
      9#include <linux/interrupt.h>
     10#include <linux/virtio.h>
     11#include <linux/virtio_blk.h>
     12#include <linux/scatterlist.h>
     13#include <linux/string_helpers.h>
     14#include <linux/idr.h>
     15#include <linux/blk-mq.h>
     16#include <linux/blk-mq-virtio.h>
     17#include <linux/numa.h>
     18#include <uapi/linux/virtio_ring.h>
     19
     20#define PART_BITS 4
     21#define VQ_NAME_LEN 16
     22#define MAX_DISCARD_SEGMENTS 256u
     23
     24/* The maximum number of sg elements that fit into a virtqueue */
     25#define VIRTIO_BLK_MAX_SG_ELEMS 32768
     26
     27#ifdef CONFIG_ARCH_NO_SG_CHAIN
     28#define VIRTIO_BLK_INLINE_SG_CNT	0
     29#else
     30#define VIRTIO_BLK_INLINE_SG_CNT	2
     31#endif
     32
     33static unsigned int num_request_queues;
     34module_param(num_request_queues, uint, 0644);
     35MODULE_PARM_DESC(num_request_queues,
     36		 "Limit the number of request queues to use for blk device. "
     37		 "0 for no limit. "
     38		 "Values > nr_cpu_ids truncated to nr_cpu_ids.");
     39
     40static unsigned int poll_queues;
     41module_param(poll_queues, uint, 0644);
     42MODULE_PARM_DESC(poll_queues, "The number of dedicated virtqueues for polling I/O");
     43
     44static int major;
     45static DEFINE_IDA(vd_index_ida);
     46
     47static struct workqueue_struct *virtblk_wq;
     48
     49struct virtio_blk_vq {
     50	struct virtqueue *vq;
     51	spinlock_t lock;
     52	char name[VQ_NAME_LEN];
     53} ____cacheline_aligned_in_smp;
     54
     55struct virtio_blk {
     56	/*
     57	 * This mutex must be held by anything that may run after
     58	 * virtblk_remove() sets vblk->vdev to NULL.
     59	 *
     60	 * blk-mq, virtqueue processing, and sysfs attribute code paths are
     61	 * shut down before vblk->vdev is set to NULL and therefore do not need
     62	 * to hold this mutex.
     63	 */
     64	struct mutex vdev_mutex;
     65	struct virtio_device *vdev;
     66
     67	/* The disk structure for the kernel. */
     68	struct gendisk *disk;
     69
     70	/* Block layer tags. */
     71	struct blk_mq_tag_set tag_set;
     72
     73	/* Process context for config space updates */
     74	struct work_struct config_work;
     75
     76	/* Ida index - used to track minor number allocations. */
     77	int index;
     78
     79	/* num of vqs */
     80	int num_vqs;
     81	int io_queues[HCTX_MAX_TYPES];
     82	struct virtio_blk_vq *vqs;
     83};
     84
     85struct virtblk_req {
     86	struct virtio_blk_outhdr out_hdr;
     87	u8 status;
     88	struct sg_table sg_table;
     89	struct scatterlist sg[];
     90};
     91
     92static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
     93{
     94	switch (vbr->status) {
     95	case VIRTIO_BLK_S_OK:
     96		return BLK_STS_OK;
     97	case VIRTIO_BLK_S_UNSUPP:
     98		return BLK_STS_NOTSUPP;
     99	default:
    100		return BLK_STS_IOERR;
    101	}
    102}
    103
    104static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr)
    105{
    106	struct scatterlist hdr, status, *sgs[3];
    107	unsigned int num_out = 0, num_in = 0;
    108
    109	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
    110	sgs[num_out++] = &hdr;
    111
    112	if (vbr->sg_table.nents) {
    113		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
    114			sgs[num_out++] = vbr->sg_table.sgl;
    115		else
    116			sgs[num_out + num_in++] = vbr->sg_table.sgl;
    117	}
    118
    119	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
    120	sgs[num_out + num_in++] = &status;
    121
    122	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
    123}
    124
    125static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
    126{
    127	unsigned short segments = blk_rq_nr_discard_segments(req);
    128	unsigned short n = 0;
    129	struct virtio_blk_discard_write_zeroes *range;
    130	struct bio *bio;
    131	u32 flags = 0;
    132
    133	if (unmap)
    134		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;
    135
    136	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
    137	if (!range)
    138		return -ENOMEM;
    139
    140	/*
    141	 * Single max discard segment means multi-range discard isn't
    142	 * supported, and block layer only runs contiguity merge like
    143	 * normal RW request. So we can't reply on bio for retrieving
    144	 * each range info.
    145	 */
    146	if (queue_max_discard_segments(req->q) == 1) {
    147		range[0].flags = cpu_to_le32(flags);
    148		range[0].num_sectors = cpu_to_le32(blk_rq_sectors(req));
    149		range[0].sector = cpu_to_le64(blk_rq_pos(req));
    150		n = 1;
    151	} else {
    152		__rq_for_each_bio(bio, req) {
    153			u64 sector = bio->bi_iter.bi_sector;
    154			u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;
    155
    156			range[n].flags = cpu_to_le32(flags);
    157			range[n].num_sectors = cpu_to_le32(num_sectors);
    158			range[n].sector = cpu_to_le64(sector);
    159			n++;
    160		}
    161	}
    162
    163	WARN_ON_ONCE(n != segments);
    164
    165	req->special_vec.bv_page = virt_to_page(range);
    166	req->special_vec.bv_offset = offset_in_page(range);
    167	req->special_vec.bv_len = sizeof(*range) * segments;
    168	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
    169
    170	return 0;
    171}
    172
    173static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr)
    174{
    175	if (blk_rq_nr_phys_segments(req))
    176		sg_free_table_chained(&vbr->sg_table,
    177				      VIRTIO_BLK_INLINE_SG_CNT);
    178}
    179
    180static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req,
    181		struct virtblk_req *vbr)
    182{
    183	int err;
    184
    185	if (!blk_rq_nr_phys_segments(req))
    186		return 0;
    187
    188	vbr->sg_table.sgl = vbr->sg;
    189	err = sg_alloc_table_chained(&vbr->sg_table,
    190				     blk_rq_nr_phys_segments(req),
    191				     vbr->sg_table.sgl,
    192				     VIRTIO_BLK_INLINE_SG_CNT);
    193	if (unlikely(err))
    194		return -ENOMEM;
    195
    196	return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl);
    197}
    198
    199static void virtblk_cleanup_cmd(struct request *req)
    200{
    201	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
    202		kfree(bvec_virt(&req->special_vec));
    203}
    204
    205static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
    206				      struct request *req,
    207				      struct virtblk_req *vbr)
    208{
    209	bool unmap = false;
    210	u32 type;
    211
    212	vbr->out_hdr.sector = 0;
    213
    214	switch (req_op(req)) {
    215	case REQ_OP_READ:
    216		type = VIRTIO_BLK_T_IN;
    217		vbr->out_hdr.sector = cpu_to_virtio64(vdev,
    218						      blk_rq_pos(req));
    219		break;
    220	case REQ_OP_WRITE:
    221		type = VIRTIO_BLK_T_OUT;
    222		vbr->out_hdr.sector = cpu_to_virtio64(vdev,
    223						      blk_rq_pos(req));
    224		break;
    225	case REQ_OP_FLUSH:
    226		type = VIRTIO_BLK_T_FLUSH;
    227		break;
    228	case REQ_OP_DISCARD:
    229		type = VIRTIO_BLK_T_DISCARD;
    230		break;
    231	case REQ_OP_WRITE_ZEROES:
    232		type = VIRTIO_BLK_T_WRITE_ZEROES;
    233		unmap = !(req->cmd_flags & REQ_NOUNMAP);
    234		break;
    235	case REQ_OP_DRV_IN:
    236		type = VIRTIO_BLK_T_GET_ID;
    237		break;
    238	default:
    239		WARN_ON_ONCE(1);
    240		return BLK_STS_IOERR;
    241	}
    242
    243	vbr->out_hdr.type = cpu_to_virtio32(vdev, type);
    244	vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
    245
    246	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
    247		if (virtblk_setup_discard_write_zeroes(req, unmap))
    248			return BLK_STS_RESOURCE;
    249	}
    250
    251	return 0;
    252}
    253
    254static inline void virtblk_request_done(struct request *req)
    255{
    256	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
    257
    258	virtblk_unmap_data(req, vbr);
    259	virtblk_cleanup_cmd(req);
    260	blk_mq_end_request(req, virtblk_result(vbr));
    261}
    262
    263static void virtblk_done(struct virtqueue *vq)
    264{
    265	struct virtio_blk *vblk = vq->vdev->priv;
    266	bool req_done = false;
    267	int qid = vq->index;
    268	struct virtblk_req *vbr;
    269	unsigned long flags;
    270	unsigned int len;
    271
    272	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
    273	do {
    274		virtqueue_disable_cb(vq);
    275		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
    276			struct request *req = blk_mq_rq_from_pdu(vbr);
    277
    278			if (likely(!blk_should_fake_timeout(req->q)))
    279				blk_mq_complete_request(req);
    280			req_done = true;
    281		}
    282		if (unlikely(virtqueue_is_broken(vq)))
    283			break;
    284	} while (!virtqueue_enable_cb(vq));
    285
    286	/* In case queue is stopped waiting for more buffers. */
    287	if (req_done)
    288		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
    289	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
    290}
    291
    292static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
    293{
    294	struct virtio_blk *vblk = hctx->queue->queuedata;
    295	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
    296	bool kick;
    297
    298	spin_lock_irq(&vq->lock);
    299	kick = virtqueue_kick_prepare(vq->vq);
    300	spin_unlock_irq(&vq->lock);
    301
    302	if (kick)
    303		virtqueue_notify(vq->vq);
    304}
    305
    306static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx,
    307					struct virtio_blk *vblk,
    308					struct request *req,
    309					struct virtblk_req *vbr)
    310{
    311	blk_status_t status;
    312
    313	status = virtblk_setup_cmd(vblk->vdev, req, vbr);
    314	if (unlikely(status))
    315		return status;
    316
    317	blk_mq_start_request(req);
    318
    319	vbr->sg_table.nents = virtblk_map_data(hctx, req, vbr);
    320	if (unlikely(vbr->sg_table.nents < 0)) {
    321		virtblk_cleanup_cmd(req);
    322		return BLK_STS_RESOURCE;
    323	}
    324
    325	return BLK_STS_OK;
    326}
    327
    328static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
    329			   const struct blk_mq_queue_data *bd)
    330{
    331	struct virtio_blk *vblk = hctx->queue->queuedata;
    332	struct request *req = bd->rq;
    333	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
    334	unsigned long flags;
    335	int qid = hctx->queue_num;
    336	bool notify = false;
    337	blk_status_t status;
    338	int err;
    339
    340	status = virtblk_prep_rq(hctx, vblk, req, vbr);
    341	if (unlikely(status))
    342		return status;
    343
    344	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
    345	err = virtblk_add_req(vblk->vqs[qid].vq, vbr);
    346	if (err) {
    347		virtqueue_kick(vblk->vqs[qid].vq);
    348		/* Don't stop the queue if -ENOMEM: we may have failed to
    349		 * bounce the buffer due to global resource outage.
    350		 */
    351		if (err == -ENOSPC)
    352			blk_mq_stop_hw_queue(hctx);
    353		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
    354		virtblk_unmap_data(req, vbr);
    355		virtblk_cleanup_cmd(req);
    356		switch (err) {
    357		case -ENOSPC:
    358			return BLK_STS_DEV_RESOURCE;
    359		case -ENOMEM:
    360			return BLK_STS_RESOURCE;
    361		default:
    362			return BLK_STS_IOERR;
    363		}
    364	}
    365
    366	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
    367		notify = true;
    368	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
    369
    370	if (notify)
    371		virtqueue_notify(vblk->vqs[qid].vq);
    372	return BLK_STS_OK;
    373}
    374
    375static bool virtblk_prep_rq_batch(struct request *req)
    376{
    377	struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
    378	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
    379
    380	req->mq_hctx->tags->rqs[req->tag] = req;
    381
    382	return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK;
    383}
    384
    385static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
    386					struct request **rqlist,
    387					struct request **requeue_list)
    388{
    389	unsigned long flags;
    390	int err;
    391	bool kick;
    392
    393	spin_lock_irqsave(&vq->lock, flags);
    394
    395	while (!rq_list_empty(*rqlist)) {
    396		struct request *req = rq_list_pop(rqlist);
    397		struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
    398
    399		err = virtblk_add_req(vq->vq, vbr);
    400		if (err) {
    401			virtblk_unmap_data(req, vbr);
    402			virtblk_cleanup_cmd(req);
    403			rq_list_add(requeue_list, req);
    404		}
    405	}
    406
    407	kick = virtqueue_kick_prepare(vq->vq);
    408	spin_unlock_irqrestore(&vq->lock, flags);
    409
    410	return kick;
    411}
    412
    413static void virtio_queue_rqs(struct request **rqlist)
    414{
    415	struct request *req, *next, *prev = NULL;
    416	struct request *requeue_list = NULL;
    417
    418	rq_list_for_each_safe(rqlist, req, next) {
    419		struct virtio_blk_vq *vq = req->mq_hctx->driver_data;
    420		bool kick;
    421
    422		if (!virtblk_prep_rq_batch(req)) {
    423			rq_list_move(rqlist, &requeue_list, req, prev);
    424			req = prev;
    425			if (!req)
    426				continue;
    427		}
    428
    429		if (!next || req->mq_hctx != next->mq_hctx) {
    430			req->rq_next = NULL;
    431			kick = virtblk_add_req_batch(vq, rqlist, &requeue_list);
    432			if (kick)
    433				virtqueue_notify(vq->vq);
    434
    435			*rqlist = next;
    436			prev = NULL;
    437		} else
    438			prev = req;
    439	}
    440
    441	*rqlist = requeue_list;
    442}
    443
    444/* return id (s/n) string for *disk to *id_str
    445 */
    446static int virtblk_get_id(struct gendisk *disk, char *id_str)
    447{
    448	struct virtio_blk *vblk = disk->private_data;
    449	struct request_queue *q = vblk->disk->queue;
    450	struct request *req;
    451	int err;
    452
    453	req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0);
    454	if (IS_ERR(req))
    455		return PTR_ERR(req);
    456
    457	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
    458	if (err)
    459		goto out;
    460
    461	blk_execute_rq(req, false);
    462	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
    463out:
    464	blk_mq_free_request(req);
    465	return err;
    466}
    467
    468/* We provide getgeo only to please some old bootloader/partitioning tools */
    469static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
    470{
    471	struct virtio_blk *vblk = bd->bd_disk->private_data;
    472	int ret = 0;
    473
    474	mutex_lock(&vblk->vdev_mutex);
    475
    476	if (!vblk->vdev) {
    477		ret = -ENXIO;
    478		goto out;
    479	}
    480
    481	/* see if the host passed in geometry config */
    482	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
    483		virtio_cread(vblk->vdev, struct virtio_blk_config,
    484			     geometry.cylinders, &geo->cylinders);
    485		virtio_cread(vblk->vdev, struct virtio_blk_config,
    486			     geometry.heads, &geo->heads);
    487		virtio_cread(vblk->vdev, struct virtio_blk_config,
    488			     geometry.sectors, &geo->sectors);
    489	} else {
    490		/* some standard values, similar to sd */
    491		geo->heads = 1 << 6;
    492		geo->sectors = 1 << 5;
    493		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
    494	}
    495out:
    496	mutex_unlock(&vblk->vdev_mutex);
    497	return ret;
    498}
    499
    500static void virtblk_free_disk(struct gendisk *disk)
    501{
    502	struct virtio_blk *vblk = disk->private_data;
    503
    504	ida_simple_remove(&vd_index_ida, vblk->index);
    505	mutex_destroy(&vblk->vdev_mutex);
    506	kfree(vblk);
    507}
    508
    509static const struct block_device_operations virtblk_fops = {
    510	.owner  	= THIS_MODULE,
    511	.getgeo		= virtblk_getgeo,
    512	.free_disk	= virtblk_free_disk,
    513};
    514
    515static int index_to_minor(int index)
    516{
    517	return index << PART_BITS;
    518}
    519
    520static int minor_to_index(int minor)
    521{
    522	return minor >> PART_BITS;
    523}
    524
    525static ssize_t serial_show(struct device *dev,
    526			   struct device_attribute *attr, char *buf)
    527{
    528	struct gendisk *disk = dev_to_disk(dev);
    529	int err;
    530
    531	/* sysfs gives us a PAGE_SIZE buffer */
    532	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
    533
    534	buf[VIRTIO_BLK_ID_BYTES] = '\0';
    535	err = virtblk_get_id(disk, buf);
    536	if (!err)
    537		return strlen(buf);
    538
    539	if (err == -EIO) /* Unsupported? Make it empty. */
    540		return 0;
    541
    542	return err;
    543}
    544
    545static DEVICE_ATTR_RO(serial);
    546
    547/* The queue's logical block size must be set before calling this */
    548static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
    549{
    550	struct virtio_device *vdev = vblk->vdev;
    551	struct request_queue *q = vblk->disk->queue;
    552	char cap_str_2[10], cap_str_10[10];
    553	unsigned long long nblocks;
    554	u64 capacity;
    555
    556	/* Host must always specify the capacity. */
    557	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
    558
    559	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
    560
    561	string_get_size(nblocks, queue_logical_block_size(q),
    562			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
    563	string_get_size(nblocks, queue_logical_block_size(q),
    564			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
    565
    566	dev_notice(&vdev->dev,
    567		   "[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
    568		   vblk->disk->disk_name,
    569		   resize ? "new size: " : "",
    570		   nblocks,
    571		   queue_logical_block_size(q),
    572		   cap_str_10,
    573		   cap_str_2);
    574
    575	set_capacity_and_notify(vblk->disk, capacity);
    576}
    577
    578static void virtblk_config_changed_work(struct work_struct *work)
    579{
    580	struct virtio_blk *vblk =
    581		container_of(work, struct virtio_blk, config_work);
    582
    583	virtblk_update_capacity(vblk, true);
    584}
    585
    586static void virtblk_config_changed(struct virtio_device *vdev)
    587{
    588	struct virtio_blk *vblk = vdev->priv;
    589
    590	queue_work(virtblk_wq, &vblk->config_work);
    591}
    592
    593static int init_vq(struct virtio_blk *vblk)
    594{
    595	int err;
    596	int i;
    597	vq_callback_t **callbacks;
    598	const char **names;
    599	struct virtqueue **vqs;
    600	unsigned short num_vqs;
    601	unsigned int num_poll_vqs;
    602	struct virtio_device *vdev = vblk->vdev;
    603	struct irq_affinity desc = { 0, };
    604
    605	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
    606				   struct virtio_blk_config, num_queues,
    607				   &num_vqs);
    608	if (err)
    609		num_vqs = 1;
    610
    611	if (!err && !num_vqs) {
    612		dev_err(&vdev->dev, "MQ advertised but zero queues reported\n");
    613		return -EINVAL;
    614	}
    615
    616	num_vqs = min_t(unsigned int,
    617			min_not_zero(num_request_queues, nr_cpu_ids),
    618			num_vqs);
    619
    620	num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1);
    621
    622	vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs;
    623	vblk->io_queues[HCTX_TYPE_READ] = 0;
    624	vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs;
    625
    626	dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n",
    627				vblk->io_queues[HCTX_TYPE_DEFAULT],
    628				vblk->io_queues[HCTX_TYPE_READ],
    629				vblk->io_queues[HCTX_TYPE_POLL]);
    630
    631	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
    632	if (!vblk->vqs)
    633		return -ENOMEM;
    634
    635	names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
    636	callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
    637	vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
    638	if (!names || !callbacks || !vqs) {
    639		err = -ENOMEM;
    640		goto out;
    641	}
    642
    643	for (i = 0; i < num_vqs - num_poll_vqs; i++) {
    644		callbacks[i] = virtblk_done;
    645		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
    646		names[i] = vblk->vqs[i].name;
    647	}
    648
    649	for (; i < num_vqs; i++) {
    650		callbacks[i] = NULL;
    651		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
    652		names[i] = vblk->vqs[i].name;
    653	}
    654
    655	/* Discover virtqueues and write information to configuration.  */
    656	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
    657	if (err)
    658		goto out;
    659
    660	for (i = 0; i < num_vqs; i++) {
    661		spin_lock_init(&vblk->vqs[i].lock);
    662		vblk->vqs[i].vq = vqs[i];
    663	}
    664	vblk->num_vqs = num_vqs;
    665
    666out:
    667	kfree(vqs);
    668	kfree(callbacks);
    669	kfree(names);
    670	if (err)
    671		kfree(vblk->vqs);
    672	return err;
    673}
    674
    675/*
    676 * Legacy naming scheme used for virtio devices.  We are stuck with it for
    677 * virtio blk but don't ever use it for any new driver.
    678 */
    679static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
    680{
    681	const int base = 'z' - 'a' + 1;
    682	char *begin = buf + strlen(prefix);
    683	char *end = buf + buflen;
    684	char *p;
    685	int unit;
    686
    687	p = end - 1;
    688	*p = '\0';
    689	unit = base;
    690	do {
    691		if (p == begin)
    692			return -EINVAL;
    693		*--p = 'a' + (index % unit);
    694		index = (index / unit) - 1;
    695	} while (index >= 0);
    696
    697	memmove(begin, p, end - p);
    698	memcpy(buf, prefix, strlen(prefix));
    699
    700	return 0;
    701}
    702
    703static int virtblk_get_cache_mode(struct virtio_device *vdev)
    704{
    705	u8 writeback;
    706	int err;
    707
    708	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
    709				   struct virtio_blk_config, wce,
    710				   &writeback);
    711
    712	/*
    713	 * If WCE is not configurable and flush is not available,
    714	 * assume no writeback cache is in use.
    715	 */
    716	if (err)
    717		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
    718
    719	return writeback;
    720}
    721
    722static void virtblk_update_cache_mode(struct virtio_device *vdev)
    723{
    724	u8 writeback = virtblk_get_cache_mode(vdev);
    725	struct virtio_blk *vblk = vdev->priv;
    726
    727	blk_queue_write_cache(vblk->disk->queue, writeback, false);
    728}
    729
    730static const char *const virtblk_cache_types[] = {
    731	"write through", "write back"
    732};
    733
    734static ssize_t
    735cache_type_store(struct device *dev, struct device_attribute *attr,
    736		 const char *buf, size_t count)
    737{
    738	struct gendisk *disk = dev_to_disk(dev);
    739	struct virtio_blk *vblk = disk->private_data;
    740	struct virtio_device *vdev = vblk->vdev;
    741	int i;
    742
    743	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
    744	i = sysfs_match_string(virtblk_cache_types, buf);
    745	if (i < 0)
    746		return i;
    747
    748	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
    749	virtblk_update_cache_mode(vdev);
    750	return count;
    751}
    752
    753static ssize_t
    754cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
    755{
    756	struct gendisk *disk = dev_to_disk(dev);
    757	struct virtio_blk *vblk = disk->private_data;
    758	u8 writeback = virtblk_get_cache_mode(vblk->vdev);
    759
    760	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
    761	return sysfs_emit(buf, "%s\n", virtblk_cache_types[writeback]);
    762}
    763
    764static DEVICE_ATTR_RW(cache_type);
    765
    766static struct attribute *virtblk_attrs[] = {
    767	&dev_attr_serial.attr,
    768	&dev_attr_cache_type.attr,
    769	NULL,
    770};
    771
    772static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
    773		struct attribute *a, int n)
    774{
    775	struct device *dev = kobj_to_dev(kobj);
    776	struct gendisk *disk = dev_to_disk(dev);
    777	struct virtio_blk *vblk = disk->private_data;
    778	struct virtio_device *vdev = vblk->vdev;
    779
    780	if (a == &dev_attr_cache_type.attr &&
    781	    !virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
    782		return S_IRUGO;
    783
    784	return a->mode;
    785}
    786
    787static const struct attribute_group virtblk_attr_group = {
    788	.attrs = virtblk_attrs,
    789	.is_visible = virtblk_attrs_are_visible,
    790};
    791
    792static const struct attribute_group *virtblk_attr_groups[] = {
    793	&virtblk_attr_group,
    794	NULL,
    795};
    796
    797static int virtblk_map_queues(struct blk_mq_tag_set *set)
    798{
    799	struct virtio_blk *vblk = set->driver_data;
    800	int i, qoff;
    801
    802	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
    803		struct blk_mq_queue_map *map = &set->map[i];
    804
    805		map->nr_queues = vblk->io_queues[i];
    806		map->queue_offset = qoff;
    807		qoff += map->nr_queues;
    808
    809		if (map->nr_queues == 0)
    810			continue;
    811
    812		/*
    813		 * Regular queues have interrupts and hence CPU affinity is
    814		 * defined by the core virtio code, but polling queues have
    815		 * no interrupts so we let the block layer assign CPU affinity.
    816		 */
    817		if (i == HCTX_TYPE_POLL)
    818			blk_mq_map_queues(&set->map[i]);
    819		else
    820			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
    821	}
    822
    823	return 0;
    824}
    825
    826static void virtblk_complete_batch(struct io_comp_batch *iob)
    827{
    828	struct request *req;
    829
    830	rq_list_for_each(&iob->req_list, req) {
    831		virtblk_unmap_data(req, blk_mq_rq_to_pdu(req));
    832		virtblk_cleanup_cmd(req);
    833	}
    834	blk_mq_end_request_batch(iob);
    835}
    836
    837static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
    838{
    839	struct virtio_blk *vblk = hctx->queue->queuedata;
    840	struct virtio_blk_vq *vq = hctx->driver_data;
    841	struct virtblk_req *vbr;
    842	unsigned long flags;
    843	unsigned int len;
    844	int found = 0;
    845
    846	spin_lock_irqsave(&vq->lock, flags);
    847
    848	while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) {
    849		struct request *req = blk_mq_rq_from_pdu(vbr);
    850
    851		found++;
    852		if (!blk_mq_add_to_batch(req, iob, vbr->status,
    853						virtblk_complete_batch))
    854			blk_mq_complete_request(req);
    855	}
    856
    857	if (found)
    858		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
    859
    860	spin_unlock_irqrestore(&vq->lock, flags);
    861
    862	return found;
    863}
    864
    865static int virtblk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
    866			  unsigned int hctx_idx)
    867{
    868	struct virtio_blk *vblk = data;
    869	struct virtio_blk_vq *vq = &vblk->vqs[hctx_idx];
    870
    871	WARN_ON(vblk->tag_set.tags[hctx_idx] != hctx->tags);
    872	hctx->driver_data = vq;
    873	return 0;
    874}
    875
    876static const struct blk_mq_ops virtio_mq_ops = {
    877	.queue_rq	= virtio_queue_rq,
    878	.queue_rqs	= virtio_queue_rqs,
    879	.commit_rqs	= virtio_commit_rqs,
    880	.init_hctx	= virtblk_init_hctx,
    881	.complete	= virtblk_request_done,
    882	.map_queues	= virtblk_map_queues,
    883	.poll		= virtblk_poll,
    884};
    885
    886static unsigned int virtblk_queue_depth;
    887module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
    888
    889static int virtblk_probe(struct virtio_device *vdev)
    890{
    891	struct virtio_blk *vblk;
    892	struct request_queue *q;
    893	int err, index;
    894
    895	u32 v, blk_size, max_size, sg_elems, opt_io_size;
    896	u16 min_io_size;
    897	u8 physical_block_exp, alignment_offset;
    898	unsigned int queue_depth;
    899
    900	if (!vdev->config->get) {
    901		dev_err(&vdev->dev, "%s failure: config access disabled\n",
    902			__func__);
    903		return -EINVAL;
    904	}
    905
    906	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
    907			     GFP_KERNEL);
    908	if (err < 0)
    909		goto out;
    910	index = err;
    911
    912	/* We need to know how many segments before we allocate. */
    913	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
    914				   struct virtio_blk_config, seg_max,
    915				   &sg_elems);
    916
    917	/* We need at least one SG element, whatever they say. */
    918	if (err || !sg_elems)
    919		sg_elems = 1;
    920
    921	/* Prevent integer overflows and honor max vq size */
    922	sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2);
    923
    924	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
    925	if (!vblk) {
    926		err = -ENOMEM;
    927		goto out_free_index;
    928	}
    929
    930	mutex_init(&vblk->vdev_mutex);
    931
    932	vblk->vdev = vdev;
    933
    934	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
    935
    936	err = init_vq(vblk);
    937	if (err)
    938		goto out_free_vblk;
    939
    940	/* Default queue sizing is to fill the ring. */
    941	if (!virtblk_queue_depth) {
    942		queue_depth = vblk->vqs[0].vq->num_free;
    943		/* ... but without indirect descs, we use 2 descs per req */
    944		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
    945			queue_depth /= 2;
    946	} else {
    947		queue_depth = virtblk_queue_depth;
    948	}
    949
    950	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
    951	vblk->tag_set.ops = &virtio_mq_ops;
    952	vblk->tag_set.queue_depth = queue_depth;
    953	vblk->tag_set.numa_node = NUMA_NO_NODE;
    954	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
    955	vblk->tag_set.cmd_size =
    956		sizeof(struct virtblk_req) +
    957		sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
    958	vblk->tag_set.driver_data = vblk;
    959	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
    960	vblk->tag_set.nr_maps = 1;
    961	if (vblk->io_queues[HCTX_TYPE_POLL])
    962		vblk->tag_set.nr_maps = 3;
    963
    964	err = blk_mq_alloc_tag_set(&vblk->tag_set);
    965	if (err)
    966		goto out_free_vq;
    967
    968	vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk);
    969	if (IS_ERR(vblk->disk)) {
    970		err = PTR_ERR(vblk->disk);
    971		goto out_free_tags;
    972	}
    973	q = vblk->disk->queue;
    974
    975	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
    976
    977	vblk->disk->major = major;
    978	vblk->disk->first_minor = index_to_minor(index);
    979	vblk->disk->minors = 1 << PART_BITS;
    980	vblk->disk->private_data = vblk;
    981	vblk->disk->fops = &virtblk_fops;
    982	vblk->index = index;
    983
    984	/* configure queue flush support */
    985	virtblk_update_cache_mode(vdev);
    986
    987	/* If disk is read-only in the host, the guest should obey */
    988	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
    989		set_disk_ro(vblk->disk, 1);
    990
    991	/* We can handle whatever the host told us to handle. */
    992	blk_queue_max_segments(q, sg_elems);
    993
    994	/* No real sector limit. */
    995	blk_queue_max_hw_sectors(q, -1U);
    996
    997	max_size = virtio_max_dma_size(vdev);
    998
    999	/* Host can optionally specify maximum segment size and number of
   1000	 * segments. */
   1001	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
   1002				   struct virtio_blk_config, size_max, &v);
   1003	if (!err)
   1004		max_size = min(max_size, v);
   1005
   1006	blk_queue_max_segment_size(q, max_size);
   1007
   1008	/* Host can optionally specify the block size of the device */
   1009	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
   1010				   struct virtio_blk_config, blk_size,
   1011				   &blk_size);
   1012	if (!err) {
   1013		err = blk_validate_block_size(blk_size);
   1014		if (err) {
   1015			dev_err(&vdev->dev,
   1016				"virtio_blk: invalid block size: 0x%x\n",
   1017				blk_size);
   1018			goto out_cleanup_disk;
   1019		}
   1020
   1021		blk_queue_logical_block_size(q, blk_size);
   1022	} else
   1023		blk_size = queue_logical_block_size(q);
   1024
   1025	/* Use topology information if available */
   1026	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
   1027				   struct virtio_blk_config, physical_block_exp,
   1028				   &physical_block_exp);
   1029	if (!err && physical_block_exp)
   1030		blk_queue_physical_block_size(q,
   1031				blk_size * (1 << physical_block_exp));
   1032
   1033	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
   1034				   struct virtio_blk_config, alignment_offset,
   1035				   &alignment_offset);
   1036	if (!err && alignment_offset)
   1037		blk_queue_alignment_offset(q, blk_size * alignment_offset);
   1038
   1039	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
   1040				   struct virtio_blk_config, min_io_size,
   1041				   &min_io_size);
   1042	if (!err && min_io_size)
   1043		blk_queue_io_min(q, blk_size * min_io_size);
   1044
   1045	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
   1046				   struct virtio_blk_config, opt_io_size,
   1047				   &opt_io_size);
   1048	if (!err && opt_io_size)
   1049		blk_queue_io_opt(q, blk_size * opt_io_size);
   1050
   1051	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
   1052		virtio_cread(vdev, struct virtio_blk_config,
   1053			     discard_sector_alignment, &v);
   1054		if (v)
   1055			q->limits.discard_granularity = v << SECTOR_SHIFT;
   1056		else
   1057			q->limits.discard_granularity = blk_size;
   1058
   1059		virtio_cread(vdev, struct virtio_blk_config,
   1060			     max_discard_sectors, &v);
   1061		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
   1062
   1063		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
   1064			     &v);
   1065
   1066		/*
   1067		 * max_discard_seg == 0 is out of spec but we always
   1068		 * handled it.
   1069		 */
   1070		if (!v)
   1071			v = sg_elems;
   1072		blk_queue_max_discard_segments(q,
   1073					       min(v, MAX_DISCARD_SEGMENTS));
   1074	}
   1075
   1076	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
   1077		virtio_cread(vdev, struct virtio_blk_config,
   1078			     max_write_zeroes_sectors, &v);
   1079		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
   1080	}
   1081
   1082	virtblk_update_capacity(vblk, false);
   1083	virtio_device_ready(vdev);
   1084
   1085	err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
   1086	if (err)
   1087		goto out_cleanup_disk;
   1088
   1089	return 0;
   1090
   1091out_cleanup_disk:
   1092	blk_cleanup_disk(vblk->disk);
   1093out_free_tags:
   1094	blk_mq_free_tag_set(&vblk->tag_set);
   1095out_free_vq:
   1096	vdev->config->del_vqs(vdev);
   1097	kfree(vblk->vqs);
   1098out_free_vblk:
   1099	kfree(vblk);
   1100out_free_index:
   1101	ida_simple_remove(&vd_index_ida, index);
   1102out:
   1103	return err;
   1104}
   1105
   1106static void virtblk_remove(struct virtio_device *vdev)
   1107{
   1108	struct virtio_blk *vblk = vdev->priv;
   1109
   1110	/* Make sure no work handler is accessing the device. */
   1111	flush_work(&vblk->config_work);
   1112
   1113	del_gendisk(vblk->disk);
   1114	blk_cleanup_queue(vblk->disk->queue);
   1115	blk_mq_free_tag_set(&vblk->tag_set);
   1116
   1117	mutex_lock(&vblk->vdev_mutex);
   1118
   1119	/* Stop all the virtqueues. */
   1120	virtio_reset_device(vdev);
   1121
   1122	/* Virtqueues are stopped, nothing can use vblk->vdev anymore. */
   1123	vblk->vdev = NULL;
   1124
   1125	vdev->config->del_vqs(vdev);
   1126	kfree(vblk->vqs);
   1127
   1128	mutex_unlock(&vblk->vdev_mutex);
   1129
   1130	put_disk(vblk->disk);
   1131}
   1132
   1133#ifdef CONFIG_PM_SLEEP
   1134static int virtblk_freeze(struct virtio_device *vdev)
   1135{
   1136	struct virtio_blk *vblk = vdev->priv;
   1137
   1138	/* Ensure we don't receive any more interrupts */
   1139	virtio_reset_device(vdev);
   1140
   1141	/* Make sure no work handler is accessing the device. */
   1142	flush_work(&vblk->config_work);
   1143
   1144	blk_mq_quiesce_queue(vblk->disk->queue);
   1145
   1146	vdev->config->del_vqs(vdev);
   1147	kfree(vblk->vqs);
   1148
   1149	return 0;
   1150}
   1151
   1152static int virtblk_restore(struct virtio_device *vdev)
   1153{
   1154	struct virtio_blk *vblk = vdev->priv;
   1155	int ret;
   1156
   1157	ret = init_vq(vdev->priv);
   1158	if (ret)
   1159		return ret;
   1160
   1161	virtio_device_ready(vdev);
   1162
   1163	blk_mq_unquiesce_queue(vblk->disk->queue);
   1164	return 0;
   1165}
   1166#endif
   1167
   1168static const struct virtio_device_id id_table[] = {
   1169	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
   1170	{ 0 },
   1171};
   1172
   1173static unsigned int features_legacy[] = {
   1174	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
   1175	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
   1176	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
   1177	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
   1178}
   1179;
   1180static unsigned int features[] = {
   1181	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
   1182	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
   1183	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
   1184	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
   1185};
   1186
   1187static struct virtio_driver virtio_blk = {
   1188	.feature_table			= features,
   1189	.feature_table_size		= ARRAY_SIZE(features),
   1190	.feature_table_legacy		= features_legacy,
   1191	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
   1192	.driver.name			= KBUILD_MODNAME,
   1193	.driver.owner			= THIS_MODULE,
   1194	.id_table			= id_table,
   1195	.probe				= virtblk_probe,
   1196	.remove				= virtblk_remove,
   1197	.config_changed			= virtblk_config_changed,
   1198#ifdef CONFIG_PM_SLEEP
   1199	.freeze				= virtblk_freeze,
   1200	.restore			= virtblk_restore,
   1201#endif
   1202};
   1203
   1204static int __init virtio_blk_init(void)
   1205{
   1206	int error;
   1207
   1208	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
   1209	if (!virtblk_wq)
   1210		return -ENOMEM;
   1211
   1212	major = register_blkdev(0, "virtblk");
   1213	if (major < 0) {
   1214		error = major;
   1215		goto out_destroy_workqueue;
   1216	}
   1217
   1218	error = register_virtio_driver(&virtio_blk);
   1219	if (error)
   1220		goto out_unregister_blkdev;
   1221	return 0;
   1222
   1223out_unregister_blkdev:
   1224	unregister_blkdev(major, "virtblk");
   1225out_destroy_workqueue:
   1226	destroy_workqueue(virtblk_wq);
   1227	return error;
   1228}
   1229
   1230static void __exit virtio_blk_fini(void)
   1231{
   1232	unregister_virtio_driver(&virtio_blk);
   1233	unregister_blkdev(major, "virtblk");
   1234	destroy_workqueue(virtblk_wq);
   1235}
   1236module_init(virtio_blk_init);
   1237module_exit(virtio_blk_fini);
   1238
   1239MODULE_DEVICE_TABLE(virtio, id_table);
   1240MODULE_DESCRIPTION("Virtio block driver");
   1241MODULE_LICENSE("GPL");