blk-core.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
blk-core.c (35544B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 1991, 1992 Linus Torvalds
      4 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
      5 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
      6 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
      7 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
      8 *	-  July2000
      9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
     10 */
     11
     12/*
     13 * This handles all read/write requests to block devices
     14 */
     15#include <linux/kernel.h>
     16#include <linux/module.h>
     17#include <linux/bio.h>
     18#include <linux/blkdev.h>
     19#include <linux/blk-pm.h>
     20#include <linux/blk-integrity.h>
     21#include <linux/highmem.h>
     22#include <linux/mm.h>
     23#include <linux/pagemap.h>
     24#include <linux/kernel_stat.h>
     25#include <linux/string.h>
     26#include <linux/init.h>
     27#include <linux/completion.h>
     28#include <linux/slab.h>
     29#include <linux/swap.h>
     30#include <linux/writeback.h>
     31#include <linux/task_io_accounting_ops.h>
     32#include <linux/fault-inject.h>
     33#include <linux/list_sort.h>
     34#include <linux/delay.h>
     35#include <linux/ratelimit.h>
     36#include <linux/pm_runtime.h>
     37#include <linux/t10-pi.h>
     38#include <linux/debugfs.h>
     39#include <linux/bpf.h>
     40#include <linux/psi.h>
     41#include <linux/part_stat.h>
     42#include <linux/sched/sysctl.h>
     43#include <linux/blk-crypto.h>
     44
     45#define CREATE_TRACE_POINTS
     46#include <trace/events/block.h>
     47
     48#include "blk.h"
     49#include "blk-mq-sched.h"
     50#include "blk-pm.h"
     51#include "blk-cgroup.h"
     52#include "blk-throttle.h"
     53
     54struct dentry *blk_debugfs_root;
     55
     56EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
     57EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
     58EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
     59EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
     60EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
     61EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
     62
     63DEFINE_IDA(blk_queue_ida);
     64
     65/*
     66 * For queue allocation
     67 */
     68struct kmem_cache *blk_requestq_cachep;
     69struct kmem_cache *blk_requestq_srcu_cachep;
     70
     71/*
     72 * Controlling structure to kblockd
     73 */
     74static struct workqueue_struct *kblockd_workqueue;
     75
     76/**
     77 * blk_queue_flag_set - atomically set a queue flag
     78 * @flag: flag to be set
     79 * @q: request queue
     80 */
     81void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
     82{
     83	set_bit(flag, &q->queue_flags);
     84}
     85EXPORT_SYMBOL(blk_queue_flag_set);
     86
     87/**
     88 * blk_queue_flag_clear - atomically clear a queue flag
     89 * @flag: flag to be cleared
     90 * @q: request queue
     91 */
     92void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
     93{
     94	clear_bit(flag, &q->queue_flags);
     95}
     96EXPORT_SYMBOL(blk_queue_flag_clear);
     97
     98/**
     99 * blk_queue_flag_test_and_set - atomically test and set a queue flag
    100 * @flag: flag to be set
    101 * @q: request queue
    102 *
    103 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
    104 * the flag was already set.
    105 */
    106bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
    107{
    108	return test_and_set_bit(flag, &q->queue_flags);
    109}
    110EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
    111
    112#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
    113static const char *const blk_op_name[] = {
    114	REQ_OP_NAME(READ),
    115	REQ_OP_NAME(WRITE),
    116	REQ_OP_NAME(FLUSH),
    117	REQ_OP_NAME(DISCARD),
    118	REQ_OP_NAME(SECURE_ERASE),
    119	REQ_OP_NAME(ZONE_RESET),
    120	REQ_OP_NAME(ZONE_RESET_ALL),
    121	REQ_OP_NAME(ZONE_OPEN),
    122	REQ_OP_NAME(ZONE_CLOSE),
    123	REQ_OP_NAME(ZONE_FINISH),
    124	REQ_OP_NAME(ZONE_APPEND),
    125	REQ_OP_NAME(WRITE_ZEROES),
    126	REQ_OP_NAME(DRV_IN),
    127	REQ_OP_NAME(DRV_OUT),
    128};
    129#undef REQ_OP_NAME
    130
    131/**
    132 * blk_op_str - Return string XXX in the REQ_OP_XXX.
    133 * @op: REQ_OP_XXX.
    134 *
    135 * Description: Centralize block layer function to convert REQ_OP_XXX into
    136 * string format. Useful in the debugging and tracing bio or request. For
    137 * invalid REQ_OP_XXX it returns string "UNKNOWN".
    138 */
    139inline const char *blk_op_str(unsigned int op)
    140{
    141	const char *op_str = "UNKNOWN";
    142
    143	if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
    144		op_str = blk_op_name[op];
    145
    146	return op_str;
    147}
    148EXPORT_SYMBOL_GPL(blk_op_str);
    149
    150static const struct {
    151	int		errno;
    152	const char	*name;
    153} blk_errors[] = {
    154	[BLK_STS_OK]		= { 0,		"" },
    155	[BLK_STS_NOTSUPP]	= { -EOPNOTSUPP, "operation not supported" },
    156	[BLK_STS_TIMEOUT]	= { -ETIMEDOUT,	"timeout" },
    157	[BLK_STS_NOSPC]		= { -ENOSPC,	"critical space allocation" },
    158	[BLK_STS_TRANSPORT]	= { -ENOLINK,	"recoverable transport" },
    159	[BLK_STS_TARGET]	= { -EREMOTEIO,	"critical target" },
    160	[BLK_STS_NEXUS]		= { -EBADE,	"critical nexus" },
    161	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
    162	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
    163	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
    164	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
    165	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
    166	[BLK_STS_OFFLINE]	= { -ENODEV,	"device offline" },
    167
    168	/* device mapper special case, should not leak out: */
    169	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
    170
    171	/* zone device specific errors */
    172	[BLK_STS_ZONE_OPEN_RESOURCE]	= { -ETOOMANYREFS, "open zones exceeded" },
    173	[BLK_STS_ZONE_ACTIVE_RESOURCE]	= { -EOVERFLOW, "active zones exceeded" },
    174
    175	/* everything else not covered above: */
    176	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
    177};
    178
    179blk_status_t errno_to_blk_status(int errno)
    180{
    181	int i;
    182
    183	for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
    184		if (blk_errors[i].errno == errno)
    185			return (__force blk_status_t)i;
    186	}
    187
    188	return BLK_STS_IOERR;
    189}
    190EXPORT_SYMBOL_GPL(errno_to_blk_status);
    191
    192int blk_status_to_errno(blk_status_t status)
    193{
    194	int idx = (__force int)status;
    195
    196	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
    197		return -EIO;
    198	return blk_errors[idx].errno;
    199}
    200EXPORT_SYMBOL_GPL(blk_status_to_errno);
    201
    202const char *blk_status_to_str(blk_status_t status)
    203{
    204	int idx = (__force int)status;
    205
    206	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
    207		return "<null>";
    208	return blk_errors[idx].name;
    209}
    210
    211/**
    212 * blk_sync_queue - cancel any pending callbacks on a queue
    213 * @q: the queue
    214 *
    215 * Description:
    216 *     The block layer may perform asynchronous callback activity
    217 *     on a queue, such as calling the unplug function after a timeout.
    218 *     A block device may call blk_sync_queue to ensure that any
    219 *     such activity is cancelled, thus allowing it to release resources
    220 *     that the callbacks might use. The caller must already have made sure
    221 *     that its ->submit_bio will not re-add plugging prior to calling
    222 *     this function.
    223 *
    224 *     This function does not cancel any asynchronous activity arising
    225 *     out of elevator or throttling code. That would require elevator_exit()
    226 *     and blkcg_exit_queue() to be called with queue lock initialized.
    227 *
    228 */
    229void blk_sync_queue(struct request_queue *q)
    230{
    231	del_timer_sync(&q->timeout);
    232	cancel_work_sync(&q->timeout_work);
    233}
    234EXPORT_SYMBOL(blk_sync_queue);
    235
    236/**
    237 * blk_set_pm_only - increment pm_only counter
    238 * @q: request queue pointer
    239 */
    240void blk_set_pm_only(struct request_queue *q)
    241{
    242	atomic_inc(&q->pm_only);
    243}
    244EXPORT_SYMBOL_GPL(blk_set_pm_only);
    245
    246void blk_clear_pm_only(struct request_queue *q)
    247{
    248	int pm_only;
    249
    250	pm_only = atomic_dec_return(&q->pm_only);
    251	WARN_ON_ONCE(pm_only < 0);
    252	if (pm_only == 0)
    253		wake_up_all(&q->mq_freeze_wq);
    254}
    255EXPORT_SYMBOL_GPL(blk_clear_pm_only);
    256
    257/**
    258 * blk_put_queue - decrement the request_queue refcount
    259 * @q: the request_queue structure to decrement the refcount for
    260 *
    261 * Decrements the refcount of the request_queue kobject. When this reaches 0
    262 * we'll have blk_release_queue() called.
    263 *
    264 * Context: Any context, but the last reference must not be dropped from
    265 *          atomic context.
    266 */
    267void blk_put_queue(struct request_queue *q)
    268{
    269	kobject_put(&q->kobj);
    270}
    271EXPORT_SYMBOL(blk_put_queue);
    272
    273void blk_queue_start_drain(struct request_queue *q)
    274{
    275	/*
    276	 * When queue DYING flag is set, we need to block new req
    277	 * entering queue, so we call blk_freeze_queue_start() to
    278	 * prevent I/O from crossing blk_queue_enter().
    279	 */
    280	blk_freeze_queue_start(q);
    281	if (queue_is_mq(q))
    282		blk_mq_wake_waiters(q);
    283	/* Make blk_queue_enter() reexamine the DYING flag. */
    284	wake_up_all(&q->mq_freeze_wq);
    285}
    286
    287/**
    288 * blk_cleanup_queue - shutdown a request queue
    289 * @q: request queue to shutdown
    290 *
    291 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
    292 * put it.  All future requests will be failed immediately with -ENODEV.
    293 *
    294 * Context: can sleep
    295 */
    296void blk_cleanup_queue(struct request_queue *q)
    297{
    298	/* cannot be called from atomic context */
    299	might_sleep();
    300
    301	WARN_ON_ONCE(blk_queue_registered(q));
    302
    303	/* mark @q DYING, no new request or merges will be allowed afterwards */
    304	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
    305	blk_queue_start_drain(q);
    306
    307	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
    308	blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
    309
    310	/*
    311	 * Drain all requests queued before DYING marking. Set DEAD flag to
    312	 * prevent that blk_mq_run_hw_queues() accesses the hardware queues
    313	 * after draining finished.
    314	 */
    315	blk_freeze_queue(q);
    316
    317	blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
    318
    319	blk_sync_queue(q);
    320	if (queue_is_mq(q)) {
    321		blk_mq_cancel_work_sync(q);
    322		blk_mq_exit_queue(q);
    323	}
    324
    325	/* @q is and will stay empty, shutdown and put */
    326	blk_put_queue(q);
    327}
    328EXPORT_SYMBOL(blk_cleanup_queue);
    329
    330/**
    331 * blk_queue_enter() - try to increase q->q_usage_counter
    332 * @q: request queue pointer
    333 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
    334 */
    335int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
    336{
    337	const bool pm = flags & BLK_MQ_REQ_PM;
    338
    339	while (!blk_try_enter_queue(q, pm)) {
    340		if (flags & BLK_MQ_REQ_NOWAIT)
    341			return -EBUSY;
    342
    343		/*
    344		 * read pair of barrier in blk_freeze_queue_start(), we need to
    345		 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
    346		 * reading .mq_freeze_depth or queue dying flag, otherwise the
    347		 * following wait may never return if the two reads are
    348		 * reordered.
    349		 */
    350		smp_rmb();
    351		wait_event(q->mq_freeze_wq,
    352			   (!q->mq_freeze_depth &&
    353			    blk_pm_resume_queue(pm, q)) ||
    354			   blk_queue_dying(q));
    355		if (blk_queue_dying(q))
    356			return -ENODEV;
    357	}
    358
    359	return 0;
    360}
    361
    362int __bio_queue_enter(struct request_queue *q, struct bio *bio)
    363{
    364	while (!blk_try_enter_queue(q, false)) {
    365		struct gendisk *disk = bio->bi_bdev->bd_disk;
    366
    367		if (bio->bi_opf & REQ_NOWAIT) {
    368			if (test_bit(GD_DEAD, &disk->state))
    369				goto dead;
    370			bio_wouldblock_error(bio);
    371			return -EBUSY;
    372		}
    373
    374		/*
    375		 * read pair of barrier in blk_freeze_queue_start(), we need to
    376		 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
    377		 * reading .mq_freeze_depth or queue dying flag, otherwise the
    378		 * following wait may never return if the two reads are
    379		 * reordered.
    380		 */
    381		smp_rmb();
    382		wait_event(q->mq_freeze_wq,
    383			   (!q->mq_freeze_depth &&
    384			    blk_pm_resume_queue(false, q)) ||
    385			   test_bit(GD_DEAD, &disk->state));
    386		if (test_bit(GD_DEAD, &disk->state))
    387			goto dead;
    388	}
    389
    390	return 0;
    391dead:
    392	bio_io_error(bio);
    393	return -ENODEV;
    394}
    395
    396void blk_queue_exit(struct request_queue *q)
    397{
    398	percpu_ref_put(&q->q_usage_counter);
    399}
    400
    401static void blk_queue_usage_counter_release(struct percpu_ref *ref)
    402{
    403	struct request_queue *q =
    404		container_of(ref, struct request_queue, q_usage_counter);
    405
    406	wake_up_all(&q->mq_freeze_wq);
    407}
    408
    409static void blk_rq_timed_out_timer(struct timer_list *t)
    410{
    411	struct request_queue *q = from_timer(q, t, timeout);
    412
    413	kblockd_schedule_work(&q->timeout_work);
    414}
    415
    416static void blk_timeout_work(struct work_struct *work)
    417{
    418}
    419
    420struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
    421{
    422	struct request_queue *q;
    423	int ret;
    424
    425	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
    426			GFP_KERNEL | __GFP_ZERO, node_id);
    427	if (!q)
    428		return NULL;
    429
    430	if (alloc_srcu) {
    431		blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
    432		if (init_srcu_struct(q->srcu) != 0)
    433			goto fail_q;
    434	}
    435
    436	q->last_merge = NULL;
    437
    438	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
    439	if (q->id < 0)
    440		goto fail_srcu;
    441
    442	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
    443	if (ret)
    444		goto fail_id;
    445
    446	q->stats = blk_alloc_queue_stats();
    447	if (!q->stats)
    448		goto fail_split;
    449
    450	q->node = node_id;
    451
    452	atomic_set(&q->nr_active_requests_shared_tags, 0);
    453
    454	timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
    455	INIT_WORK(&q->timeout_work, blk_timeout_work);
    456	INIT_LIST_HEAD(&q->icq_list);
    457
    458	kobject_init(&q->kobj, &blk_queue_ktype);
    459
    460	mutex_init(&q->debugfs_mutex);
    461	mutex_init(&q->sysfs_lock);
    462	mutex_init(&q->sysfs_dir_lock);
    463	spin_lock_init(&q->queue_lock);
    464
    465	init_waitqueue_head(&q->mq_freeze_wq);
    466	mutex_init(&q->mq_freeze_lock);
    467
    468	/*
    469	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
    470	 * See blk_register_queue() for details.
    471	 */
    472	if (percpu_ref_init(&q->q_usage_counter,
    473				blk_queue_usage_counter_release,
    474				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
    475		goto fail_stats;
    476
    477	blk_queue_dma_alignment(q, 511);
    478	blk_set_default_limits(&q->limits);
    479	q->nr_requests = BLKDEV_DEFAULT_RQ;
    480
    481	return q;
    482
    483fail_stats:
    484	blk_free_queue_stats(q->stats);
    485fail_split:
    486	bioset_exit(&q->bio_split);
    487fail_id:
    488	ida_simple_remove(&blk_queue_ida, q->id);
    489fail_srcu:
    490	if (alloc_srcu)
    491		cleanup_srcu_struct(q->srcu);
    492fail_q:
    493	kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
    494	return NULL;
    495}
    496
    497/**
    498 * blk_get_queue - increment the request_queue refcount
    499 * @q: the request_queue structure to increment the refcount for
    500 *
    501 * Increment the refcount of the request_queue kobject.
    502 *
    503 * Context: Any context.
    504 */
    505bool blk_get_queue(struct request_queue *q)
    506{
    507	if (likely(!blk_queue_dying(q))) {
    508		__blk_get_queue(q);
    509		return true;
    510	}
    511
    512	return false;
    513}
    514EXPORT_SYMBOL(blk_get_queue);
    515
    516#ifdef CONFIG_FAIL_MAKE_REQUEST
    517
    518static DECLARE_FAULT_ATTR(fail_make_request);
    519
    520static int __init setup_fail_make_request(char *str)
    521{
    522	return setup_fault_attr(&fail_make_request, str);
    523}
    524__setup("fail_make_request=", setup_fail_make_request);
    525
    526bool should_fail_request(struct block_device *part, unsigned int bytes)
    527{
    528	return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
    529}
    530
    531static int __init fail_make_request_debugfs(void)
    532{
    533	struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
    534						NULL, &fail_make_request);
    535
    536	return PTR_ERR_OR_ZERO(dir);
    537}
    538
    539late_initcall(fail_make_request_debugfs);
    540#endif /* CONFIG_FAIL_MAKE_REQUEST */
    541
    542static inline bool bio_check_ro(struct bio *bio)
    543{
    544	if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
    545		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
    546			return false;
    547		pr_warn("Trying to write to read-only block-device %pg\n",
    548			bio->bi_bdev);
    549		/* Older lvm-tools actually trigger this */
    550		return false;
    551	}
    552
    553	return false;
    554}
    555
    556static noinline int should_fail_bio(struct bio *bio)
    557{
    558	if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
    559		return -EIO;
    560	return 0;
    561}
    562ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
    563
    564/*
    565 * Check whether this bio extends beyond the end of the device or partition.
    566 * This may well happen - the kernel calls bread() without checking the size of
    567 * the device, e.g., when mounting a file system.
    568 */
    569static inline int bio_check_eod(struct bio *bio)
    570{
    571	sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
    572	unsigned int nr_sectors = bio_sectors(bio);
    573
    574	if (nr_sectors && maxsector &&
    575	    (nr_sectors > maxsector ||
    576	     bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
    577		pr_info_ratelimited("%s: attempt to access beyond end of device\n"
    578				    "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
    579				    current->comm, bio->bi_bdev, bio->bi_opf,
    580				    bio->bi_iter.bi_sector, nr_sectors, maxsector);
    581		return -EIO;
    582	}
    583	return 0;
    584}
    585
    586/*
    587 * Remap block n of partition p to block n+start(p) of the disk.
    588 */
    589static int blk_partition_remap(struct bio *bio)
    590{
    591	struct block_device *p = bio->bi_bdev;
    592
    593	if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
    594		return -EIO;
    595	if (bio_sectors(bio)) {
    596		bio->bi_iter.bi_sector += p->bd_start_sect;
    597		trace_block_bio_remap(bio, p->bd_dev,
    598				      bio->bi_iter.bi_sector -
    599				      p->bd_start_sect);
    600	}
    601	bio_set_flag(bio, BIO_REMAPPED);
    602	return 0;
    603}
    604
    605/*
    606 * Check write append to a zoned block device.
    607 */
    608static inline blk_status_t blk_check_zone_append(struct request_queue *q,
    609						 struct bio *bio)
    610{
    611	sector_t pos = bio->bi_iter.bi_sector;
    612	int nr_sectors = bio_sectors(bio);
    613
    614	/* Only applicable to zoned block devices */
    615	if (!blk_queue_is_zoned(q))
    616		return BLK_STS_NOTSUPP;
    617
    618	/* The bio sector must point to the start of a sequential zone */
    619	if (pos & (blk_queue_zone_sectors(q) - 1) ||
    620	    !blk_queue_zone_is_seq(q, pos))
    621		return BLK_STS_IOERR;
    622
    623	/*
    624	 * Not allowed to cross zone boundaries. Otherwise, the BIO will be
    625	 * split and could result in non-contiguous sectors being written in
    626	 * different zones.
    627	 */
    628	if (nr_sectors > q->limits.chunk_sectors)
    629		return BLK_STS_IOERR;
    630
    631	/* Make sure the BIO is small enough and will not get split */
    632	if (nr_sectors > q->limits.max_zone_append_sectors)
    633		return BLK_STS_IOERR;
    634
    635	bio->bi_opf |= REQ_NOMERGE;
    636
    637	return BLK_STS_OK;
    638}
    639
    640static void __submit_bio(struct bio *bio)
    641{
    642	struct gendisk *disk = bio->bi_bdev->bd_disk;
    643
    644	if (unlikely(!blk_crypto_bio_prep(&bio)))
    645		return;
    646
    647	if (!disk->fops->submit_bio) {
    648		blk_mq_submit_bio(bio);
    649	} else if (likely(bio_queue_enter(bio) == 0)) {
    650		disk->fops->submit_bio(bio);
    651		blk_queue_exit(disk->queue);
    652	}
    653}
    654
    655/*
    656 * The loop in this function may be a bit non-obvious, and so deserves some
    657 * explanation:
    658 *
    659 *  - Before entering the loop, bio->bi_next is NULL (as all callers ensure
    660 *    that), so we have a list with a single bio.
    661 *  - We pretend that we have just taken it off a longer list, so we assign
    662 *    bio_list to a pointer to the bio_list_on_stack, thus initialising the
    663 *    bio_list of new bios to be added.  ->submit_bio() may indeed add some more
    664 *    bios through a recursive call to submit_bio_noacct.  If it did, we find a
    665 *    non-NULL value in bio_list and re-enter the loop from the top.
    666 *  - In this case we really did just take the bio of the top of the list (no
    667 *    pretending) and so remove it from bio_list, and call into ->submit_bio()
    668 *    again.
    669 *
    670 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
    671 * bio_list_on_stack[1] contains bios that were submitted before the current
    672 *	->submit_bio, but that haven't been processed yet.
    673 */
    674static void __submit_bio_noacct(struct bio *bio)
    675{
    676	struct bio_list bio_list_on_stack[2];
    677
    678	BUG_ON(bio->bi_next);
    679
    680	bio_list_init(&bio_list_on_stack[0]);
    681	current->bio_list = bio_list_on_stack;
    682
    683	do {
    684		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
    685		struct bio_list lower, same;
    686
    687		/*
    688		 * Create a fresh bio_list for all subordinate requests.
    689		 */
    690		bio_list_on_stack[1] = bio_list_on_stack[0];
    691		bio_list_init(&bio_list_on_stack[0]);
    692
    693		__submit_bio(bio);
    694
    695		/*
    696		 * Sort new bios into those for a lower level and those for the
    697		 * same level.
    698		 */
    699		bio_list_init(&lower);
    700		bio_list_init(&same);
    701		while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
    702			if (q == bdev_get_queue(bio->bi_bdev))
    703				bio_list_add(&same, bio);
    704			else
    705				bio_list_add(&lower, bio);
    706
    707		/*
    708		 * Now assemble so we handle the lowest level first.
    709		 */
    710		bio_list_merge(&bio_list_on_stack[0], &lower);
    711		bio_list_merge(&bio_list_on_stack[0], &same);
    712		bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
    713	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
    714
    715	current->bio_list = NULL;
    716}
    717
    718static void __submit_bio_noacct_mq(struct bio *bio)
    719{
    720	struct bio_list bio_list[2] = { };
    721
    722	current->bio_list = bio_list;
    723
    724	do {
    725		__submit_bio(bio);
    726	} while ((bio = bio_list_pop(&bio_list[0])));
    727
    728	current->bio_list = NULL;
    729}
    730
    731void submit_bio_noacct_nocheck(struct bio *bio)
    732{
    733	/*
    734	 * We only want one ->submit_bio to be active at a time, else stack
    735	 * usage with stacked devices could be a problem.  Use current->bio_list
    736	 * to collect a list of requests submited by a ->submit_bio method while
    737	 * it is active, and then process them after it returned.
    738	 */
    739	if (current->bio_list)
    740		bio_list_add(&current->bio_list[0], bio);
    741	else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
    742		__submit_bio_noacct_mq(bio);
    743	else
    744		__submit_bio_noacct(bio);
    745}
    746
    747/**
    748 * submit_bio_noacct - re-submit a bio to the block device layer for I/O
    749 * @bio:  The bio describing the location in memory and on the device.
    750 *
    751 * This is a version of submit_bio() that shall only be used for I/O that is
    752 * resubmitted to lower level drivers by stacking block drivers.  All file
    753 * systems and other upper level users of the block layer should use
    754 * submit_bio() instead.
    755 */
    756void submit_bio_noacct(struct bio *bio)
    757{
    758	struct block_device *bdev = bio->bi_bdev;
    759	struct request_queue *q = bdev_get_queue(bdev);
    760	blk_status_t status = BLK_STS_IOERR;
    761	struct blk_plug *plug;
    762
    763	might_sleep();
    764
    765	plug = blk_mq_plug(q, bio);
    766	if (plug && plug->nowait)
    767		bio->bi_opf |= REQ_NOWAIT;
    768
    769	/*
    770	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
    771	 * if queue does not support NOWAIT.
    772	 */
    773	if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
    774		goto not_supported;
    775
    776	if (should_fail_bio(bio))
    777		goto end_io;
    778	if (unlikely(bio_check_ro(bio)))
    779		goto end_io;
    780	if (!bio_flagged(bio, BIO_REMAPPED)) {
    781		if (unlikely(bio_check_eod(bio)))
    782			goto end_io;
    783		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
    784			goto end_io;
    785	}
    786
    787	/*
    788	 * Filter flush bio's early so that bio based drivers without flush
    789	 * support don't have to worry about them.
    790	 */
    791	if (op_is_flush(bio->bi_opf) &&
    792	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
    793		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
    794		if (!bio_sectors(bio)) {
    795			status = BLK_STS_OK;
    796			goto end_io;
    797		}
    798	}
    799
    800	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
    801		bio_clear_polled(bio);
    802
    803	switch (bio_op(bio)) {
    804	case REQ_OP_DISCARD:
    805		if (!bdev_max_discard_sectors(bdev))
    806			goto not_supported;
    807		break;
    808	case REQ_OP_SECURE_ERASE:
    809		if (!bdev_max_secure_erase_sectors(bdev))
    810			goto not_supported;
    811		break;
    812	case REQ_OP_ZONE_APPEND:
    813		status = blk_check_zone_append(q, bio);
    814		if (status != BLK_STS_OK)
    815			goto end_io;
    816		break;
    817	case REQ_OP_ZONE_RESET:
    818	case REQ_OP_ZONE_OPEN:
    819	case REQ_OP_ZONE_CLOSE:
    820	case REQ_OP_ZONE_FINISH:
    821		if (!blk_queue_is_zoned(q))
    822			goto not_supported;
    823		break;
    824	case REQ_OP_ZONE_RESET_ALL:
    825		if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
    826			goto not_supported;
    827		break;
    828	case REQ_OP_WRITE_ZEROES:
    829		if (!q->limits.max_write_zeroes_sectors)
    830			goto not_supported;
    831		break;
    832	default:
    833		break;
    834	}
    835
    836	if (blk_throtl_bio(bio))
    837		return;
    838
    839	blk_cgroup_bio_start(bio);
    840	blkcg_bio_issue_init(bio);
    841
    842	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
    843		trace_block_bio_queue(bio);
    844		/* Now that enqueuing has been traced, we need to trace
    845		 * completion as well.
    846		 */
    847		bio_set_flag(bio, BIO_TRACE_COMPLETION);
    848	}
    849	submit_bio_noacct_nocheck(bio);
    850	return;
    851
    852not_supported:
    853	status = BLK_STS_NOTSUPP;
    854end_io:
    855	bio->bi_status = status;
    856	bio_endio(bio);
    857}
    858EXPORT_SYMBOL(submit_bio_noacct);
    859
    860/**
    861 * submit_bio - submit a bio to the block device layer for I/O
    862 * @bio: The &struct bio which describes the I/O
    863 *
    864 * submit_bio() is used to submit I/O requests to block devices.  It is passed a
    865 * fully set up &struct bio that describes the I/O that needs to be done.  The
    866 * bio will be send to the device described by the bi_bdev field.
    867 *
    868 * The success/failure status of the request, along with notification of
    869 * completion, is delivered asynchronously through the ->bi_end_io() callback
    870 * in @bio.  The bio must NOT be touched by thecaller until ->bi_end_io() has
    871 * been called.
    872 */
    873void submit_bio(struct bio *bio)
    874{
    875	if (blkcg_punt_bio_submit(bio))
    876		return;
    877
    878	if (bio_op(bio) == REQ_OP_READ) {
    879		task_io_account_read(bio->bi_iter.bi_size);
    880		count_vm_events(PGPGIN, bio_sectors(bio));
    881	} else if (bio_op(bio) == REQ_OP_WRITE) {
    882		count_vm_events(PGPGOUT, bio_sectors(bio));
    883	}
    884
    885	/*
    886	 * If we're reading data that is part of the userspace workingset, count
    887	 * submission time as memory stall.  When the device is congested, or
    888	 * the submitting cgroup IO-throttled, submission can be a significant
    889	 * part of overall IO time.
    890	 */
    891	if (unlikely(bio_op(bio) == REQ_OP_READ &&
    892	    bio_flagged(bio, BIO_WORKINGSET))) {
    893		unsigned long pflags;
    894
    895		psi_memstall_enter(&pflags);
    896		submit_bio_noacct(bio);
    897		psi_memstall_leave(&pflags);
    898		return;
    899	}
    900
    901	submit_bio_noacct(bio);
    902}
    903EXPORT_SYMBOL(submit_bio);
    904
    905/**
    906 * bio_poll - poll for BIO completions
    907 * @bio: bio to poll for
    908 * @iob: batches of IO
    909 * @flags: BLK_POLL_* flags that control the behavior
    910 *
    911 * Poll for completions on queue associated with the bio. Returns number of
    912 * completed entries found.
    913 *
    914 * Note: the caller must either be the context that submitted @bio, or
    915 * be in a RCU critical section to prevent freeing of @bio.
    916 */
    917int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
    918{
    919	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
    920	blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
    921	int ret = 0;
    922
    923	if (cookie == BLK_QC_T_NONE ||
    924	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
    925		return 0;
    926
    927	blk_flush_plug(current->plug, false);
    928
    929	if (bio_queue_enter(bio))
    930		return 0;
    931	if (queue_is_mq(q)) {
    932		ret = blk_mq_poll(q, cookie, iob, flags);
    933	} else {
    934		struct gendisk *disk = q->disk;
    935
    936		if (disk && disk->fops->poll_bio)
    937			ret = disk->fops->poll_bio(bio, iob, flags);
    938	}
    939	blk_queue_exit(q);
    940	return ret;
    941}
    942EXPORT_SYMBOL_GPL(bio_poll);
    943
    944/*
    945 * Helper to implement file_operations.iopoll.  Requires the bio to be stored
    946 * in iocb->private, and cleared before freeing the bio.
    947 */
    948int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
    949		    unsigned int flags)
    950{
    951	struct bio *bio;
    952	int ret = 0;
    953
    954	/*
    955	 * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
    956	 * point to a freshly allocated bio at this point.  If that happens
    957	 * we have a few cases to consider:
    958	 *
    959	 *  1) the bio is beeing initialized and bi_bdev is NULL.  We can just
    960	 *     simply nothing in this case
    961	 *  2) the bio points to a not poll enabled device.  bio_poll will catch
    962	 *     this and return 0
    963	 *  3) the bio points to a poll capable device, including but not
    964	 *     limited to the one that the original bio pointed to.  In this
    965	 *     case we will call into the actual poll method and poll for I/O,
    966	 *     even if we don't need to, but it won't cause harm either.
    967	 *
    968	 * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
    969	 * is still allocated. Because partitions hold a reference to the whole
    970	 * device bdev and thus disk, the disk is also still valid.  Grabbing
    971	 * a reference to the queue in bio_poll() ensures the hctxs and requests
    972	 * are still valid as well.
    973	 */
    974	rcu_read_lock();
    975	bio = READ_ONCE(kiocb->private);
    976	if (bio && bio->bi_bdev)
    977		ret = bio_poll(bio, iob, flags);
    978	rcu_read_unlock();
    979
    980	return ret;
    981}
    982EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
    983
    984void update_io_ticks(struct block_device *part, unsigned long now, bool end)
    985{
    986	unsigned long stamp;
    987again:
    988	stamp = READ_ONCE(part->bd_stamp);
    989	if (unlikely(time_after(now, stamp))) {
    990		if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp))
    991			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
    992	}
    993	if (part->bd_partno) {
    994		part = bdev_whole(part);
    995		goto again;
    996	}
    997}
    998
    999unsigned long bdev_start_io_acct(struct block_device *bdev,
   1000				 unsigned int sectors, unsigned int op,
   1001				 unsigned long start_time)
   1002{
   1003	const int sgrp = op_stat_group(op);
   1004
   1005	part_stat_lock();
   1006	update_io_ticks(bdev, start_time, false);
   1007	part_stat_inc(bdev, ios[sgrp]);
   1008	part_stat_add(bdev, sectors[sgrp], sectors);
   1009	part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
   1010	part_stat_unlock();
   1011
   1012	return start_time;
   1013}
   1014EXPORT_SYMBOL(bdev_start_io_acct);
   1015
   1016/**
   1017 * bio_start_io_acct_time - start I/O accounting for bio based drivers
   1018 * @bio:	bio to start account for
   1019 * @start_time:	start time that should be passed back to bio_end_io_acct().
   1020 */
   1021void bio_start_io_acct_time(struct bio *bio, unsigned long start_time)
   1022{
   1023	bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
   1024			   bio_op(bio), start_time);
   1025}
   1026EXPORT_SYMBOL_GPL(bio_start_io_acct_time);
   1027
   1028/**
   1029 * bio_start_io_acct - start I/O accounting for bio based drivers
   1030 * @bio:	bio to start account for
   1031 *
   1032 * Returns the start time that should be passed back to bio_end_io_acct().
   1033 */
   1034unsigned long bio_start_io_acct(struct bio *bio)
   1035{
   1036	return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
   1037				  bio_op(bio), jiffies);
   1038}
   1039EXPORT_SYMBOL_GPL(bio_start_io_acct);
   1040
   1041void bdev_end_io_acct(struct block_device *bdev, unsigned int op,
   1042		      unsigned long start_time)
   1043{
   1044	const int sgrp = op_stat_group(op);
   1045	unsigned long now = READ_ONCE(jiffies);
   1046	unsigned long duration = now - start_time;
   1047
   1048	part_stat_lock();
   1049	update_io_ticks(bdev, now, true);
   1050	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
   1051	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
   1052	part_stat_unlock();
   1053}
   1054EXPORT_SYMBOL(bdev_end_io_acct);
   1055
   1056void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
   1057			      struct block_device *orig_bdev)
   1058{
   1059	bdev_end_io_acct(orig_bdev, bio_op(bio), start_time);
   1060}
   1061EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
   1062
   1063/**
   1064 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
   1065 * @q : the queue of the device being checked
   1066 *
   1067 * Description:
   1068 *    Check if underlying low-level drivers of a device are busy.
   1069 *    If the drivers want to export their busy state, they must set own
   1070 *    exporting function using blk_queue_lld_busy() first.
   1071 *
   1072 *    Basically, this function is used only by request stacking drivers
   1073 *    to stop dispatching requests to underlying devices when underlying
   1074 *    devices are busy.  This behavior helps more I/O merging on the queue
   1075 *    of the request stacking driver and prevents I/O throughput regression
   1076 *    on burst I/O load.
   1077 *
   1078 * Return:
   1079 *    0 - Not busy (The request stacking driver should dispatch request)
   1080 *    1 - Busy (The request stacking driver should stop dispatching request)
   1081 */
   1082int blk_lld_busy(struct request_queue *q)
   1083{
   1084	if (queue_is_mq(q) && q->mq_ops->busy)
   1085		return q->mq_ops->busy(q);
   1086
   1087	return 0;
   1088}
   1089EXPORT_SYMBOL_GPL(blk_lld_busy);
   1090
   1091int kblockd_schedule_work(struct work_struct *work)
   1092{
   1093	return queue_work(kblockd_workqueue, work);
   1094}
   1095EXPORT_SYMBOL(kblockd_schedule_work);
   1096
   1097int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
   1098				unsigned long delay)
   1099{
   1100	return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
   1101}
   1102EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
   1103
   1104void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
   1105{
   1106	struct task_struct *tsk = current;
   1107
   1108	/*
   1109	 * If this is a nested plug, don't actually assign it.
   1110	 */
   1111	if (tsk->plug)
   1112		return;
   1113
   1114	plug->mq_list = NULL;
   1115	plug->cached_rq = NULL;
   1116	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
   1117	plug->rq_count = 0;
   1118	plug->multiple_queues = false;
   1119	plug->has_elevator = false;
   1120	plug->nowait = false;
   1121	INIT_LIST_HEAD(&plug->cb_list);
   1122
   1123	/*
   1124	 * Store ordering should not be needed here, since a potential
   1125	 * preempt will imply a full memory barrier
   1126	 */
   1127	tsk->plug = plug;
   1128}
   1129
   1130/**
   1131 * blk_start_plug - initialize blk_plug and track it inside the task_struct
   1132 * @plug:	The &struct blk_plug that needs to be initialized
   1133 *
   1134 * Description:
   1135 *   blk_start_plug() indicates to the block layer an intent by the caller
   1136 *   to submit multiple I/O requests in a batch.  The block layer may use
   1137 *   this hint to defer submitting I/Os from the caller until blk_finish_plug()
   1138 *   is called.  However, the block layer may choose to submit requests
   1139 *   before a call to blk_finish_plug() if the number of queued I/Os
   1140 *   exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
   1141 *   %BLK_PLUG_FLUSH_SIZE.  The queued I/Os may also be submitted early if
   1142 *   the task schedules (see below).
   1143 *
   1144 *   Tracking blk_plug inside the task_struct will help with auto-flushing the
   1145 *   pending I/O should the task end up blocking between blk_start_plug() and
   1146 *   blk_finish_plug(). This is important from a performance perspective, but
   1147 *   also ensures that we don't deadlock. For instance, if the task is blocking
   1148 *   for a memory allocation, memory reclaim could end up wanting to free a
   1149 *   page belonging to that request that is currently residing in our private
   1150 *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
   1151 *   this kind of deadlock.
   1152 */
   1153void blk_start_plug(struct blk_plug *plug)
   1154{
   1155	blk_start_plug_nr_ios(plug, 1);
   1156}
   1157EXPORT_SYMBOL(blk_start_plug);
   1158
   1159static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
   1160{
   1161	LIST_HEAD(callbacks);
   1162
   1163	while (!list_empty(&plug->cb_list)) {
   1164		list_splice_init(&plug->cb_list, &callbacks);
   1165
   1166		while (!list_empty(&callbacks)) {
   1167			struct blk_plug_cb *cb = list_first_entry(&callbacks,
   1168							  struct blk_plug_cb,
   1169							  list);
   1170			list_del(&cb->list);
   1171			cb->callback(cb, from_schedule);
   1172		}
   1173	}
   1174}
   1175
   1176struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
   1177				      int size)
   1178{
   1179	struct blk_plug *plug = current->plug;
   1180	struct blk_plug_cb *cb;
   1181
   1182	if (!plug)
   1183		return NULL;
   1184
   1185	list_for_each_entry(cb, &plug->cb_list, list)
   1186		if (cb->callback == unplug && cb->data == data)
   1187			return cb;
   1188
   1189	/* Not currently on the callback list */
   1190	BUG_ON(size < sizeof(*cb));
   1191	cb = kzalloc(size, GFP_ATOMIC);
   1192	if (cb) {
   1193		cb->data = data;
   1194		cb->callback = unplug;
   1195		list_add(&cb->list, &plug->cb_list);
   1196	}
   1197	return cb;
   1198}
   1199EXPORT_SYMBOL(blk_check_plugged);
   1200
   1201void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
   1202{
   1203	if (!list_empty(&plug->cb_list))
   1204		flush_plug_callbacks(plug, from_schedule);
   1205	if (!rq_list_empty(plug->mq_list))
   1206		blk_mq_flush_plug_list(plug, from_schedule);
   1207	/*
   1208	 * Unconditionally flush out cached requests, even if the unplug
   1209	 * event came from schedule. Since we know hold references to the
   1210	 * queue for cached requests, we don't want a blocked task holding
   1211	 * up a queue freeze/quiesce event.
   1212	 */
   1213	if (unlikely(!rq_list_empty(plug->cached_rq)))
   1214		blk_mq_free_plug_rqs(plug);
   1215}
   1216
   1217/**
   1218 * blk_finish_plug - mark the end of a batch of submitted I/O
   1219 * @plug:	The &struct blk_plug passed to blk_start_plug()
   1220 *
   1221 * Description:
   1222 * Indicate that a batch of I/O submissions is complete.  This function
   1223 * must be paired with an initial call to blk_start_plug().  The intent
   1224 * is to allow the block layer to optimize I/O submission.  See the
   1225 * documentation for blk_start_plug() for more information.
   1226 */
   1227void blk_finish_plug(struct blk_plug *plug)
   1228{
   1229	if (plug == current->plug) {
   1230		__blk_flush_plug(plug, false);
   1231		current->plug = NULL;
   1232	}
   1233}
   1234EXPORT_SYMBOL(blk_finish_plug);
   1235
   1236void blk_io_schedule(void)
   1237{
   1238	/* Prevent hang_check timer from firing at us during very long I/O */
   1239	unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
   1240
   1241	if (timeout)
   1242		io_schedule_timeout(timeout);
   1243	else
   1244		io_schedule();
   1245}
   1246EXPORT_SYMBOL_GPL(blk_io_schedule);
   1247
   1248int __init blk_dev_init(void)
   1249{
   1250	BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
   1251	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
   1252			sizeof_field(struct request, cmd_flags));
   1253	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
   1254			sizeof_field(struct bio, bi_opf));
   1255	BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
   1256			   __alignof__(struct request_queue)) !=
   1257		     sizeof(struct request_queue));
   1258
   1259	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
   1260	kblockd_workqueue = alloc_workqueue("kblockd",
   1261					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
   1262	if (!kblockd_workqueue)
   1263		panic("Failed to create kblockd\n");
   1264
   1265	blk_requestq_cachep = kmem_cache_create("request_queue",
   1266			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
   1267
   1268	blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
   1269			sizeof(struct request_queue) +
   1270			sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
   1271
   1272	blk_debugfs_root = debugfs_create_dir("block", NULL);
   1273
   1274	return 0;
   1275}