blk-mq.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
blk-mq.c (121108B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Block multiqueue core code
      4 *
      5 * Copyright (C) 2013-2014 Jens Axboe
      6 * Copyright (C) 2013-2014 Christoph Hellwig
      7 */
      8#include <linux/kernel.h>
      9#include <linux/module.h>
     10#include <linux/backing-dev.h>
     11#include <linux/bio.h>
     12#include <linux/blkdev.h>
     13#include <linux/blk-integrity.h>
     14#include <linux/kmemleak.h>
     15#include <linux/mm.h>
     16#include <linux/init.h>
     17#include <linux/slab.h>
     18#include <linux/workqueue.h>
     19#include <linux/smp.h>
     20#include <linux/interrupt.h>
     21#include <linux/llist.h>
     22#include <linux/cpu.h>
     23#include <linux/cache.h>
     24#include <linux/sched/sysctl.h>
     25#include <linux/sched/topology.h>
     26#include <linux/sched/signal.h>
     27#include <linux/delay.h>
     28#include <linux/crash_dump.h>
     29#include <linux/prefetch.h>
     30#include <linux/blk-crypto.h>
     31#include <linux/part_stat.h>
     32
     33#include <trace/events/block.h>
     34
     35#include <linux/blk-mq.h>
     36#include <linux/t10-pi.h>
     37#include "blk.h"
     38#include "blk-mq.h"
     39#include "blk-mq-debugfs.h"
     40#include "blk-mq-tag.h"
     41#include "blk-pm.h"
     42#include "blk-stat.h"
     43#include "blk-mq-sched.h"
     44#include "blk-rq-qos.h"
     45
     46static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
     47
     48static void blk_mq_poll_stats_start(struct request_queue *q);
     49static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
     50
     51static int blk_mq_poll_stats_bkt(const struct request *rq)
     52{
     53	int ddir, sectors, bucket;
     54
     55	ddir = rq_data_dir(rq);
     56	sectors = blk_rq_stats_sectors(rq);
     57
     58	bucket = ddir + 2 * ilog2(sectors);
     59
     60	if (bucket < 0)
     61		return -1;
     62	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
     63		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
     64
     65	return bucket;
     66}
     67
     68#define BLK_QC_T_SHIFT		16
     69#define BLK_QC_T_INTERNAL	(1U << 31)
     70
     71static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
     72		blk_qc_t qc)
     73{
     74	return xa_load(&q->hctx_table,
     75			(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
     76}
     77
     78static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
     79		blk_qc_t qc)
     80{
     81	unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
     82
     83	if (qc & BLK_QC_T_INTERNAL)
     84		return blk_mq_tag_to_rq(hctx->sched_tags, tag);
     85	return blk_mq_tag_to_rq(hctx->tags, tag);
     86}
     87
     88static inline blk_qc_t blk_rq_to_qc(struct request *rq)
     89{
     90	return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
     91		(rq->tag != -1 ?
     92		 rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
     93}
     94
     95/*
     96 * Check if any of the ctx, dispatch list or elevator
     97 * have pending work in this hardware queue.
     98 */
     99static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
    100{
    101	return !list_empty_careful(&hctx->dispatch) ||
    102		sbitmap_any_bit_set(&hctx->ctx_map) ||
    103			blk_mq_sched_has_work(hctx);
    104}
    105
    106/*
    107 * Mark this ctx as having pending work in this hardware queue
    108 */
    109static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
    110				     struct blk_mq_ctx *ctx)
    111{
    112	const int bit = ctx->index_hw[hctx->type];
    113
    114	if (!sbitmap_test_bit(&hctx->ctx_map, bit))
    115		sbitmap_set_bit(&hctx->ctx_map, bit);
    116}
    117
    118static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
    119				      struct blk_mq_ctx *ctx)
    120{
    121	const int bit = ctx->index_hw[hctx->type];
    122
    123	sbitmap_clear_bit(&hctx->ctx_map, bit);
    124}
    125
    126struct mq_inflight {
    127	struct block_device *part;
    128	unsigned int inflight[2];
    129};
    130
    131static bool blk_mq_check_inflight(struct request *rq, void *priv,
    132				  bool reserved)
    133{
    134	struct mq_inflight *mi = priv;
    135
    136	if (rq->part && blk_do_io_stat(rq) &&
    137	    (!mi->part->bd_partno || rq->part == mi->part) &&
    138	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
    139		mi->inflight[rq_data_dir(rq)]++;
    140
    141	return true;
    142}
    143
    144unsigned int blk_mq_in_flight(struct request_queue *q,
    145		struct block_device *part)
    146{
    147	struct mq_inflight mi = { .part = part };
    148
    149	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
    150
    151	return mi.inflight[0] + mi.inflight[1];
    152}
    153
    154void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
    155		unsigned int inflight[2])
    156{
    157	struct mq_inflight mi = { .part = part };
    158
    159	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
    160	inflight[0] = mi.inflight[0];
    161	inflight[1] = mi.inflight[1];
    162}
    163
    164void blk_freeze_queue_start(struct request_queue *q)
    165{
    166	mutex_lock(&q->mq_freeze_lock);
    167	if (++q->mq_freeze_depth == 1) {
    168		percpu_ref_kill(&q->q_usage_counter);
    169		mutex_unlock(&q->mq_freeze_lock);
    170		if (queue_is_mq(q))
    171			blk_mq_run_hw_queues(q, false);
    172	} else {
    173		mutex_unlock(&q->mq_freeze_lock);
    174	}
    175}
    176EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
    177
    178void blk_mq_freeze_queue_wait(struct request_queue *q)
    179{
    180	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
    181}
    182EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
    183
    184int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
    185				     unsigned long timeout)
    186{
    187	return wait_event_timeout(q->mq_freeze_wq,
    188					percpu_ref_is_zero(&q->q_usage_counter),
    189					timeout);
    190}
    191EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
    192
    193/*
    194 * Guarantee no request is in use, so we can change any data structure of
    195 * the queue afterward.
    196 */
    197void blk_freeze_queue(struct request_queue *q)
    198{
    199	/*
    200	 * In the !blk_mq case we are only calling this to kill the
    201	 * q_usage_counter, otherwise this increases the freeze depth
    202	 * and waits for it to return to zero.  For this reason there is
    203	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
    204	 * exported to drivers as the only user for unfreeze is blk_mq.
    205	 */
    206	blk_freeze_queue_start(q);
    207	blk_mq_freeze_queue_wait(q);
    208}
    209
    210void blk_mq_freeze_queue(struct request_queue *q)
    211{
    212	/*
    213	 * ...just an alias to keep freeze and unfreeze actions balanced
    214	 * in the blk_mq_* namespace
    215	 */
    216	blk_freeze_queue(q);
    217}
    218EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
    219
    220void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
    221{
    222	mutex_lock(&q->mq_freeze_lock);
    223	if (force_atomic)
    224		q->q_usage_counter.data->force_atomic = true;
    225	q->mq_freeze_depth--;
    226	WARN_ON_ONCE(q->mq_freeze_depth < 0);
    227	if (!q->mq_freeze_depth) {
    228		percpu_ref_resurrect(&q->q_usage_counter);
    229		wake_up_all(&q->mq_freeze_wq);
    230	}
    231	mutex_unlock(&q->mq_freeze_lock);
    232}
    233
    234void blk_mq_unfreeze_queue(struct request_queue *q)
    235{
    236	__blk_mq_unfreeze_queue(q, false);
    237}
    238EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
    239
    240/*
    241 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
    242 * mpt3sas driver such that this function can be removed.
    243 */
    244void blk_mq_quiesce_queue_nowait(struct request_queue *q)
    245{
    246	unsigned long flags;
    247
    248	spin_lock_irqsave(&q->queue_lock, flags);
    249	if (!q->quiesce_depth++)
    250		blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
    251	spin_unlock_irqrestore(&q->queue_lock, flags);
    252}
    253EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
    254
    255/**
    256 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
    257 * @q: request queue.
    258 *
    259 * Note: it is driver's responsibility for making sure that quiesce has
    260 * been started.
    261 */
    262void blk_mq_wait_quiesce_done(struct request_queue *q)
    263{
    264	if (blk_queue_has_srcu(q))
    265		synchronize_srcu(q->srcu);
    266	else
    267		synchronize_rcu();
    268}
    269EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
    270
    271/**
    272 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
    273 * @q: request queue.
    274 *
    275 * Note: this function does not prevent that the struct request end_io()
    276 * callback function is invoked. Once this function is returned, we make
    277 * sure no dispatch can happen until the queue is unquiesced via
    278 * blk_mq_unquiesce_queue().
    279 */
    280void blk_mq_quiesce_queue(struct request_queue *q)
    281{
    282	blk_mq_quiesce_queue_nowait(q);
    283	blk_mq_wait_quiesce_done(q);
    284}
    285EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
    286
    287/*
    288 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
    289 * @q: request queue.
    290 *
    291 * This function recovers queue into the state before quiescing
    292 * which is done by blk_mq_quiesce_queue.
    293 */
    294void blk_mq_unquiesce_queue(struct request_queue *q)
    295{
    296	unsigned long flags;
    297	bool run_queue = false;
    298
    299	spin_lock_irqsave(&q->queue_lock, flags);
    300	if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
    301		;
    302	} else if (!--q->quiesce_depth) {
    303		blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
    304		run_queue = true;
    305	}
    306	spin_unlock_irqrestore(&q->queue_lock, flags);
    307
    308	/* dispatch requests which are inserted during quiescing */
    309	if (run_queue)
    310		blk_mq_run_hw_queues(q, true);
    311}
    312EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
    313
    314void blk_mq_wake_waiters(struct request_queue *q)
    315{
    316	struct blk_mq_hw_ctx *hctx;
    317	unsigned long i;
    318
    319	queue_for_each_hw_ctx(q, hctx, i)
    320		if (blk_mq_hw_queue_mapped(hctx))
    321			blk_mq_tag_wakeup_all(hctx->tags, true);
    322}
    323
    324void blk_rq_init(struct request_queue *q, struct request *rq)
    325{
    326	memset(rq, 0, sizeof(*rq));
    327
    328	INIT_LIST_HEAD(&rq->queuelist);
    329	rq->q = q;
    330	rq->__sector = (sector_t) -1;
    331	INIT_HLIST_NODE(&rq->hash);
    332	RB_CLEAR_NODE(&rq->rb_node);
    333	rq->tag = BLK_MQ_NO_TAG;
    334	rq->internal_tag = BLK_MQ_NO_TAG;
    335	rq->start_time_ns = ktime_get_ns();
    336	rq->part = NULL;
    337	blk_crypto_rq_set_defaults(rq);
    338}
    339EXPORT_SYMBOL(blk_rq_init);
    340
    341static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
    342		struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
    343{
    344	struct blk_mq_ctx *ctx = data->ctx;
    345	struct blk_mq_hw_ctx *hctx = data->hctx;
    346	struct request_queue *q = data->q;
    347	struct request *rq = tags->static_rqs[tag];
    348
    349	rq->q = q;
    350	rq->mq_ctx = ctx;
    351	rq->mq_hctx = hctx;
    352	rq->cmd_flags = data->cmd_flags;
    353
    354	if (data->flags & BLK_MQ_REQ_PM)
    355		data->rq_flags |= RQF_PM;
    356	if (blk_queue_io_stat(q))
    357		data->rq_flags |= RQF_IO_STAT;
    358	rq->rq_flags = data->rq_flags;
    359
    360	if (!(data->rq_flags & RQF_ELV)) {
    361		rq->tag = tag;
    362		rq->internal_tag = BLK_MQ_NO_TAG;
    363	} else {
    364		rq->tag = BLK_MQ_NO_TAG;
    365		rq->internal_tag = tag;
    366	}
    367	rq->timeout = 0;
    368
    369	if (blk_mq_need_time_stamp(rq))
    370		rq->start_time_ns = ktime_get_ns();
    371	else
    372		rq->start_time_ns = 0;
    373	rq->part = NULL;
    374#ifdef CONFIG_BLK_RQ_ALLOC_TIME
    375	rq->alloc_time_ns = alloc_time_ns;
    376#endif
    377	rq->io_start_time_ns = 0;
    378	rq->stats_sectors = 0;
    379	rq->nr_phys_segments = 0;
    380#if defined(CONFIG_BLK_DEV_INTEGRITY)
    381	rq->nr_integrity_segments = 0;
    382#endif
    383	rq->end_io = NULL;
    384	rq->end_io_data = NULL;
    385
    386	blk_crypto_rq_set_defaults(rq);
    387	INIT_LIST_HEAD(&rq->queuelist);
    388	/* tag was already set */
    389	WRITE_ONCE(rq->deadline, 0);
    390	req_ref_set(rq, 1);
    391
    392	if (rq->rq_flags & RQF_ELV) {
    393		struct elevator_queue *e = data->q->elevator;
    394
    395		INIT_HLIST_NODE(&rq->hash);
    396		RB_CLEAR_NODE(&rq->rb_node);
    397
    398		if (!op_is_flush(data->cmd_flags) &&
    399		    e->type->ops.prepare_request) {
    400			e->type->ops.prepare_request(rq);
    401			rq->rq_flags |= RQF_ELVPRIV;
    402		}
    403	}
    404
    405	return rq;
    406}
    407
    408static inline struct request *
    409__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
    410		u64 alloc_time_ns)
    411{
    412	unsigned int tag, tag_offset;
    413	struct blk_mq_tags *tags;
    414	struct request *rq;
    415	unsigned long tag_mask;
    416	int i, nr = 0;
    417
    418	tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
    419	if (unlikely(!tag_mask))
    420		return NULL;
    421
    422	tags = blk_mq_tags_from_data(data);
    423	for (i = 0; tag_mask; i++) {
    424		if (!(tag_mask & (1UL << i)))
    425			continue;
    426		tag = tag_offset + i;
    427		prefetch(tags->static_rqs[tag]);
    428		tag_mask &= ~(1UL << i);
    429		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
    430		rq_list_add(data->cached_rq, rq);
    431		nr++;
    432	}
    433	/* caller already holds a reference, add for remainder */
    434	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
    435	data->nr_tags -= nr;
    436
    437	return rq_list_pop(data->cached_rq);
    438}
    439
    440static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
    441{
    442	struct request_queue *q = data->q;
    443	u64 alloc_time_ns = 0;
    444	struct request *rq;
    445	unsigned int tag;
    446
    447	/* alloc_time includes depth and tag waits */
    448	if (blk_queue_rq_alloc_time(q))
    449		alloc_time_ns = ktime_get_ns();
    450
    451	if (data->cmd_flags & REQ_NOWAIT)
    452		data->flags |= BLK_MQ_REQ_NOWAIT;
    453
    454	if (q->elevator) {
    455		struct elevator_queue *e = q->elevator;
    456
    457		data->rq_flags |= RQF_ELV;
    458
    459		/*
    460		 * Flush/passthrough requests are special and go directly to the
    461		 * dispatch list. Don't include reserved tags in the
    462		 * limiting, as it isn't useful.
    463		 */
    464		if (!op_is_flush(data->cmd_flags) &&
    465		    !blk_op_is_passthrough(data->cmd_flags) &&
    466		    e->type->ops.limit_depth &&
    467		    !(data->flags & BLK_MQ_REQ_RESERVED))
    468			e->type->ops.limit_depth(data->cmd_flags, data);
    469	}
    470
    471retry:
    472	data->ctx = blk_mq_get_ctx(q);
    473	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
    474	if (!(data->rq_flags & RQF_ELV))
    475		blk_mq_tag_busy(data->hctx);
    476
    477	/*
    478	 * Try batched alloc if we want more than 1 tag.
    479	 */
    480	if (data->nr_tags > 1) {
    481		rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
    482		if (rq)
    483			return rq;
    484		data->nr_tags = 1;
    485	}
    486
    487	/*
    488	 * Waiting allocations only fail because of an inactive hctx.  In that
    489	 * case just retry the hctx assignment and tag allocation as CPU hotplug
    490	 * should have migrated us to an online CPU by now.
    491	 */
    492	tag = blk_mq_get_tag(data);
    493	if (tag == BLK_MQ_NO_TAG) {
    494		if (data->flags & BLK_MQ_REQ_NOWAIT)
    495			return NULL;
    496		/*
    497		 * Give up the CPU and sleep for a random short time to
    498		 * ensure that thread using a realtime scheduling class
    499		 * are migrated off the CPU, and thus off the hctx that
    500		 * is going away.
    501		 */
    502		msleep(3);
    503		goto retry;
    504	}
    505
    506	return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
    507					alloc_time_ns);
    508}
    509
    510struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
    511		blk_mq_req_flags_t flags)
    512{
    513	struct blk_mq_alloc_data data = {
    514		.q		= q,
    515		.flags		= flags,
    516		.cmd_flags	= op,
    517		.nr_tags	= 1,
    518	};
    519	struct request *rq;
    520	int ret;
    521
    522	ret = blk_queue_enter(q, flags);
    523	if (ret)
    524		return ERR_PTR(ret);
    525
    526	rq = __blk_mq_alloc_requests(&data);
    527	if (!rq)
    528		goto out_queue_exit;
    529	rq->__data_len = 0;
    530	rq->__sector = (sector_t) -1;
    531	rq->bio = rq->biotail = NULL;
    532	return rq;
    533out_queue_exit:
    534	blk_queue_exit(q);
    535	return ERR_PTR(-EWOULDBLOCK);
    536}
    537EXPORT_SYMBOL(blk_mq_alloc_request);
    538
    539struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
    540	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
    541{
    542	struct blk_mq_alloc_data data = {
    543		.q		= q,
    544		.flags		= flags,
    545		.cmd_flags	= op,
    546		.nr_tags	= 1,
    547	};
    548	u64 alloc_time_ns = 0;
    549	unsigned int cpu;
    550	unsigned int tag;
    551	int ret;
    552
    553	/* alloc_time includes depth and tag waits */
    554	if (blk_queue_rq_alloc_time(q))
    555		alloc_time_ns = ktime_get_ns();
    556
    557	/*
    558	 * If the tag allocator sleeps we could get an allocation for a
    559	 * different hardware context.  No need to complicate the low level
    560	 * allocator for this for the rare use case of a command tied to
    561	 * a specific queue.
    562	 */
    563	if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
    564		return ERR_PTR(-EINVAL);
    565
    566	if (hctx_idx >= q->nr_hw_queues)
    567		return ERR_PTR(-EIO);
    568
    569	ret = blk_queue_enter(q, flags);
    570	if (ret)
    571		return ERR_PTR(ret);
    572
    573	/*
    574	 * Check if the hardware context is actually mapped to anything.
    575	 * If not tell the caller that it should skip this queue.
    576	 */
    577	ret = -EXDEV;
    578	data.hctx = xa_load(&q->hctx_table, hctx_idx);
    579	if (!blk_mq_hw_queue_mapped(data.hctx))
    580		goto out_queue_exit;
    581	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
    582	if (cpu >= nr_cpu_ids)
    583		goto out_queue_exit;
    584	data.ctx = __blk_mq_get_ctx(q, cpu);
    585
    586	if (!q->elevator)
    587		blk_mq_tag_busy(data.hctx);
    588	else
    589		data.rq_flags |= RQF_ELV;
    590
    591	ret = -EWOULDBLOCK;
    592	tag = blk_mq_get_tag(&data);
    593	if (tag == BLK_MQ_NO_TAG)
    594		goto out_queue_exit;
    595	return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
    596					alloc_time_ns);
    597
    598out_queue_exit:
    599	blk_queue_exit(q);
    600	return ERR_PTR(ret);
    601}
    602EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
    603
    604static void __blk_mq_free_request(struct request *rq)
    605{
    606	struct request_queue *q = rq->q;
    607	struct blk_mq_ctx *ctx = rq->mq_ctx;
    608	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
    609	const int sched_tag = rq->internal_tag;
    610
    611	blk_crypto_free_request(rq);
    612	blk_pm_mark_last_busy(rq);
    613	rq->mq_hctx = NULL;
    614	if (rq->tag != BLK_MQ_NO_TAG)
    615		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
    616	if (sched_tag != BLK_MQ_NO_TAG)
    617		blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
    618	blk_mq_sched_restart(hctx);
    619	blk_queue_exit(q);
    620}
    621
    622void blk_mq_free_request(struct request *rq)
    623{
    624	struct request_queue *q = rq->q;
    625	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
    626
    627	if ((rq->rq_flags & RQF_ELVPRIV) &&
    628	    q->elevator->type->ops.finish_request)
    629		q->elevator->type->ops.finish_request(rq);
    630
    631	if (rq->rq_flags & RQF_MQ_INFLIGHT)
    632		__blk_mq_dec_active_requests(hctx);
    633
    634	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
    635		laptop_io_completion(q->disk->bdi);
    636
    637	rq_qos_done(q, rq);
    638
    639	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
    640	if (req_ref_put_and_test(rq))
    641		__blk_mq_free_request(rq);
    642}
    643EXPORT_SYMBOL_GPL(blk_mq_free_request);
    644
    645void blk_mq_free_plug_rqs(struct blk_plug *plug)
    646{
    647	struct request *rq;
    648
    649	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
    650		blk_mq_free_request(rq);
    651}
    652
    653void blk_dump_rq_flags(struct request *rq, char *msg)
    654{
    655	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
    656		rq->q->disk ? rq->q->disk->disk_name : "?",
    657		(unsigned long long) rq->cmd_flags);
    658
    659	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
    660	       (unsigned long long)blk_rq_pos(rq),
    661	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
    662	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
    663	       rq->bio, rq->biotail, blk_rq_bytes(rq));
    664}
    665EXPORT_SYMBOL(blk_dump_rq_flags);
    666
    667static void req_bio_endio(struct request *rq, struct bio *bio,
    668			  unsigned int nbytes, blk_status_t error)
    669{
    670	if (unlikely(error)) {
    671		bio->bi_status = error;
    672	} else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
    673		/*
    674		 * Partial zone append completions cannot be supported as the
    675		 * BIO fragments may end up not being written sequentially.
    676		 */
    677		if (bio->bi_iter.bi_size != nbytes)
    678			bio->bi_status = BLK_STS_IOERR;
    679		else
    680			bio->bi_iter.bi_sector = rq->__sector;
    681	}
    682
    683	bio_advance(bio, nbytes);
    684
    685	if (unlikely(rq->rq_flags & RQF_QUIET))
    686		bio_set_flag(bio, BIO_QUIET);
    687	/* don't actually finish bio if it's part of flush sequence */
    688	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
    689		bio_endio(bio);
    690}
    691
    692static void blk_account_io_completion(struct request *req, unsigned int bytes)
    693{
    694	if (req->part && blk_do_io_stat(req)) {
    695		const int sgrp = op_stat_group(req_op(req));
    696
    697		part_stat_lock();
    698		part_stat_add(req->part, sectors[sgrp], bytes >> 9);
    699		part_stat_unlock();
    700	}
    701}
    702
    703static void blk_print_req_error(struct request *req, blk_status_t status)
    704{
    705	printk_ratelimited(KERN_ERR
    706		"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
    707		"phys_seg %u prio class %u\n",
    708		blk_status_to_str(status),
    709		req->q->disk ? req->q->disk->disk_name : "?",
    710		blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
    711		req->cmd_flags & ~REQ_OP_MASK,
    712		req->nr_phys_segments,
    713		IOPRIO_PRIO_CLASS(req->ioprio));
    714}
    715
    716/*
    717 * Fully end IO on a request. Does not support partial completions, or
    718 * errors.
    719 */
    720static void blk_complete_request(struct request *req)
    721{
    722	const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
    723	int total_bytes = blk_rq_bytes(req);
    724	struct bio *bio = req->bio;
    725
    726	trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
    727
    728	if (!bio)
    729		return;
    730
    731#ifdef CONFIG_BLK_DEV_INTEGRITY
    732	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
    733		req->q->integrity.profile->complete_fn(req, total_bytes);
    734#endif
    735
    736	blk_account_io_completion(req, total_bytes);
    737
    738	do {
    739		struct bio *next = bio->bi_next;
    740
    741		/* Completion has already been traced */
    742		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
    743
    744		if (req_op(req) == REQ_OP_ZONE_APPEND)
    745			bio->bi_iter.bi_sector = req->__sector;
    746
    747		if (!is_flush)
    748			bio_endio(bio);
    749		bio = next;
    750	} while (bio);
    751
    752	/*
    753	 * Reset counters so that the request stacking driver
    754	 * can find how many bytes remain in the request
    755	 * later.
    756	 */
    757	req->bio = NULL;
    758	req->__data_len = 0;
    759}
    760
    761/**
    762 * blk_update_request - Complete multiple bytes without completing the request
    763 * @req:      the request being processed
    764 * @error:    block status code
    765 * @nr_bytes: number of bytes to complete for @req
    766 *
    767 * Description:
    768 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
    769 *     the request structure even if @req doesn't have leftover.
    770 *     If @req has leftover, sets it up for the next range of segments.
    771 *
    772 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
    773 *     %false return from this function.
    774 *
    775 * Note:
    776 *	The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
    777 *      except in the consistency check at the end of this function.
    778 *
    779 * Return:
    780 *     %false - this request doesn't have any more data
    781 *     %true  - this request has more data
    782 **/
    783bool blk_update_request(struct request *req, blk_status_t error,
    784		unsigned int nr_bytes)
    785{
    786	int total_bytes;
    787
    788	trace_block_rq_complete(req, error, nr_bytes);
    789
    790	if (!req->bio)
    791		return false;
    792
    793#ifdef CONFIG_BLK_DEV_INTEGRITY
    794	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
    795	    error == BLK_STS_OK)
    796		req->q->integrity.profile->complete_fn(req, nr_bytes);
    797#endif
    798
    799	if (unlikely(error && !blk_rq_is_passthrough(req) &&
    800		     !(req->rq_flags & RQF_QUIET)) &&
    801		     !test_bit(GD_DEAD, &req->q->disk->state)) {
    802		blk_print_req_error(req, error);
    803		trace_block_rq_error(req, error, nr_bytes);
    804	}
    805
    806	blk_account_io_completion(req, nr_bytes);
    807
    808	total_bytes = 0;
    809	while (req->bio) {
    810		struct bio *bio = req->bio;
    811		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
    812
    813		if (bio_bytes == bio->bi_iter.bi_size)
    814			req->bio = bio->bi_next;
    815
    816		/* Completion has already been traced */
    817		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
    818		req_bio_endio(req, bio, bio_bytes, error);
    819
    820		total_bytes += bio_bytes;
    821		nr_bytes -= bio_bytes;
    822
    823		if (!nr_bytes)
    824			break;
    825	}
    826
    827	/*
    828	 * completely done
    829	 */
    830	if (!req->bio) {
    831		/*
    832		 * Reset counters so that the request stacking driver
    833		 * can find how many bytes remain in the request
    834		 * later.
    835		 */
    836		req->__data_len = 0;
    837		return false;
    838	}
    839
    840	req->__data_len -= total_bytes;
    841
    842	/* update sector only for requests with clear definition of sector */
    843	if (!blk_rq_is_passthrough(req))
    844		req->__sector += total_bytes >> 9;
    845
    846	/* mixed attributes always follow the first bio */
    847	if (req->rq_flags & RQF_MIXED_MERGE) {
    848		req->cmd_flags &= ~REQ_FAILFAST_MASK;
    849		req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
    850	}
    851
    852	if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
    853		/*
    854		 * If total number of sectors is less than the first segment
    855		 * size, something has gone terribly wrong.
    856		 */
    857		if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
    858			blk_dump_rq_flags(req, "request botched");
    859			req->__data_len = blk_rq_cur_bytes(req);
    860		}
    861
    862		/* recalculate the number of segments */
    863		req->nr_phys_segments = blk_recalc_rq_segments(req);
    864	}
    865
    866	return true;
    867}
    868EXPORT_SYMBOL_GPL(blk_update_request);
    869
    870static void __blk_account_io_done(struct request *req, u64 now)
    871{
    872	const int sgrp = op_stat_group(req_op(req));
    873
    874	part_stat_lock();
    875	update_io_ticks(req->part, jiffies, true);
    876	part_stat_inc(req->part, ios[sgrp]);
    877	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
    878	part_stat_unlock();
    879}
    880
    881static inline void blk_account_io_done(struct request *req, u64 now)
    882{
    883	/*
    884	 * Account IO completion.  flush_rq isn't accounted as a
    885	 * normal IO on queueing nor completion.  Accounting the
    886	 * containing request is enough.
    887	 */
    888	if (blk_do_io_stat(req) && req->part &&
    889	    !(req->rq_flags & RQF_FLUSH_SEQ))
    890		__blk_account_io_done(req, now);
    891}
    892
    893static void __blk_account_io_start(struct request *rq)
    894{
    895	/*
    896	 * All non-passthrough requests are created from a bio with one
    897	 * exception: when a flush command that is part of a flush sequence
    898	 * generated by the state machine in blk-flush.c is cloned onto the
    899	 * lower device by dm-multipath we can get here without a bio.
    900	 */
    901	if (rq->bio)
    902		rq->part = rq->bio->bi_bdev;
    903	else
    904		rq->part = rq->q->disk->part0;
    905
    906	part_stat_lock();
    907	update_io_ticks(rq->part, jiffies, false);
    908	part_stat_unlock();
    909}
    910
    911static inline void blk_account_io_start(struct request *req)
    912{
    913	if (blk_do_io_stat(req))
    914		__blk_account_io_start(req);
    915}
    916
    917static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
    918{
    919	if (rq->rq_flags & RQF_STATS) {
    920		blk_mq_poll_stats_start(rq->q);
    921		blk_stat_add(rq, now);
    922	}
    923
    924	blk_mq_sched_completed_request(rq, now);
    925	blk_account_io_done(rq, now);
    926}
    927
    928inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
    929{
    930	if (blk_mq_need_time_stamp(rq))
    931		__blk_mq_end_request_acct(rq, ktime_get_ns());
    932
    933	if (rq->end_io) {
    934		rq_qos_done(rq->q, rq);
    935		rq->end_io(rq, error);
    936	} else {
    937		blk_mq_free_request(rq);
    938	}
    939}
    940EXPORT_SYMBOL(__blk_mq_end_request);
    941
    942void blk_mq_end_request(struct request *rq, blk_status_t error)
    943{
    944	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
    945		BUG();
    946	__blk_mq_end_request(rq, error);
    947}
    948EXPORT_SYMBOL(blk_mq_end_request);
    949
    950#define TAG_COMP_BATCH		32
    951
    952static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
    953					  int *tag_array, int nr_tags)
    954{
    955	struct request_queue *q = hctx->queue;
    956
    957	/*
    958	 * All requests should have been marked as RQF_MQ_INFLIGHT, so
    959	 * update hctx->nr_active in batch
    960	 */
    961	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
    962		__blk_mq_sub_active_requests(hctx, nr_tags);
    963
    964	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
    965	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
    966}
    967
    968void blk_mq_end_request_batch(struct io_comp_batch *iob)
    969{
    970	int tags[TAG_COMP_BATCH], nr_tags = 0;
    971	struct blk_mq_hw_ctx *cur_hctx = NULL;
    972	struct request *rq;
    973	u64 now = 0;
    974
    975	if (iob->need_ts)
    976		now = ktime_get_ns();
    977
    978	while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
    979		prefetch(rq->bio);
    980		prefetch(rq->rq_next);
    981
    982		blk_complete_request(rq);
    983		if (iob->need_ts)
    984			__blk_mq_end_request_acct(rq, now);
    985
    986		rq_qos_done(rq->q, rq);
    987
    988		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
    989		if (!req_ref_put_and_test(rq))
    990			continue;
    991
    992		blk_crypto_free_request(rq);
    993		blk_pm_mark_last_busy(rq);
    994
    995		if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
    996			if (cur_hctx)
    997				blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
    998			nr_tags = 0;
    999			cur_hctx = rq->mq_hctx;
   1000		}
   1001		tags[nr_tags++] = rq->tag;
   1002	}
   1003
   1004	if (nr_tags)
   1005		blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
   1006}
   1007EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
   1008
   1009static void blk_complete_reqs(struct llist_head *list)
   1010{
   1011	struct llist_node *entry = llist_reverse_order(llist_del_all(list));
   1012	struct request *rq, *next;
   1013
   1014	llist_for_each_entry_safe(rq, next, entry, ipi_list)
   1015		rq->q->mq_ops->complete(rq);
   1016}
   1017
   1018static __latent_entropy void blk_done_softirq(struct softirq_action *h)
   1019{
   1020	blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
   1021}
   1022
   1023static int blk_softirq_cpu_dead(unsigned int cpu)
   1024{
   1025	blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
   1026	return 0;
   1027}
   1028
   1029static void __blk_mq_complete_request_remote(void *data)
   1030{
   1031	__raise_softirq_irqoff(BLOCK_SOFTIRQ);
   1032}
   1033
   1034static inline bool blk_mq_complete_need_ipi(struct request *rq)
   1035{
   1036	int cpu = raw_smp_processor_id();
   1037
   1038	if (!IS_ENABLED(CONFIG_SMP) ||
   1039	    !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
   1040		return false;
   1041	/*
   1042	 * With force threaded interrupts enabled, raising softirq from an SMP
   1043	 * function call will always result in waking the ksoftirqd thread.
   1044	 * This is probably worse than completing the request on a different
   1045	 * cache domain.
   1046	 */
   1047	if (force_irqthreads())
   1048		return false;
   1049
   1050	/* same CPU or cache domain?  Complete locally */
   1051	if (cpu == rq->mq_ctx->cpu ||
   1052	    (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
   1053	     cpus_share_cache(cpu, rq->mq_ctx->cpu)))
   1054		return false;
   1055
   1056	/* don't try to IPI to an offline CPU */
   1057	return cpu_online(rq->mq_ctx->cpu);
   1058}
   1059
   1060static void blk_mq_complete_send_ipi(struct request *rq)
   1061{
   1062	struct llist_head *list;
   1063	unsigned int cpu;
   1064
   1065	cpu = rq->mq_ctx->cpu;
   1066	list = &per_cpu(blk_cpu_done, cpu);
   1067	if (llist_add(&rq->ipi_list, list)) {
   1068		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
   1069		smp_call_function_single_async(cpu, &rq->csd);
   1070	}
   1071}
   1072
   1073static void blk_mq_raise_softirq(struct request *rq)
   1074{
   1075	struct llist_head *list;
   1076
   1077	preempt_disable();
   1078	list = this_cpu_ptr(&blk_cpu_done);
   1079	if (llist_add(&rq->ipi_list, list))
   1080		raise_softirq(BLOCK_SOFTIRQ);
   1081	preempt_enable();
   1082}
   1083
   1084bool blk_mq_complete_request_remote(struct request *rq)
   1085{
   1086	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
   1087
   1088	/*
   1089	 * For a polled request, always complete locally, it's pointless
   1090	 * to redirect the completion.
   1091	 */
   1092	if (rq->cmd_flags & REQ_POLLED)
   1093		return false;
   1094
   1095	if (blk_mq_complete_need_ipi(rq)) {
   1096		blk_mq_complete_send_ipi(rq);
   1097		return true;
   1098	}
   1099
   1100	if (rq->q->nr_hw_queues == 1) {
   1101		blk_mq_raise_softirq(rq);
   1102		return true;
   1103	}
   1104	return false;
   1105}
   1106EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
   1107
   1108/**
   1109 * blk_mq_complete_request - end I/O on a request
   1110 * @rq:		the request being processed
   1111 *
   1112 * Description:
   1113 *	Complete a request by scheduling the ->complete_rq operation.
   1114 **/
   1115void blk_mq_complete_request(struct request *rq)
   1116{
   1117	if (!blk_mq_complete_request_remote(rq))
   1118		rq->q->mq_ops->complete(rq);
   1119}
   1120EXPORT_SYMBOL(blk_mq_complete_request);
   1121
   1122/**
   1123 * blk_mq_start_request - Start processing a request
   1124 * @rq: Pointer to request to be started
   1125 *
   1126 * Function used by device drivers to notify the block layer that a request
   1127 * is going to be processed now, so blk layer can do proper initializations
   1128 * such as starting the timeout timer.
   1129 */
   1130void blk_mq_start_request(struct request *rq)
   1131{
   1132	struct request_queue *q = rq->q;
   1133
   1134	trace_block_rq_issue(rq);
   1135
   1136	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
   1137		rq->io_start_time_ns = ktime_get_ns();
   1138		rq->stats_sectors = blk_rq_sectors(rq);
   1139		rq->rq_flags |= RQF_STATS;
   1140		rq_qos_issue(q, rq);
   1141	}
   1142
   1143	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
   1144
   1145	blk_add_timer(rq);
   1146	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
   1147
   1148#ifdef CONFIG_BLK_DEV_INTEGRITY
   1149	if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
   1150		q->integrity.profile->prepare_fn(rq);
   1151#endif
   1152	if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
   1153	        WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq));
   1154}
   1155EXPORT_SYMBOL(blk_mq_start_request);
   1156
   1157/*
   1158 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
   1159 * queues. This is important for md arrays to benefit from merging
   1160 * requests.
   1161 */
   1162static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
   1163{
   1164	if (plug->multiple_queues)
   1165		return BLK_MAX_REQUEST_COUNT * 2;
   1166	return BLK_MAX_REQUEST_COUNT;
   1167}
   1168
   1169static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
   1170{
   1171	struct request *last = rq_list_peek(&plug->mq_list);
   1172
   1173	if (!plug->rq_count) {
   1174		trace_block_plug(rq->q);
   1175	} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
   1176		   (!blk_queue_nomerges(rq->q) &&
   1177		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
   1178		blk_mq_flush_plug_list(plug, false);
   1179		trace_block_plug(rq->q);
   1180	}
   1181
   1182	if (!plug->multiple_queues && last && last->q != rq->q)
   1183		plug->multiple_queues = true;
   1184	if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
   1185		plug->has_elevator = true;
   1186	rq->rq_next = NULL;
   1187	rq_list_add(&plug->mq_list, rq);
   1188	plug->rq_count++;
   1189}
   1190
   1191/**
   1192 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
   1193 * @rq:		request to insert
   1194 * @at_head:    insert request at head or tail of queue
   1195 *
   1196 * Description:
   1197 *    Insert a fully prepared request at the back of the I/O scheduler queue
   1198 *    for execution.  Don't wait for completion.
   1199 *
   1200 * Note:
   1201 *    This function will invoke @done directly if the queue is dead.
   1202 */
   1203void blk_execute_rq_nowait(struct request *rq, bool at_head)
   1204{
   1205	WARN_ON(irqs_disabled());
   1206	WARN_ON(!blk_rq_is_passthrough(rq));
   1207
   1208	blk_account_io_start(rq);
   1209	if (current->plug)
   1210		blk_add_rq_to_plug(current->plug, rq);
   1211	else
   1212		blk_mq_sched_insert_request(rq, at_head, true, false);
   1213}
   1214EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
   1215
   1216struct blk_rq_wait {
   1217	struct completion done;
   1218	blk_status_t ret;
   1219};
   1220
   1221static void blk_end_sync_rq(struct request *rq, blk_status_t ret)
   1222{
   1223	struct blk_rq_wait *wait = rq->end_io_data;
   1224
   1225	wait->ret = ret;
   1226	complete(&wait->done);
   1227}
   1228
   1229static bool blk_rq_is_poll(struct request *rq)
   1230{
   1231	if (!rq->mq_hctx)
   1232		return false;
   1233	if (rq->mq_hctx->type != HCTX_TYPE_POLL)
   1234		return false;
   1235	if (WARN_ON_ONCE(!rq->bio))
   1236		return false;
   1237	return true;
   1238}
   1239
   1240static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
   1241{
   1242	do {
   1243		bio_poll(rq->bio, NULL, 0);
   1244		cond_resched();
   1245	} while (!completion_done(wait));
   1246}
   1247
   1248/**
   1249 * blk_execute_rq - insert a request into queue for execution
   1250 * @rq:		request to insert
   1251 * @at_head:    insert request at head or tail of queue
   1252 *
   1253 * Description:
   1254 *    Insert a fully prepared request at the back of the I/O scheduler queue
   1255 *    for execution and wait for completion.
   1256 * Return: The blk_status_t result provided to blk_mq_end_request().
   1257 */
   1258blk_status_t blk_execute_rq(struct request *rq, bool at_head)
   1259{
   1260	struct blk_rq_wait wait = {
   1261		.done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
   1262	};
   1263
   1264	WARN_ON(irqs_disabled());
   1265	WARN_ON(!blk_rq_is_passthrough(rq));
   1266
   1267	rq->end_io_data = &wait;
   1268	rq->end_io = blk_end_sync_rq;
   1269
   1270	blk_account_io_start(rq);
   1271	blk_mq_sched_insert_request(rq, at_head, true, false);
   1272
   1273	if (blk_rq_is_poll(rq)) {
   1274		blk_rq_poll_completion(rq, &wait.done);
   1275	} else {
   1276		/*
   1277		 * Prevent hang_check timer from firing at us during very long
   1278		 * I/O
   1279		 */
   1280		unsigned long hang_check = sysctl_hung_task_timeout_secs;
   1281
   1282		if (hang_check)
   1283			while (!wait_for_completion_io_timeout(&wait.done,
   1284					hang_check * (HZ/2)))
   1285				;
   1286		else
   1287			wait_for_completion_io(&wait.done);
   1288	}
   1289
   1290	return wait.ret;
   1291}
   1292EXPORT_SYMBOL(blk_execute_rq);
   1293
   1294static void __blk_mq_requeue_request(struct request *rq)
   1295{
   1296	struct request_queue *q = rq->q;
   1297
   1298	blk_mq_put_driver_tag(rq);
   1299
   1300	trace_block_rq_requeue(rq);
   1301	rq_qos_requeue(q, rq);
   1302
   1303	if (blk_mq_request_started(rq)) {
   1304		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
   1305		rq->rq_flags &= ~RQF_TIMED_OUT;
   1306	}
   1307}
   1308
   1309void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
   1310{
   1311	__blk_mq_requeue_request(rq);
   1312
   1313	/* this request will be re-inserted to io scheduler queue */
   1314	blk_mq_sched_requeue_request(rq);
   1315
   1316	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
   1317}
   1318EXPORT_SYMBOL(blk_mq_requeue_request);
   1319
   1320static void blk_mq_requeue_work(struct work_struct *work)
   1321{
   1322	struct request_queue *q =
   1323		container_of(work, struct request_queue, requeue_work.work);
   1324	LIST_HEAD(rq_list);
   1325	struct request *rq, *next;
   1326
   1327	spin_lock_irq(&q->requeue_lock);
   1328	list_splice_init(&q->requeue_list, &rq_list);
   1329	spin_unlock_irq(&q->requeue_lock);
   1330
   1331	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
   1332		if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
   1333			continue;
   1334
   1335		rq->rq_flags &= ~RQF_SOFTBARRIER;
   1336		list_del_init(&rq->queuelist);
   1337		/*
   1338		 * If RQF_DONTPREP, rq has contained some driver specific
   1339		 * data, so insert it to hctx dispatch list to avoid any
   1340		 * merge.
   1341		 */
   1342		if (rq->rq_flags & RQF_DONTPREP)
   1343			blk_mq_request_bypass_insert(rq, false, false);
   1344		else
   1345			blk_mq_sched_insert_request(rq, true, false, false);
   1346	}
   1347
   1348	while (!list_empty(&rq_list)) {
   1349		rq = list_entry(rq_list.next, struct request, queuelist);
   1350		list_del_init(&rq->queuelist);
   1351		blk_mq_sched_insert_request(rq, false, false, false);
   1352	}
   1353
   1354	blk_mq_run_hw_queues(q, false);
   1355}
   1356
   1357void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
   1358				bool kick_requeue_list)
   1359{
   1360	struct request_queue *q = rq->q;
   1361	unsigned long flags;
   1362
   1363	/*
   1364	 * We abuse this flag that is otherwise used by the I/O scheduler to
   1365	 * request head insertion from the workqueue.
   1366	 */
   1367	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
   1368
   1369	spin_lock_irqsave(&q->requeue_lock, flags);
   1370	if (at_head) {
   1371		rq->rq_flags |= RQF_SOFTBARRIER;
   1372		list_add(&rq->queuelist, &q->requeue_list);
   1373	} else {
   1374		list_add_tail(&rq->queuelist, &q->requeue_list);
   1375	}
   1376	spin_unlock_irqrestore(&q->requeue_lock, flags);
   1377
   1378	if (kick_requeue_list)
   1379		blk_mq_kick_requeue_list(q);
   1380}
   1381
   1382void blk_mq_kick_requeue_list(struct request_queue *q)
   1383{
   1384	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
   1385}
   1386EXPORT_SYMBOL(blk_mq_kick_requeue_list);
   1387
   1388void blk_mq_delay_kick_requeue_list(struct request_queue *q,
   1389				    unsigned long msecs)
   1390{
   1391	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
   1392				    msecs_to_jiffies(msecs));
   1393}
   1394EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
   1395
   1396static bool blk_mq_rq_inflight(struct request *rq, void *priv,
   1397			       bool reserved)
   1398{
   1399	/*
   1400	 * If we find a request that isn't idle we know the queue is busy
   1401	 * as it's checked in the iter.
   1402	 * Return false to stop the iteration.
   1403	 */
   1404	if (blk_mq_request_started(rq)) {
   1405		bool *busy = priv;
   1406
   1407		*busy = true;
   1408		return false;
   1409	}
   1410
   1411	return true;
   1412}
   1413
   1414bool blk_mq_queue_inflight(struct request_queue *q)
   1415{
   1416	bool busy = false;
   1417
   1418	blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
   1419	return busy;
   1420}
   1421EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
   1422
   1423static void blk_mq_rq_timed_out(struct request *req, bool reserved)
   1424{
   1425	req->rq_flags |= RQF_TIMED_OUT;
   1426	if (req->q->mq_ops->timeout) {
   1427		enum blk_eh_timer_return ret;
   1428
   1429		ret = req->q->mq_ops->timeout(req, reserved);
   1430		if (ret == BLK_EH_DONE)
   1431			return;
   1432		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
   1433	}
   1434
   1435	blk_add_timer(req);
   1436}
   1437
   1438static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
   1439{
   1440	unsigned long deadline;
   1441
   1442	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
   1443		return false;
   1444	if (rq->rq_flags & RQF_TIMED_OUT)
   1445		return false;
   1446
   1447	deadline = READ_ONCE(rq->deadline);
   1448	if (time_after_eq(jiffies, deadline))
   1449		return true;
   1450
   1451	if (*next == 0)
   1452		*next = deadline;
   1453	else if (time_after(*next, deadline))
   1454		*next = deadline;
   1455	return false;
   1456}
   1457
   1458void blk_mq_put_rq_ref(struct request *rq)
   1459{
   1460	if (is_flush_rq(rq))
   1461		rq->end_io(rq, 0);
   1462	else if (req_ref_put_and_test(rq))
   1463		__blk_mq_free_request(rq);
   1464}
   1465
   1466static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
   1467{
   1468	unsigned long *next = priv;
   1469
   1470	/*
   1471	 * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
   1472	 * be reallocated underneath the timeout handler's processing, then
   1473	 * the expire check is reliable. If the request is not expired, then
   1474	 * it was completed and reallocated as a new request after returning
   1475	 * from blk_mq_check_expired().
   1476	 */
   1477	if (blk_mq_req_expired(rq, next))
   1478		blk_mq_rq_timed_out(rq, reserved);
   1479	return true;
   1480}
   1481
   1482static void blk_mq_timeout_work(struct work_struct *work)
   1483{
   1484	struct request_queue *q =
   1485		container_of(work, struct request_queue, timeout_work);
   1486	unsigned long next = 0;
   1487	struct blk_mq_hw_ctx *hctx;
   1488	unsigned long i;
   1489
   1490	/* A deadlock might occur if a request is stuck requiring a
   1491	 * timeout at the same time a queue freeze is waiting
   1492	 * completion, since the timeout code would not be able to
   1493	 * acquire the queue reference here.
   1494	 *
   1495	 * That's why we don't use blk_queue_enter here; instead, we use
   1496	 * percpu_ref_tryget directly, because we need to be able to
   1497	 * obtain a reference even in the short window between the queue
   1498	 * starting to freeze, by dropping the first reference in
   1499	 * blk_freeze_queue_start, and the moment the last request is
   1500	 * consumed, marked by the instant q_usage_counter reaches
   1501	 * zero.
   1502	 */
   1503	if (!percpu_ref_tryget(&q->q_usage_counter))
   1504		return;
   1505
   1506	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
   1507
   1508	if (next != 0) {
   1509		mod_timer(&q->timeout, next);
   1510	} else {
   1511		/*
   1512		 * Request timeouts are handled as a forward rolling timer. If
   1513		 * we end up here it means that no requests are pending and
   1514		 * also that no request has been pending for a while. Mark
   1515		 * each hctx as idle.
   1516		 */
   1517		queue_for_each_hw_ctx(q, hctx, i) {
   1518			/* the hctx may be unmapped, so check it here */
   1519			if (blk_mq_hw_queue_mapped(hctx))
   1520				blk_mq_tag_idle(hctx);
   1521		}
   1522	}
   1523	blk_queue_exit(q);
   1524}
   1525
   1526struct flush_busy_ctx_data {
   1527	struct blk_mq_hw_ctx *hctx;
   1528	struct list_head *list;
   1529};
   1530
   1531static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
   1532{
   1533	struct flush_busy_ctx_data *flush_data = data;
   1534	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
   1535	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
   1536	enum hctx_type type = hctx->type;
   1537
   1538	spin_lock(&ctx->lock);
   1539	list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
   1540	sbitmap_clear_bit(sb, bitnr);
   1541	spin_unlock(&ctx->lock);
   1542	return true;
   1543}
   1544
   1545/*
   1546 * Process software queues that have been marked busy, splicing them
   1547 * to the for-dispatch
   1548 */
   1549void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
   1550{
   1551	struct flush_busy_ctx_data data = {
   1552		.hctx = hctx,
   1553		.list = list,
   1554	};
   1555
   1556	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
   1557}
   1558EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
   1559
   1560struct dispatch_rq_data {
   1561	struct blk_mq_hw_ctx *hctx;
   1562	struct request *rq;
   1563};
   1564
   1565static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
   1566		void *data)
   1567{
   1568	struct dispatch_rq_data *dispatch_data = data;
   1569	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
   1570	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
   1571	enum hctx_type type = hctx->type;
   1572
   1573	spin_lock(&ctx->lock);
   1574	if (!list_empty(&ctx->rq_lists[type])) {
   1575		dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
   1576		list_del_init(&dispatch_data->rq->queuelist);
   1577		if (list_empty(&ctx->rq_lists[type]))
   1578			sbitmap_clear_bit(sb, bitnr);
   1579	}
   1580	spin_unlock(&ctx->lock);
   1581
   1582	return !dispatch_data->rq;
   1583}
   1584
   1585struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
   1586					struct blk_mq_ctx *start)
   1587{
   1588	unsigned off = start ? start->index_hw[hctx->type] : 0;
   1589	struct dispatch_rq_data data = {
   1590		.hctx = hctx,
   1591		.rq   = NULL,
   1592	};
   1593
   1594	__sbitmap_for_each_set(&hctx->ctx_map, off,
   1595			       dispatch_rq_from_ctx, &data);
   1596
   1597	return data.rq;
   1598}
   1599
   1600static bool __blk_mq_alloc_driver_tag(struct request *rq)
   1601{
   1602	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
   1603	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
   1604	int tag;
   1605
   1606	blk_mq_tag_busy(rq->mq_hctx);
   1607
   1608	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
   1609		bt = &rq->mq_hctx->tags->breserved_tags;
   1610		tag_offset = 0;
   1611	} else {
   1612		if (!hctx_may_queue(rq->mq_hctx, bt))
   1613			return false;
   1614	}
   1615
   1616	tag = __sbitmap_queue_get(bt);
   1617	if (tag == BLK_MQ_NO_TAG)
   1618		return false;
   1619
   1620	rq->tag = tag + tag_offset;
   1621	return true;
   1622}
   1623
   1624bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
   1625{
   1626	if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
   1627		return false;
   1628
   1629	if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
   1630			!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
   1631		rq->rq_flags |= RQF_MQ_INFLIGHT;
   1632		__blk_mq_inc_active_requests(hctx);
   1633	}
   1634	hctx->tags->rqs[rq->tag] = rq;
   1635	return true;
   1636}
   1637
   1638static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
   1639				int flags, void *key)
   1640{
   1641	struct blk_mq_hw_ctx *hctx;
   1642
   1643	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
   1644
   1645	spin_lock(&hctx->dispatch_wait_lock);
   1646	if (!list_empty(&wait->entry)) {
   1647		struct sbitmap_queue *sbq;
   1648
   1649		list_del_init(&wait->entry);
   1650		sbq = &hctx->tags->bitmap_tags;
   1651		atomic_dec(&sbq->ws_active);
   1652	}
   1653	spin_unlock(&hctx->dispatch_wait_lock);
   1654
   1655	blk_mq_run_hw_queue(hctx, true);
   1656	return 1;
   1657}
   1658
   1659/*
   1660 * Mark us waiting for a tag. For shared tags, this involves hooking us into
   1661 * the tag wakeups. For non-shared tags, we can simply mark us needing a
   1662 * restart. For both cases, take care to check the condition again after
   1663 * marking us as waiting.
   1664 */
   1665static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
   1666				 struct request *rq)
   1667{
   1668	struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
   1669	struct wait_queue_head *wq;
   1670	wait_queue_entry_t *wait;
   1671	bool ret;
   1672
   1673	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
   1674		blk_mq_sched_mark_restart_hctx(hctx);
   1675
   1676		/*
   1677		 * It's possible that a tag was freed in the window between the
   1678		 * allocation failure and adding the hardware queue to the wait
   1679		 * queue.
   1680		 *
   1681		 * Don't clear RESTART here, someone else could have set it.
   1682		 * At most this will cost an extra queue run.
   1683		 */
   1684		return blk_mq_get_driver_tag(rq);
   1685	}
   1686
   1687	wait = &hctx->dispatch_wait;
   1688	if (!list_empty_careful(&wait->entry))
   1689		return false;
   1690
   1691	wq = &bt_wait_ptr(sbq, hctx)->wait;
   1692
   1693	spin_lock_irq(&wq->lock);
   1694	spin_lock(&hctx->dispatch_wait_lock);
   1695	if (!list_empty(&wait->entry)) {
   1696		spin_unlock(&hctx->dispatch_wait_lock);
   1697		spin_unlock_irq(&wq->lock);
   1698		return false;
   1699	}
   1700
   1701	atomic_inc(&sbq->ws_active);
   1702	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
   1703	__add_wait_queue(wq, wait);
   1704
   1705	/*
   1706	 * It's possible that a tag was freed in the window between the
   1707	 * allocation failure and adding the hardware queue to the wait
   1708	 * queue.
   1709	 */
   1710	ret = blk_mq_get_driver_tag(rq);
   1711	if (!ret) {
   1712		spin_unlock(&hctx->dispatch_wait_lock);
   1713		spin_unlock_irq(&wq->lock);
   1714		return false;
   1715	}
   1716
   1717	/*
   1718	 * We got a tag, remove ourselves from the wait queue to ensure
   1719	 * someone else gets the wakeup.
   1720	 */
   1721	list_del_init(&wait->entry);
   1722	atomic_dec(&sbq->ws_active);
   1723	spin_unlock(&hctx->dispatch_wait_lock);
   1724	spin_unlock_irq(&wq->lock);
   1725
   1726	return true;
   1727}
   1728
   1729#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
   1730#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
   1731/*
   1732 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
   1733 * - EWMA is one simple way to compute running average value
   1734 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
   1735 * - take 4 as factor for avoiding to get too small(0) result, and this
   1736 *   factor doesn't matter because EWMA decreases exponentially
   1737 */
   1738static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
   1739{
   1740	unsigned int ewma;
   1741
   1742	ewma = hctx->dispatch_busy;
   1743
   1744	if (!ewma && !busy)
   1745		return;
   1746
   1747	ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
   1748	if (busy)
   1749		ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
   1750	ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
   1751
   1752	hctx->dispatch_busy = ewma;
   1753}
   1754
   1755#define BLK_MQ_RESOURCE_DELAY	3		/* ms units */
   1756
   1757static void blk_mq_handle_dev_resource(struct request *rq,
   1758				       struct list_head *list)
   1759{
   1760	struct request *next =
   1761		list_first_entry_or_null(list, struct request, queuelist);
   1762
   1763	/*
   1764	 * If an I/O scheduler has been configured and we got a driver tag for
   1765	 * the next request already, free it.
   1766	 */
   1767	if (next)
   1768		blk_mq_put_driver_tag(next);
   1769
   1770	list_add(&rq->queuelist, list);
   1771	__blk_mq_requeue_request(rq);
   1772}
   1773
   1774static void blk_mq_handle_zone_resource(struct request *rq,
   1775					struct list_head *zone_list)
   1776{
   1777	/*
   1778	 * If we end up here it is because we cannot dispatch a request to a
   1779	 * specific zone due to LLD level zone-write locking or other zone
   1780	 * related resource not being available. In this case, set the request
   1781	 * aside in zone_list for retrying it later.
   1782	 */
   1783	list_add(&rq->queuelist, zone_list);
   1784	__blk_mq_requeue_request(rq);
   1785}
   1786
   1787enum prep_dispatch {
   1788	PREP_DISPATCH_OK,
   1789	PREP_DISPATCH_NO_TAG,
   1790	PREP_DISPATCH_NO_BUDGET,
   1791};
   1792
   1793static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
   1794						  bool need_budget)
   1795{
   1796	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
   1797	int budget_token = -1;
   1798
   1799	if (need_budget) {
   1800		budget_token = blk_mq_get_dispatch_budget(rq->q);
   1801		if (budget_token < 0) {
   1802			blk_mq_put_driver_tag(rq);
   1803			return PREP_DISPATCH_NO_BUDGET;
   1804		}
   1805		blk_mq_set_rq_budget_token(rq, budget_token);
   1806	}
   1807
   1808	if (!blk_mq_get_driver_tag(rq)) {
   1809		/*
   1810		 * The initial allocation attempt failed, so we need to
   1811		 * rerun the hardware queue when a tag is freed. The
   1812		 * waitqueue takes care of that. If the queue is run
   1813		 * before we add this entry back on the dispatch list,
   1814		 * we'll re-run it below.
   1815		 */
   1816		if (!blk_mq_mark_tag_wait(hctx, rq)) {
   1817			/*
   1818			 * All budgets not got from this function will be put
   1819			 * together during handling partial dispatch
   1820			 */
   1821			if (need_budget)
   1822				blk_mq_put_dispatch_budget(rq->q, budget_token);
   1823			return PREP_DISPATCH_NO_TAG;
   1824		}
   1825	}
   1826
   1827	return PREP_DISPATCH_OK;
   1828}
   1829
   1830/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
   1831static void blk_mq_release_budgets(struct request_queue *q,
   1832		struct list_head *list)
   1833{
   1834	struct request *rq;
   1835
   1836	list_for_each_entry(rq, list, queuelist) {
   1837		int budget_token = blk_mq_get_rq_budget_token(rq);
   1838
   1839		if (budget_token >= 0)
   1840			blk_mq_put_dispatch_budget(q, budget_token);
   1841	}
   1842}
   1843
   1844/*
   1845 * Returns true if we did some work AND can potentially do more.
   1846 */
   1847bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
   1848			     unsigned int nr_budgets)
   1849{
   1850	enum prep_dispatch prep;
   1851	struct request_queue *q = hctx->queue;
   1852	struct request *rq, *nxt;
   1853	int errors, queued;
   1854	blk_status_t ret = BLK_STS_OK;
   1855	LIST_HEAD(zone_list);
   1856	bool needs_resource = false;
   1857
   1858	if (list_empty(list))
   1859		return false;
   1860
   1861	/*
   1862	 * Now process all the entries, sending them to the driver.
   1863	 */
   1864	errors = queued = 0;
   1865	do {
   1866		struct blk_mq_queue_data bd;
   1867
   1868		rq = list_first_entry(list, struct request, queuelist);
   1869
   1870		WARN_ON_ONCE(hctx != rq->mq_hctx);
   1871		prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
   1872		if (prep != PREP_DISPATCH_OK)
   1873			break;
   1874
   1875		list_del_init(&rq->queuelist);
   1876
   1877		bd.rq = rq;
   1878
   1879		/*
   1880		 * Flag last if we have no more requests, or if we have more
   1881		 * but can't assign a driver tag to it.
   1882		 */
   1883		if (list_empty(list))
   1884			bd.last = true;
   1885		else {
   1886			nxt = list_first_entry(list, struct request, queuelist);
   1887			bd.last = !blk_mq_get_driver_tag(nxt);
   1888		}
   1889
   1890		/*
   1891		 * once the request is queued to lld, no need to cover the
   1892		 * budget any more
   1893		 */
   1894		if (nr_budgets)
   1895			nr_budgets--;
   1896		ret = q->mq_ops->queue_rq(hctx, &bd);
   1897		switch (ret) {
   1898		case BLK_STS_OK:
   1899			queued++;
   1900			break;
   1901		case BLK_STS_RESOURCE:
   1902			needs_resource = true;
   1903			fallthrough;
   1904		case BLK_STS_DEV_RESOURCE:
   1905			blk_mq_handle_dev_resource(rq, list);
   1906			goto out;
   1907		case BLK_STS_ZONE_RESOURCE:
   1908			/*
   1909			 * Move the request to zone_list and keep going through
   1910			 * the dispatch list to find more requests the drive can
   1911			 * accept.
   1912			 */
   1913			blk_mq_handle_zone_resource(rq, &zone_list);
   1914			needs_resource = true;
   1915			break;
   1916		default:
   1917			errors++;
   1918			blk_mq_end_request(rq, ret);
   1919		}
   1920	} while (!list_empty(list));
   1921out:
   1922	if (!list_empty(&zone_list))
   1923		list_splice_tail_init(&zone_list, list);
   1924
   1925	/* If we didn't flush the entire list, we could have told the driver
   1926	 * there was more coming, but that turned out to be a lie.
   1927	 */
   1928	if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued)
   1929		q->mq_ops->commit_rqs(hctx);
   1930	/*
   1931	 * Any items that need requeuing? Stuff them into hctx->dispatch,
   1932	 * that is where we will continue on next queue run.
   1933	 */
   1934	if (!list_empty(list)) {
   1935		bool needs_restart;
   1936		/* For non-shared tags, the RESTART check will suffice */
   1937		bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
   1938			(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
   1939
   1940		if (nr_budgets)
   1941			blk_mq_release_budgets(q, list);
   1942
   1943		spin_lock(&hctx->lock);
   1944		list_splice_tail_init(list, &hctx->dispatch);
   1945		spin_unlock(&hctx->lock);
   1946
   1947		/*
   1948		 * Order adding requests to hctx->dispatch and checking
   1949		 * SCHED_RESTART flag. The pair of this smp_mb() is the one
   1950		 * in blk_mq_sched_restart(). Avoid restart code path to
   1951		 * miss the new added requests to hctx->dispatch, meantime
   1952		 * SCHED_RESTART is observed here.
   1953		 */
   1954		smp_mb();
   1955
   1956		/*
   1957		 * If SCHED_RESTART was set by the caller of this function and
   1958		 * it is no longer set that means that it was cleared by another
   1959		 * thread and hence that a queue rerun is needed.
   1960		 *
   1961		 * If 'no_tag' is set, that means that we failed getting
   1962		 * a driver tag with an I/O scheduler attached. If our dispatch
   1963		 * waitqueue is no longer active, ensure that we run the queue
   1964		 * AFTER adding our entries back to the list.
   1965		 *
   1966		 * If no I/O scheduler has been configured it is possible that
   1967		 * the hardware queue got stopped and restarted before requests
   1968		 * were pushed back onto the dispatch list. Rerun the queue to
   1969		 * avoid starvation. Notes:
   1970		 * - blk_mq_run_hw_queue() checks whether or not a queue has
   1971		 *   been stopped before rerunning a queue.
   1972		 * - Some but not all block drivers stop a queue before
   1973		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
   1974		 *   and dm-rq.
   1975		 *
   1976		 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
   1977		 * bit is set, run queue after a delay to avoid IO stalls
   1978		 * that could otherwise occur if the queue is idle.  We'll do
   1979		 * similar if we couldn't get budget or couldn't lock a zone
   1980		 * and SCHED_RESTART is set.
   1981		 */
   1982		needs_restart = blk_mq_sched_needs_restart(hctx);
   1983		if (prep == PREP_DISPATCH_NO_BUDGET)
   1984			needs_resource = true;
   1985		if (!needs_restart ||
   1986		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
   1987			blk_mq_run_hw_queue(hctx, true);
   1988		else if (needs_restart && needs_resource)
   1989			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
   1990
   1991		blk_mq_update_dispatch_busy(hctx, true);
   1992		return false;
   1993	} else
   1994		blk_mq_update_dispatch_busy(hctx, false);
   1995
   1996	return (queued + errors) != 0;
   1997}
   1998
   1999/**
   2000 * __blk_mq_run_hw_queue - Run a hardware queue.
   2001 * @hctx: Pointer to the hardware queue to run.
   2002 *
   2003 * Send pending requests to the hardware.
   2004 */
   2005static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
   2006{
   2007	/*
   2008	 * We can't run the queue inline with ints disabled. Ensure that
   2009	 * we catch bad users of this early.
   2010	 */
   2011	WARN_ON_ONCE(in_interrupt());
   2012
   2013	blk_mq_run_dispatch_ops(hctx->queue,
   2014			blk_mq_sched_dispatch_requests(hctx));
   2015}
   2016
   2017static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
   2018{
   2019	int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
   2020
   2021	if (cpu >= nr_cpu_ids)
   2022		cpu = cpumask_first(hctx->cpumask);
   2023	return cpu;
   2024}
   2025
   2026/*
   2027 * It'd be great if the workqueue API had a way to pass
   2028 * in a mask and had some smarts for more clever placement.
   2029 * For now we just round-robin here, switching for every
   2030 * BLK_MQ_CPU_WORK_BATCH queued items.
   2031 */
   2032static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
   2033{
   2034	bool tried = false;
   2035	int next_cpu = hctx->next_cpu;
   2036
   2037	if (hctx->queue->nr_hw_queues == 1)
   2038		return WORK_CPU_UNBOUND;
   2039
   2040	if (--hctx->next_cpu_batch <= 0) {
   2041select_cpu:
   2042		next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
   2043				cpu_online_mask);
   2044		if (next_cpu >= nr_cpu_ids)
   2045			next_cpu = blk_mq_first_mapped_cpu(hctx);
   2046		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
   2047	}
   2048
   2049	/*
   2050	 * Do unbound schedule if we can't find a online CPU for this hctx,
   2051	 * and it should only happen in the path of handling CPU DEAD.
   2052	 */
   2053	if (!cpu_online(next_cpu)) {
   2054		if (!tried) {
   2055			tried = true;
   2056			goto select_cpu;
   2057		}
   2058
   2059		/*
   2060		 * Make sure to re-select CPU next time once after CPUs
   2061		 * in hctx->cpumask become online again.
   2062		 */
   2063		hctx->next_cpu = next_cpu;
   2064		hctx->next_cpu_batch = 1;
   2065		return WORK_CPU_UNBOUND;
   2066	}
   2067
   2068	hctx->next_cpu = next_cpu;
   2069	return next_cpu;
   2070}
   2071
   2072/**
   2073 * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
   2074 * @hctx: Pointer to the hardware queue to run.
   2075 * @async: If we want to run the queue asynchronously.
   2076 * @msecs: Milliseconds of delay to wait before running the queue.
   2077 *
   2078 * If !@async, try to run the queue now. Else, run the queue asynchronously and
   2079 * with a delay of @msecs.
   2080 */
   2081static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
   2082					unsigned long msecs)
   2083{
   2084	if (unlikely(blk_mq_hctx_stopped(hctx)))
   2085		return;
   2086
   2087	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
   2088		int cpu = get_cpu();
   2089		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
   2090			__blk_mq_run_hw_queue(hctx);
   2091			put_cpu();
   2092			return;
   2093		}
   2094
   2095		put_cpu();
   2096	}
   2097
   2098	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
   2099				    msecs_to_jiffies(msecs));
   2100}
   2101
   2102/**
   2103 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
   2104 * @hctx: Pointer to the hardware queue to run.
   2105 * @msecs: Milliseconds of delay to wait before running the queue.
   2106 *
   2107 * Run a hardware queue asynchronously with a delay of @msecs.
   2108 */
   2109void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
   2110{
   2111	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
   2112}
   2113EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
   2114
   2115/**
   2116 * blk_mq_run_hw_queue - Start to run a hardware queue.
   2117 * @hctx: Pointer to the hardware queue to run.
   2118 * @async: If we want to run the queue asynchronously.
   2119 *
   2120 * Check if the request queue is not in a quiesced state and if there are
   2121 * pending requests to be sent. If this is true, run the queue to send requests
   2122 * to hardware.
   2123 */
   2124void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
   2125{
   2126	bool need_run;
   2127
   2128	/*
   2129	 * When queue is quiesced, we may be switching io scheduler, or
   2130	 * updating nr_hw_queues, or other things, and we can't run queue
   2131	 * any more, even __blk_mq_hctx_has_pending() can't be called safely.
   2132	 *
   2133	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
   2134	 * quiesced.
   2135	 */
   2136	__blk_mq_run_dispatch_ops(hctx->queue, false,
   2137		need_run = !blk_queue_quiesced(hctx->queue) &&
   2138		blk_mq_hctx_has_pending(hctx));
   2139
   2140	if (need_run)
   2141		__blk_mq_delay_run_hw_queue(hctx, async, 0);
   2142}
   2143EXPORT_SYMBOL(blk_mq_run_hw_queue);
   2144
   2145/*
   2146 * Return prefered queue to dispatch from (if any) for non-mq aware IO
   2147 * scheduler.
   2148 */
   2149static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
   2150{
   2151	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
   2152	/*
   2153	 * If the IO scheduler does not respect hardware queues when
   2154	 * dispatching, we just don't bother with multiple HW queues and
   2155	 * dispatch from hctx for the current CPU since running multiple queues
   2156	 * just causes lock contention inside the scheduler and pointless cache
   2157	 * bouncing.
   2158	 */
   2159	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
   2160
   2161	if (!blk_mq_hctx_stopped(hctx))
   2162		return hctx;
   2163	return NULL;
   2164}
   2165
   2166/**
   2167 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
   2168 * @q: Pointer to the request queue to run.
   2169 * @async: If we want to run the queue asynchronously.
   2170 */
   2171void blk_mq_run_hw_queues(struct request_queue *q, bool async)
   2172{
   2173	struct blk_mq_hw_ctx *hctx, *sq_hctx;
   2174	unsigned long i;
   2175
   2176	sq_hctx = NULL;
   2177	if (blk_queue_sq_sched(q))
   2178		sq_hctx = blk_mq_get_sq_hctx(q);
   2179	queue_for_each_hw_ctx(q, hctx, i) {
   2180		if (blk_mq_hctx_stopped(hctx))
   2181			continue;
   2182		/*
   2183		 * Dispatch from this hctx either if there's no hctx preferred
   2184		 * by IO scheduler or if it has requests that bypass the
   2185		 * scheduler.
   2186		 */
   2187		if (!sq_hctx || sq_hctx == hctx ||
   2188		    !list_empty_careful(&hctx->dispatch))
   2189			blk_mq_run_hw_queue(hctx, async);
   2190	}
   2191}
   2192EXPORT_SYMBOL(blk_mq_run_hw_queues);
   2193
   2194/**
   2195 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
   2196 * @q: Pointer to the request queue to run.
   2197 * @msecs: Milliseconds of delay to wait before running the queues.
   2198 */
   2199void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
   2200{
   2201	struct blk_mq_hw_ctx *hctx, *sq_hctx;
   2202	unsigned long i;
   2203
   2204	sq_hctx = NULL;
   2205	if (blk_queue_sq_sched(q))
   2206		sq_hctx = blk_mq_get_sq_hctx(q);
   2207	queue_for_each_hw_ctx(q, hctx, i) {
   2208		if (blk_mq_hctx_stopped(hctx))
   2209			continue;
   2210		/*
   2211		 * If there is already a run_work pending, leave the
   2212		 * pending delay untouched. Otherwise, a hctx can stall
   2213		 * if another hctx is re-delaying the other's work
   2214		 * before the work executes.
   2215		 */
   2216		if (delayed_work_pending(&hctx->run_work))
   2217			continue;
   2218		/*
   2219		 * Dispatch from this hctx either if there's no hctx preferred
   2220		 * by IO scheduler or if it has requests that bypass the
   2221		 * scheduler.
   2222		 */
   2223		if (!sq_hctx || sq_hctx == hctx ||
   2224		    !list_empty_careful(&hctx->dispatch))
   2225			blk_mq_delay_run_hw_queue(hctx, msecs);
   2226	}
   2227}
   2228EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
   2229
   2230/**
   2231 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
   2232 * @q: request queue.
   2233 *
   2234 * The caller is responsible for serializing this function against
   2235 * blk_mq_{start,stop}_hw_queue().
   2236 */
   2237bool blk_mq_queue_stopped(struct request_queue *q)
   2238{
   2239	struct blk_mq_hw_ctx *hctx;
   2240	unsigned long i;
   2241
   2242	queue_for_each_hw_ctx(q, hctx, i)
   2243		if (blk_mq_hctx_stopped(hctx))
   2244			return true;
   2245
   2246	return false;
   2247}
   2248EXPORT_SYMBOL(blk_mq_queue_stopped);
   2249
   2250/*
   2251 * This function is often used for pausing .queue_rq() by driver when
   2252 * there isn't enough resource or some conditions aren't satisfied, and
   2253 * BLK_STS_RESOURCE is usually returned.
   2254 *
   2255 * We do not guarantee that dispatch can be drained or blocked
   2256 * after blk_mq_stop_hw_queue() returns. Please use
   2257 * blk_mq_quiesce_queue() for that requirement.
   2258 */
   2259void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
   2260{
   2261	cancel_delayed_work(&hctx->run_work);
   2262
   2263	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
   2264}
   2265EXPORT_SYMBOL(blk_mq_stop_hw_queue);
   2266
   2267/*
   2268 * This function is often used for pausing .queue_rq() by driver when
   2269 * there isn't enough resource or some conditions aren't satisfied, and
   2270 * BLK_STS_RESOURCE is usually returned.
   2271 *
   2272 * We do not guarantee that dispatch can be drained or blocked
   2273 * after blk_mq_stop_hw_queues() returns. Please use
   2274 * blk_mq_quiesce_queue() for that requirement.
   2275 */
   2276void blk_mq_stop_hw_queues(struct request_queue *q)
   2277{
   2278	struct blk_mq_hw_ctx *hctx;
   2279	unsigned long i;
   2280
   2281	queue_for_each_hw_ctx(q, hctx, i)
   2282		blk_mq_stop_hw_queue(hctx);
   2283}
   2284EXPORT_SYMBOL(blk_mq_stop_hw_queues);
   2285
   2286void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
   2287{
   2288	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
   2289
   2290	blk_mq_run_hw_queue(hctx, false);
   2291}
   2292EXPORT_SYMBOL(blk_mq_start_hw_queue);
   2293
   2294void blk_mq_start_hw_queues(struct request_queue *q)
   2295{
   2296	struct blk_mq_hw_ctx *hctx;
   2297	unsigned long i;
   2298
   2299	queue_for_each_hw_ctx(q, hctx, i)
   2300		blk_mq_start_hw_queue(hctx);
   2301}
   2302EXPORT_SYMBOL(blk_mq_start_hw_queues);
   2303
   2304void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
   2305{
   2306	if (!blk_mq_hctx_stopped(hctx))
   2307		return;
   2308
   2309	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
   2310	blk_mq_run_hw_queue(hctx, async);
   2311}
   2312EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
   2313
   2314void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
   2315{
   2316	struct blk_mq_hw_ctx *hctx;
   2317	unsigned long i;
   2318
   2319	queue_for_each_hw_ctx(q, hctx, i)
   2320		blk_mq_start_stopped_hw_queue(hctx, async);
   2321}
   2322EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
   2323
   2324static void blk_mq_run_work_fn(struct work_struct *work)
   2325{
   2326	struct blk_mq_hw_ctx *hctx;
   2327
   2328	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
   2329
   2330	/*
   2331	 * If we are stopped, don't run the queue.
   2332	 */
   2333	if (blk_mq_hctx_stopped(hctx))
   2334		return;
   2335
   2336	__blk_mq_run_hw_queue(hctx);
   2337}
   2338
   2339static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
   2340					    struct request *rq,
   2341					    bool at_head)
   2342{
   2343	struct blk_mq_ctx *ctx = rq->mq_ctx;
   2344	enum hctx_type type = hctx->type;
   2345
   2346	lockdep_assert_held(&ctx->lock);
   2347
   2348	trace_block_rq_insert(rq);
   2349
   2350	if (at_head)
   2351		list_add(&rq->queuelist, &ctx->rq_lists[type]);
   2352	else
   2353		list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
   2354}
   2355
   2356void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
   2357			     bool at_head)
   2358{
   2359	struct blk_mq_ctx *ctx = rq->mq_ctx;
   2360
   2361	lockdep_assert_held(&ctx->lock);
   2362
   2363	__blk_mq_insert_req_list(hctx, rq, at_head);
   2364	blk_mq_hctx_mark_pending(hctx, ctx);
   2365}
   2366
   2367/**
   2368 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
   2369 * @rq: Pointer to request to be inserted.
   2370 * @at_head: true if the request should be inserted at the head of the list.
   2371 * @run_queue: If we should run the hardware queue after inserting the request.
   2372 *
   2373 * Should only be used carefully, when the caller knows we want to
   2374 * bypass a potential IO scheduler on the target device.
   2375 */
   2376void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
   2377				  bool run_queue)
   2378{
   2379	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
   2380
   2381	spin_lock(&hctx->lock);
   2382	if (at_head)
   2383		list_add(&rq->queuelist, &hctx->dispatch);
   2384	else
   2385		list_add_tail(&rq->queuelist, &hctx->dispatch);
   2386	spin_unlock(&hctx->lock);
   2387
   2388	if (run_queue)
   2389		blk_mq_run_hw_queue(hctx, false);
   2390}
   2391
   2392void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
   2393			    struct list_head *list)
   2394
   2395{
   2396	struct request *rq;
   2397	enum hctx_type type = hctx->type;
   2398
   2399	/*
   2400	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
   2401	 * offline now
   2402	 */
   2403	list_for_each_entry(rq, list, queuelist) {
   2404		BUG_ON(rq->mq_ctx != ctx);
   2405		trace_block_rq_insert(rq);
   2406	}
   2407
   2408	spin_lock(&ctx->lock);
   2409	list_splice_tail_init(list, &ctx->rq_lists[type]);
   2410	blk_mq_hctx_mark_pending(hctx, ctx);
   2411	spin_unlock(&ctx->lock);
   2412}
   2413
   2414static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
   2415			      bool from_schedule)
   2416{
   2417	if (hctx->queue->mq_ops->commit_rqs) {
   2418		trace_block_unplug(hctx->queue, *queued, !from_schedule);
   2419		hctx->queue->mq_ops->commit_rqs(hctx);
   2420	}
   2421	*queued = 0;
   2422}
   2423
   2424static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
   2425		unsigned int nr_segs)
   2426{
   2427	int err;
   2428
   2429	if (bio->bi_opf & REQ_RAHEAD)
   2430		rq->cmd_flags |= REQ_FAILFAST_MASK;
   2431
   2432	rq->__sector = bio->bi_iter.bi_sector;
   2433	blk_rq_bio_prep(rq, bio, nr_segs);
   2434
   2435	/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
   2436	err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
   2437	WARN_ON_ONCE(err);
   2438
   2439	blk_account_io_start(rq);
   2440}
   2441
   2442static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
   2443					    struct request *rq, bool last)
   2444{
   2445	struct request_queue *q = rq->q;
   2446	struct blk_mq_queue_data bd = {
   2447		.rq = rq,
   2448		.last = last,
   2449	};
   2450	blk_status_t ret;
   2451
   2452	/*
   2453	 * For OK queue, we are done. For error, caller may kill it.
   2454	 * Any other error (busy), just add it to our list as we
   2455	 * previously would have done.
   2456	 */
   2457	ret = q->mq_ops->queue_rq(hctx, &bd);
   2458	switch (ret) {
   2459	case BLK_STS_OK:
   2460		blk_mq_update_dispatch_busy(hctx, false);
   2461		break;
   2462	case BLK_STS_RESOURCE:
   2463	case BLK_STS_DEV_RESOURCE:
   2464		blk_mq_update_dispatch_busy(hctx, true);
   2465		__blk_mq_requeue_request(rq);
   2466		break;
   2467	default:
   2468		blk_mq_update_dispatch_busy(hctx, false);
   2469		break;
   2470	}
   2471
   2472	return ret;
   2473}
   2474
   2475static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
   2476						struct request *rq,
   2477						bool bypass_insert, bool last)
   2478{
   2479	struct request_queue *q = rq->q;
   2480	bool run_queue = true;
   2481	int budget_token;
   2482
   2483	/*
   2484	 * RCU or SRCU read lock is needed before checking quiesced flag.
   2485	 *
   2486	 * When queue is stopped or quiesced, ignore 'bypass_insert' from
   2487	 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
   2488	 * and avoid driver to try to dispatch again.
   2489	 */
   2490	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
   2491		run_queue = false;
   2492		bypass_insert = false;
   2493		goto insert;
   2494	}
   2495
   2496	if ((rq->rq_flags & RQF_ELV) && !bypass_insert)
   2497		goto insert;
   2498
   2499	budget_token = blk_mq_get_dispatch_budget(q);
   2500	if (budget_token < 0)
   2501		goto insert;
   2502
   2503	blk_mq_set_rq_budget_token(rq, budget_token);
   2504
   2505	if (!blk_mq_get_driver_tag(rq)) {
   2506		blk_mq_put_dispatch_budget(q, budget_token);
   2507		goto insert;
   2508	}
   2509
   2510	return __blk_mq_issue_directly(hctx, rq, last);
   2511insert:
   2512	if (bypass_insert)
   2513		return BLK_STS_RESOURCE;
   2514
   2515	blk_mq_sched_insert_request(rq, false, run_queue, false);
   2516
   2517	return BLK_STS_OK;
   2518}
   2519
   2520/**
   2521 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
   2522 * @hctx: Pointer of the associated hardware queue.
   2523 * @rq: Pointer to request to be sent.
   2524 *
   2525 * If the device has enough resources to accept a new request now, send the
   2526 * request directly to device driver. Else, insert at hctx->dispatch queue, so
   2527 * we can try send it another time in the future. Requests inserted at this
   2528 * queue have higher priority.
   2529 */
   2530static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
   2531		struct request *rq)
   2532{
   2533	blk_status_t ret =
   2534		__blk_mq_try_issue_directly(hctx, rq, false, true);
   2535
   2536	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
   2537		blk_mq_request_bypass_insert(rq, false, true);
   2538	else if (ret != BLK_STS_OK)
   2539		blk_mq_end_request(rq, ret);
   2540}
   2541
   2542static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
   2543{
   2544	return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last);
   2545}
   2546
   2547static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
   2548{
   2549	struct blk_mq_hw_ctx *hctx = NULL;
   2550	struct request *rq;
   2551	int queued = 0;
   2552	int errors = 0;
   2553
   2554	while ((rq = rq_list_pop(&plug->mq_list))) {
   2555		bool last = rq_list_empty(plug->mq_list);
   2556		blk_status_t ret;
   2557
   2558		if (hctx != rq->mq_hctx) {
   2559			if (hctx)
   2560				blk_mq_commit_rqs(hctx, &queued, from_schedule);
   2561			hctx = rq->mq_hctx;
   2562		}
   2563
   2564		ret = blk_mq_request_issue_directly(rq, last);
   2565		switch (ret) {
   2566		case BLK_STS_OK:
   2567			queued++;
   2568			break;
   2569		case BLK_STS_RESOURCE:
   2570		case BLK_STS_DEV_RESOURCE:
   2571			blk_mq_request_bypass_insert(rq, false, last);
   2572			blk_mq_commit_rqs(hctx, &queued, from_schedule);
   2573			return;
   2574		default:
   2575			blk_mq_end_request(rq, ret);
   2576			errors++;
   2577			break;
   2578		}
   2579	}
   2580
   2581	/*
   2582	 * If we didn't flush the entire list, we could have told the driver
   2583	 * there was more coming, but that turned out to be a lie.
   2584	 */
   2585	if (errors)
   2586		blk_mq_commit_rqs(hctx, &queued, from_schedule);
   2587}
   2588
   2589static void __blk_mq_flush_plug_list(struct request_queue *q,
   2590				     struct blk_plug *plug)
   2591{
   2592	if (blk_queue_quiesced(q))
   2593		return;
   2594	q->mq_ops->queue_rqs(&plug->mq_list);
   2595}
   2596
   2597static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
   2598{
   2599	struct blk_mq_hw_ctx *this_hctx = NULL;
   2600	struct blk_mq_ctx *this_ctx = NULL;
   2601	struct request *requeue_list = NULL;
   2602	unsigned int depth = 0;
   2603	LIST_HEAD(list);
   2604
   2605	do {
   2606		struct request *rq = rq_list_pop(&plug->mq_list);
   2607
   2608		if (!this_hctx) {
   2609			this_hctx = rq->mq_hctx;
   2610			this_ctx = rq->mq_ctx;
   2611		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
   2612			rq_list_add(&requeue_list, rq);
   2613			continue;
   2614		}
   2615		list_add_tail(&rq->queuelist, &list);
   2616		depth++;
   2617	} while (!rq_list_empty(plug->mq_list));
   2618
   2619	plug->mq_list = requeue_list;
   2620	trace_block_unplug(this_hctx->queue, depth, !from_sched);
   2621	blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched);
   2622}
   2623
   2624void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
   2625{
   2626	struct request *rq;
   2627
   2628	if (rq_list_empty(plug->mq_list))
   2629		return;
   2630	plug->rq_count = 0;
   2631
   2632	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
   2633		struct request_queue *q;
   2634
   2635		rq = rq_list_peek(&plug->mq_list);
   2636		q = rq->q;
   2637
   2638		/*
   2639		 * Peek first request and see if we have a ->queue_rqs() hook.
   2640		 * If we do, we can dispatch the whole plug list in one go. We
   2641		 * already know at this point that all requests belong to the
   2642		 * same queue, caller must ensure that's the case.
   2643		 *
   2644		 * Since we pass off the full list to the driver at this point,
   2645		 * we do not increment the active request count for the queue.
   2646		 * Bypass shared tags for now because of that.
   2647		 */
   2648		if (q->mq_ops->queue_rqs &&
   2649		    !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
   2650			blk_mq_run_dispatch_ops(q,
   2651				__blk_mq_flush_plug_list(q, plug));
   2652			if (rq_list_empty(plug->mq_list))
   2653				return;
   2654		}
   2655
   2656		blk_mq_run_dispatch_ops(q,
   2657				blk_mq_plug_issue_direct(plug, false));
   2658		if (rq_list_empty(plug->mq_list))
   2659			return;
   2660	}
   2661
   2662	do {
   2663		blk_mq_dispatch_plug_list(plug, from_schedule);
   2664	} while (!rq_list_empty(plug->mq_list));
   2665}
   2666
   2667void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
   2668		struct list_head *list)
   2669{
   2670	int queued = 0;
   2671	int errors = 0;
   2672
   2673	while (!list_empty(list)) {
   2674		blk_status_t ret;
   2675		struct request *rq = list_first_entry(list, struct request,
   2676				queuelist);
   2677
   2678		list_del_init(&rq->queuelist);
   2679		ret = blk_mq_request_issue_directly(rq, list_empty(list));
   2680		if (ret != BLK_STS_OK) {
   2681			if (ret == BLK_STS_RESOURCE ||
   2682					ret == BLK_STS_DEV_RESOURCE) {
   2683				blk_mq_request_bypass_insert(rq, false,
   2684							list_empty(list));
   2685				break;
   2686			}
   2687			blk_mq_end_request(rq, ret);
   2688			errors++;
   2689		} else
   2690			queued++;
   2691	}
   2692
   2693	/*
   2694	 * If we didn't flush the entire list, we could have told
   2695	 * the driver there was more coming, but that turned out to
   2696	 * be a lie.
   2697	 */
   2698	if ((!list_empty(list) || errors) &&
   2699	     hctx->queue->mq_ops->commit_rqs && queued)
   2700		hctx->queue->mq_ops->commit_rqs(hctx);
   2701}
   2702
   2703static bool blk_mq_attempt_bio_merge(struct request_queue *q,
   2704				     struct bio *bio, unsigned int nr_segs)
   2705{
   2706	if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
   2707		if (blk_attempt_plug_merge(q, bio, nr_segs))
   2708			return true;
   2709		if (blk_mq_sched_bio_merge(q, bio, nr_segs))
   2710			return true;
   2711	}
   2712	return false;
   2713}
   2714
   2715static struct request *blk_mq_get_new_requests(struct request_queue *q,
   2716					       struct blk_plug *plug,
   2717					       struct bio *bio,
   2718					       unsigned int nsegs)
   2719{
   2720	struct blk_mq_alloc_data data = {
   2721		.q		= q,
   2722		.nr_tags	= 1,
   2723		.cmd_flags	= bio->bi_opf,
   2724	};
   2725	struct request *rq;
   2726
   2727	if (unlikely(bio_queue_enter(bio)))
   2728		return NULL;
   2729
   2730	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
   2731		goto queue_exit;
   2732
   2733	rq_qos_throttle(q, bio);
   2734
   2735	if (plug) {
   2736		data.nr_tags = plug->nr_ios;
   2737		plug->nr_ios = 1;
   2738		data.cached_rq = &plug->cached_rq;
   2739	}
   2740
   2741	rq = __blk_mq_alloc_requests(&data);
   2742	if (rq)
   2743		return rq;
   2744	rq_qos_cleanup(q, bio);
   2745	if (bio->bi_opf & REQ_NOWAIT)
   2746		bio_wouldblock_error(bio);
   2747queue_exit:
   2748	blk_queue_exit(q);
   2749	return NULL;
   2750}
   2751
   2752static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
   2753		struct blk_plug *plug, struct bio **bio, unsigned int nsegs)
   2754{
   2755	struct request *rq;
   2756
   2757	if (!plug)
   2758		return NULL;
   2759	rq = rq_list_peek(&plug->cached_rq);
   2760	if (!rq || rq->q != q)
   2761		return NULL;
   2762
   2763	if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) {
   2764		*bio = NULL;
   2765		return NULL;
   2766	}
   2767
   2768	if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type)
   2769		return NULL;
   2770	if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
   2771		return NULL;
   2772
   2773	/*
   2774	 * If any qos ->throttle() end up blocking, we will have flushed the
   2775	 * plug and hence killed the cached_rq list as well. Pop this entry
   2776	 * before we throttle.
   2777	 */
   2778	plug->cached_rq = rq_list_next(rq);
   2779	rq_qos_throttle(q, *bio);
   2780
   2781	rq->cmd_flags = (*bio)->bi_opf;
   2782	INIT_LIST_HEAD(&rq->queuelist);
   2783	return rq;
   2784}
   2785
   2786/**
   2787 * blk_mq_submit_bio - Create and send a request to block device.
   2788 * @bio: Bio pointer.
   2789 *
   2790 * Builds up a request structure from @q and @bio and send to the device. The
   2791 * request may not be queued directly to hardware if:
   2792 * * This request can be merged with another one
   2793 * * We want to place request at plug queue for possible future merging
   2794 * * There is an IO scheduler active at this queue
   2795 *
   2796 * It will not queue the request if there is an error with the bio, or at the
   2797 * request creation.
   2798 */
   2799void blk_mq_submit_bio(struct bio *bio)
   2800{
   2801	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
   2802	struct blk_plug *plug = blk_mq_plug(q, bio);
   2803	const int is_sync = op_is_sync(bio->bi_opf);
   2804	struct request *rq;
   2805	unsigned int nr_segs = 1;
   2806	blk_status_t ret;
   2807
   2808	blk_queue_bounce(q, &bio);
   2809	if (blk_may_split(q, bio))
   2810		__blk_queue_split(q, &bio, &nr_segs);
   2811
   2812	if (!bio_integrity_prep(bio))
   2813		return;
   2814
   2815	rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
   2816	if (!rq) {
   2817		if (!bio)
   2818			return;
   2819		rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
   2820		if (unlikely(!rq))
   2821			return;
   2822	}
   2823
   2824	trace_block_getrq(bio);
   2825
   2826	rq_qos_track(q, rq, bio);
   2827
   2828	blk_mq_bio_to_request(rq, bio, nr_segs);
   2829
   2830	ret = blk_crypto_init_request(rq);
   2831	if (ret != BLK_STS_OK) {
   2832		bio->bi_status = ret;
   2833		bio_endio(bio);
   2834		blk_mq_free_request(rq);
   2835		return;
   2836	}
   2837
   2838	if (op_is_flush(bio->bi_opf)) {
   2839		blk_insert_flush(rq);
   2840		return;
   2841	}
   2842
   2843	if (plug)
   2844		blk_add_rq_to_plug(plug, rq);
   2845	else if ((rq->rq_flags & RQF_ELV) ||
   2846		 (rq->mq_hctx->dispatch_busy &&
   2847		  (q->nr_hw_queues == 1 || !is_sync)))
   2848		blk_mq_sched_insert_request(rq, false, true, true);
   2849	else
   2850		blk_mq_run_dispatch_ops(rq->q,
   2851				blk_mq_try_issue_directly(rq->mq_hctx, rq));
   2852}
   2853
   2854#ifdef CONFIG_BLK_MQ_STACKING
   2855/**
   2856 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
   2857 * @rq: the request being queued
   2858 */
   2859blk_status_t blk_insert_cloned_request(struct request *rq)
   2860{
   2861	struct request_queue *q = rq->q;
   2862	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
   2863	blk_status_t ret;
   2864
   2865	if (blk_rq_sectors(rq) > max_sectors) {
   2866		/*
   2867		 * SCSI device does not have a good way to return if
   2868		 * Write Same/Zero is actually supported. If a device rejects
   2869		 * a non-read/write command (discard, write same,etc.) the
   2870		 * low-level device driver will set the relevant queue limit to
   2871		 * 0 to prevent blk-lib from issuing more of the offending
   2872		 * operations. Commands queued prior to the queue limit being
   2873		 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
   2874		 * errors being propagated to upper layers.
   2875		 */
   2876		if (max_sectors == 0)
   2877			return BLK_STS_NOTSUPP;
   2878
   2879		printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
   2880			__func__, blk_rq_sectors(rq), max_sectors);
   2881		return BLK_STS_IOERR;
   2882	}
   2883
   2884	/*
   2885	 * The queue settings related to segment counting may differ from the
   2886	 * original queue.
   2887	 */
   2888	rq->nr_phys_segments = blk_recalc_rq_segments(rq);
   2889	if (rq->nr_phys_segments > queue_max_segments(q)) {
   2890		printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
   2891			__func__, rq->nr_phys_segments, queue_max_segments(q));
   2892		return BLK_STS_IOERR;
   2893	}
   2894
   2895	if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
   2896		return BLK_STS_IOERR;
   2897
   2898	if (blk_crypto_insert_cloned_request(rq))
   2899		return BLK_STS_IOERR;
   2900
   2901	blk_account_io_start(rq);
   2902
   2903	/*
   2904	 * Since we have a scheduler attached on the top device,
   2905	 * bypass a potential scheduler on the bottom device for
   2906	 * insert.
   2907	 */
   2908	blk_mq_run_dispatch_ops(q,
   2909			ret = blk_mq_request_issue_directly(rq, true));
   2910	if (ret)
   2911		blk_account_io_done(rq, ktime_get_ns());
   2912	return ret;
   2913}
   2914EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
   2915
   2916/**
   2917 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
   2918 * @rq: the clone request to be cleaned up
   2919 *
   2920 * Description:
   2921 *     Free all bios in @rq for a cloned request.
   2922 */
   2923void blk_rq_unprep_clone(struct request *rq)
   2924{
   2925	struct bio *bio;
   2926
   2927	while ((bio = rq->bio) != NULL) {
   2928		rq->bio = bio->bi_next;
   2929
   2930		bio_put(bio);
   2931	}
   2932}
   2933EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
   2934
   2935/**
   2936 * blk_rq_prep_clone - Helper function to setup clone request
   2937 * @rq: the request to be setup
   2938 * @rq_src: original request to be cloned
   2939 * @bs: bio_set that bios for clone are allocated from
   2940 * @gfp_mask: memory allocation mask for bio
   2941 * @bio_ctr: setup function to be called for each clone bio.
   2942 *           Returns %0 for success, non %0 for failure.
   2943 * @data: private data to be passed to @bio_ctr
   2944 *
   2945 * Description:
   2946 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
   2947 *     Also, pages which the original bios are pointing to are not copied
   2948 *     and the cloned bios just point same pages.
   2949 *     So cloned bios must be completed before original bios, which means
   2950 *     the caller must complete @rq before @rq_src.
   2951 */
   2952int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
   2953		      struct bio_set *bs, gfp_t gfp_mask,
   2954		      int (*bio_ctr)(struct bio *, struct bio *, void *),
   2955		      void *data)
   2956{
   2957	struct bio *bio, *bio_src;
   2958
   2959	if (!bs)
   2960		bs = &fs_bio_set;
   2961
   2962	__rq_for_each_bio(bio_src, rq_src) {
   2963		bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
   2964				      bs);
   2965		if (!bio)
   2966			goto free_and_out;
   2967
   2968		if (bio_ctr && bio_ctr(bio, bio_src, data))
   2969			goto free_and_out;
   2970
   2971		if (rq->bio) {
   2972			rq->biotail->bi_next = bio;
   2973			rq->biotail = bio;
   2974		} else {
   2975			rq->bio = rq->biotail = bio;
   2976		}
   2977		bio = NULL;
   2978	}
   2979
   2980	/* Copy attributes of the original request to the clone request. */
   2981	rq->__sector = blk_rq_pos(rq_src);
   2982	rq->__data_len = blk_rq_bytes(rq_src);
   2983	if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
   2984		rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
   2985		rq->special_vec = rq_src->special_vec;
   2986	}
   2987	rq->nr_phys_segments = rq_src->nr_phys_segments;
   2988	rq->ioprio = rq_src->ioprio;
   2989
   2990	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
   2991		goto free_and_out;
   2992
   2993	return 0;
   2994
   2995free_and_out:
   2996	if (bio)
   2997		bio_put(bio);
   2998	blk_rq_unprep_clone(rq);
   2999
   3000	return -ENOMEM;
   3001}
   3002EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
   3003#endif /* CONFIG_BLK_MQ_STACKING */
   3004
   3005/*
   3006 * Steal bios from a request and add them to a bio list.
   3007 * The request must not have been partially completed before.
   3008 */
   3009void blk_steal_bios(struct bio_list *list, struct request *rq)
   3010{
   3011	if (rq->bio) {
   3012		if (list->tail)
   3013			list->tail->bi_next = rq->bio;
   3014		else
   3015			list->head = rq->bio;
   3016		list->tail = rq->biotail;
   3017
   3018		rq->bio = NULL;
   3019		rq->biotail = NULL;
   3020	}
   3021
   3022	rq->__data_len = 0;
   3023}
   3024EXPORT_SYMBOL_GPL(blk_steal_bios);
   3025
   3026static size_t order_to_size(unsigned int order)
   3027{
   3028	return (size_t)PAGE_SIZE << order;
   3029}
   3030
   3031/* called before freeing request pool in @tags */
   3032static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
   3033				    struct blk_mq_tags *tags)
   3034{
   3035	struct page *page;
   3036	unsigned long flags;
   3037
   3038	/* There is no need to clear a driver tags own mapping */
   3039	if (drv_tags == tags)
   3040		return;
   3041
   3042	list_for_each_entry(page, &tags->page_list, lru) {
   3043		unsigned long start = (unsigned long)page_address(page);
   3044		unsigned long end = start + order_to_size(page->private);
   3045		int i;
   3046
   3047		for (i = 0; i < drv_tags->nr_tags; i++) {
   3048			struct request *rq = drv_tags->rqs[i];
   3049			unsigned long rq_addr = (unsigned long)rq;
   3050
   3051			if (rq_addr >= start && rq_addr < end) {
   3052				WARN_ON_ONCE(req_ref_read(rq) != 0);
   3053				cmpxchg(&drv_tags->rqs[i], rq, NULL);
   3054			}
   3055		}
   3056	}
   3057
   3058	/*
   3059	 * Wait until all pending iteration is done.
   3060	 *
   3061	 * Request reference is cleared and it is guaranteed to be observed
   3062	 * after the ->lock is released.
   3063	 */
   3064	spin_lock_irqsave(&drv_tags->lock, flags);
   3065	spin_unlock_irqrestore(&drv_tags->lock, flags);
   3066}
   3067
   3068void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
   3069		     unsigned int hctx_idx)
   3070{
   3071	struct blk_mq_tags *drv_tags;
   3072	struct page *page;
   3073
   3074	if (list_empty(&tags->page_list))
   3075		return;
   3076
   3077	if (blk_mq_is_shared_tags(set->flags))
   3078		drv_tags = set->shared_tags;
   3079	else
   3080		drv_tags = set->tags[hctx_idx];
   3081
   3082	if (tags->static_rqs && set->ops->exit_request) {
   3083		int i;
   3084
   3085		for (i = 0; i < tags->nr_tags; i++) {
   3086			struct request *rq = tags->static_rqs[i];
   3087
   3088			if (!rq)
   3089				continue;
   3090			set->ops->exit_request(set, rq, hctx_idx);
   3091			tags->static_rqs[i] = NULL;
   3092		}
   3093	}
   3094
   3095	blk_mq_clear_rq_mapping(drv_tags, tags);
   3096
   3097	while (!list_empty(&tags->page_list)) {
   3098		page = list_first_entry(&tags->page_list, struct page, lru);
   3099		list_del_init(&page->lru);
   3100		/*
   3101		 * Remove kmemleak object previously allocated in
   3102		 * blk_mq_alloc_rqs().
   3103		 */
   3104		kmemleak_free(page_address(page));
   3105		__free_pages(page, page->private);
   3106	}
   3107}
   3108
   3109void blk_mq_free_rq_map(struct blk_mq_tags *tags)
   3110{
   3111	kfree(tags->rqs);
   3112	tags->rqs = NULL;
   3113	kfree(tags->static_rqs);
   3114	tags->static_rqs = NULL;
   3115
   3116	blk_mq_free_tags(tags);
   3117}
   3118
   3119static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
   3120		unsigned int hctx_idx)
   3121{
   3122	int i;
   3123
   3124	for (i = 0; i < set->nr_maps; i++) {
   3125		unsigned int start = set->map[i].queue_offset;
   3126		unsigned int end = start + set->map[i].nr_queues;
   3127
   3128		if (hctx_idx >= start && hctx_idx < end)
   3129			break;
   3130	}
   3131
   3132	if (i >= set->nr_maps)
   3133		i = HCTX_TYPE_DEFAULT;
   3134
   3135	return i;
   3136}
   3137
   3138static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
   3139		unsigned int hctx_idx)
   3140{
   3141	enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
   3142
   3143	return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
   3144}
   3145
   3146static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
   3147					       unsigned int hctx_idx,
   3148					       unsigned int nr_tags,
   3149					       unsigned int reserved_tags)
   3150{
   3151	int node = blk_mq_get_hctx_node(set, hctx_idx);
   3152	struct blk_mq_tags *tags;
   3153
   3154	if (node == NUMA_NO_NODE)
   3155		node = set->numa_node;
   3156
   3157	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
   3158				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
   3159	if (!tags)
   3160		return NULL;
   3161
   3162	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
   3163				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
   3164				 node);
   3165	if (!tags->rqs) {
   3166		blk_mq_free_tags(tags);
   3167		return NULL;
   3168	}
   3169
   3170	tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
   3171					GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
   3172					node);
   3173	if (!tags->static_rqs) {
   3174		kfree(tags->rqs);
   3175		blk_mq_free_tags(tags);
   3176		return NULL;
   3177	}
   3178
   3179	return tags;
   3180}
   3181
   3182static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
   3183			       unsigned int hctx_idx, int node)
   3184{
   3185	int ret;
   3186
   3187	if (set->ops->init_request) {
   3188		ret = set->ops->init_request(set, rq, hctx_idx, node);
   3189		if (ret)
   3190			return ret;
   3191	}
   3192
   3193	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
   3194	return 0;
   3195}
   3196
   3197static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
   3198			    struct blk_mq_tags *tags,
   3199			    unsigned int hctx_idx, unsigned int depth)
   3200{
   3201	unsigned int i, j, entries_per_page, max_order = 4;
   3202	int node = blk_mq_get_hctx_node(set, hctx_idx);
   3203	size_t rq_size, left;
   3204
   3205	if (node == NUMA_NO_NODE)
   3206		node = set->numa_node;
   3207
   3208	INIT_LIST_HEAD(&tags->page_list);
   3209
   3210	/*
   3211	 * rq_size is the size of the request plus driver payload, rounded
   3212	 * to the cacheline size
   3213	 */
   3214	rq_size = round_up(sizeof(struct request) + set->cmd_size,
   3215				cache_line_size());
   3216	left = rq_size * depth;
   3217
   3218	for (i = 0; i < depth; ) {
   3219		int this_order = max_order;
   3220		struct page *page;
   3221		int to_do;
   3222		void *p;
   3223
   3224		while (this_order && left < order_to_size(this_order - 1))
   3225			this_order--;
   3226
   3227		do {
   3228			page = alloc_pages_node(node,
   3229				GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
   3230				this_order);
   3231			if (page)
   3232				break;
   3233			if (!this_order--)
   3234				break;
   3235			if (order_to_size(this_order) < rq_size)
   3236				break;
   3237		} while (1);
   3238
   3239		if (!page)
   3240			goto fail;
   3241
   3242		page->private = this_order;
   3243		list_add_tail(&page->lru, &tags->page_list);
   3244
   3245		p = page_address(page);
   3246		/*
   3247		 * Allow kmemleak to scan these pages as they contain pointers
   3248		 * to additional allocations like via ops->init_request().
   3249		 */
   3250		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
   3251		entries_per_page = order_to_size(this_order) / rq_size;
   3252		to_do = min(entries_per_page, depth - i);
   3253		left -= to_do * rq_size;
   3254		for (j = 0; j < to_do; j++) {
   3255			struct request *rq = p;
   3256
   3257			tags->static_rqs[i] = rq;
   3258			if (blk_mq_init_request(set, rq, hctx_idx, node)) {
   3259				tags->static_rqs[i] = NULL;
   3260				goto fail;
   3261			}
   3262
   3263			p += rq_size;
   3264			i++;
   3265		}
   3266	}
   3267	return 0;
   3268
   3269fail:
   3270	blk_mq_free_rqs(set, tags, hctx_idx);
   3271	return -ENOMEM;
   3272}
   3273
   3274struct rq_iter_data {
   3275	struct blk_mq_hw_ctx *hctx;
   3276	bool has_rq;
   3277};
   3278
   3279static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
   3280{
   3281	struct rq_iter_data *iter_data = data;
   3282
   3283	if (rq->mq_hctx != iter_data->hctx)
   3284		return true;
   3285	iter_data->has_rq = true;
   3286	return false;
   3287}
   3288
   3289static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
   3290{
   3291	struct blk_mq_tags *tags = hctx->sched_tags ?
   3292			hctx->sched_tags : hctx->tags;
   3293	struct rq_iter_data data = {
   3294		.hctx	= hctx,
   3295	};
   3296
   3297	blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
   3298	return data.has_rq;
   3299}
   3300
   3301static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
   3302		struct blk_mq_hw_ctx *hctx)
   3303{
   3304	if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
   3305		return false;
   3306	if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
   3307		return false;
   3308	return true;
   3309}
   3310
   3311static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
   3312{
   3313	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
   3314			struct blk_mq_hw_ctx, cpuhp_online);
   3315
   3316	if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
   3317	    !blk_mq_last_cpu_in_hctx(cpu, hctx))
   3318		return 0;
   3319
   3320	/*
   3321	 * Prevent new request from being allocated on the current hctx.
   3322	 *
   3323	 * The smp_mb__after_atomic() Pairs with the implied barrier in
   3324	 * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
   3325	 * seen once we return from the tag allocator.
   3326	 */
   3327	set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
   3328	smp_mb__after_atomic();
   3329
   3330	/*
   3331	 * Try to grab a reference to the queue and wait for any outstanding
   3332	 * requests.  If we could not grab a reference the queue has been
   3333	 * frozen and there are no requests.
   3334	 */
   3335	if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
   3336		while (blk_mq_hctx_has_requests(hctx))
   3337			msleep(5);
   3338		percpu_ref_put(&hctx->queue->q_usage_counter);
   3339	}
   3340
   3341	return 0;
   3342}
   3343
   3344static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
   3345{
   3346	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
   3347			struct blk_mq_hw_ctx, cpuhp_online);
   3348
   3349	if (cpumask_test_cpu(cpu, hctx->cpumask))
   3350		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
   3351	return 0;
   3352}
   3353
   3354/*
   3355 * 'cpu' is going away. splice any existing rq_list entries from this
   3356 * software queue to the hw queue dispatch list, and ensure that it
   3357 * gets run.
   3358 */
   3359static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
   3360{
   3361	struct blk_mq_hw_ctx *hctx;
   3362	struct blk_mq_ctx *ctx;
   3363	LIST_HEAD(tmp);
   3364	enum hctx_type type;
   3365
   3366	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
   3367	if (!cpumask_test_cpu(cpu, hctx->cpumask))
   3368		return 0;
   3369
   3370	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
   3371	type = hctx->type;
   3372
   3373	spin_lock(&ctx->lock);
   3374	if (!list_empty(&ctx->rq_lists[type])) {
   3375		list_splice_init(&ctx->rq_lists[type], &tmp);
   3376		blk_mq_hctx_clear_pending(hctx, ctx);
   3377	}
   3378	spin_unlock(&ctx->lock);
   3379
   3380	if (list_empty(&tmp))
   3381		return 0;
   3382
   3383	spin_lock(&hctx->lock);
   3384	list_splice_tail_init(&tmp, &hctx->dispatch);
   3385	spin_unlock(&hctx->lock);
   3386
   3387	blk_mq_run_hw_queue(hctx, true);
   3388	return 0;
   3389}
   3390
   3391static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
   3392{
   3393	if (!(hctx->flags & BLK_MQ_F_STACKING))
   3394		cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
   3395						    &hctx->cpuhp_online);
   3396	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
   3397					    &hctx->cpuhp_dead);
   3398}
   3399
   3400/*
   3401 * Before freeing hw queue, clearing the flush request reference in
   3402 * tags->rqs[] for avoiding potential UAF.
   3403 */
   3404static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
   3405		unsigned int queue_depth, struct request *flush_rq)
   3406{
   3407	int i;
   3408	unsigned long flags;
   3409
   3410	/* The hw queue may not be mapped yet */
   3411	if (!tags)
   3412		return;
   3413
   3414	WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
   3415
   3416	for (i = 0; i < queue_depth; i++)
   3417		cmpxchg(&tags->rqs[i], flush_rq, NULL);
   3418
   3419	/*
   3420	 * Wait until all pending iteration is done.
   3421	 *
   3422	 * Request reference is cleared and it is guaranteed to be observed
   3423	 * after the ->lock is released.
   3424	 */
   3425	spin_lock_irqsave(&tags->lock, flags);
   3426	spin_unlock_irqrestore(&tags->lock, flags);
   3427}
   3428
   3429/* hctx->ctxs will be freed in queue's release handler */
   3430static void blk_mq_exit_hctx(struct request_queue *q,
   3431		struct blk_mq_tag_set *set,
   3432		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
   3433{
   3434	struct request *flush_rq = hctx->fq->flush_rq;
   3435
   3436	if (blk_mq_hw_queue_mapped(hctx))
   3437		blk_mq_tag_idle(hctx);
   3438
   3439	if (blk_queue_init_done(q))
   3440		blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
   3441				set->queue_depth, flush_rq);
   3442	if (set->ops->exit_request)
   3443		set->ops->exit_request(set, flush_rq, hctx_idx);
   3444
   3445	if (set->ops->exit_hctx)
   3446		set->ops->exit_hctx(hctx, hctx_idx);
   3447
   3448	blk_mq_remove_cpuhp(hctx);
   3449
   3450	xa_erase(&q->hctx_table, hctx_idx);
   3451
   3452	spin_lock(&q->unused_hctx_lock);
   3453	list_add(&hctx->hctx_list, &q->unused_hctx_list);
   3454	spin_unlock(&q->unused_hctx_lock);
   3455}
   3456
   3457static void blk_mq_exit_hw_queues(struct request_queue *q,
   3458		struct blk_mq_tag_set *set, int nr_queue)
   3459{
   3460	struct blk_mq_hw_ctx *hctx;
   3461	unsigned long i;
   3462
   3463	queue_for_each_hw_ctx(q, hctx, i) {
   3464		if (i == nr_queue)
   3465			break;
   3466		blk_mq_exit_hctx(q, set, hctx, i);
   3467	}
   3468}
   3469
   3470static int blk_mq_init_hctx(struct request_queue *q,
   3471		struct blk_mq_tag_set *set,
   3472		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
   3473{
   3474	hctx->queue_num = hctx_idx;
   3475
   3476	if (!(hctx->flags & BLK_MQ_F_STACKING))
   3477		cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
   3478				&hctx->cpuhp_online);
   3479	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
   3480
   3481	hctx->tags = set->tags[hctx_idx];
   3482
   3483	if (set->ops->init_hctx &&
   3484	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
   3485		goto unregister_cpu_notifier;
   3486
   3487	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
   3488				hctx->numa_node))
   3489		goto exit_hctx;
   3490
   3491	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
   3492		goto exit_flush_rq;
   3493
   3494	return 0;
   3495
   3496 exit_flush_rq:
   3497	if (set->ops->exit_request)
   3498		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
   3499 exit_hctx:
   3500	if (set->ops->exit_hctx)
   3501		set->ops->exit_hctx(hctx, hctx_idx);
   3502 unregister_cpu_notifier:
   3503	blk_mq_remove_cpuhp(hctx);
   3504	return -1;
   3505}
   3506
   3507static struct blk_mq_hw_ctx *
   3508blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
   3509		int node)
   3510{
   3511	struct blk_mq_hw_ctx *hctx;
   3512	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
   3513
   3514	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
   3515	if (!hctx)
   3516		goto fail_alloc_hctx;
   3517
   3518	if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
   3519		goto free_hctx;
   3520
   3521	atomic_set(&hctx->nr_active, 0);
   3522	if (node == NUMA_NO_NODE)
   3523		node = set->numa_node;
   3524	hctx->numa_node = node;
   3525
   3526	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
   3527	spin_lock_init(&hctx->lock);
   3528	INIT_LIST_HEAD(&hctx->dispatch);
   3529	hctx->queue = q;
   3530	hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
   3531
   3532	INIT_LIST_HEAD(&hctx->hctx_list);
   3533
   3534	/*
   3535	 * Allocate space for all possible cpus to avoid allocation at
   3536	 * runtime
   3537	 */
   3538	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
   3539			gfp, node);
   3540	if (!hctx->ctxs)
   3541		goto free_cpumask;
   3542
   3543	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
   3544				gfp, node, false, false))
   3545		goto free_ctxs;
   3546	hctx->nr_ctx = 0;
   3547
   3548	spin_lock_init(&hctx->dispatch_wait_lock);
   3549	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
   3550	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
   3551
   3552	hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
   3553	if (!hctx->fq)
   3554		goto free_bitmap;
   3555
   3556	blk_mq_hctx_kobj_init(hctx);
   3557
   3558	return hctx;
   3559
   3560 free_bitmap:
   3561	sbitmap_free(&hctx->ctx_map);
   3562 free_ctxs:
   3563	kfree(hctx->ctxs);
   3564 free_cpumask:
   3565	free_cpumask_var(hctx->cpumask);
   3566 free_hctx:
   3567	kfree(hctx);
   3568 fail_alloc_hctx:
   3569	return NULL;
   3570}
   3571
   3572static void blk_mq_init_cpu_queues(struct request_queue *q,
   3573				   unsigned int nr_hw_queues)
   3574{
   3575	struct blk_mq_tag_set *set = q->tag_set;
   3576	unsigned int i, j;
   3577
   3578	for_each_possible_cpu(i) {
   3579		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
   3580		struct blk_mq_hw_ctx *hctx;
   3581		int k;
   3582
   3583		__ctx->cpu = i;
   3584		spin_lock_init(&__ctx->lock);
   3585		for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
   3586			INIT_LIST_HEAD(&__ctx->rq_lists[k]);
   3587
   3588		__ctx->queue = q;
   3589
   3590		/*
   3591		 * Set local node, IFF we have more than one hw queue. If
   3592		 * not, we remain on the home node of the device
   3593		 */
   3594		for (j = 0; j < set->nr_maps; j++) {
   3595			hctx = blk_mq_map_queue_type(q, j, i);
   3596			if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
   3597				hctx->numa_node = cpu_to_node(i);
   3598		}
   3599	}
   3600}
   3601
   3602struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
   3603					     unsigned int hctx_idx,
   3604					     unsigned int depth)
   3605{
   3606	struct blk_mq_tags *tags;
   3607	int ret;
   3608
   3609	tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
   3610	if (!tags)
   3611		return NULL;
   3612
   3613	ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
   3614	if (ret) {
   3615		blk_mq_free_rq_map(tags);
   3616		return NULL;
   3617	}
   3618
   3619	return tags;
   3620}
   3621
   3622static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
   3623				       int hctx_idx)
   3624{
   3625	if (blk_mq_is_shared_tags(set->flags)) {
   3626		set->tags[hctx_idx] = set->shared_tags;
   3627
   3628		return true;
   3629	}
   3630
   3631	set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
   3632						       set->queue_depth);
   3633
   3634	return set->tags[hctx_idx];
   3635}
   3636
   3637void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
   3638			     struct blk_mq_tags *tags,
   3639			     unsigned int hctx_idx)
   3640{
   3641	if (tags) {
   3642		blk_mq_free_rqs(set, tags, hctx_idx);
   3643		blk_mq_free_rq_map(tags);
   3644	}
   3645}
   3646
   3647static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
   3648				      unsigned int hctx_idx)
   3649{
   3650	if (!blk_mq_is_shared_tags(set->flags))
   3651		blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
   3652
   3653	set->tags[hctx_idx] = NULL;
   3654}
   3655
   3656static void blk_mq_map_swqueue(struct request_queue *q)
   3657{
   3658	unsigned int j, hctx_idx;
   3659	unsigned long i;
   3660	struct blk_mq_hw_ctx *hctx;
   3661	struct blk_mq_ctx *ctx;
   3662	struct blk_mq_tag_set *set = q->tag_set;
   3663
   3664	queue_for_each_hw_ctx(q, hctx, i) {
   3665		cpumask_clear(hctx->cpumask);
   3666		hctx->nr_ctx = 0;
   3667		hctx->dispatch_from = NULL;
   3668	}
   3669
   3670	/*
   3671	 * Map software to hardware queues.
   3672	 *
   3673	 * If the cpu isn't present, the cpu is mapped to first hctx.
   3674	 */
   3675	for_each_possible_cpu(i) {
   3676
   3677		ctx = per_cpu_ptr(q->queue_ctx, i);
   3678		for (j = 0; j < set->nr_maps; j++) {
   3679			if (!set->map[j].nr_queues) {
   3680				ctx->hctxs[j] = blk_mq_map_queue_type(q,
   3681						HCTX_TYPE_DEFAULT, i);
   3682				continue;
   3683			}
   3684			hctx_idx = set->map[j].mq_map[i];
   3685			/* unmapped hw queue can be remapped after CPU topo changed */
   3686			if (!set->tags[hctx_idx] &&
   3687			    !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
   3688				/*
   3689				 * If tags initialization fail for some hctx,
   3690				 * that hctx won't be brought online.  In this
   3691				 * case, remap the current ctx to hctx[0] which
   3692				 * is guaranteed to always have tags allocated
   3693				 */
   3694				set->map[j].mq_map[i] = 0;
   3695			}
   3696
   3697			hctx = blk_mq_map_queue_type(q, j, i);
   3698			ctx->hctxs[j] = hctx;
   3699			/*
   3700			 * If the CPU is already set in the mask, then we've
   3701			 * mapped this one already. This can happen if
   3702			 * devices share queues across queue maps.
   3703			 */
   3704			if (cpumask_test_cpu(i, hctx->cpumask))
   3705				continue;
   3706
   3707			cpumask_set_cpu(i, hctx->cpumask);
   3708			hctx->type = j;
   3709			ctx->index_hw[hctx->type] = hctx->nr_ctx;
   3710			hctx->ctxs[hctx->nr_ctx++] = ctx;
   3711
   3712			/*
   3713			 * If the nr_ctx type overflows, we have exceeded the
   3714			 * amount of sw queues we can support.
   3715			 */
   3716			BUG_ON(!hctx->nr_ctx);
   3717		}
   3718
   3719		for (; j < HCTX_MAX_TYPES; j++)
   3720			ctx->hctxs[j] = blk_mq_map_queue_type(q,
   3721					HCTX_TYPE_DEFAULT, i);
   3722	}
   3723
   3724	queue_for_each_hw_ctx(q, hctx, i) {
   3725		/*
   3726		 * If no software queues are mapped to this hardware queue,
   3727		 * disable it and free the request entries.
   3728		 */
   3729		if (!hctx->nr_ctx) {
   3730			/* Never unmap queue 0.  We need it as a
   3731			 * fallback in case of a new remap fails
   3732			 * allocation
   3733			 */
   3734			if (i)
   3735				__blk_mq_free_map_and_rqs(set, i);
   3736
   3737			hctx->tags = NULL;
   3738			continue;
   3739		}
   3740
   3741		hctx->tags = set->tags[i];
   3742		WARN_ON(!hctx->tags);
   3743
   3744		/*
   3745		 * Set the map size to the number of mapped software queues.
   3746		 * This is more accurate and more efficient than looping
   3747		 * over all possibly mapped software queues.
   3748		 */
   3749		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
   3750
   3751		/*
   3752		 * Initialize batch roundrobin counts
   3753		 */
   3754		hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
   3755		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
   3756	}
   3757}
   3758
   3759/*
   3760 * Caller needs to ensure that we're either frozen/quiesced, or that
   3761 * the queue isn't live yet.
   3762 */
   3763static void queue_set_hctx_shared(struct request_queue *q, bool shared)
   3764{
   3765	struct blk_mq_hw_ctx *hctx;
   3766	unsigned long i;
   3767
   3768	queue_for_each_hw_ctx(q, hctx, i) {
   3769		if (shared) {
   3770			hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
   3771		} else {
   3772			blk_mq_tag_idle(hctx);
   3773			hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
   3774		}
   3775	}
   3776}
   3777
   3778static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
   3779					 bool shared)
   3780{
   3781	struct request_queue *q;
   3782
   3783	lockdep_assert_held(&set->tag_list_lock);
   3784
   3785	list_for_each_entry(q, &set->tag_list, tag_set_list) {
   3786		blk_mq_freeze_queue(q);
   3787		queue_set_hctx_shared(q, shared);
   3788		blk_mq_unfreeze_queue(q);
   3789	}
   3790}
   3791
   3792static void blk_mq_del_queue_tag_set(struct request_queue *q)
   3793{
   3794	struct blk_mq_tag_set *set = q->tag_set;
   3795
   3796	mutex_lock(&set->tag_list_lock);
   3797	list_del(&q->tag_set_list);
   3798	if (list_is_singular(&set->tag_list)) {
   3799		/* just transitioned to unshared */
   3800		set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
   3801		/* update existing queue */
   3802		blk_mq_update_tag_set_shared(set, false);
   3803	}
   3804	mutex_unlock(&set->tag_list_lock);
   3805	INIT_LIST_HEAD(&q->tag_set_list);
   3806}
   3807
   3808static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
   3809				     struct request_queue *q)
   3810{
   3811	mutex_lock(&set->tag_list_lock);
   3812
   3813	/*
   3814	 * Check to see if we're transitioning to shared (from 1 to 2 queues).
   3815	 */
   3816	if (!list_empty(&set->tag_list) &&
   3817	    !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
   3818		set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
   3819		/* update existing queue */
   3820		blk_mq_update_tag_set_shared(set, true);
   3821	}
   3822	if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
   3823		queue_set_hctx_shared(q, true);
   3824	list_add_tail(&q->tag_set_list, &set->tag_list);
   3825
   3826	mutex_unlock(&set->tag_list_lock);
   3827}
   3828
   3829/* All allocations will be freed in release handler of q->mq_kobj */
   3830static int blk_mq_alloc_ctxs(struct request_queue *q)
   3831{
   3832	struct blk_mq_ctxs *ctxs;
   3833	int cpu;
   3834
   3835	ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
   3836	if (!ctxs)
   3837		return -ENOMEM;
   3838
   3839	ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
   3840	if (!ctxs->queue_ctx)
   3841		goto fail;
   3842
   3843	for_each_possible_cpu(cpu) {
   3844		struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
   3845		ctx->ctxs = ctxs;
   3846	}
   3847
   3848	q->mq_kobj = &ctxs->kobj;
   3849	q->queue_ctx = ctxs->queue_ctx;
   3850
   3851	return 0;
   3852 fail:
   3853	kfree(ctxs);
   3854	return -ENOMEM;
   3855}
   3856
   3857/*
   3858 * It is the actual release handler for mq, but we do it from
   3859 * request queue's release handler for avoiding use-after-free
   3860 * and headache because q->mq_kobj shouldn't have been introduced,
   3861 * but we can't group ctx/kctx kobj without it.
   3862 */
   3863void blk_mq_release(struct request_queue *q)
   3864{
   3865	struct blk_mq_hw_ctx *hctx, *next;
   3866	unsigned long i;
   3867
   3868	queue_for_each_hw_ctx(q, hctx, i)
   3869		WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
   3870
   3871	/* all hctx are in .unused_hctx_list now */
   3872	list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
   3873		list_del_init(&hctx->hctx_list);
   3874		kobject_put(&hctx->kobj);
   3875	}
   3876
   3877	xa_destroy(&q->hctx_table);
   3878
   3879	/*
   3880	 * release .mq_kobj and sw queue's kobject now because
   3881	 * both share lifetime with request queue.
   3882	 */
   3883	blk_mq_sysfs_deinit(q);
   3884}
   3885
   3886static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
   3887		void *queuedata)
   3888{
   3889	struct request_queue *q;
   3890	int ret;
   3891
   3892	q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
   3893	if (!q)
   3894		return ERR_PTR(-ENOMEM);
   3895	q->queuedata = queuedata;
   3896	ret = blk_mq_init_allocated_queue(set, q);
   3897	if (ret) {
   3898		blk_cleanup_queue(q);
   3899		return ERR_PTR(ret);
   3900	}
   3901	return q;
   3902}
   3903
   3904struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
   3905{
   3906	return blk_mq_init_queue_data(set, NULL);
   3907}
   3908EXPORT_SYMBOL(blk_mq_init_queue);
   3909
   3910struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
   3911		struct lock_class_key *lkclass)
   3912{
   3913	struct request_queue *q;
   3914	struct gendisk *disk;
   3915
   3916	q = blk_mq_init_queue_data(set, queuedata);
   3917	if (IS_ERR(q))
   3918		return ERR_CAST(q);
   3919
   3920	disk = __alloc_disk_node(q, set->numa_node, lkclass);
   3921	if (!disk) {
   3922		blk_cleanup_queue(q);
   3923		return ERR_PTR(-ENOMEM);
   3924	}
   3925	return disk;
   3926}
   3927EXPORT_SYMBOL(__blk_mq_alloc_disk);
   3928
   3929static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
   3930		struct blk_mq_tag_set *set, struct request_queue *q,
   3931		int hctx_idx, int node)
   3932{
   3933	struct blk_mq_hw_ctx *hctx = NULL, *tmp;
   3934
   3935	/* reuse dead hctx first */
   3936	spin_lock(&q->unused_hctx_lock);
   3937	list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
   3938		if (tmp->numa_node == node) {
   3939			hctx = tmp;
   3940			break;
   3941		}
   3942	}
   3943	if (hctx)
   3944		list_del_init(&hctx->hctx_list);
   3945	spin_unlock(&q->unused_hctx_lock);
   3946
   3947	if (!hctx)
   3948		hctx = blk_mq_alloc_hctx(q, set, node);
   3949	if (!hctx)
   3950		goto fail;
   3951
   3952	if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
   3953		goto free_hctx;
   3954
   3955	return hctx;
   3956
   3957 free_hctx:
   3958	kobject_put(&hctx->kobj);
   3959 fail:
   3960	return NULL;
   3961}
   3962
   3963static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
   3964						struct request_queue *q)
   3965{
   3966	struct blk_mq_hw_ctx *hctx;
   3967	unsigned long i, j;
   3968
   3969	/* protect against switching io scheduler  */
   3970	mutex_lock(&q->sysfs_lock);
   3971	for (i = 0; i < set->nr_hw_queues; i++) {
   3972		int old_node;
   3973		int node = blk_mq_get_hctx_node(set, i);
   3974		struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
   3975
   3976		if (old_hctx) {
   3977			old_node = old_hctx->numa_node;
   3978			blk_mq_exit_hctx(q, set, old_hctx, i);
   3979		}
   3980
   3981		if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
   3982			if (!old_hctx)
   3983				break;
   3984			pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
   3985					node, old_node);
   3986			hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
   3987			WARN_ON_ONCE(!hctx);
   3988		}
   3989	}
   3990	/*
   3991	 * Increasing nr_hw_queues fails. Free the newly allocated
   3992	 * hctxs and keep the previous q->nr_hw_queues.
   3993	 */
   3994	if (i != set->nr_hw_queues) {
   3995		j = q->nr_hw_queues;
   3996	} else {
   3997		j = i;
   3998		q->nr_hw_queues = set->nr_hw_queues;
   3999	}
   4000
   4001	xa_for_each_start(&q->hctx_table, j, hctx, j)
   4002		blk_mq_exit_hctx(q, set, hctx, j);
   4003	mutex_unlock(&q->sysfs_lock);
   4004}
   4005
   4006static void blk_mq_update_poll_flag(struct request_queue *q)
   4007{
   4008	struct blk_mq_tag_set *set = q->tag_set;
   4009
   4010	if (set->nr_maps > HCTX_TYPE_POLL &&
   4011	    set->map[HCTX_TYPE_POLL].nr_queues)
   4012		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
   4013	else
   4014		blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
   4015}
   4016
   4017int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
   4018		struct request_queue *q)
   4019{
   4020	WARN_ON_ONCE(blk_queue_has_srcu(q) !=
   4021			!!(set->flags & BLK_MQ_F_BLOCKING));
   4022
   4023	/* mark the queue as mq asap */
   4024	q->mq_ops = set->ops;
   4025
   4026	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
   4027					     blk_mq_poll_stats_bkt,
   4028					     BLK_MQ_POLL_STATS_BKTS, q);
   4029	if (!q->poll_cb)
   4030		goto err_exit;
   4031
   4032	if (blk_mq_alloc_ctxs(q))
   4033		goto err_poll;
   4034
   4035	/* init q->mq_kobj and sw queues' kobjects */
   4036	blk_mq_sysfs_init(q);
   4037
   4038	INIT_LIST_HEAD(&q->unused_hctx_list);
   4039	spin_lock_init(&q->unused_hctx_lock);
   4040
   4041	xa_init(&q->hctx_table);
   4042
   4043	blk_mq_realloc_hw_ctxs(set, q);
   4044	if (!q->nr_hw_queues)
   4045		goto err_hctxs;
   4046
   4047	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
   4048	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
   4049
   4050	q->tag_set = set;
   4051
   4052	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
   4053	blk_mq_update_poll_flag(q);
   4054
   4055	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
   4056	INIT_LIST_HEAD(&q->requeue_list);
   4057	spin_lock_init(&q->requeue_lock);
   4058
   4059	q->nr_requests = set->queue_depth;
   4060
   4061	/*
   4062	 * Default to classic polling
   4063	 */
   4064	q->poll_nsec = BLK_MQ_POLL_CLASSIC;
   4065
   4066	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
   4067	blk_mq_add_queue_tag_set(set, q);
   4068	blk_mq_map_swqueue(q);
   4069	return 0;
   4070
   4071err_hctxs:
   4072	xa_destroy(&q->hctx_table);
   4073	q->nr_hw_queues = 0;
   4074	blk_mq_sysfs_deinit(q);
   4075err_poll:
   4076	blk_stat_free_callback(q->poll_cb);
   4077	q->poll_cb = NULL;
   4078err_exit:
   4079	q->mq_ops = NULL;
   4080	return -ENOMEM;
   4081}
   4082EXPORT_SYMBOL(blk_mq_init_allocated_queue);
   4083
   4084/* tags can _not_ be used after returning from blk_mq_exit_queue */
   4085void blk_mq_exit_queue(struct request_queue *q)
   4086{
   4087	struct blk_mq_tag_set *set = q->tag_set;
   4088
   4089	/* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
   4090	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
   4091	/* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
   4092	blk_mq_del_queue_tag_set(q);
   4093}
   4094
   4095static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
   4096{
   4097	int i;
   4098
   4099	if (blk_mq_is_shared_tags(set->flags)) {
   4100		set->shared_tags = blk_mq_alloc_map_and_rqs(set,
   4101						BLK_MQ_NO_HCTX_IDX,
   4102						set->queue_depth);
   4103		if (!set->shared_tags)
   4104			return -ENOMEM;
   4105	}
   4106
   4107	for (i = 0; i < set->nr_hw_queues; i++) {
   4108		if (!__blk_mq_alloc_map_and_rqs(set, i))
   4109			goto out_unwind;
   4110		cond_resched();
   4111	}
   4112
   4113	return 0;
   4114
   4115out_unwind:
   4116	while (--i >= 0)
   4117		__blk_mq_free_map_and_rqs(set, i);
   4118
   4119	if (blk_mq_is_shared_tags(set->flags)) {
   4120		blk_mq_free_map_and_rqs(set, set->shared_tags,
   4121					BLK_MQ_NO_HCTX_IDX);
   4122	}
   4123
   4124	return -ENOMEM;
   4125}
   4126
   4127/*
   4128 * Allocate the request maps associated with this tag_set. Note that this
   4129 * may reduce the depth asked for, if memory is tight. set->queue_depth
   4130 * will be updated to reflect the allocated depth.
   4131 */
   4132static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
   4133{
   4134	unsigned int depth;
   4135	int err;
   4136
   4137	depth = set->queue_depth;
   4138	do {
   4139		err = __blk_mq_alloc_rq_maps(set);
   4140		if (!err)
   4141			break;
   4142
   4143		set->queue_depth >>= 1;
   4144		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
   4145			err = -ENOMEM;
   4146			break;
   4147		}
   4148	} while (set->queue_depth);
   4149
   4150	if (!set->queue_depth || err) {
   4151		pr_err("blk-mq: failed to allocate request map\n");
   4152		return -ENOMEM;
   4153	}
   4154
   4155	if (depth != set->queue_depth)
   4156		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
   4157						depth, set->queue_depth);
   4158
   4159	return 0;
   4160}
   4161
   4162static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
   4163{
   4164	/*
   4165	 * blk_mq_map_queues() and multiple .map_queues() implementations
   4166	 * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
   4167	 * number of hardware queues.
   4168	 */
   4169	if (set->nr_maps == 1)
   4170		set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
   4171
   4172	if (set->ops->map_queues && !is_kdump_kernel()) {
   4173		int i;
   4174
   4175		/*
   4176		 * transport .map_queues is usually done in the following
   4177		 * way:
   4178		 *
   4179		 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
   4180		 * 	mask = get_cpu_mask(queue)
   4181		 * 	for_each_cpu(cpu, mask)
   4182		 * 		set->map[x].mq_map[cpu] = queue;
   4183		 * }
   4184		 *
   4185		 * When we need to remap, the table has to be cleared for
   4186		 * killing stale mapping since one CPU may not be mapped
   4187		 * to any hw queue.
   4188		 */
   4189		for (i = 0; i < set->nr_maps; i++)
   4190			blk_mq_clear_mq_map(&set->map[i]);
   4191
   4192		return set->ops->map_queues(set);
   4193	} else {
   4194		BUG_ON(set->nr_maps > 1);
   4195		return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
   4196	}
   4197}
   4198
   4199static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
   4200				  int cur_nr_hw_queues, int new_nr_hw_queues)
   4201{
   4202	struct blk_mq_tags **new_tags;
   4203
   4204	if (cur_nr_hw_queues >= new_nr_hw_queues)
   4205		return 0;
   4206
   4207	new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
   4208				GFP_KERNEL, set->numa_node);
   4209	if (!new_tags)
   4210		return -ENOMEM;
   4211
   4212	if (set->tags)
   4213		memcpy(new_tags, set->tags, cur_nr_hw_queues *
   4214		       sizeof(*set->tags));
   4215	kfree(set->tags);
   4216	set->tags = new_tags;
   4217	set->nr_hw_queues = new_nr_hw_queues;
   4218
   4219	return 0;
   4220}
   4221
   4222static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
   4223				int new_nr_hw_queues)
   4224{
   4225	return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
   4226}
   4227
   4228/*
   4229 * Alloc a tag set to be associated with one or more request queues.
   4230 * May fail with EINVAL for various error conditions. May adjust the
   4231 * requested depth down, if it's too large. In that case, the set
   4232 * value will be stored in set->queue_depth.
   4233 */
   4234int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
   4235{
   4236	int i, ret;
   4237
   4238	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
   4239
   4240	if (!set->nr_hw_queues)
   4241		return -EINVAL;
   4242	if (!set->queue_depth)
   4243		return -EINVAL;
   4244	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
   4245		return -EINVAL;
   4246
   4247	if (!set->ops->queue_rq)
   4248		return -EINVAL;
   4249
   4250	if (!set->ops->get_budget ^ !set->ops->put_budget)
   4251		return -EINVAL;
   4252
   4253	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
   4254		pr_info("blk-mq: reduced tag depth to %u\n",
   4255			BLK_MQ_MAX_DEPTH);
   4256		set->queue_depth = BLK_MQ_MAX_DEPTH;
   4257	}
   4258
   4259	if (!set->nr_maps)
   4260		set->nr_maps = 1;
   4261	else if (set->nr_maps > HCTX_MAX_TYPES)
   4262		return -EINVAL;
   4263
   4264	/*
   4265	 * If a crashdump is active, then we are potentially in a very
   4266	 * memory constrained environment. Limit us to 1 queue and
   4267	 * 64 tags to prevent using too much memory.
   4268	 */
   4269	if (is_kdump_kernel()) {
   4270		set->nr_hw_queues = 1;
   4271		set->nr_maps = 1;
   4272		set->queue_depth = min(64U, set->queue_depth);
   4273	}
   4274	/*
   4275	 * There is no use for more h/w queues than cpus if we just have
   4276	 * a single map
   4277	 */
   4278	if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
   4279		set->nr_hw_queues = nr_cpu_ids;
   4280
   4281	if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
   4282		return -ENOMEM;
   4283
   4284	ret = -ENOMEM;
   4285	for (i = 0; i < set->nr_maps; i++) {
   4286		set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
   4287						  sizeof(set->map[i].mq_map[0]),
   4288						  GFP_KERNEL, set->numa_node);
   4289		if (!set->map[i].mq_map)
   4290			goto out_free_mq_map;
   4291		set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
   4292	}
   4293
   4294	ret = blk_mq_update_queue_map(set);
   4295	if (ret)
   4296		goto out_free_mq_map;
   4297
   4298	ret = blk_mq_alloc_set_map_and_rqs(set);
   4299	if (ret)
   4300		goto out_free_mq_map;
   4301
   4302	mutex_init(&set->tag_list_lock);
   4303	INIT_LIST_HEAD(&set->tag_list);
   4304
   4305	return 0;
   4306
   4307out_free_mq_map:
   4308	for (i = 0; i < set->nr_maps; i++) {
   4309		kfree(set->map[i].mq_map);
   4310		set->map[i].mq_map = NULL;
   4311	}
   4312	kfree(set->tags);
   4313	set->tags = NULL;
   4314	return ret;
   4315}
   4316EXPORT_SYMBOL(blk_mq_alloc_tag_set);
   4317
   4318/* allocate and initialize a tagset for a simple single-queue device */
   4319int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
   4320		const struct blk_mq_ops *ops, unsigned int queue_depth,
   4321		unsigned int set_flags)
   4322{
   4323	memset(set, 0, sizeof(*set));
   4324	set->ops = ops;
   4325	set->nr_hw_queues = 1;
   4326	set->nr_maps = 1;
   4327	set->queue_depth = queue_depth;
   4328	set->numa_node = NUMA_NO_NODE;
   4329	set->flags = set_flags;
   4330	return blk_mq_alloc_tag_set(set);
   4331}
   4332EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
   4333
   4334void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
   4335{
   4336	int i, j;
   4337
   4338	for (i = 0; i < set->nr_hw_queues; i++)
   4339		__blk_mq_free_map_and_rqs(set, i);
   4340
   4341	if (blk_mq_is_shared_tags(set->flags)) {
   4342		blk_mq_free_map_and_rqs(set, set->shared_tags,
   4343					BLK_MQ_NO_HCTX_IDX);
   4344	}
   4345
   4346	for (j = 0; j < set->nr_maps; j++) {
   4347		kfree(set->map[j].mq_map);
   4348		set->map[j].mq_map = NULL;
   4349	}
   4350
   4351	kfree(set->tags);
   4352	set->tags = NULL;
   4353}
   4354EXPORT_SYMBOL(blk_mq_free_tag_set);
   4355
   4356int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
   4357{
   4358	struct blk_mq_tag_set *set = q->tag_set;
   4359	struct blk_mq_hw_ctx *hctx;
   4360	int ret;
   4361	unsigned long i;
   4362
   4363	if (!set)
   4364		return -EINVAL;
   4365
   4366	if (q->nr_requests == nr)
   4367		return 0;
   4368
   4369	blk_mq_freeze_queue(q);
   4370	blk_mq_quiesce_queue(q);
   4371
   4372	ret = 0;
   4373	queue_for_each_hw_ctx(q, hctx, i) {
   4374		if (!hctx->tags)
   4375			continue;
   4376		/*
   4377		 * If we're using an MQ scheduler, just update the scheduler
   4378		 * queue depth. This is similar to what the old code would do.
   4379		 */
   4380		if (hctx->sched_tags) {
   4381			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
   4382						      nr, true);
   4383		} else {
   4384			ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
   4385						      false);
   4386		}
   4387		if (ret)
   4388			break;
   4389		if (q->elevator && q->elevator->type->ops.depth_updated)
   4390			q->elevator->type->ops.depth_updated(hctx);
   4391	}
   4392	if (!ret) {
   4393		q->nr_requests = nr;
   4394		if (blk_mq_is_shared_tags(set->flags)) {
   4395			if (q->elevator)
   4396				blk_mq_tag_update_sched_shared_tags(q);
   4397			else
   4398				blk_mq_tag_resize_shared_tags(set, nr);
   4399		}
   4400	}
   4401
   4402	blk_mq_unquiesce_queue(q);
   4403	blk_mq_unfreeze_queue(q);
   4404
   4405	return ret;
   4406}
   4407
   4408/*
   4409 * request_queue and elevator_type pair.
   4410 * It is just used by __blk_mq_update_nr_hw_queues to cache
   4411 * the elevator_type associated with a request_queue.
   4412 */
   4413struct blk_mq_qe_pair {
   4414	struct list_head node;
   4415	struct request_queue *q;
   4416	struct elevator_type *type;
   4417};
   4418
   4419/*
   4420 * Cache the elevator_type in qe pair list and switch the
   4421 * io scheduler to 'none'
   4422 */
   4423static bool blk_mq_elv_switch_none(struct list_head *head,
   4424		struct request_queue *q)
   4425{
   4426	struct blk_mq_qe_pair *qe;
   4427
   4428	if (!q->elevator)
   4429		return true;
   4430
   4431	qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
   4432	if (!qe)
   4433		return false;
   4434
   4435	/* q->elevator needs protection from ->sysfs_lock */
   4436	mutex_lock(&q->sysfs_lock);
   4437
   4438	INIT_LIST_HEAD(&qe->node);
   4439	qe->q = q;
   4440	qe->type = q->elevator->type;
   4441	list_add(&qe->node, head);
   4442
   4443	/*
   4444	 * After elevator_switch_mq, the previous elevator_queue will be
   4445	 * released by elevator_release. The reference of the io scheduler
   4446	 * module get by elevator_get will also be put. So we need to get
   4447	 * a reference of the io scheduler module here to prevent it to be
   4448	 * removed.
   4449	 */
   4450	__module_get(qe->type->elevator_owner);
   4451	elevator_switch_mq(q, NULL);
   4452	mutex_unlock(&q->sysfs_lock);
   4453
   4454	return true;
   4455}
   4456
   4457static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
   4458						struct request_queue *q)
   4459{
   4460	struct blk_mq_qe_pair *qe;
   4461
   4462	list_for_each_entry(qe, head, node)
   4463		if (qe->q == q)
   4464			return qe;
   4465
   4466	return NULL;
   4467}
   4468
   4469static void blk_mq_elv_switch_back(struct list_head *head,
   4470				  struct request_queue *q)
   4471{
   4472	struct blk_mq_qe_pair *qe;
   4473	struct elevator_type *t;
   4474
   4475	qe = blk_lookup_qe_pair(head, q);
   4476	if (!qe)
   4477		return;
   4478	t = qe->type;
   4479	list_del(&qe->node);
   4480	kfree(qe);
   4481
   4482	mutex_lock(&q->sysfs_lock);
   4483	elevator_switch_mq(q, t);
   4484	mutex_unlock(&q->sysfs_lock);
   4485}
   4486
   4487static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
   4488							int nr_hw_queues)
   4489{
   4490	struct request_queue *q;
   4491	LIST_HEAD(head);
   4492	int prev_nr_hw_queues;
   4493
   4494	lockdep_assert_held(&set->tag_list_lock);
   4495
   4496	if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
   4497		nr_hw_queues = nr_cpu_ids;
   4498	if (nr_hw_queues < 1)
   4499		return;
   4500	if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
   4501		return;
   4502
   4503	list_for_each_entry(q, &set->tag_list, tag_set_list)
   4504		blk_mq_freeze_queue(q);
   4505	/*
   4506	 * Switch IO scheduler to 'none', cleaning up the data associated
   4507	 * with the previous scheduler. We will switch back once we are done
   4508	 * updating the new sw to hw queue mappings.
   4509	 */
   4510	list_for_each_entry(q, &set->tag_list, tag_set_list)
   4511		if (!blk_mq_elv_switch_none(&head, q))
   4512			goto switch_back;
   4513
   4514	list_for_each_entry(q, &set->tag_list, tag_set_list) {
   4515		blk_mq_debugfs_unregister_hctxs(q);
   4516		blk_mq_sysfs_unregister(q);
   4517	}
   4518
   4519	prev_nr_hw_queues = set->nr_hw_queues;
   4520	if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
   4521	    0)
   4522		goto reregister;
   4523
   4524	set->nr_hw_queues = nr_hw_queues;
   4525fallback:
   4526	blk_mq_update_queue_map(set);
   4527	list_for_each_entry(q, &set->tag_list, tag_set_list) {
   4528		blk_mq_realloc_hw_ctxs(set, q);
   4529		blk_mq_update_poll_flag(q);
   4530		if (q->nr_hw_queues != set->nr_hw_queues) {
   4531			int i = prev_nr_hw_queues;
   4532
   4533			pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
   4534					nr_hw_queues, prev_nr_hw_queues);
   4535			for (; i < set->nr_hw_queues; i++)
   4536				__blk_mq_free_map_and_rqs(set, i);
   4537
   4538			set->nr_hw_queues = prev_nr_hw_queues;
   4539			blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
   4540			goto fallback;
   4541		}
   4542		blk_mq_map_swqueue(q);
   4543	}
   4544
   4545reregister:
   4546	list_for_each_entry(q, &set->tag_list, tag_set_list) {
   4547		blk_mq_sysfs_register(q);
   4548		blk_mq_debugfs_register_hctxs(q);
   4549	}
   4550
   4551switch_back:
   4552	list_for_each_entry(q, &set->tag_list, tag_set_list)
   4553		blk_mq_elv_switch_back(&head, q);
   4554
   4555	list_for_each_entry(q, &set->tag_list, tag_set_list)
   4556		blk_mq_unfreeze_queue(q);
   4557}
   4558
   4559void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
   4560{
   4561	mutex_lock(&set->tag_list_lock);
   4562	__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
   4563	mutex_unlock(&set->tag_list_lock);
   4564}
   4565EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
   4566
   4567/* Enable polling stats and return whether they were already enabled. */
   4568static bool blk_poll_stats_enable(struct request_queue *q)
   4569{
   4570	if (q->poll_stat)
   4571		return true;
   4572
   4573	return blk_stats_alloc_enable(q);
   4574}
   4575
   4576static void blk_mq_poll_stats_start(struct request_queue *q)
   4577{
   4578	/*
   4579	 * We don't arm the callback if polling stats are not enabled or the
   4580	 * callback is already active.
   4581	 */
   4582	if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
   4583		return;
   4584
   4585	blk_stat_activate_msecs(q->poll_cb, 100);
   4586}
   4587
   4588static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
   4589{
   4590	struct request_queue *q = cb->data;
   4591	int bucket;
   4592
   4593	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
   4594		if (cb->stat[bucket].nr_samples)
   4595			q->poll_stat[bucket] = cb->stat[bucket];
   4596	}
   4597}
   4598
   4599static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
   4600				       struct request *rq)
   4601{
   4602	unsigned long ret = 0;
   4603	int bucket;
   4604
   4605	/*
   4606	 * If stats collection isn't on, don't sleep but turn it on for
   4607	 * future users
   4608	 */
   4609	if (!blk_poll_stats_enable(q))
   4610		return 0;
   4611
   4612	/*
   4613	 * As an optimistic guess, use half of the mean service time
   4614	 * for this type of request. We can (and should) make this smarter.
   4615	 * For instance, if the completion latencies are tight, we can
   4616	 * get closer than just half the mean. This is especially
   4617	 * important on devices where the completion latencies are longer
   4618	 * than ~10 usec. We do use the stats for the relevant IO size
   4619	 * if available which does lead to better estimates.
   4620	 */
   4621	bucket = blk_mq_poll_stats_bkt(rq);
   4622	if (bucket < 0)
   4623		return ret;
   4624
   4625	if (q->poll_stat[bucket].nr_samples)
   4626		ret = (q->poll_stat[bucket].mean + 1) / 2;
   4627
   4628	return ret;
   4629}
   4630
   4631static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
   4632{
   4633	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
   4634	struct request *rq = blk_qc_to_rq(hctx, qc);
   4635	struct hrtimer_sleeper hs;
   4636	enum hrtimer_mode mode;
   4637	unsigned int nsecs;
   4638	ktime_t kt;
   4639
   4640	/*
   4641	 * If a request has completed on queue that uses an I/O scheduler, we
   4642	 * won't get back a request from blk_qc_to_rq.
   4643	 */
   4644	if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
   4645		return false;
   4646
   4647	/*
   4648	 * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
   4649	 *
   4650	 *  0:	use half of prev avg
   4651	 * >0:	use this specific value
   4652	 */
   4653	if (q->poll_nsec > 0)
   4654		nsecs = q->poll_nsec;
   4655	else
   4656		nsecs = blk_mq_poll_nsecs(q, rq);
   4657
   4658	if (!nsecs)
   4659		return false;
   4660
   4661	rq->rq_flags |= RQF_MQ_POLL_SLEPT;
   4662
   4663	/*
   4664	 * This will be replaced with the stats tracking code, using
   4665	 * 'avg_completion_time / 2' as the pre-sleep target.
   4666	 */
   4667	kt = nsecs;
   4668
   4669	mode = HRTIMER_MODE_REL;
   4670	hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
   4671	hrtimer_set_expires(&hs.timer, kt);
   4672
   4673	do {
   4674		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
   4675			break;
   4676		set_current_state(TASK_UNINTERRUPTIBLE);
   4677		hrtimer_sleeper_start_expires(&hs, mode);
   4678		if (hs.task)
   4679			io_schedule();
   4680		hrtimer_cancel(&hs.timer);
   4681		mode = HRTIMER_MODE_ABS;
   4682	} while (hs.task && !signal_pending(current));
   4683
   4684	__set_current_state(TASK_RUNNING);
   4685	destroy_hrtimer_on_stack(&hs.timer);
   4686
   4687	/*
   4688	 * If we sleep, have the caller restart the poll loop to reset the
   4689	 * state.  Like for the other success return cases, the caller is
   4690	 * responsible for checking if the IO completed.  If the IO isn't
   4691	 * complete, we'll get called again and will go straight to the busy
   4692	 * poll loop.
   4693	 */
   4694	return true;
   4695}
   4696
   4697static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
   4698			       struct io_comp_batch *iob, unsigned int flags)
   4699{
   4700	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
   4701	long state = get_current_state();
   4702	int ret;
   4703
   4704	do {
   4705		ret = q->mq_ops->poll(hctx, iob);
   4706		if (ret > 0) {
   4707			__set_current_state(TASK_RUNNING);
   4708			return ret;
   4709		}
   4710
   4711		if (signal_pending_state(state, current))
   4712			__set_current_state(TASK_RUNNING);
   4713		if (task_is_running(current))
   4714			return 1;
   4715
   4716		if (ret < 0 || (flags & BLK_POLL_ONESHOT))
   4717			break;
   4718		cpu_relax();
   4719	} while (!need_resched());
   4720
   4721	__set_current_state(TASK_RUNNING);
   4722	return 0;
   4723}
   4724
   4725int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
   4726		unsigned int flags)
   4727{
   4728	if (!(flags & BLK_POLL_NOSLEEP) &&
   4729	    q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
   4730		if (blk_mq_poll_hybrid(q, cookie))
   4731			return 1;
   4732	}
   4733	return blk_mq_poll_classic(q, cookie, iob, flags);
   4734}
   4735
   4736unsigned int blk_mq_rq_cpu(struct request *rq)
   4737{
   4738	return rq->mq_ctx->cpu;
   4739}
   4740EXPORT_SYMBOL(blk_mq_rq_cpu);
   4741
   4742void blk_mq_cancel_work_sync(struct request_queue *q)
   4743{
   4744	if (queue_is_mq(q)) {
   4745		struct blk_mq_hw_ctx *hctx;
   4746		unsigned long i;
   4747
   4748		cancel_delayed_work_sync(&q->requeue_work);
   4749
   4750		queue_for_each_hw_ctx(q, hctx, i)
   4751			cancel_delayed_work_sync(&hctx->run_work);
   4752	}
   4753}
   4754
   4755static int __init blk_mq_init(void)
   4756{
   4757	int i;
   4758
   4759	for_each_possible_cpu(i)
   4760		init_llist_head(&per_cpu(blk_cpu_done, i));
   4761	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
   4762
   4763	cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
   4764				  "block/softirq:dead", NULL,
   4765				  blk_softirq_cpu_dead);
   4766	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
   4767				blk_mq_hctx_notify_dead);
   4768	cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
   4769				blk_mq_hctx_notify_online,
   4770				blk_mq_hctx_notify_offline);
   4771	return 0;
   4772}
   4773subsys_initcall(blk_mq_init);