cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mq-deadline.c (32519B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
      4 *  for the blk-mq scheduling framework
      5 *
      6 *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
      7 */
      8#include <linux/kernel.h>
      9#include <linux/fs.h>
     10#include <linux/blkdev.h>
     11#include <linux/blk-mq.h>
     12#include <linux/bio.h>
     13#include <linux/module.h>
     14#include <linux/slab.h>
     15#include <linux/init.h>
     16#include <linux/compiler.h>
     17#include <linux/rbtree.h>
     18#include <linux/sbitmap.h>
     19
     20#include <trace/events/block.h>
     21
     22#include "elevator.h"
     23#include "blk.h"
     24#include "blk-mq.h"
     25#include "blk-mq-debugfs.h"
     26#include "blk-mq-tag.h"
     27#include "blk-mq-sched.h"
     28
     29/*
     30 * See Documentation/block/deadline-iosched.rst
     31 */
     32static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
     33static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
     34/*
     35 * Time after which to dispatch lower priority requests even if higher
     36 * priority requests are pending.
     37 */
     38static const int prio_aging_expire = 10 * HZ;
     39static const int writes_starved = 2;    /* max times reads can starve a write */
     40static const int fifo_batch = 16;       /* # of sequential requests treated as one
     41				     by the above parameters. For throughput. */
     42
     43enum dd_data_dir {
     44	DD_READ		= READ,
     45	DD_WRITE	= WRITE,
     46};
     47
     48enum { DD_DIR_COUNT = 2 };
     49
     50enum dd_prio {
     51	DD_RT_PRIO	= 0,
     52	DD_BE_PRIO	= 1,
     53	DD_IDLE_PRIO	= 2,
     54	DD_PRIO_MAX	= 2,
     55};
     56
     57enum { DD_PRIO_COUNT = 3 };
     58
     59/*
     60 * I/O statistics per I/O priority. It is fine if these counters overflow.
     61 * What matters is that these counters are at least as wide as
     62 * log2(max_outstanding_requests).
     63 */
     64struct io_stats_per_prio {
     65	uint32_t inserted;
     66	uint32_t merged;
     67	uint32_t dispatched;
     68	atomic_t completed;
     69};
     70
     71/*
     72 * Deadline scheduler data per I/O priority (enum dd_prio). Requests are
     73 * present on both sort_list[] and fifo_list[].
     74 */
     75struct dd_per_prio {
     76	struct list_head dispatch;
     77	struct rb_root sort_list[DD_DIR_COUNT];
     78	struct list_head fifo_list[DD_DIR_COUNT];
     79	/* Next request in FIFO order. Read, write or both are NULL. */
     80	struct request *next_rq[DD_DIR_COUNT];
     81	struct io_stats_per_prio stats;
     82};
     83
     84struct deadline_data {
     85	/*
     86	 * run time data
     87	 */
     88
     89	struct dd_per_prio per_prio[DD_PRIO_COUNT];
     90
     91	/* Data direction of latest dispatched request. */
     92	enum dd_data_dir last_dir;
     93	unsigned int batching;		/* number of sequential requests made */
     94	unsigned int starved;		/* times reads have starved writes */
     95
     96	/*
     97	 * settings that change how the i/o scheduler behaves
     98	 */
     99	int fifo_expire[DD_DIR_COUNT];
    100	int fifo_batch;
    101	int writes_starved;
    102	int front_merges;
    103	u32 async_depth;
    104	int prio_aging_expire;
    105
    106	spinlock_t lock;
    107	spinlock_t zone_lock;
    108};
    109
    110/* Maps an I/O priority class to a deadline scheduler priority. */
    111static const enum dd_prio ioprio_class_to_prio[] = {
    112	[IOPRIO_CLASS_NONE]	= DD_BE_PRIO,
    113	[IOPRIO_CLASS_RT]	= DD_RT_PRIO,
    114	[IOPRIO_CLASS_BE]	= DD_BE_PRIO,
    115	[IOPRIO_CLASS_IDLE]	= DD_IDLE_PRIO,
    116};
    117
    118static inline struct rb_root *
    119deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
    120{
    121	return &per_prio->sort_list[rq_data_dir(rq)];
    122}
    123
    124/*
    125 * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
    126 * request.
    127 */
    128static u8 dd_rq_ioclass(struct request *rq)
    129{
    130	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
    131}
    132
    133/*
    134 * get the request after `rq' in sector-sorted order
    135 */
    136static inline struct request *
    137deadline_latter_request(struct request *rq)
    138{
    139	struct rb_node *node = rb_next(&rq->rb_node);
    140
    141	if (node)
    142		return rb_entry_rq(node);
    143
    144	return NULL;
    145}
    146
    147static void
    148deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
    149{
    150	struct rb_root *root = deadline_rb_root(per_prio, rq);
    151
    152	elv_rb_add(root, rq);
    153}
    154
    155static inline void
    156deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
    157{
    158	const enum dd_data_dir data_dir = rq_data_dir(rq);
    159
    160	if (per_prio->next_rq[data_dir] == rq)
    161		per_prio->next_rq[data_dir] = deadline_latter_request(rq);
    162
    163	elv_rb_del(deadline_rb_root(per_prio, rq), rq);
    164}
    165
    166/*
    167 * remove rq from rbtree and fifo.
    168 */
    169static void deadline_remove_request(struct request_queue *q,
    170				    struct dd_per_prio *per_prio,
    171				    struct request *rq)
    172{
    173	list_del_init(&rq->queuelist);
    174
    175	/*
    176	 * We might not be on the rbtree, if we are doing an insert merge
    177	 */
    178	if (!RB_EMPTY_NODE(&rq->rb_node))
    179		deadline_del_rq_rb(per_prio, rq);
    180
    181	elv_rqhash_del(q, rq);
    182	if (q->last_merge == rq)
    183		q->last_merge = NULL;
    184}
    185
    186static void dd_request_merged(struct request_queue *q, struct request *req,
    187			      enum elv_merge type)
    188{
    189	struct deadline_data *dd = q->elevator->elevator_data;
    190	const u8 ioprio_class = dd_rq_ioclass(req);
    191	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
    192	struct dd_per_prio *per_prio = &dd->per_prio[prio];
    193
    194	/*
    195	 * if the merge was a front merge, we need to reposition request
    196	 */
    197	if (type == ELEVATOR_FRONT_MERGE) {
    198		elv_rb_del(deadline_rb_root(per_prio, req), req);
    199		deadline_add_rq_rb(per_prio, req);
    200	}
    201}
    202
    203/*
    204 * Callback function that is invoked after @next has been merged into @req.
    205 */
    206static void dd_merged_requests(struct request_queue *q, struct request *req,
    207			       struct request *next)
    208{
    209	struct deadline_data *dd = q->elevator->elevator_data;
    210	const u8 ioprio_class = dd_rq_ioclass(next);
    211	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
    212
    213	lockdep_assert_held(&dd->lock);
    214
    215	dd->per_prio[prio].stats.merged++;
    216
    217	/*
    218	 * if next expires before rq, assign its expire time to rq
    219	 * and move into next position (next will be deleted) in fifo
    220	 */
    221	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
    222		if (time_before((unsigned long)next->fifo_time,
    223				(unsigned long)req->fifo_time)) {
    224			list_move(&req->queuelist, &next->queuelist);
    225			req->fifo_time = next->fifo_time;
    226		}
    227	}
    228
    229	/*
    230	 * kill knowledge of next, this one is a goner
    231	 */
    232	deadline_remove_request(q, &dd->per_prio[prio], next);
    233}
    234
    235/*
    236 * move an entry to dispatch queue
    237 */
    238static void
    239deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
    240		      struct request *rq)
    241{
    242	const enum dd_data_dir data_dir = rq_data_dir(rq);
    243
    244	per_prio->next_rq[data_dir] = deadline_latter_request(rq);
    245
    246	/*
    247	 * take it off the sort and fifo list
    248	 */
    249	deadline_remove_request(rq->q, per_prio, rq);
    250}
    251
    252/* Number of requests queued for a given priority level. */
    253static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
    254{
    255	const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
    256
    257	lockdep_assert_held(&dd->lock);
    258
    259	return stats->inserted - atomic_read(&stats->completed);
    260}
    261
    262/*
    263 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
    264 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
    265 */
    266static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
    267				      enum dd_data_dir data_dir)
    268{
    269	struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
    270
    271	/*
    272	 * rq is expired!
    273	 */
    274	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
    275		return 1;
    276
    277	return 0;
    278}
    279
    280/*
    281 * For the specified data direction, return the next request to
    282 * dispatch using arrival ordered lists.
    283 */
    284static struct request *
    285deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
    286		      enum dd_data_dir data_dir)
    287{
    288	struct request *rq;
    289	unsigned long flags;
    290
    291	if (list_empty(&per_prio->fifo_list[data_dir]))
    292		return NULL;
    293
    294	rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
    295	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
    296		return rq;
    297
    298	/*
    299	 * Look for a write request that can be dispatched, that is one with
    300	 * an unlocked target zone.
    301	 */
    302	spin_lock_irqsave(&dd->zone_lock, flags);
    303	list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
    304		if (blk_req_can_dispatch_to_zone(rq))
    305			goto out;
    306	}
    307	rq = NULL;
    308out:
    309	spin_unlock_irqrestore(&dd->zone_lock, flags);
    310
    311	return rq;
    312}
    313
    314/*
    315 * For the specified data direction, return the next request to
    316 * dispatch using sector position sorted lists.
    317 */
    318static struct request *
    319deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
    320		      enum dd_data_dir data_dir)
    321{
    322	struct request *rq;
    323	unsigned long flags;
    324
    325	rq = per_prio->next_rq[data_dir];
    326	if (!rq)
    327		return NULL;
    328
    329	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
    330		return rq;
    331
    332	/*
    333	 * Look for a write request that can be dispatched, that is one with
    334	 * an unlocked target zone.
    335	 */
    336	spin_lock_irqsave(&dd->zone_lock, flags);
    337	while (rq) {
    338		if (blk_req_can_dispatch_to_zone(rq))
    339			break;
    340		rq = deadline_latter_request(rq);
    341	}
    342	spin_unlock_irqrestore(&dd->zone_lock, flags);
    343
    344	return rq;
    345}
    346
    347/*
    348 * Returns true if and only if @rq started after @latest_start where
    349 * @latest_start is in jiffies.
    350 */
    351static bool started_after(struct deadline_data *dd, struct request *rq,
    352			  unsigned long latest_start)
    353{
    354	unsigned long start_time = (unsigned long)rq->fifo_time;
    355
    356	start_time -= dd->fifo_expire[rq_data_dir(rq)];
    357
    358	return time_after(start_time, latest_start);
    359}
    360
    361/*
    362 * deadline_dispatch_requests selects the best request according to
    363 * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
    364 */
    365static struct request *__dd_dispatch_request(struct deadline_data *dd,
    366					     struct dd_per_prio *per_prio,
    367					     unsigned long latest_start)
    368{
    369	struct request *rq, *next_rq;
    370	enum dd_data_dir data_dir;
    371	enum dd_prio prio;
    372	u8 ioprio_class;
    373
    374	lockdep_assert_held(&dd->lock);
    375
    376	if (!list_empty(&per_prio->dispatch)) {
    377		rq = list_first_entry(&per_prio->dispatch, struct request,
    378				      queuelist);
    379		if (started_after(dd, rq, latest_start))
    380			return NULL;
    381		list_del_init(&rq->queuelist);
    382		goto done;
    383	}
    384
    385	/*
    386	 * batches are currently reads XOR writes
    387	 */
    388	rq = deadline_next_request(dd, per_prio, dd->last_dir);
    389	if (rq && dd->batching < dd->fifo_batch)
    390		/* we have a next request are still entitled to batch */
    391		goto dispatch_request;
    392
    393	/*
    394	 * at this point we are not running a batch. select the appropriate
    395	 * data direction (read / write)
    396	 */
    397
    398	if (!list_empty(&per_prio->fifo_list[DD_READ])) {
    399		BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ]));
    400
    401		if (deadline_fifo_request(dd, per_prio, DD_WRITE) &&
    402		    (dd->starved++ >= dd->writes_starved))
    403			goto dispatch_writes;
    404
    405		data_dir = DD_READ;
    406
    407		goto dispatch_find_request;
    408	}
    409
    410	/*
    411	 * there are either no reads or writes have been starved
    412	 */
    413
    414	if (!list_empty(&per_prio->fifo_list[DD_WRITE])) {
    415dispatch_writes:
    416		BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE]));
    417
    418		dd->starved = 0;
    419
    420		data_dir = DD_WRITE;
    421
    422		goto dispatch_find_request;
    423	}
    424
    425	return NULL;
    426
    427dispatch_find_request:
    428	/*
    429	 * we are not running a batch, find best request for selected data_dir
    430	 */
    431	next_rq = deadline_next_request(dd, per_prio, data_dir);
    432	if (deadline_check_fifo(per_prio, data_dir) || !next_rq) {
    433		/*
    434		 * A deadline has expired, the last request was in the other
    435		 * direction, or we have run out of higher-sectored requests.
    436		 * Start again from the request with the earliest expiry time.
    437		 */
    438		rq = deadline_fifo_request(dd, per_prio, data_dir);
    439	} else {
    440		/*
    441		 * The last req was the same dir and we have a next request in
    442		 * sort order. No expired requests so continue on from here.
    443		 */
    444		rq = next_rq;
    445	}
    446
    447	/*
    448	 * For a zoned block device, if we only have writes queued and none of
    449	 * them can be dispatched, rq will be NULL.
    450	 */
    451	if (!rq)
    452		return NULL;
    453
    454	dd->last_dir = data_dir;
    455	dd->batching = 0;
    456
    457dispatch_request:
    458	if (started_after(dd, rq, latest_start))
    459		return NULL;
    460
    461	/*
    462	 * rq is the selected appropriate request.
    463	 */
    464	dd->batching++;
    465	deadline_move_request(dd, per_prio, rq);
    466done:
    467	ioprio_class = dd_rq_ioclass(rq);
    468	prio = ioprio_class_to_prio[ioprio_class];
    469	dd->per_prio[prio].stats.dispatched++;
    470	/*
    471	 * If the request needs its target zone locked, do it.
    472	 */
    473	blk_req_zone_write_lock(rq);
    474	rq->rq_flags |= RQF_STARTED;
    475	return rq;
    476}
    477
    478/*
    479 * Check whether there are any requests with priority other than DD_RT_PRIO
    480 * that were inserted more than prio_aging_expire jiffies ago.
    481 */
    482static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
    483						      unsigned long now)
    484{
    485	struct request *rq;
    486	enum dd_prio prio;
    487	int prio_cnt;
    488
    489	lockdep_assert_held(&dd->lock);
    490
    491	prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
    492		   !!dd_queued(dd, DD_IDLE_PRIO);
    493	if (prio_cnt < 2)
    494		return NULL;
    495
    496	for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
    497		rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
    498					   now - dd->prio_aging_expire);
    499		if (rq)
    500			return rq;
    501	}
    502
    503	return NULL;
    504}
    505
    506/*
    507 * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
    508 *
    509 * One confusing aspect here is that we get called for a specific
    510 * hardware queue, but we may return a request that is for a
    511 * different hardware queue. This is because mq-deadline has shared
    512 * state for all hardware queues, in terms of sorting, FIFOs, etc.
    513 */
    514static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
    515{
    516	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
    517	const unsigned long now = jiffies;
    518	struct request *rq;
    519	enum dd_prio prio;
    520
    521	spin_lock(&dd->lock);
    522	rq = dd_dispatch_prio_aged_requests(dd, now);
    523	if (rq)
    524		goto unlock;
    525
    526	/*
    527	 * Next, dispatch requests in priority order. Ignore lower priority
    528	 * requests if any higher priority requests are pending.
    529	 */
    530	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
    531		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
    532		if (rq || dd_queued(dd, prio))
    533			break;
    534	}
    535
    536unlock:
    537	spin_unlock(&dd->lock);
    538
    539	return rq;
    540}
    541
    542/*
    543 * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
    544 * function is used by __blk_mq_get_tag().
    545 */
    546static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
    547{
    548	struct deadline_data *dd = data->q->elevator->elevator_data;
    549
    550	/* Do not throttle synchronous reads. */
    551	if (op_is_sync(op) && !op_is_write(op))
    552		return;
    553
    554	/*
    555	 * Throttle asynchronous requests and writes such that these requests
    556	 * do not block the allocation of synchronous requests.
    557	 */
    558	data->shallow_depth = dd->async_depth;
    559}
    560
    561/* Called by blk_mq_update_nr_requests(). */
    562static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
    563{
    564	struct request_queue *q = hctx->queue;
    565	struct deadline_data *dd = q->elevator->elevator_data;
    566	struct blk_mq_tags *tags = hctx->sched_tags;
    567
    568	dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
    569
    570	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
    571}
    572
    573/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
    574static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
    575{
    576	dd_depth_updated(hctx);
    577	return 0;
    578}
    579
    580static void dd_exit_sched(struct elevator_queue *e)
    581{
    582	struct deadline_data *dd = e->elevator_data;
    583	enum dd_prio prio;
    584
    585	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
    586		struct dd_per_prio *per_prio = &dd->per_prio[prio];
    587		const struct io_stats_per_prio *stats = &per_prio->stats;
    588		uint32_t queued;
    589
    590		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
    591		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
    592
    593		spin_lock(&dd->lock);
    594		queued = dd_queued(dd, prio);
    595		spin_unlock(&dd->lock);
    596
    597		WARN_ONCE(queued != 0,
    598			  "statistics for priority %d: i %u m %u d %u c %u\n",
    599			  prio, stats->inserted, stats->merged,
    600			  stats->dispatched, atomic_read(&stats->completed));
    601	}
    602
    603	kfree(dd);
    604}
    605
    606/*
    607 * initialize elevator private data (deadline_data).
    608 */
    609static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
    610{
    611	struct deadline_data *dd;
    612	struct elevator_queue *eq;
    613	enum dd_prio prio;
    614	int ret = -ENOMEM;
    615
    616	eq = elevator_alloc(q, e);
    617	if (!eq)
    618		return ret;
    619
    620	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
    621	if (!dd)
    622		goto put_eq;
    623
    624	eq->elevator_data = dd;
    625
    626	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
    627		struct dd_per_prio *per_prio = &dd->per_prio[prio];
    628
    629		INIT_LIST_HEAD(&per_prio->dispatch);
    630		INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]);
    631		INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]);
    632		per_prio->sort_list[DD_READ] = RB_ROOT;
    633		per_prio->sort_list[DD_WRITE] = RB_ROOT;
    634	}
    635	dd->fifo_expire[DD_READ] = read_expire;
    636	dd->fifo_expire[DD_WRITE] = write_expire;
    637	dd->writes_starved = writes_starved;
    638	dd->front_merges = 1;
    639	dd->last_dir = DD_WRITE;
    640	dd->fifo_batch = fifo_batch;
    641	dd->prio_aging_expire = prio_aging_expire;
    642	spin_lock_init(&dd->lock);
    643	spin_lock_init(&dd->zone_lock);
    644
    645	/* We dispatch from request queue wide instead of hw queue */
    646	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
    647
    648	q->elevator = eq;
    649	return 0;
    650
    651put_eq:
    652	kobject_put(&eq->kobj);
    653	return ret;
    654}
    655
    656/*
    657 * Try to merge @bio into an existing request. If @bio has been merged into
    658 * an existing request, store the pointer to that request into *@rq.
    659 */
    660static int dd_request_merge(struct request_queue *q, struct request **rq,
    661			    struct bio *bio)
    662{
    663	struct deadline_data *dd = q->elevator->elevator_data;
    664	const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
    665	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
    666	struct dd_per_prio *per_prio = &dd->per_prio[prio];
    667	sector_t sector = bio_end_sector(bio);
    668	struct request *__rq;
    669
    670	if (!dd->front_merges)
    671		return ELEVATOR_NO_MERGE;
    672
    673	__rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector);
    674	if (__rq) {
    675		BUG_ON(sector != blk_rq_pos(__rq));
    676
    677		if (elv_bio_merge_ok(__rq, bio)) {
    678			*rq = __rq;
    679			if (blk_discard_mergable(__rq))
    680				return ELEVATOR_DISCARD_MERGE;
    681			return ELEVATOR_FRONT_MERGE;
    682		}
    683	}
    684
    685	return ELEVATOR_NO_MERGE;
    686}
    687
    688/*
    689 * Attempt to merge a bio into an existing request. This function is called
    690 * before @bio is associated with a request.
    691 */
    692static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
    693		unsigned int nr_segs)
    694{
    695	struct deadline_data *dd = q->elevator->elevator_data;
    696	struct request *free = NULL;
    697	bool ret;
    698
    699	spin_lock(&dd->lock);
    700	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
    701	spin_unlock(&dd->lock);
    702
    703	if (free)
    704		blk_mq_free_request(free);
    705
    706	return ret;
    707}
    708
    709/*
    710 * add rq to rbtree and fifo
    711 */
    712static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
    713			      bool at_head)
    714{
    715	struct request_queue *q = hctx->queue;
    716	struct deadline_data *dd = q->elevator->elevator_data;
    717	const enum dd_data_dir data_dir = rq_data_dir(rq);
    718	u16 ioprio = req_get_ioprio(rq);
    719	u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
    720	struct dd_per_prio *per_prio;
    721	enum dd_prio prio;
    722	LIST_HEAD(free);
    723
    724	lockdep_assert_held(&dd->lock);
    725
    726	/*
    727	 * This may be a requeue of a write request that has locked its
    728	 * target zone. If it is the case, this releases the zone lock.
    729	 */
    730	blk_req_zone_write_unlock(rq);
    731
    732	prio = ioprio_class_to_prio[ioprio_class];
    733	per_prio = &dd->per_prio[prio];
    734	if (!rq->elv.priv[0]) {
    735		per_prio->stats.inserted++;
    736		rq->elv.priv[0] = (void *)(uintptr_t)1;
    737	}
    738
    739	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
    740		blk_mq_free_requests(&free);
    741		return;
    742	}
    743
    744	trace_block_rq_insert(rq);
    745
    746	if (at_head) {
    747		list_add(&rq->queuelist, &per_prio->dispatch);
    748		rq->fifo_time = jiffies;
    749	} else {
    750		deadline_add_rq_rb(per_prio, rq);
    751
    752		if (rq_mergeable(rq)) {
    753			elv_rqhash_add(q, rq);
    754			if (!q->last_merge)
    755				q->last_merge = rq;
    756		}
    757
    758		/*
    759		 * set expire time and add to fifo list
    760		 */
    761		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
    762		list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
    763	}
    764}
    765
    766/*
    767 * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests().
    768 */
    769static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
    770			       struct list_head *list, bool at_head)
    771{
    772	struct request_queue *q = hctx->queue;
    773	struct deadline_data *dd = q->elevator->elevator_data;
    774
    775	spin_lock(&dd->lock);
    776	while (!list_empty(list)) {
    777		struct request *rq;
    778
    779		rq = list_first_entry(list, struct request, queuelist);
    780		list_del_init(&rq->queuelist);
    781		dd_insert_request(hctx, rq, at_head);
    782	}
    783	spin_unlock(&dd->lock);
    784}
    785
    786/* Callback from inside blk_mq_rq_ctx_init(). */
    787static void dd_prepare_request(struct request *rq)
    788{
    789	rq->elv.priv[0] = NULL;
    790}
    791
    792/*
    793 * Callback from inside blk_mq_free_request().
    794 *
    795 * For zoned block devices, write unlock the target zone of
    796 * completed write requests. Do this while holding the zone lock
    797 * spinlock so that the zone is never unlocked while deadline_fifo_request()
    798 * or deadline_next_request() are executing. This function is called for
    799 * all requests, whether or not these requests complete successfully.
    800 *
    801 * For a zoned block device, __dd_dispatch_request() may have stopped
    802 * dispatching requests if all the queued requests are write requests directed
    803 * at zones that are already locked due to on-going write requests. To ensure
    804 * write request dispatch progress in this case, mark the queue as needing a
    805 * restart to ensure that the queue is run again after completion of the
    806 * request and zones being unlocked.
    807 */
    808static void dd_finish_request(struct request *rq)
    809{
    810	struct request_queue *q = rq->q;
    811	struct deadline_data *dd = q->elevator->elevator_data;
    812	const u8 ioprio_class = dd_rq_ioclass(rq);
    813	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
    814	struct dd_per_prio *per_prio = &dd->per_prio[prio];
    815
    816	/*
    817	 * The block layer core may call dd_finish_request() without having
    818	 * called dd_insert_requests(). Skip requests that bypassed I/O
    819	 * scheduling. See also blk_mq_request_bypass_insert().
    820	 */
    821	if (!rq->elv.priv[0])
    822		return;
    823
    824	atomic_inc(&per_prio->stats.completed);
    825
    826	if (blk_queue_is_zoned(q)) {
    827		unsigned long flags;
    828
    829		spin_lock_irqsave(&dd->zone_lock, flags);
    830		blk_req_zone_write_unlock(rq);
    831		if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
    832			blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
    833		spin_unlock_irqrestore(&dd->zone_lock, flags);
    834	}
    835}
    836
    837static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
    838{
    839	return !list_empty_careful(&per_prio->dispatch) ||
    840		!list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
    841		!list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
    842}
    843
    844static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
    845{
    846	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
    847	enum dd_prio prio;
    848
    849	for (prio = 0; prio <= DD_PRIO_MAX; prio++)
    850		if (dd_has_work_for_prio(&dd->per_prio[prio]))
    851			return true;
    852
    853	return false;
    854}
    855
    856/*
    857 * sysfs parts below
    858 */
    859#define SHOW_INT(__FUNC, __VAR)						\
    860static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
    861{									\
    862	struct deadline_data *dd = e->elevator_data;			\
    863									\
    864	return sysfs_emit(page, "%d\n", __VAR);				\
    865}
    866#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
    867SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
    868SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
    869SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
    870SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
    871SHOW_INT(deadline_front_merges_show, dd->front_merges);
    872SHOW_INT(deadline_async_depth_show, dd->async_depth);
    873SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
    874#undef SHOW_INT
    875#undef SHOW_JIFFIES
    876
    877#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
    878static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
    879{									\
    880	struct deadline_data *dd = e->elevator_data;			\
    881	int __data, __ret;						\
    882									\
    883	__ret = kstrtoint(page, 0, &__data);				\
    884	if (__ret < 0)							\
    885		return __ret;						\
    886	if (__data < (MIN))						\
    887		__data = (MIN);						\
    888	else if (__data > (MAX))					\
    889		__data = (MAX);						\
    890	*(__PTR) = __CONV(__data);					\
    891	return count;							\
    892}
    893#define STORE_INT(__FUNC, __PTR, MIN, MAX)				\
    894	STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, )
    895#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX)				\
    896	STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
    897STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
    898STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
    899STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
    900STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
    901STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
    902STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
    903STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
    904#undef STORE_FUNCTION
    905#undef STORE_INT
    906#undef STORE_JIFFIES
    907
    908#define DD_ATTR(name) \
    909	__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
    910
    911static struct elv_fs_entry deadline_attrs[] = {
    912	DD_ATTR(read_expire),
    913	DD_ATTR(write_expire),
    914	DD_ATTR(writes_starved),
    915	DD_ATTR(front_merges),
    916	DD_ATTR(async_depth),
    917	DD_ATTR(fifo_batch),
    918	DD_ATTR(prio_aging_expire),
    919	__ATTR_NULL
    920};
    921
    922#ifdef CONFIG_BLK_DEBUG_FS
    923#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name)		\
    924static void *deadline_##name##_fifo_start(struct seq_file *m,		\
    925					  loff_t *pos)			\
    926	__acquires(&dd->lock)						\
    927{									\
    928	struct request_queue *q = m->private;				\
    929	struct deadline_data *dd = q->elevator->elevator_data;		\
    930	struct dd_per_prio *per_prio = &dd->per_prio[prio];		\
    931									\
    932	spin_lock(&dd->lock);						\
    933	return seq_list_start(&per_prio->fifo_list[data_dir], *pos);	\
    934}									\
    935									\
    936static void *deadline_##name##_fifo_next(struct seq_file *m, void *v,	\
    937					 loff_t *pos)			\
    938{									\
    939	struct request_queue *q = m->private;				\
    940	struct deadline_data *dd = q->elevator->elevator_data;		\
    941	struct dd_per_prio *per_prio = &dd->per_prio[prio];		\
    942									\
    943	return seq_list_next(v, &per_prio->fifo_list[data_dir], pos);	\
    944}									\
    945									\
    946static void deadline_##name##_fifo_stop(struct seq_file *m, void *v)	\
    947	__releases(&dd->lock)						\
    948{									\
    949	struct request_queue *q = m->private;				\
    950	struct deadline_data *dd = q->elevator->elevator_data;		\
    951									\
    952	spin_unlock(&dd->lock);						\
    953}									\
    954									\
    955static const struct seq_operations deadline_##name##_fifo_seq_ops = {	\
    956	.start	= deadline_##name##_fifo_start,				\
    957	.next	= deadline_##name##_fifo_next,				\
    958	.stop	= deadline_##name##_fifo_stop,				\
    959	.show	= blk_mq_debugfs_rq_show,				\
    960};									\
    961									\
    962static int deadline_##name##_next_rq_show(void *data,			\
    963					  struct seq_file *m)		\
    964{									\
    965	struct request_queue *q = data;					\
    966	struct deadline_data *dd = q->elevator->elevator_data;		\
    967	struct dd_per_prio *per_prio = &dd->per_prio[prio];		\
    968	struct request *rq = per_prio->next_rq[data_dir];		\
    969									\
    970	if (rq)								\
    971		__blk_mq_debugfs_rq_show(m, rq);			\
    972	return 0;							\
    973}
    974
    975DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0);
    976DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0);
    977DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1);
    978DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1);
    979DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2);
    980DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2);
    981#undef DEADLINE_DEBUGFS_DDIR_ATTRS
    982
    983static int deadline_batching_show(void *data, struct seq_file *m)
    984{
    985	struct request_queue *q = data;
    986	struct deadline_data *dd = q->elevator->elevator_data;
    987
    988	seq_printf(m, "%u\n", dd->batching);
    989	return 0;
    990}
    991
    992static int deadline_starved_show(void *data, struct seq_file *m)
    993{
    994	struct request_queue *q = data;
    995	struct deadline_data *dd = q->elevator->elevator_data;
    996
    997	seq_printf(m, "%u\n", dd->starved);
    998	return 0;
    999}
   1000
   1001static int dd_async_depth_show(void *data, struct seq_file *m)
   1002{
   1003	struct request_queue *q = data;
   1004	struct deadline_data *dd = q->elevator->elevator_data;
   1005
   1006	seq_printf(m, "%u\n", dd->async_depth);
   1007	return 0;
   1008}
   1009
   1010static int dd_queued_show(void *data, struct seq_file *m)
   1011{
   1012	struct request_queue *q = data;
   1013	struct deadline_data *dd = q->elevator->elevator_data;
   1014	u32 rt, be, idle;
   1015
   1016	spin_lock(&dd->lock);
   1017	rt = dd_queued(dd, DD_RT_PRIO);
   1018	be = dd_queued(dd, DD_BE_PRIO);
   1019	idle = dd_queued(dd, DD_IDLE_PRIO);
   1020	spin_unlock(&dd->lock);
   1021
   1022	seq_printf(m, "%u %u %u\n", rt, be, idle);
   1023
   1024	return 0;
   1025}
   1026
   1027/* Number of requests owned by the block driver for a given priority. */
   1028static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
   1029{
   1030	const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
   1031
   1032	lockdep_assert_held(&dd->lock);
   1033
   1034	return stats->dispatched + stats->merged -
   1035		atomic_read(&stats->completed);
   1036}
   1037
   1038static int dd_owned_by_driver_show(void *data, struct seq_file *m)
   1039{
   1040	struct request_queue *q = data;
   1041	struct deadline_data *dd = q->elevator->elevator_data;
   1042	u32 rt, be, idle;
   1043
   1044	spin_lock(&dd->lock);
   1045	rt = dd_owned_by_driver(dd, DD_RT_PRIO);
   1046	be = dd_owned_by_driver(dd, DD_BE_PRIO);
   1047	idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
   1048	spin_unlock(&dd->lock);
   1049
   1050	seq_printf(m, "%u %u %u\n", rt, be, idle);
   1051
   1052	return 0;
   1053}
   1054
   1055#define DEADLINE_DISPATCH_ATTR(prio)					\
   1056static void *deadline_dispatch##prio##_start(struct seq_file *m,	\
   1057					     loff_t *pos)		\
   1058	__acquires(&dd->lock)						\
   1059{									\
   1060	struct request_queue *q = m->private;				\
   1061	struct deadline_data *dd = q->elevator->elevator_data;		\
   1062	struct dd_per_prio *per_prio = &dd->per_prio[prio];		\
   1063									\
   1064	spin_lock(&dd->lock);						\
   1065	return seq_list_start(&per_prio->dispatch, *pos);		\
   1066}									\
   1067									\
   1068static void *deadline_dispatch##prio##_next(struct seq_file *m,		\
   1069					    void *v, loff_t *pos)	\
   1070{									\
   1071	struct request_queue *q = m->private;				\
   1072	struct deadline_data *dd = q->elevator->elevator_data;		\
   1073	struct dd_per_prio *per_prio = &dd->per_prio[prio];		\
   1074									\
   1075	return seq_list_next(v, &per_prio->dispatch, pos);		\
   1076}									\
   1077									\
   1078static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v)	\
   1079	__releases(&dd->lock)						\
   1080{									\
   1081	struct request_queue *q = m->private;				\
   1082	struct deadline_data *dd = q->elevator->elevator_data;		\
   1083									\
   1084	spin_unlock(&dd->lock);						\
   1085}									\
   1086									\
   1087static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \
   1088	.start	= deadline_dispatch##prio##_start,			\
   1089	.next	= deadline_dispatch##prio##_next,			\
   1090	.stop	= deadline_dispatch##prio##_stop,			\
   1091	.show	= blk_mq_debugfs_rq_show,				\
   1092}
   1093
   1094DEADLINE_DISPATCH_ATTR(0);
   1095DEADLINE_DISPATCH_ATTR(1);
   1096DEADLINE_DISPATCH_ATTR(2);
   1097#undef DEADLINE_DISPATCH_ATTR
   1098
   1099#define DEADLINE_QUEUE_DDIR_ATTRS(name)					\
   1100	{#name "_fifo_list", 0400,					\
   1101			.seq_ops = &deadline_##name##_fifo_seq_ops}
   1102#define DEADLINE_NEXT_RQ_ATTR(name)					\
   1103	{#name "_next_rq", 0400, deadline_##name##_next_rq_show}
   1104static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
   1105	DEADLINE_QUEUE_DDIR_ATTRS(read0),
   1106	DEADLINE_QUEUE_DDIR_ATTRS(write0),
   1107	DEADLINE_QUEUE_DDIR_ATTRS(read1),
   1108	DEADLINE_QUEUE_DDIR_ATTRS(write1),
   1109	DEADLINE_QUEUE_DDIR_ATTRS(read2),
   1110	DEADLINE_QUEUE_DDIR_ATTRS(write2),
   1111	DEADLINE_NEXT_RQ_ATTR(read0),
   1112	DEADLINE_NEXT_RQ_ATTR(write0),
   1113	DEADLINE_NEXT_RQ_ATTR(read1),
   1114	DEADLINE_NEXT_RQ_ATTR(write1),
   1115	DEADLINE_NEXT_RQ_ATTR(read2),
   1116	DEADLINE_NEXT_RQ_ATTR(write2),
   1117	{"batching", 0400, deadline_batching_show},
   1118	{"starved", 0400, deadline_starved_show},
   1119	{"async_depth", 0400, dd_async_depth_show},
   1120	{"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops},
   1121	{"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops},
   1122	{"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops},
   1123	{"owned_by_driver", 0400, dd_owned_by_driver_show},
   1124	{"queued", 0400, dd_queued_show},
   1125	{},
   1126};
   1127#undef DEADLINE_QUEUE_DDIR_ATTRS
   1128#endif
   1129
   1130static struct elevator_type mq_deadline = {
   1131	.ops = {
   1132		.depth_updated		= dd_depth_updated,
   1133		.limit_depth		= dd_limit_depth,
   1134		.insert_requests	= dd_insert_requests,
   1135		.dispatch_request	= dd_dispatch_request,
   1136		.prepare_request	= dd_prepare_request,
   1137		.finish_request		= dd_finish_request,
   1138		.next_request		= elv_rb_latter_request,
   1139		.former_request		= elv_rb_former_request,
   1140		.bio_merge		= dd_bio_merge,
   1141		.request_merge		= dd_request_merge,
   1142		.requests_merged	= dd_merged_requests,
   1143		.request_merged		= dd_request_merged,
   1144		.has_work		= dd_has_work,
   1145		.init_sched		= dd_init_sched,
   1146		.exit_sched		= dd_exit_sched,
   1147		.init_hctx		= dd_init_hctx,
   1148	},
   1149
   1150#ifdef CONFIG_BLK_DEBUG_FS
   1151	.queue_debugfs_attrs = deadline_queue_debugfs_attrs,
   1152#endif
   1153	.elevator_attrs = deadline_attrs,
   1154	.elevator_name = "mq-deadline",
   1155	.elevator_alias = "deadline",
   1156	.elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
   1157	.elevator_owner = THIS_MODULE,
   1158};
   1159MODULE_ALIAS("mq-deadline-iosched");
   1160
   1161static int __init deadline_init(void)
   1162{
   1163	return elv_register(&mq_deadline);
   1164}
   1165
   1166static void __exit deadline_exit(void)
   1167{
   1168	elv_unregister(&mq_deadline);
   1169}
   1170
   1171module_init(deadline_init);
   1172module_exit(deadline_exit);
   1173
   1174MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche");
   1175MODULE_LICENSE("GPL");
   1176MODULE_DESCRIPTION("MQ deadline IO scheduler");