cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

blk-mq.h (34577B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2#ifndef BLK_MQ_H
      3#define BLK_MQ_H
      4
      5#include <linux/blkdev.h>
      6#include <linux/sbitmap.h>
      7#include <linux/lockdep.h>
      8#include <linux/scatterlist.h>
      9#include <linux/prefetch.h>
     10
     11struct blk_mq_tags;
     12struct blk_flush_queue;
     13
     14#define BLKDEV_MIN_RQ	4
     15#define BLKDEV_DEFAULT_RQ	128
     16
     17typedef void (rq_end_io_fn)(struct request *, blk_status_t);
     18
     19/*
     20 * request flags */
     21typedef __u32 __bitwise req_flags_t;
     22
     23/* drive already may have started this one */
     24#define RQF_STARTED		((__force req_flags_t)(1 << 1))
     25/* may not be passed by ioscheduler */
     26#define RQF_SOFTBARRIER		((__force req_flags_t)(1 << 3))
     27/* request for flush sequence */
     28#define RQF_FLUSH_SEQ		((__force req_flags_t)(1 << 4))
     29/* merge of different types, fail separately */
     30#define RQF_MIXED_MERGE		((__force req_flags_t)(1 << 5))
     31/* track inflight for MQ */
     32#define RQF_MQ_INFLIGHT		((__force req_flags_t)(1 << 6))
     33/* don't call prep for this one */
     34#define RQF_DONTPREP		((__force req_flags_t)(1 << 7))
     35/* vaguely specified driver internal error.  Ignored by the block layer */
     36#define RQF_FAILED		((__force req_flags_t)(1 << 10))
     37/* don't warn about errors */
     38#define RQF_QUIET		((__force req_flags_t)(1 << 11))
     39/* elevator private data attached */
     40#define RQF_ELVPRIV		((__force req_flags_t)(1 << 12))
     41/* account into disk and partition IO statistics */
     42#define RQF_IO_STAT		((__force req_flags_t)(1 << 13))
     43/* runtime pm request */
     44#define RQF_PM			((__force req_flags_t)(1 << 15))
     45/* on IO scheduler merge hash */
     46#define RQF_HASHED		((__force req_flags_t)(1 << 16))
     47/* track IO completion time */
     48#define RQF_STATS		((__force req_flags_t)(1 << 17))
     49/* Look at ->special_vec for the actual data payload instead of the
     50   bio chain. */
     51#define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
     52/* The per-zone write lock is held for this request */
     53#define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
     54/* already slept for hybrid poll */
     55#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
     56/* ->timeout has been called, don't expire again */
     57#define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
     58/* queue has elevator attached */
     59#define RQF_ELV			((__force req_flags_t)(1 << 22))
     60
     61/* flags that prevent us from merging requests: */
     62#define RQF_NOMERGE_FLAGS \
     63	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
     64
     65enum mq_rq_state {
     66	MQ_RQ_IDLE		= 0,
     67	MQ_RQ_IN_FLIGHT		= 1,
     68	MQ_RQ_COMPLETE		= 2,
     69};
     70
     71/*
     72 * Try to put the fields that are referenced together in the same cacheline.
     73 *
     74 * If you modify this structure, make sure to update blk_rq_init() and
     75 * especially blk_mq_rq_ctx_init() to take care of the added fields.
     76 */
     77struct request {
     78	struct request_queue *q;
     79	struct blk_mq_ctx *mq_ctx;
     80	struct blk_mq_hw_ctx *mq_hctx;
     81
     82	unsigned int cmd_flags;		/* op and common flags */
     83	req_flags_t rq_flags;
     84
     85	int tag;
     86	int internal_tag;
     87
     88	unsigned int timeout;
     89
     90	/* the following two fields are internal, NEVER access directly */
     91	unsigned int __data_len;	/* total data len */
     92	sector_t __sector;		/* sector cursor */
     93
     94	struct bio *bio;
     95	struct bio *biotail;
     96
     97	union {
     98		struct list_head queuelist;
     99		struct request *rq_next;
    100	};
    101
    102	struct block_device *part;
    103#ifdef CONFIG_BLK_RQ_ALLOC_TIME
    104	/* Time that the first bio started allocating this request. */
    105	u64 alloc_time_ns;
    106#endif
    107	/* Time that this request was allocated for this IO. */
    108	u64 start_time_ns;
    109	/* Time that I/O was submitted to the device. */
    110	u64 io_start_time_ns;
    111
    112#ifdef CONFIG_BLK_WBT
    113	unsigned short wbt_flags;
    114#endif
    115	/*
    116	 * rq sectors used for blk stats. It has the same value
    117	 * with blk_rq_sectors(rq), except that it never be zeroed
    118	 * by completion.
    119	 */
    120	unsigned short stats_sectors;
    121
    122	/*
    123	 * Number of scatter-gather DMA addr+len pairs after
    124	 * physical address coalescing is performed.
    125	 */
    126	unsigned short nr_phys_segments;
    127
    128#ifdef CONFIG_BLK_DEV_INTEGRITY
    129	unsigned short nr_integrity_segments;
    130#endif
    131
    132#ifdef CONFIG_BLK_INLINE_ENCRYPTION
    133	struct bio_crypt_ctx *crypt_ctx;
    134	struct blk_crypto_keyslot *crypt_keyslot;
    135#endif
    136
    137	unsigned short write_hint;
    138	unsigned short ioprio;
    139
    140	enum mq_rq_state state;
    141	atomic_t ref;
    142
    143	unsigned long deadline;
    144
    145	/*
    146	 * The hash is used inside the scheduler, and killed once the
    147	 * request reaches the dispatch list. The ipi_list is only used
    148	 * to queue the request for softirq completion, which is long
    149	 * after the request has been unhashed (and even removed from
    150	 * the dispatch list).
    151	 */
    152	union {
    153		struct hlist_node hash;	/* merge hash */
    154		struct llist_node ipi_list;
    155	};
    156
    157	/*
    158	 * The rb_node is only used inside the io scheduler, requests
    159	 * are pruned when moved to the dispatch queue. So let the
    160	 * completion_data share space with the rb_node.
    161	 */
    162	union {
    163		struct rb_node rb_node;	/* sort/lookup */
    164		struct bio_vec special_vec;
    165		void *completion_data;
    166	};
    167
    168
    169	/*
    170	 * Three pointers are available for the IO schedulers, if they need
    171	 * more they have to dynamically allocate it.  Flush requests are
    172	 * never put on the IO scheduler. So let the flush fields share
    173	 * space with the elevator data.
    174	 */
    175	union {
    176		struct {
    177			struct io_cq		*icq;
    178			void			*priv[2];
    179		} elv;
    180
    181		struct {
    182			unsigned int		seq;
    183			struct list_head	list;
    184			rq_end_io_fn		*saved_end_io;
    185		} flush;
    186	};
    187
    188	union {
    189		struct __call_single_data csd;
    190		u64 fifo_time;
    191	};
    192
    193	/*
    194	 * completion callback.
    195	 */
    196	rq_end_io_fn *end_io;
    197	void *end_io_data;
    198};
    199
    200#define req_op(req) \
    201	((req)->cmd_flags & REQ_OP_MASK)
    202
    203static inline bool blk_rq_is_passthrough(struct request *rq)
    204{
    205	return blk_op_is_passthrough(req_op(rq));
    206}
    207
    208static inline unsigned short req_get_ioprio(struct request *req)
    209{
    210	return req->ioprio;
    211}
    212
    213#define rq_data_dir(rq)		(op_is_write(req_op(rq)) ? WRITE : READ)
    214
    215#define rq_dma_dir(rq) \
    216	(op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
    217
    218#define rq_list_add(listptr, rq)	do {		\
    219	(rq)->rq_next = *(listptr);			\
    220	*(listptr) = rq;				\
    221} while (0)
    222
    223#define rq_list_pop(listptr)				\
    224({							\
    225	struct request *__req = NULL;			\
    226	if ((listptr) && *(listptr))	{		\
    227		__req = *(listptr);			\
    228		*(listptr) = __req->rq_next;		\
    229	}						\
    230	__req;						\
    231})
    232
    233#define rq_list_peek(listptr)				\
    234({							\
    235	struct request *__req = NULL;			\
    236	if ((listptr) && *(listptr))			\
    237		__req = *(listptr);			\
    238	__req;						\
    239})
    240
    241#define rq_list_for_each(listptr, pos)			\
    242	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
    243
    244#define rq_list_for_each_safe(listptr, pos, nxt)			\
    245	for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos);	\
    246		pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL)
    247
    248#define rq_list_next(rq)	(rq)->rq_next
    249#define rq_list_empty(list)	((list) == (struct request *) NULL)
    250
    251/**
    252 * rq_list_move() - move a struct request from one list to another
    253 * @src: The source list @rq is currently in
    254 * @dst: The destination list that @rq will be appended to
    255 * @rq: The request to move
    256 * @prev: The request preceding @rq in @src (NULL if @rq is the head)
    257 */
    258static inline void rq_list_move(struct request **src, struct request **dst,
    259				struct request *rq, struct request *prev)
    260{
    261	if (prev)
    262		prev->rq_next = rq->rq_next;
    263	else
    264		*src = rq->rq_next;
    265	rq_list_add(dst, rq);
    266}
    267
    268enum blk_eh_timer_return {
    269	BLK_EH_DONE,		/* drivers has completed the command */
    270	BLK_EH_RESET_TIMER,	/* reset timer and try again */
    271};
    272
    273#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
    274#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
    275
    276/**
    277 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
    278 * block device
    279 */
    280struct blk_mq_hw_ctx {
    281	struct {
    282		/** @lock: Protects the dispatch list. */
    283		spinlock_t		lock;
    284		/**
    285		 * @dispatch: Used for requests that are ready to be
    286		 * dispatched to the hardware but for some reason (e.g. lack of
    287		 * resources) could not be sent to the hardware. As soon as the
    288		 * driver can send new requests, requests at this list will
    289		 * be sent first for a fairer dispatch.
    290		 */
    291		struct list_head	dispatch;
    292		 /**
    293		  * @state: BLK_MQ_S_* flags. Defines the state of the hw
    294		  * queue (active, scheduled to restart, stopped).
    295		  */
    296		unsigned long		state;
    297	} ____cacheline_aligned_in_smp;
    298
    299	/**
    300	 * @run_work: Used for scheduling a hardware queue run at a later time.
    301	 */
    302	struct delayed_work	run_work;
    303	/** @cpumask: Map of available CPUs where this hctx can run. */
    304	cpumask_var_t		cpumask;
    305	/**
    306	 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
    307	 * selection from @cpumask.
    308	 */
    309	int			next_cpu;
    310	/**
    311	 * @next_cpu_batch: Counter of how many works left in the batch before
    312	 * changing to the next CPU.
    313	 */
    314	int			next_cpu_batch;
    315
    316	/** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
    317	unsigned long		flags;
    318
    319	/**
    320	 * @sched_data: Pointer owned by the IO scheduler attached to a request
    321	 * queue. It's up to the IO scheduler how to use this pointer.
    322	 */
    323	void			*sched_data;
    324	/**
    325	 * @queue: Pointer to the request queue that owns this hardware context.
    326	 */
    327	struct request_queue	*queue;
    328	/** @fq: Queue of requests that need to perform a flush operation. */
    329	struct blk_flush_queue	*fq;
    330
    331	/**
    332	 * @driver_data: Pointer to data owned by the block driver that created
    333	 * this hctx
    334	 */
    335	void			*driver_data;
    336
    337	/**
    338	 * @ctx_map: Bitmap for each software queue. If bit is on, there is a
    339	 * pending request in that software queue.
    340	 */
    341	struct sbitmap		ctx_map;
    342
    343	/**
    344	 * @dispatch_from: Software queue to be used when no scheduler was
    345	 * selected.
    346	 */
    347	struct blk_mq_ctx	*dispatch_from;
    348	/**
    349	 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
    350	 * decide if the hw_queue is busy using Exponential Weighted Moving
    351	 * Average algorithm.
    352	 */
    353	unsigned int		dispatch_busy;
    354
    355	/** @type: HCTX_TYPE_* flags. Type of hardware queue. */
    356	unsigned short		type;
    357	/** @nr_ctx: Number of software queues. */
    358	unsigned short		nr_ctx;
    359	/** @ctxs: Array of software queues. */
    360	struct blk_mq_ctx	**ctxs;
    361
    362	/** @dispatch_wait_lock: Lock for dispatch_wait queue. */
    363	spinlock_t		dispatch_wait_lock;
    364	/**
    365	 * @dispatch_wait: Waitqueue to put requests when there is no tag
    366	 * available at the moment, to wait for another try in the future.
    367	 */
    368	wait_queue_entry_t	dispatch_wait;
    369
    370	/**
    371	 * @wait_index: Index of next available dispatch_wait queue to insert
    372	 * requests.
    373	 */
    374	atomic_t		wait_index;
    375
    376	/**
    377	 * @tags: Tags owned by the block driver. A tag at this set is only
    378	 * assigned when a request is dispatched from a hardware queue.
    379	 */
    380	struct blk_mq_tags	*tags;
    381	/**
    382	 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
    383	 * scheduler associated with a request queue, a tag is assigned when
    384	 * that request is allocated. Else, this member is not used.
    385	 */
    386	struct blk_mq_tags	*sched_tags;
    387
    388	/** @queued: Number of queued requests. */
    389	unsigned long		queued;
    390	/** @run: Number of dispatched requests. */
    391	unsigned long		run;
    392
    393	/** @numa_node: NUMA node the storage adapter has been connected to. */
    394	unsigned int		numa_node;
    395	/** @queue_num: Index of this hardware queue. */
    396	unsigned int		queue_num;
    397
    398	/**
    399	 * @nr_active: Number of active requests. Only used when a tag set is
    400	 * shared across request queues.
    401	 */
    402	atomic_t		nr_active;
    403
    404	/** @cpuhp_online: List to store request if CPU is going to die */
    405	struct hlist_node	cpuhp_online;
    406	/** @cpuhp_dead: List to store request if some CPU die. */
    407	struct hlist_node	cpuhp_dead;
    408	/** @kobj: Kernel object for sysfs. */
    409	struct kobject		kobj;
    410
    411#ifdef CONFIG_BLK_DEBUG_FS
    412	/**
    413	 * @debugfs_dir: debugfs directory for this hardware queue. Named
    414	 * as cpu<cpu_number>.
    415	 */
    416	struct dentry		*debugfs_dir;
    417	/** @sched_debugfs_dir:	debugfs directory for the scheduler. */
    418	struct dentry		*sched_debugfs_dir;
    419#endif
    420
    421	/**
    422	 * @hctx_list: if this hctx is not in use, this is an entry in
    423	 * q->unused_hctx_list.
    424	 */
    425	struct list_head	hctx_list;
    426};
    427
    428/**
    429 * struct blk_mq_queue_map - Map software queues to hardware queues
    430 * @mq_map:       CPU ID to hardware queue index map. This is an array
    431 *	with nr_cpu_ids elements. Each element has a value in the range
    432 *	[@queue_offset, @queue_offset + @nr_queues).
    433 * @nr_queues:    Number of hardware queues to map CPU IDs onto.
    434 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
    435 *	driver to map each hardware queue type (enum hctx_type) onto a distinct
    436 *	set of hardware queues.
    437 */
    438struct blk_mq_queue_map {
    439	unsigned int *mq_map;
    440	unsigned int nr_queues;
    441	unsigned int queue_offset;
    442};
    443
    444/**
    445 * enum hctx_type - Type of hardware queue
    446 * @HCTX_TYPE_DEFAULT:	All I/O not otherwise accounted for.
    447 * @HCTX_TYPE_READ:	Just for READ I/O.
    448 * @HCTX_TYPE_POLL:	Polled I/O of any kind.
    449 * @HCTX_MAX_TYPES:	Number of types of hctx.
    450 */
    451enum hctx_type {
    452	HCTX_TYPE_DEFAULT,
    453	HCTX_TYPE_READ,
    454	HCTX_TYPE_POLL,
    455
    456	HCTX_MAX_TYPES,
    457};
    458
    459/**
    460 * struct blk_mq_tag_set - tag set that can be shared between request queues
    461 * @map:	   One or more ctx -> hctx mappings. One map exists for each
    462 *		   hardware queue type (enum hctx_type) that the driver wishes
    463 *		   to support. There are no restrictions on maps being of the
    464 *		   same size, and it's perfectly legal to share maps between
    465 *		   types.
    466 * @nr_maps:	   Number of elements in the @map array. A number in the range
    467 *		   [1, HCTX_MAX_TYPES].
    468 * @ops:	   Pointers to functions that implement block driver behavior.
    469 * @nr_hw_queues:  Number of hardware queues supported by the block driver that
    470 *		   owns this data structure.
    471 * @queue_depth:   Number of tags per hardware queue, reserved tags included.
    472 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
    473 *		   allocations.
    474 * @cmd_size:	   Number of additional bytes to allocate per request. The block
    475 *		   driver owns these additional bytes.
    476 * @numa_node:	   NUMA node the storage adapter has been connected to.
    477 * @timeout:	   Request processing timeout in jiffies.
    478 * @flags:	   Zero or more BLK_MQ_F_* flags.
    479 * @driver_data:   Pointer to data owned by the block driver that created this
    480 *		   tag set.
    481 * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
    482 *		   elements.
    483 * @shared_tags:
    484 *		   Shared set of tags. Has @nr_hw_queues elements. If set,
    485 *		   shared by all @tags.
    486 * @tag_list_lock: Serializes tag_list accesses.
    487 * @tag_list:	   List of the request queues that use this tag set. See also
    488 *		   request_queue.tag_set_list.
    489 */
    490struct blk_mq_tag_set {
    491	struct blk_mq_queue_map	map[HCTX_MAX_TYPES];
    492	unsigned int		nr_maps;
    493	const struct blk_mq_ops	*ops;
    494	unsigned int		nr_hw_queues;
    495	unsigned int		queue_depth;
    496	unsigned int		reserved_tags;
    497	unsigned int		cmd_size;
    498	int			numa_node;
    499	unsigned int		timeout;
    500	unsigned int		flags;
    501	void			*driver_data;
    502
    503	struct blk_mq_tags	**tags;
    504
    505	struct blk_mq_tags	*shared_tags;
    506
    507	struct mutex		tag_list_lock;
    508	struct list_head	tag_list;
    509};
    510
    511/**
    512 * struct blk_mq_queue_data - Data about a request inserted in a queue
    513 *
    514 * @rq:   Request pointer.
    515 * @last: If it is the last request in the queue.
    516 */
    517struct blk_mq_queue_data {
    518	struct request *rq;
    519	bool last;
    520};
    521
    522typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
    523
    524/**
    525 * struct blk_mq_ops - Callback functions that implements block driver
    526 * behaviour.
    527 */
    528struct blk_mq_ops {
    529	/**
    530	 * @queue_rq: Queue a new request from block IO.
    531	 */
    532	blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
    533				 const struct blk_mq_queue_data *);
    534
    535	/**
    536	 * @commit_rqs: If a driver uses bd->last to judge when to submit
    537	 * requests to hardware, it must define this function. In case of errors
    538	 * that make us stop issuing further requests, this hook serves the
    539	 * purpose of kicking the hardware (which the last request otherwise
    540	 * would have done).
    541	 */
    542	void (*commit_rqs)(struct blk_mq_hw_ctx *);
    543
    544	/**
    545	 * @queue_rqs: Queue a list of new requests. Driver is guaranteed
    546	 * that each request belongs to the same queue. If the driver doesn't
    547	 * empty the @rqlist completely, then the rest will be queued
    548	 * individually by the block layer upon return.
    549	 */
    550	void (*queue_rqs)(struct request **rqlist);
    551
    552	/**
    553	 * @get_budget: Reserve budget before queue request, once .queue_rq is
    554	 * run, it is driver's responsibility to release the
    555	 * reserved budget. Also we have to handle failure case
    556	 * of .get_budget for avoiding I/O deadlock.
    557	 */
    558	int (*get_budget)(struct request_queue *);
    559
    560	/**
    561	 * @put_budget: Release the reserved budget.
    562	 */
    563	void (*put_budget)(struct request_queue *, int);
    564
    565	/**
    566	 * @set_rq_budget_token: store rq's budget token
    567	 */
    568	void (*set_rq_budget_token)(struct request *, int);
    569	/**
    570	 * @get_rq_budget_token: retrieve rq's budget token
    571	 */
    572	int (*get_rq_budget_token)(struct request *);
    573
    574	/**
    575	 * @timeout: Called on request timeout.
    576	 */
    577	enum blk_eh_timer_return (*timeout)(struct request *, bool);
    578
    579	/**
    580	 * @poll: Called to poll for completion of a specific tag.
    581	 */
    582	int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
    583
    584	/**
    585	 * @complete: Mark the request as complete.
    586	 */
    587	void (*complete)(struct request *);
    588
    589	/**
    590	 * @init_hctx: Called when the block layer side of a hardware queue has
    591	 * been set up, allowing the driver to allocate/init matching
    592	 * structures.
    593	 */
    594	int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
    595	/**
    596	 * @exit_hctx: Ditto for exit/teardown.
    597	 */
    598	void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
    599
    600	/**
    601	 * @init_request: Called for every command allocated by the block layer
    602	 * to allow the driver to set up driver specific data.
    603	 *
    604	 * Tag greater than or equal to queue_depth is for setting up
    605	 * flush request.
    606	 */
    607	int (*init_request)(struct blk_mq_tag_set *set, struct request *,
    608			    unsigned int, unsigned int);
    609	/**
    610	 * @exit_request: Ditto for exit/teardown.
    611	 */
    612	void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
    613			     unsigned int);
    614
    615	/**
    616	 * @cleanup_rq: Called before freeing one request which isn't completed
    617	 * yet, and usually for freeing the driver private data.
    618	 */
    619	void (*cleanup_rq)(struct request *);
    620
    621	/**
    622	 * @busy: If set, returns whether or not this queue currently is busy.
    623	 */
    624	bool (*busy)(struct request_queue *);
    625
    626	/**
    627	 * @map_queues: This allows drivers specify their own queue mapping by
    628	 * overriding the setup-time function that builds the mq_map.
    629	 */
    630	int (*map_queues)(struct blk_mq_tag_set *set);
    631
    632#ifdef CONFIG_BLK_DEBUG_FS
    633	/**
    634	 * @show_rq: Used by the debugfs implementation to show driver-specific
    635	 * information about a request.
    636	 */
    637	void (*show_rq)(struct seq_file *m, struct request *rq);
    638#endif
    639};
    640
    641enum {
    642	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
    643	BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
    644	/*
    645	 * Set when this device requires underlying blk-mq device for
    646	 * completing IO:
    647	 */
    648	BLK_MQ_F_STACKING	= 1 << 2,
    649	BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
    650	BLK_MQ_F_BLOCKING	= 1 << 5,
    651	/* Do not allow an I/O scheduler to be configured. */
    652	BLK_MQ_F_NO_SCHED	= 1 << 6,
    653	/*
    654	 * Select 'none' during queue registration in case of a single hwq
    655	 * or shared hwqs instead of 'mq-deadline'.
    656	 */
    657	BLK_MQ_F_NO_SCHED_BY_DEFAULT	= 1 << 7,
    658	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
    659	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
    660
    661	BLK_MQ_S_STOPPED	= 0,
    662	BLK_MQ_S_TAG_ACTIVE	= 1,
    663	BLK_MQ_S_SCHED_RESTART	= 2,
    664
    665	/* hw queue is inactive after all its CPUs become offline */
    666	BLK_MQ_S_INACTIVE	= 3,
    667
    668	BLK_MQ_MAX_DEPTH	= 10240,
    669
    670	BLK_MQ_CPU_WORK_BATCH	= 8,
    671};
    672#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
    673	((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
    674		((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
    675#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
    676	((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
    677		<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
    678
    679#define BLK_MQ_NO_HCTX_IDX	(-1U)
    680
    681struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
    682		struct lock_class_key *lkclass);
    683#define blk_mq_alloc_disk(set, queuedata)				\
    684({									\
    685	static struct lock_class_key __key;				\
    686									\
    687	__blk_mq_alloc_disk(set, queuedata, &__key);			\
    688})
    689struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
    690int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
    691		struct request_queue *q);
    692void blk_mq_unregister_dev(struct device *, struct request_queue *);
    693
    694int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
    695int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
    696		const struct blk_mq_ops *ops, unsigned int queue_depth,
    697		unsigned int set_flags);
    698void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
    699
    700void blk_mq_free_request(struct request *rq);
    701
    702bool blk_mq_queue_inflight(struct request_queue *q);
    703
    704enum {
    705	/* return when out of requests */
    706	BLK_MQ_REQ_NOWAIT	= (__force blk_mq_req_flags_t)(1 << 0),
    707	/* allocate from reserved pool */
    708	BLK_MQ_REQ_RESERVED	= (__force blk_mq_req_flags_t)(1 << 1),
    709	/* set RQF_PM */
    710	BLK_MQ_REQ_PM		= (__force blk_mq_req_flags_t)(1 << 2),
    711};
    712
    713struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
    714		blk_mq_req_flags_t flags);
    715struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
    716		unsigned int op, blk_mq_req_flags_t flags,
    717		unsigned int hctx_idx);
    718
    719/*
    720 * Tag address space map.
    721 */
    722struct blk_mq_tags {
    723	unsigned int nr_tags;
    724	unsigned int nr_reserved_tags;
    725
    726	atomic_t active_queues;
    727
    728	struct sbitmap_queue bitmap_tags;
    729	struct sbitmap_queue breserved_tags;
    730
    731	struct request **rqs;
    732	struct request **static_rqs;
    733	struct list_head page_list;
    734
    735	/*
    736	 * used to clear request reference in rqs[] before freeing one
    737	 * request pool
    738	 */
    739	spinlock_t lock;
    740};
    741
    742static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
    743					       unsigned int tag)
    744{
    745	if (tag < tags->nr_tags) {
    746		prefetch(tags->rqs[tag]);
    747		return tags->rqs[tag];
    748	}
    749
    750	return NULL;
    751}
    752
    753enum {
    754	BLK_MQ_UNIQUE_TAG_BITS = 16,
    755	BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
    756};
    757
    758u32 blk_mq_unique_tag(struct request *rq);
    759
    760static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
    761{
    762	return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
    763}
    764
    765static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
    766{
    767	return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
    768}
    769
    770/**
    771 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
    772 * @rq: target request.
    773 */
    774static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
    775{
    776	return READ_ONCE(rq->state);
    777}
    778
    779static inline int blk_mq_request_started(struct request *rq)
    780{
    781	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
    782}
    783
    784static inline int blk_mq_request_completed(struct request *rq)
    785{
    786	return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
    787}
    788
    789/*
    790 * 
    791 * Set the state to complete when completing a request from inside ->queue_rq.
    792 * This is used by drivers that want to ensure special complete actions that
    793 * need access to the request are called on failure, e.g. by nvme for
    794 * multipathing.
    795 */
    796static inline void blk_mq_set_request_complete(struct request *rq)
    797{
    798	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
    799}
    800
    801/*
    802 * Complete the request directly instead of deferring it to softirq or
    803 * completing it another CPU. Useful in preemptible instead of an interrupt.
    804 */
    805static inline void blk_mq_complete_request_direct(struct request *rq,
    806		   void (*complete)(struct request *rq))
    807{
    808	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
    809	complete(rq);
    810}
    811
    812void blk_mq_start_request(struct request *rq);
    813void blk_mq_end_request(struct request *rq, blk_status_t error);
    814void __blk_mq_end_request(struct request *rq, blk_status_t error);
    815void blk_mq_end_request_batch(struct io_comp_batch *ib);
    816
    817/*
    818 * Only need start/end time stamping if we have iostat or
    819 * blk stats enabled, or using an IO scheduler.
    820 */
    821static inline bool blk_mq_need_time_stamp(struct request *rq)
    822{
    823	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
    824}
    825
    826/*
    827 * Batched completions only work when there is no I/O error and no special
    828 * ->end_io handler.
    829 */
    830static inline bool blk_mq_add_to_batch(struct request *req,
    831				       struct io_comp_batch *iob, int ioerror,
    832				       void (*complete)(struct io_comp_batch *))
    833{
    834	if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
    835		return false;
    836	if (!iob->complete)
    837		iob->complete = complete;
    838	else if (iob->complete != complete)
    839		return false;
    840	iob->need_ts |= blk_mq_need_time_stamp(req);
    841	rq_list_add(&iob->req_list, req);
    842	return true;
    843}
    844
    845void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
    846void blk_mq_kick_requeue_list(struct request_queue *q);
    847void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
    848void blk_mq_complete_request(struct request *rq);
    849bool blk_mq_complete_request_remote(struct request *rq);
    850bool blk_mq_queue_stopped(struct request_queue *q);
    851void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
    852void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
    853void blk_mq_stop_hw_queues(struct request_queue *q);
    854void blk_mq_start_hw_queues(struct request_queue *q);
    855void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
    856void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
    857void blk_mq_quiesce_queue(struct request_queue *q);
    858void blk_mq_wait_quiesce_done(struct request_queue *q);
    859void blk_mq_unquiesce_queue(struct request_queue *q);
    860void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
    861void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
    862void blk_mq_run_hw_queues(struct request_queue *q, bool async);
    863void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
    864void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
    865		busy_tag_iter_fn *fn, void *priv);
    866void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
    867void blk_mq_freeze_queue(struct request_queue *q);
    868void blk_mq_unfreeze_queue(struct request_queue *q);
    869void blk_freeze_queue_start(struct request_queue *q);
    870void blk_mq_freeze_queue_wait(struct request_queue *q);
    871int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
    872				     unsigned long timeout);
    873
    874int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
    875void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
    876
    877void blk_mq_quiesce_queue_nowait(struct request_queue *q);
    878
    879unsigned int blk_mq_rq_cpu(struct request *rq);
    880
    881bool __blk_should_fake_timeout(struct request_queue *q);
    882static inline bool blk_should_fake_timeout(struct request_queue *q)
    883{
    884	if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
    885	    test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
    886		return __blk_should_fake_timeout(q);
    887	return false;
    888}
    889
    890/**
    891 * blk_mq_rq_from_pdu - cast a PDU to a request
    892 * @pdu: the PDU (Protocol Data Unit) to be casted
    893 *
    894 * Return: request
    895 *
    896 * Driver command data is immediately after the request. So subtract request
    897 * size to get back to the original request.
    898 */
    899static inline struct request *blk_mq_rq_from_pdu(void *pdu)
    900{
    901	return pdu - sizeof(struct request);
    902}
    903
    904/**
    905 * blk_mq_rq_to_pdu - cast a request to a PDU
    906 * @rq: the request to be casted
    907 *
    908 * Return: pointer to the PDU
    909 *
    910 * Driver command data is immediately after the request. So add request to get
    911 * the PDU.
    912 */
    913static inline void *blk_mq_rq_to_pdu(struct request *rq)
    914{
    915	return rq + 1;
    916}
    917
    918#define queue_for_each_hw_ctx(q, hctx, i)				\
    919	xa_for_each(&(q)->hctx_table, (i), (hctx))
    920
    921#define hctx_for_each_ctx(hctx, ctx, i)					\
    922	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
    923	     ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
    924
    925static inline void blk_mq_cleanup_rq(struct request *rq)
    926{
    927	if (rq->q->mq_ops->cleanup_rq)
    928		rq->q->mq_ops->cleanup_rq(rq);
    929}
    930
    931static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
    932		unsigned int nr_segs)
    933{
    934	rq->nr_phys_segments = nr_segs;
    935	rq->__data_len = bio->bi_iter.bi_size;
    936	rq->bio = rq->biotail = bio;
    937	rq->ioprio = bio_prio(bio);
    938}
    939
    940void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
    941		struct lock_class_key *key);
    942
    943static inline bool rq_is_sync(struct request *rq)
    944{
    945	return op_is_sync(rq->cmd_flags);
    946}
    947
    948void blk_rq_init(struct request_queue *q, struct request *rq);
    949int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
    950		struct bio_set *bs, gfp_t gfp_mask,
    951		int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
    952void blk_rq_unprep_clone(struct request *rq);
    953blk_status_t blk_insert_cloned_request(struct request *rq);
    954
    955struct rq_map_data {
    956	struct page **pages;
    957	int page_order;
    958	int nr_entries;
    959	unsigned long offset;
    960	int null_mapped;
    961	int from_user;
    962};
    963
    964int blk_rq_map_user(struct request_queue *, struct request *,
    965		struct rq_map_data *, void __user *, unsigned long, gfp_t);
    966int blk_rq_map_user_iov(struct request_queue *, struct request *,
    967		struct rq_map_data *, const struct iov_iter *, gfp_t);
    968int blk_rq_unmap_user(struct bio *);
    969int blk_rq_map_kern(struct request_queue *, struct request *, void *,
    970		unsigned int, gfp_t);
    971int blk_rq_append_bio(struct request *rq, struct bio *bio);
    972void blk_execute_rq_nowait(struct request *rq, bool at_head);
    973blk_status_t blk_execute_rq(struct request *rq, bool at_head);
    974
    975struct req_iterator {
    976	struct bvec_iter iter;
    977	struct bio *bio;
    978};
    979
    980#define __rq_for_each_bio(_bio, rq)	\
    981	if ((rq->bio))			\
    982		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
    983
    984#define rq_for_each_segment(bvl, _rq, _iter)			\
    985	__rq_for_each_bio(_iter.bio, _rq)			\
    986		bio_for_each_segment(bvl, _iter.bio, _iter.iter)
    987
    988#define rq_for_each_bvec(bvl, _rq, _iter)			\
    989	__rq_for_each_bio(_iter.bio, _rq)			\
    990		bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
    991
    992#define rq_iter_last(bvec, _iter)				\
    993		(_iter.bio->bi_next == NULL &&			\
    994		 bio_iter_last(bvec, _iter.iter))
    995
    996/*
    997 * blk_rq_pos()			: the current sector
    998 * blk_rq_bytes()		: bytes left in the entire request
    999 * blk_rq_cur_bytes()		: bytes left in the current segment
   1000 * blk_rq_sectors()		: sectors left in the entire request
   1001 * blk_rq_cur_sectors()		: sectors left in the current segment
   1002 * blk_rq_stats_sectors()	: sectors of the entire request used for stats
   1003 */
   1004static inline sector_t blk_rq_pos(const struct request *rq)
   1005{
   1006	return rq->__sector;
   1007}
   1008
   1009static inline unsigned int blk_rq_bytes(const struct request *rq)
   1010{
   1011	return rq->__data_len;
   1012}
   1013
   1014static inline int blk_rq_cur_bytes(const struct request *rq)
   1015{
   1016	if (!rq->bio)
   1017		return 0;
   1018	if (!bio_has_data(rq->bio))	/* dataless requests such as discard */
   1019		return rq->bio->bi_iter.bi_size;
   1020	return bio_iovec(rq->bio).bv_len;
   1021}
   1022
   1023static inline unsigned int blk_rq_sectors(const struct request *rq)
   1024{
   1025	return blk_rq_bytes(rq) >> SECTOR_SHIFT;
   1026}
   1027
   1028static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
   1029{
   1030	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
   1031}
   1032
   1033static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
   1034{
   1035	return rq->stats_sectors;
   1036}
   1037
   1038/*
   1039 * Some commands like WRITE SAME have a payload or data transfer size which
   1040 * is different from the size of the request.  Any driver that supports such
   1041 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
   1042 * calculate the data transfer size.
   1043 */
   1044static inline unsigned int blk_rq_payload_bytes(struct request *rq)
   1045{
   1046	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
   1047		return rq->special_vec.bv_len;
   1048	return blk_rq_bytes(rq);
   1049}
   1050
   1051/*
   1052 * Return the first full biovec in the request.  The caller needs to check that
   1053 * there are any bvecs before calling this helper.
   1054 */
   1055static inline struct bio_vec req_bvec(struct request *rq)
   1056{
   1057	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
   1058		return rq->special_vec;
   1059	return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
   1060}
   1061
   1062static inline unsigned int blk_rq_count_bios(struct request *rq)
   1063{
   1064	unsigned int nr_bios = 0;
   1065	struct bio *bio;
   1066
   1067	__rq_for_each_bio(bio, rq)
   1068		nr_bios++;
   1069
   1070	return nr_bios;
   1071}
   1072
   1073void blk_steal_bios(struct bio_list *list, struct request *rq);
   1074
   1075/*
   1076 * Request completion related functions.
   1077 *
   1078 * blk_update_request() completes given number of bytes and updates
   1079 * the request without completing it.
   1080 */
   1081bool blk_update_request(struct request *rq, blk_status_t error,
   1082			       unsigned int nr_bytes);
   1083void blk_abort_request(struct request *);
   1084
   1085/*
   1086 * Number of physical segments as sent to the device.
   1087 *
   1088 * Normally this is the number of discontiguous data segments sent by the
   1089 * submitter.  But for data-less command like discard we might have no
   1090 * actual data segments submitted, but the driver might have to add it's
   1091 * own special payload.  In that case we still return 1 here so that this
   1092 * special payload will be mapped.
   1093 */
   1094static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
   1095{
   1096	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
   1097		return 1;
   1098	return rq->nr_phys_segments;
   1099}
   1100
   1101/*
   1102 * Number of discard segments (or ranges) the driver needs to fill in.
   1103 * Each discard bio merged into a request is counted as one segment.
   1104 */
   1105static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
   1106{
   1107	return max_t(unsigned short, rq->nr_phys_segments, 1);
   1108}
   1109
   1110int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
   1111		struct scatterlist *sglist, struct scatterlist **last_sg);
   1112static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
   1113		struct scatterlist *sglist)
   1114{
   1115	struct scatterlist *last_sg = NULL;
   1116
   1117	return __blk_rq_map_sg(q, rq, sglist, &last_sg);
   1118}
   1119void blk_dump_rq_flags(struct request *, char *);
   1120
   1121#ifdef CONFIG_BLK_DEV_ZONED
   1122static inline unsigned int blk_rq_zone_no(struct request *rq)
   1123{
   1124	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
   1125}
   1126
   1127static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
   1128{
   1129	return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
   1130}
   1131
   1132bool blk_req_needs_zone_write_lock(struct request *rq);
   1133bool blk_req_zone_write_trylock(struct request *rq);
   1134void __blk_req_zone_write_lock(struct request *rq);
   1135void __blk_req_zone_write_unlock(struct request *rq);
   1136
   1137static inline void blk_req_zone_write_lock(struct request *rq)
   1138{
   1139	if (blk_req_needs_zone_write_lock(rq))
   1140		__blk_req_zone_write_lock(rq);
   1141}
   1142
   1143static inline void blk_req_zone_write_unlock(struct request *rq)
   1144{
   1145	if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
   1146		__blk_req_zone_write_unlock(rq);
   1147}
   1148
   1149static inline bool blk_req_zone_is_write_locked(struct request *rq)
   1150{
   1151	return rq->q->seq_zones_wlock &&
   1152		test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
   1153}
   1154
   1155static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
   1156{
   1157	if (!blk_req_needs_zone_write_lock(rq))
   1158		return true;
   1159	return !blk_req_zone_is_write_locked(rq);
   1160}
   1161#else /* CONFIG_BLK_DEV_ZONED */
   1162static inline bool blk_req_needs_zone_write_lock(struct request *rq)
   1163{
   1164	return false;
   1165}
   1166
   1167static inline void blk_req_zone_write_lock(struct request *rq)
   1168{
   1169}
   1170
   1171static inline void blk_req_zone_write_unlock(struct request *rq)
   1172{
   1173}
   1174static inline bool blk_req_zone_is_write_locked(struct request *rq)
   1175{
   1176	return false;
   1177}
   1178
   1179static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
   1180{
   1181	return true;
   1182}
   1183#endif /* CONFIG_BLK_DEV_ZONED */
   1184
   1185#endif /* BLK_MQ_H */