blk-mq.h (34577B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef BLK_MQ_H 3#define BLK_MQ_H 4 5#include <linux/blkdev.h> 6#include <linux/sbitmap.h> 7#include <linux/lockdep.h> 8#include <linux/scatterlist.h> 9#include <linux/prefetch.h> 10 11struct blk_mq_tags; 12struct blk_flush_queue; 13 14#define BLKDEV_MIN_RQ 4 15#define BLKDEV_DEFAULT_RQ 128 16 17typedef void (rq_end_io_fn)(struct request *, blk_status_t); 18 19/* 20 * request flags */ 21typedef __u32 __bitwise req_flags_t; 22 23/* drive already may have started this one */ 24#define RQF_STARTED ((__force req_flags_t)(1 << 1)) 25/* may not be passed by ioscheduler */ 26#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) 27/* request for flush sequence */ 28#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) 29/* merge of different types, fail separately */ 30#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) 31/* track inflight for MQ */ 32#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) 33/* don't call prep for this one */ 34#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) 35/* vaguely specified driver internal error. Ignored by the block layer */ 36#define RQF_FAILED ((__force req_flags_t)(1 << 10)) 37/* don't warn about errors */ 38#define RQF_QUIET ((__force req_flags_t)(1 << 11)) 39/* elevator private data attached */ 40#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) 41/* account into disk and partition IO statistics */ 42#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) 43/* runtime pm request */ 44#define RQF_PM ((__force req_flags_t)(1 << 15)) 45/* on IO scheduler merge hash */ 46#define RQF_HASHED ((__force req_flags_t)(1 << 16)) 47/* track IO completion time */ 48#define RQF_STATS ((__force req_flags_t)(1 << 17)) 49/* Look at ->special_vec for the actual data payload instead of the 50 bio chain. */ 51#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) 52/* The per-zone write lock is held for this request */ 53#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) 54/* already slept for hybrid poll */ 55#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) 56/* ->timeout has been called, don't expire again */ 57#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) 58/* queue has elevator attached */ 59#define RQF_ELV ((__force req_flags_t)(1 << 22)) 60 61/* flags that prevent us from merging requests: */ 62#define RQF_NOMERGE_FLAGS \ 63 (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) 64 65enum mq_rq_state { 66 MQ_RQ_IDLE = 0, 67 MQ_RQ_IN_FLIGHT = 1, 68 MQ_RQ_COMPLETE = 2, 69}; 70 71/* 72 * Try to put the fields that are referenced together in the same cacheline. 73 * 74 * If you modify this structure, make sure to update blk_rq_init() and 75 * especially blk_mq_rq_ctx_init() to take care of the added fields. 76 */ 77struct request { 78 struct request_queue *q; 79 struct blk_mq_ctx *mq_ctx; 80 struct blk_mq_hw_ctx *mq_hctx; 81 82 unsigned int cmd_flags; /* op and common flags */ 83 req_flags_t rq_flags; 84 85 int tag; 86 int internal_tag; 87 88 unsigned int timeout; 89 90 /* the following two fields are internal, NEVER access directly */ 91 unsigned int __data_len; /* total data len */ 92 sector_t __sector; /* sector cursor */ 93 94 struct bio *bio; 95 struct bio *biotail; 96 97 union { 98 struct list_head queuelist; 99 struct request *rq_next; 100 }; 101 102 struct block_device *part; 103#ifdef CONFIG_BLK_RQ_ALLOC_TIME 104 /* Time that the first bio started allocating this request. */ 105 u64 alloc_time_ns; 106#endif 107 /* Time that this request was allocated for this IO. */ 108 u64 start_time_ns; 109 /* Time that I/O was submitted to the device. */ 110 u64 io_start_time_ns; 111 112#ifdef CONFIG_BLK_WBT 113 unsigned short wbt_flags; 114#endif 115 /* 116 * rq sectors used for blk stats. It has the same value 117 * with blk_rq_sectors(rq), except that it never be zeroed 118 * by completion. 119 */ 120 unsigned short stats_sectors; 121 122 /* 123 * Number of scatter-gather DMA addr+len pairs after 124 * physical address coalescing is performed. 125 */ 126 unsigned short nr_phys_segments; 127 128#ifdef CONFIG_BLK_DEV_INTEGRITY 129 unsigned short nr_integrity_segments; 130#endif 131 132#ifdef CONFIG_BLK_INLINE_ENCRYPTION 133 struct bio_crypt_ctx *crypt_ctx; 134 struct blk_crypto_keyslot *crypt_keyslot; 135#endif 136 137 unsigned short write_hint; 138 unsigned short ioprio; 139 140 enum mq_rq_state state; 141 atomic_t ref; 142 143 unsigned long deadline; 144 145 /* 146 * The hash is used inside the scheduler, and killed once the 147 * request reaches the dispatch list. The ipi_list is only used 148 * to queue the request for softirq completion, which is long 149 * after the request has been unhashed (and even removed from 150 * the dispatch list). 151 */ 152 union { 153 struct hlist_node hash; /* merge hash */ 154 struct llist_node ipi_list; 155 }; 156 157 /* 158 * The rb_node is only used inside the io scheduler, requests 159 * are pruned when moved to the dispatch queue. So let the 160 * completion_data share space with the rb_node. 161 */ 162 union { 163 struct rb_node rb_node; /* sort/lookup */ 164 struct bio_vec special_vec; 165 void *completion_data; 166 }; 167 168 169 /* 170 * Three pointers are available for the IO schedulers, if they need 171 * more they have to dynamically allocate it. Flush requests are 172 * never put on the IO scheduler. So let the flush fields share 173 * space with the elevator data. 174 */ 175 union { 176 struct { 177 struct io_cq *icq; 178 void *priv[2]; 179 } elv; 180 181 struct { 182 unsigned int seq; 183 struct list_head list; 184 rq_end_io_fn *saved_end_io; 185 } flush; 186 }; 187 188 union { 189 struct __call_single_data csd; 190 u64 fifo_time; 191 }; 192 193 /* 194 * completion callback. 195 */ 196 rq_end_io_fn *end_io; 197 void *end_io_data; 198}; 199 200#define req_op(req) \ 201 ((req)->cmd_flags & REQ_OP_MASK) 202 203static inline bool blk_rq_is_passthrough(struct request *rq) 204{ 205 return blk_op_is_passthrough(req_op(rq)); 206} 207 208static inline unsigned short req_get_ioprio(struct request *req) 209{ 210 return req->ioprio; 211} 212 213#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) 214 215#define rq_dma_dir(rq) \ 216 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) 217 218#define rq_list_add(listptr, rq) do { \ 219 (rq)->rq_next = *(listptr); \ 220 *(listptr) = rq; \ 221} while (0) 222 223#define rq_list_pop(listptr) \ 224({ \ 225 struct request *__req = NULL; \ 226 if ((listptr) && *(listptr)) { \ 227 __req = *(listptr); \ 228 *(listptr) = __req->rq_next; \ 229 } \ 230 __req; \ 231}) 232 233#define rq_list_peek(listptr) \ 234({ \ 235 struct request *__req = NULL; \ 236 if ((listptr) && *(listptr)) \ 237 __req = *(listptr); \ 238 __req; \ 239}) 240 241#define rq_list_for_each(listptr, pos) \ 242 for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) 243 244#define rq_list_for_each_safe(listptr, pos, nxt) \ 245 for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ 246 pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) 247 248#define rq_list_next(rq) (rq)->rq_next 249#define rq_list_empty(list) ((list) == (struct request *) NULL) 250 251/** 252 * rq_list_move() - move a struct request from one list to another 253 * @src: The source list @rq is currently in 254 * @dst: The destination list that @rq will be appended to 255 * @rq: The request to move 256 * @prev: The request preceding @rq in @src (NULL if @rq is the head) 257 */ 258static inline void rq_list_move(struct request **src, struct request **dst, 259 struct request *rq, struct request *prev) 260{ 261 if (prev) 262 prev->rq_next = rq->rq_next; 263 else 264 *src = rq->rq_next; 265 rq_list_add(dst, rq); 266} 267 268enum blk_eh_timer_return { 269 BLK_EH_DONE, /* drivers has completed the command */ 270 BLK_EH_RESET_TIMER, /* reset timer and try again */ 271}; 272 273#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ 274#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ 275 276/** 277 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 278 * block device 279 */ 280struct blk_mq_hw_ctx { 281 struct { 282 /** @lock: Protects the dispatch list. */ 283 spinlock_t lock; 284 /** 285 * @dispatch: Used for requests that are ready to be 286 * dispatched to the hardware but for some reason (e.g. lack of 287 * resources) could not be sent to the hardware. As soon as the 288 * driver can send new requests, requests at this list will 289 * be sent first for a fairer dispatch. 290 */ 291 struct list_head dispatch; 292 /** 293 * @state: BLK_MQ_S_* flags. Defines the state of the hw 294 * queue (active, scheduled to restart, stopped). 295 */ 296 unsigned long state; 297 } ____cacheline_aligned_in_smp; 298 299 /** 300 * @run_work: Used for scheduling a hardware queue run at a later time. 301 */ 302 struct delayed_work run_work; 303 /** @cpumask: Map of available CPUs where this hctx can run. */ 304 cpumask_var_t cpumask; 305 /** 306 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 307 * selection from @cpumask. 308 */ 309 int next_cpu; 310 /** 311 * @next_cpu_batch: Counter of how many works left in the batch before 312 * changing to the next CPU. 313 */ 314 int next_cpu_batch; 315 316 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 317 unsigned long flags; 318 319 /** 320 * @sched_data: Pointer owned by the IO scheduler attached to a request 321 * queue. It's up to the IO scheduler how to use this pointer. 322 */ 323 void *sched_data; 324 /** 325 * @queue: Pointer to the request queue that owns this hardware context. 326 */ 327 struct request_queue *queue; 328 /** @fq: Queue of requests that need to perform a flush operation. */ 329 struct blk_flush_queue *fq; 330 331 /** 332 * @driver_data: Pointer to data owned by the block driver that created 333 * this hctx 334 */ 335 void *driver_data; 336 337 /** 338 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 339 * pending request in that software queue. 340 */ 341 struct sbitmap ctx_map; 342 343 /** 344 * @dispatch_from: Software queue to be used when no scheduler was 345 * selected. 346 */ 347 struct blk_mq_ctx *dispatch_from; 348 /** 349 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 350 * decide if the hw_queue is busy using Exponential Weighted Moving 351 * Average algorithm. 352 */ 353 unsigned int dispatch_busy; 354 355 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 356 unsigned short type; 357 /** @nr_ctx: Number of software queues. */ 358 unsigned short nr_ctx; 359 /** @ctxs: Array of software queues. */ 360 struct blk_mq_ctx **ctxs; 361 362 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 363 spinlock_t dispatch_wait_lock; 364 /** 365 * @dispatch_wait: Waitqueue to put requests when there is no tag 366 * available at the moment, to wait for another try in the future. 367 */ 368 wait_queue_entry_t dispatch_wait; 369 370 /** 371 * @wait_index: Index of next available dispatch_wait queue to insert 372 * requests. 373 */ 374 atomic_t wait_index; 375 376 /** 377 * @tags: Tags owned by the block driver. A tag at this set is only 378 * assigned when a request is dispatched from a hardware queue. 379 */ 380 struct blk_mq_tags *tags; 381 /** 382 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 383 * scheduler associated with a request queue, a tag is assigned when 384 * that request is allocated. Else, this member is not used. 385 */ 386 struct blk_mq_tags *sched_tags; 387 388 /** @queued: Number of queued requests. */ 389 unsigned long queued; 390 /** @run: Number of dispatched requests. */ 391 unsigned long run; 392 393 /** @numa_node: NUMA node the storage adapter has been connected to. */ 394 unsigned int numa_node; 395 /** @queue_num: Index of this hardware queue. */ 396 unsigned int queue_num; 397 398 /** 399 * @nr_active: Number of active requests. Only used when a tag set is 400 * shared across request queues. 401 */ 402 atomic_t nr_active; 403 404 /** @cpuhp_online: List to store request if CPU is going to die */ 405 struct hlist_node cpuhp_online; 406 /** @cpuhp_dead: List to store request if some CPU die. */ 407 struct hlist_node cpuhp_dead; 408 /** @kobj: Kernel object for sysfs. */ 409 struct kobject kobj; 410 411#ifdef CONFIG_BLK_DEBUG_FS 412 /** 413 * @debugfs_dir: debugfs directory for this hardware queue. Named 414 * as cpu<cpu_number>. 415 */ 416 struct dentry *debugfs_dir; 417 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 418 struct dentry *sched_debugfs_dir; 419#endif 420 421 /** 422 * @hctx_list: if this hctx is not in use, this is an entry in 423 * q->unused_hctx_list. 424 */ 425 struct list_head hctx_list; 426}; 427 428/** 429 * struct blk_mq_queue_map - Map software queues to hardware queues 430 * @mq_map: CPU ID to hardware queue index map. This is an array 431 * with nr_cpu_ids elements. Each element has a value in the range 432 * [@queue_offset, @queue_offset + @nr_queues). 433 * @nr_queues: Number of hardware queues to map CPU IDs onto. 434 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 435 * driver to map each hardware queue type (enum hctx_type) onto a distinct 436 * set of hardware queues. 437 */ 438struct blk_mq_queue_map { 439 unsigned int *mq_map; 440 unsigned int nr_queues; 441 unsigned int queue_offset; 442}; 443 444/** 445 * enum hctx_type - Type of hardware queue 446 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 447 * @HCTX_TYPE_READ: Just for READ I/O. 448 * @HCTX_TYPE_POLL: Polled I/O of any kind. 449 * @HCTX_MAX_TYPES: Number of types of hctx. 450 */ 451enum hctx_type { 452 HCTX_TYPE_DEFAULT, 453 HCTX_TYPE_READ, 454 HCTX_TYPE_POLL, 455 456 HCTX_MAX_TYPES, 457}; 458 459/** 460 * struct blk_mq_tag_set - tag set that can be shared between request queues 461 * @map: One or more ctx -> hctx mappings. One map exists for each 462 * hardware queue type (enum hctx_type) that the driver wishes 463 * to support. There are no restrictions on maps being of the 464 * same size, and it's perfectly legal to share maps between 465 * types. 466 * @nr_maps: Number of elements in the @map array. A number in the range 467 * [1, HCTX_MAX_TYPES]. 468 * @ops: Pointers to functions that implement block driver behavior. 469 * @nr_hw_queues: Number of hardware queues supported by the block driver that 470 * owns this data structure. 471 * @queue_depth: Number of tags per hardware queue, reserved tags included. 472 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 473 * allocations. 474 * @cmd_size: Number of additional bytes to allocate per request. The block 475 * driver owns these additional bytes. 476 * @numa_node: NUMA node the storage adapter has been connected to. 477 * @timeout: Request processing timeout in jiffies. 478 * @flags: Zero or more BLK_MQ_F_* flags. 479 * @driver_data: Pointer to data owned by the block driver that created this 480 * tag set. 481 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 482 * elements. 483 * @shared_tags: 484 * Shared set of tags. Has @nr_hw_queues elements. If set, 485 * shared by all @tags. 486 * @tag_list_lock: Serializes tag_list accesses. 487 * @tag_list: List of the request queues that use this tag set. See also 488 * request_queue.tag_set_list. 489 */ 490struct blk_mq_tag_set { 491 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 492 unsigned int nr_maps; 493 const struct blk_mq_ops *ops; 494 unsigned int nr_hw_queues; 495 unsigned int queue_depth; 496 unsigned int reserved_tags; 497 unsigned int cmd_size; 498 int numa_node; 499 unsigned int timeout; 500 unsigned int flags; 501 void *driver_data; 502 503 struct blk_mq_tags **tags; 504 505 struct blk_mq_tags *shared_tags; 506 507 struct mutex tag_list_lock; 508 struct list_head tag_list; 509}; 510 511/** 512 * struct blk_mq_queue_data - Data about a request inserted in a queue 513 * 514 * @rq: Request pointer. 515 * @last: If it is the last request in the queue. 516 */ 517struct blk_mq_queue_data { 518 struct request *rq; 519 bool last; 520}; 521 522typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); 523 524/** 525 * struct blk_mq_ops - Callback functions that implements block driver 526 * behaviour. 527 */ 528struct blk_mq_ops { 529 /** 530 * @queue_rq: Queue a new request from block IO. 531 */ 532 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, 533 const struct blk_mq_queue_data *); 534 535 /** 536 * @commit_rqs: If a driver uses bd->last to judge when to submit 537 * requests to hardware, it must define this function. In case of errors 538 * that make us stop issuing further requests, this hook serves the 539 * purpose of kicking the hardware (which the last request otherwise 540 * would have done). 541 */ 542 void (*commit_rqs)(struct blk_mq_hw_ctx *); 543 544 /** 545 * @queue_rqs: Queue a list of new requests. Driver is guaranteed 546 * that each request belongs to the same queue. If the driver doesn't 547 * empty the @rqlist completely, then the rest will be queued 548 * individually by the block layer upon return. 549 */ 550 void (*queue_rqs)(struct request **rqlist); 551 552 /** 553 * @get_budget: Reserve budget before queue request, once .queue_rq is 554 * run, it is driver's responsibility to release the 555 * reserved budget. Also we have to handle failure case 556 * of .get_budget for avoiding I/O deadlock. 557 */ 558 int (*get_budget)(struct request_queue *); 559 560 /** 561 * @put_budget: Release the reserved budget. 562 */ 563 void (*put_budget)(struct request_queue *, int); 564 565 /** 566 * @set_rq_budget_token: store rq's budget token 567 */ 568 void (*set_rq_budget_token)(struct request *, int); 569 /** 570 * @get_rq_budget_token: retrieve rq's budget token 571 */ 572 int (*get_rq_budget_token)(struct request *); 573 574 /** 575 * @timeout: Called on request timeout. 576 */ 577 enum blk_eh_timer_return (*timeout)(struct request *, bool); 578 579 /** 580 * @poll: Called to poll for completion of a specific tag. 581 */ 582 int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); 583 584 /** 585 * @complete: Mark the request as complete. 586 */ 587 void (*complete)(struct request *); 588 589 /** 590 * @init_hctx: Called when the block layer side of a hardware queue has 591 * been set up, allowing the driver to allocate/init matching 592 * structures. 593 */ 594 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); 595 /** 596 * @exit_hctx: Ditto for exit/teardown. 597 */ 598 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 599 600 /** 601 * @init_request: Called for every command allocated by the block layer 602 * to allow the driver to set up driver specific data. 603 * 604 * Tag greater than or equal to queue_depth is for setting up 605 * flush request. 606 */ 607 int (*init_request)(struct blk_mq_tag_set *set, struct request *, 608 unsigned int, unsigned int); 609 /** 610 * @exit_request: Ditto for exit/teardown. 611 */ 612 void (*exit_request)(struct blk_mq_tag_set *set, struct request *, 613 unsigned int); 614 615 /** 616 * @cleanup_rq: Called before freeing one request which isn't completed 617 * yet, and usually for freeing the driver private data. 618 */ 619 void (*cleanup_rq)(struct request *); 620 621 /** 622 * @busy: If set, returns whether or not this queue currently is busy. 623 */ 624 bool (*busy)(struct request_queue *); 625 626 /** 627 * @map_queues: This allows drivers specify their own queue mapping by 628 * overriding the setup-time function that builds the mq_map. 629 */ 630 int (*map_queues)(struct blk_mq_tag_set *set); 631 632#ifdef CONFIG_BLK_DEBUG_FS 633 /** 634 * @show_rq: Used by the debugfs implementation to show driver-specific 635 * information about a request. 636 */ 637 void (*show_rq)(struct seq_file *m, struct request *rq); 638#endif 639}; 640 641enum { 642 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 643 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, 644 /* 645 * Set when this device requires underlying blk-mq device for 646 * completing IO: 647 */ 648 BLK_MQ_F_STACKING = 1 << 2, 649 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, 650 BLK_MQ_F_BLOCKING = 1 << 5, 651 /* Do not allow an I/O scheduler to be configured. */ 652 BLK_MQ_F_NO_SCHED = 1 << 6, 653 /* 654 * Select 'none' during queue registration in case of a single hwq 655 * or shared hwqs instead of 'mq-deadline'. 656 */ 657 BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, 658 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 659 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 660 661 BLK_MQ_S_STOPPED = 0, 662 BLK_MQ_S_TAG_ACTIVE = 1, 663 BLK_MQ_S_SCHED_RESTART = 2, 664 665 /* hw queue is inactive after all its CPUs become offline */ 666 BLK_MQ_S_INACTIVE = 3, 667 668 BLK_MQ_MAX_DEPTH = 10240, 669 670 BLK_MQ_CPU_WORK_BATCH = 8, 671}; 672#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 673 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 674 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 675#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 676 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 677 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 678 679#define BLK_MQ_NO_HCTX_IDX (-1U) 680 681struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, 682 struct lock_class_key *lkclass); 683#define blk_mq_alloc_disk(set, queuedata) \ 684({ \ 685 static struct lock_class_key __key; \ 686 \ 687 __blk_mq_alloc_disk(set, queuedata, &__key); \ 688}) 689struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 690int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 691 struct request_queue *q); 692void blk_mq_unregister_dev(struct device *, struct request_queue *); 693 694int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 695int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 696 const struct blk_mq_ops *ops, unsigned int queue_depth, 697 unsigned int set_flags); 698void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 699 700void blk_mq_free_request(struct request *rq); 701 702bool blk_mq_queue_inflight(struct request_queue *q); 703 704enum { 705 /* return when out of requests */ 706 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 707 /* allocate from reserved pool */ 708 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 709 /* set RQF_PM */ 710 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), 711}; 712 713struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 714 blk_mq_req_flags_t flags); 715struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 716 unsigned int op, blk_mq_req_flags_t flags, 717 unsigned int hctx_idx); 718 719/* 720 * Tag address space map. 721 */ 722struct blk_mq_tags { 723 unsigned int nr_tags; 724 unsigned int nr_reserved_tags; 725 726 atomic_t active_queues; 727 728 struct sbitmap_queue bitmap_tags; 729 struct sbitmap_queue breserved_tags; 730 731 struct request **rqs; 732 struct request **static_rqs; 733 struct list_head page_list; 734 735 /* 736 * used to clear request reference in rqs[] before freeing one 737 * request pool 738 */ 739 spinlock_t lock; 740}; 741 742static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, 743 unsigned int tag) 744{ 745 if (tag < tags->nr_tags) { 746 prefetch(tags->rqs[tag]); 747 return tags->rqs[tag]; 748 } 749 750 return NULL; 751} 752 753enum { 754 BLK_MQ_UNIQUE_TAG_BITS = 16, 755 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 756}; 757 758u32 blk_mq_unique_tag(struct request *rq); 759 760static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 761{ 762 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 763} 764 765static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 766{ 767 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 768} 769 770/** 771 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 772 * @rq: target request. 773 */ 774static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 775{ 776 return READ_ONCE(rq->state); 777} 778 779static inline int blk_mq_request_started(struct request *rq) 780{ 781 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 782} 783 784static inline int blk_mq_request_completed(struct request *rq) 785{ 786 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 787} 788 789/* 790 * 791 * Set the state to complete when completing a request from inside ->queue_rq. 792 * This is used by drivers that want to ensure special complete actions that 793 * need access to the request are called on failure, e.g. by nvme for 794 * multipathing. 795 */ 796static inline void blk_mq_set_request_complete(struct request *rq) 797{ 798 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 799} 800 801/* 802 * Complete the request directly instead of deferring it to softirq or 803 * completing it another CPU. Useful in preemptible instead of an interrupt. 804 */ 805static inline void blk_mq_complete_request_direct(struct request *rq, 806 void (*complete)(struct request *rq)) 807{ 808 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 809 complete(rq); 810} 811 812void blk_mq_start_request(struct request *rq); 813void blk_mq_end_request(struct request *rq, blk_status_t error); 814void __blk_mq_end_request(struct request *rq, blk_status_t error); 815void blk_mq_end_request_batch(struct io_comp_batch *ib); 816 817/* 818 * Only need start/end time stamping if we have iostat or 819 * blk stats enabled, or using an IO scheduler. 820 */ 821static inline bool blk_mq_need_time_stamp(struct request *rq) 822{ 823 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV)); 824} 825 826/* 827 * Batched completions only work when there is no I/O error and no special 828 * ->end_io handler. 829 */ 830static inline bool blk_mq_add_to_batch(struct request *req, 831 struct io_comp_batch *iob, int ioerror, 832 void (*complete)(struct io_comp_batch *)) 833{ 834 if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror) 835 return false; 836 if (!iob->complete) 837 iob->complete = complete; 838 else if (iob->complete != complete) 839 return false; 840 iob->need_ts |= blk_mq_need_time_stamp(req); 841 rq_list_add(&iob->req_list, req); 842 return true; 843} 844 845void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 846void blk_mq_kick_requeue_list(struct request_queue *q); 847void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 848void blk_mq_complete_request(struct request *rq); 849bool blk_mq_complete_request_remote(struct request *rq); 850bool blk_mq_queue_stopped(struct request_queue *q); 851void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 852void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 853void blk_mq_stop_hw_queues(struct request_queue *q); 854void blk_mq_start_hw_queues(struct request_queue *q); 855void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 856void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 857void blk_mq_quiesce_queue(struct request_queue *q); 858void blk_mq_wait_quiesce_done(struct request_queue *q); 859void blk_mq_unquiesce_queue(struct request_queue *q); 860void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 861void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 862void blk_mq_run_hw_queues(struct request_queue *q, bool async); 863void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 864void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 865 busy_tag_iter_fn *fn, void *priv); 866void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 867void blk_mq_freeze_queue(struct request_queue *q); 868void blk_mq_unfreeze_queue(struct request_queue *q); 869void blk_freeze_queue_start(struct request_queue *q); 870void blk_mq_freeze_queue_wait(struct request_queue *q); 871int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 872 unsigned long timeout); 873 874int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 875void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 876 877void blk_mq_quiesce_queue_nowait(struct request_queue *q); 878 879unsigned int blk_mq_rq_cpu(struct request *rq); 880 881bool __blk_should_fake_timeout(struct request_queue *q); 882static inline bool blk_should_fake_timeout(struct request_queue *q) 883{ 884 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && 885 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) 886 return __blk_should_fake_timeout(q); 887 return false; 888} 889 890/** 891 * blk_mq_rq_from_pdu - cast a PDU to a request 892 * @pdu: the PDU (Protocol Data Unit) to be casted 893 * 894 * Return: request 895 * 896 * Driver command data is immediately after the request. So subtract request 897 * size to get back to the original request. 898 */ 899static inline struct request *blk_mq_rq_from_pdu(void *pdu) 900{ 901 return pdu - sizeof(struct request); 902} 903 904/** 905 * blk_mq_rq_to_pdu - cast a request to a PDU 906 * @rq: the request to be casted 907 * 908 * Return: pointer to the PDU 909 * 910 * Driver command data is immediately after the request. So add request to get 911 * the PDU. 912 */ 913static inline void *blk_mq_rq_to_pdu(struct request *rq) 914{ 915 return rq + 1; 916} 917 918#define queue_for_each_hw_ctx(q, hctx, i) \ 919 xa_for_each(&(q)->hctx_table, (i), (hctx)) 920 921#define hctx_for_each_ctx(hctx, ctx, i) \ 922 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 923 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 924 925static inline void blk_mq_cleanup_rq(struct request *rq) 926{ 927 if (rq->q->mq_ops->cleanup_rq) 928 rq->q->mq_ops->cleanup_rq(rq); 929} 930 931static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, 932 unsigned int nr_segs) 933{ 934 rq->nr_phys_segments = nr_segs; 935 rq->__data_len = bio->bi_iter.bi_size; 936 rq->bio = rq->biotail = bio; 937 rq->ioprio = bio_prio(bio); 938} 939 940void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, 941 struct lock_class_key *key); 942 943static inline bool rq_is_sync(struct request *rq) 944{ 945 return op_is_sync(rq->cmd_flags); 946} 947 948void blk_rq_init(struct request_queue *q, struct request *rq); 949int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 950 struct bio_set *bs, gfp_t gfp_mask, 951 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); 952void blk_rq_unprep_clone(struct request *rq); 953blk_status_t blk_insert_cloned_request(struct request *rq); 954 955struct rq_map_data { 956 struct page **pages; 957 int page_order; 958 int nr_entries; 959 unsigned long offset; 960 int null_mapped; 961 int from_user; 962}; 963 964int blk_rq_map_user(struct request_queue *, struct request *, 965 struct rq_map_data *, void __user *, unsigned long, gfp_t); 966int blk_rq_map_user_iov(struct request_queue *, struct request *, 967 struct rq_map_data *, const struct iov_iter *, gfp_t); 968int blk_rq_unmap_user(struct bio *); 969int blk_rq_map_kern(struct request_queue *, struct request *, void *, 970 unsigned int, gfp_t); 971int blk_rq_append_bio(struct request *rq, struct bio *bio); 972void blk_execute_rq_nowait(struct request *rq, bool at_head); 973blk_status_t blk_execute_rq(struct request *rq, bool at_head); 974 975struct req_iterator { 976 struct bvec_iter iter; 977 struct bio *bio; 978}; 979 980#define __rq_for_each_bio(_bio, rq) \ 981 if ((rq->bio)) \ 982 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) 983 984#define rq_for_each_segment(bvl, _rq, _iter) \ 985 __rq_for_each_bio(_iter.bio, _rq) \ 986 bio_for_each_segment(bvl, _iter.bio, _iter.iter) 987 988#define rq_for_each_bvec(bvl, _rq, _iter) \ 989 __rq_for_each_bio(_iter.bio, _rq) \ 990 bio_for_each_bvec(bvl, _iter.bio, _iter.iter) 991 992#define rq_iter_last(bvec, _iter) \ 993 (_iter.bio->bi_next == NULL && \ 994 bio_iter_last(bvec, _iter.iter)) 995 996/* 997 * blk_rq_pos() : the current sector 998 * blk_rq_bytes() : bytes left in the entire request 999 * blk_rq_cur_bytes() : bytes left in the current segment 1000 * blk_rq_sectors() : sectors left in the entire request 1001 * blk_rq_cur_sectors() : sectors left in the current segment 1002 * blk_rq_stats_sectors() : sectors of the entire request used for stats 1003 */ 1004static inline sector_t blk_rq_pos(const struct request *rq) 1005{ 1006 return rq->__sector; 1007} 1008 1009static inline unsigned int blk_rq_bytes(const struct request *rq) 1010{ 1011 return rq->__data_len; 1012} 1013 1014static inline int blk_rq_cur_bytes(const struct request *rq) 1015{ 1016 if (!rq->bio) 1017 return 0; 1018 if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ 1019 return rq->bio->bi_iter.bi_size; 1020 return bio_iovec(rq->bio).bv_len; 1021} 1022 1023static inline unsigned int blk_rq_sectors(const struct request *rq) 1024{ 1025 return blk_rq_bytes(rq) >> SECTOR_SHIFT; 1026} 1027 1028static inline unsigned int blk_rq_cur_sectors(const struct request *rq) 1029{ 1030 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; 1031} 1032 1033static inline unsigned int blk_rq_stats_sectors(const struct request *rq) 1034{ 1035 return rq->stats_sectors; 1036} 1037 1038/* 1039 * Some commands like WRITE SAME have a payload or data transfer size which 1040 * is different from the size of the request. Any driver that supports such 1041 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to 1042 * calculate the data transfer size. 1043 */ 1044static inline unsigned int blk_rq_payload_bytes(struct request *rq) 1045{ 1046 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1047 return rq->special_vec.bv_len; 1048 return blk_rq_bytes(rq); 1049} 1050 1051/* 1052 * Return the first full biovec in the request. The caller needs to check that 1053 * there are any bvecs before calling this helper. 1054 */ 1055static inline struct bio_vec req_bvec(struct request *rq) 1056{ 1057 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1058 return rq->special_vec; 1059 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); 1060} 1061 1062static inline unsigned int blk_rq_count_bios(struct request *rq) 1063{ 1064 unsigned int nr_bios = 0; 1065 struct bio *bio; 1066 1067 __rq_for_each_bio(bio, rq) 1068 nr_bios++; 1069 1070 return nr_bios; 1071} 1072 1073void blk_steal_bios(struct bio_list *list, struct request *rq); 1074 1075/* 1076 * Request completion related functions. 1077 * 1078 * blk_update_request() completes given number of bytes and updates 1079 * the request without completing it. 1080 */ 1081bool blk_update_request(struct request *rq, blk_status_t error, 1082 unsigned int nr_bytes); 1083void blk_abort_request(struct request *); 1084 1085/* 1086 * Number of physical segments as sent to the device. 1087 * 1088 * Normally this is the number of discontiguous data segments sent by the 1089 * submitter. But for data-less command like discard we might have no 1090 * actual data segments submitted, but the driver might have to add it's 1091 * own special payload. In that case we still return 1 here so that this 1092 * special payload will be mapped. 1093 */ 1094static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) 1095{ 1096 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1097 return 1; 1098 return rq->nr_phys_segments; 1099} 1100 1101/* 1102 * Number of discard segments (or ranges) the driver needs to fill in. 1103 * Each discard bio merged into a request is counted as one segment. 1104 */ 1105static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) 1106{ 1107 return max_t(unsigned short, rq->nr_phys_segments, 1); 1108} 1109 1110int __blk_rq_map_sg(struct request_queue *q, struct request *rq, 1111 struct scatterlist *sglist, struct scatterlist **last_sg); 1112static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, 1113 struct scatterlist *sglist) 1114{ 1115 struct scatterlist *last_sg = NULL; 1116 1117 return __blk_rq_map_sg(q, rq, sglist, &last_sg); 1118} 1119void blk_dump_rq_flags(struct request *, char *); 1120 1121#ifdef CONFIG_BLK_DEV_ZONED 1122static inline unsigned int blk_rq_zone_no(struct request *rq) 1123{ 1124 return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); 1125} 1126 1127static inline unsigned int blk_rq_zone_is_seq(struct request *rq) 1128{ 1129 return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); 1130} 1131 1132bool blk_req_needs_zone_write_lock(struct request *rq); 1133bool blk_req_zone_write_trylock(struct request *rq); 1134void __blk_req_zone_write_lock(struct request *rq); 1135void __blk_req_zone_write_unlock(struct request *rq); 1136 1137static inline void blk_req_zone_write_lock(struct request *rq) 1138{ 1139 if (blk_req_needs_zone_write_lock(rq)) 1140 __blk_req_zone_write_lock(rq); 1141} 1142 1143static inline void blk_req_zone_write_unlock(struct request *rq) 1144{ 1145 if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) 1146 __blk_req_zone_write_unlock(rq); 1147} 1148 1149static inline bool blk_req_zone_is_write_locked(struct request *rq) 1150{ 1151 return rq->q->seq_zones_wlock && 1152 test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); 1153} 1154 1155static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1156{ 1157 if (!blk_req_needs_zone_write_lock(rq)) 1158 return true; 1159 return !blk_req_zone_is_write_locked(rq); 1160} 1161#else /* CONFIG_BLK_DEV_ZONED */ 1162static inline bool blk_req_needs_zone_write_lock(struct request *rq) 1163{ 1164 return false; 1165} 1166 1167static inline void blk_req_zone_write_lock(struct request *rq) 1168{ 1169} 1170 1171static inline void blk_req_zone_write_unlock(struct request *rq) 1172{ 1173} 1174static inline bool blk_req_zone_is_write_locked(struct request *rq) 1175{ 1176 return false; 1177} 1178 1179static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1180{ 1181 return true; 1182} 1183#endif /* CONFIG_BLK_DEV_ZONED */ 1184 1185#endif /* BLK_MQ_H */