zoned.c (16712B)
1// SPDX-License-Identifier: GPL-2.0 2#include <linux/vmalloc.h> 3#include <linux/bitmap.h> 4#include "null_blk.h" 5 6#define CREATE_TRACE_POINTS 7#include "trace.h" 8 9#undef pr_fmt 10#define pr_fmt(fmt) "null_blk: " fmt 11 12static inline sector_t mb_to_sects(unsigned long mb) 13{ 14 return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; 15} 16 17static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) 18{ 19 return sect >> ilog2(dev->zone_size_sects); 20} 21 22static inline void null_lock_zone_res(struct nullb_device *dev) 23{ 24 if (dev->need_zone_res_mgmt) 25 spin_lock_irq(&dev->zone_res_lock); 26} 27 28static inline void null_unlock_zone_res(struct nullb_device *dev) 29{ 30 if (dev->need_zone_res_mgmt) 31 spin_unlock_irq(&dev->zone_res_lock); 32} 33 34static inline void null_init_zone_lock(struct nullb_device *dev, 35 struct nullb_zone *zone) 36{ 37 if (!dev->memory_backed) 38 spin_lock_init(&zone->spinlock); 39 else 40 mutex_init(&zone->mutex); 41} 42 43static inline void null_lock_zone(struct nullb_device *dev, 44 struct nullb_zone *zone) 45{ 46 if (!dev->memory_backed) 47 spin_lock_irq(&zone->spinlock); 48 else 49 mutex_lock(&zone->mutex); 50} 51 52static inline void null_unlock_zone(struct nullb_device *dev, 53 struct nullb_zone *zone) 54{ 55 if (!dev->memory_backed) 56 spin_unlock_irq(&zone->spinlock); 57 else 58 mutex_unlock(&zone->mutex); 59} 60 61int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) 62{ 63 sector_t dev_capacity_sects, zone_capacity_sects; 64 struct nullb_zone *zone; 65 sector_t sector = 0; 66 unsigned int i; 67 68 if (!is_power_of_2(dev->zone_size)) { 69 pr_err("zone_size must be power-of-two\n"); 70 return -EINVAL; 71 } 72 if (dev->zone_size > dev->size) { 73 pr_err("Zone size larger than device capacity\n"); 74 return -EINVAL; 75 } 76 77 if (!dev->zone_capacity) 78 dev->zone_capacity = dev->zone_size; 79 80 if (dev->zone_capacity > dev->zone_size) { 81 pr_err("zone capacity (%lu MB) larger than zone size (%lu MB)\n", 82 dev->zone_capacity, dev->zone_size); 83 return -EINVAL; 84 } 85 86 zone_capacity_sects = mb_to_sects(dev->zone_capacity); 87 dev_capacity_sects = mb_to_sects(dev->size); 88 dev->zone_size_sects = mb_to_sects(dev->zone_size); 89 dev->nr_zones = round_up(dev_capacity_sects, dev->zone_size_sects) 90 >> ilog2(dev->zone_size_sects); 91 92 dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), 93 GFP_KERNEL | __GFP_ZERO); 94 if (!dev->zones) 95 return -ENOMEM; 96 97 spin_lock_init(&dev->zone_res_lock); 98 99 if (dev->zone_nr_conv >= dev->nr_zones) { 100 dev->zone_nr_conv = dev->nr_zones - 1; 101 pr_info("changed the number of conventional zones to %u", 102 dev->zone_nr_conv); 103 } 104 105 /* Max active zones has to be < nbr of seq zones in order to be enforceable */ 106 if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { 107 dev->zone_max_active = 0; 108 pr_info("zone_max_active limit disabled, limit >= zone count\n"); 109 } 110 111 /* Max open zones has to be <= max active zones */ 112 if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { 113 dev->zone_max_open = dev->zone_max_active; 114 pr_info("changed the maximum number of open zones to %u\n", 115 dev->nr_zones); 116 } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { 117 dev->zone_max_open = 0; 118 pr_info("zone_max_open limit disabled, limit >= zone count\n"); 119 } 120 dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; 121 dev->imp_close_zone_no = dev->zone_nr_conv; 122 123 for (i = 0; i < dev->zone_nr_conv; i++) { 124 zone = &dev->zones[i]; 125 126 null_init_zone_lock(dev, zone); 127 zone->start = sector; 128 zone->len = dev->zone_size_sects; 129 zone->capacity = zone->len; 130 zone->wp = zone->start + zone->len; 131 zone->type = BLK_ZONE_TYPE_CONVENTIONAL; 132 zone->cond = BLK_ZONE_COND_NOT_WP; 133 134 sector += dev->zone_size_sects; 135 } 136 137 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 138 zone = &dev->zones[i]; 139 140 null_init_zone_lock(dev, zone); 141 zone->start = zone->wp = sector; 142 if (zone->start + dev->zone_size_sects > dev_capacity_sects) 143 zone->len = dev_capacity_sects - zone->start; 144 else 145 zone->len = dev->zone_size_sects; 146 zone->capacity = 147 min_t(sector_t, zone->len, zone_capacity_sects); 148 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; 149 zone->cond = BLK_ZONE_COND_EMPTY; 150 151 sector += dev->zone_size_sects; 152 } 153 154 return 0; 155} 156 157int null_register_zoned_dev(struct nullb *nullb) 158{ 159 struct nullb_device *dev = nullb->dev; 160 struct request_queue *q = nullb->q; 161 162 blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM); 163 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 164 blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); 165 166 if (queue_is_mq(q)) { 167 int ret = blk_revalidate_disk_zones(nullb->disk, NULL); 168 169 if (ret) 170 return ret; 171 } else { 172 blk_queue_chunk_sectors(q, dev->zone_size_sects); 173 q->nr_zones = blkdev_nr_zones(nullb->disk); 174 } 175 176 blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); 177 blk_queue_max_open_zones(q, dev->zone_max_open); 178 blk_queue_max_active_zones(q, dev->zone_max_active); 179 180 return 0; 181} 182 183void null_free_zoned_dev(struct nullb_device *dev) 184{ 185 kvfree(dev->zones); 186 dev->zones = NULL; 187} 188 189int null_report_zones(struct gendisk *disk, sector_t sector, 190 unsigned int nr_zones, report_zones_cb cb, void *data) 191{ 192 struct nullb *nullb = disk->private_data; 193 struct nullb_device *dev = nullb->dev; 194 unsigned int first_zone, i; 195 struct nullb_zone *zone; 196 struct blk_zone blkz; 197 int error; 198 199 first_zone = null_zone_no(dev, sector); 200 if (first_zone >= dev->nr_zones) 201 return 0; 202 203 nr_zones = min(nr_zones, dev->nr_zones - first_zone); 204 trace_nullb_report_zones(nullb, nr_zones); 205 206 memset(&blkz, 0, sizeof(struct blk_zone)); 207 zone = &dev->zones[first_zone]; 208 for (i = 0; i < nr_zones; i++, zone++) { 209 /* 210 * Stacked DM target drivers will remap the zone information by 211 * modifying the zone information passed to the report callback. 212 * So use a local copy to avoid corruption of the device zone 213 * array. 214 */ 215 null_lock_zone(dev, zone); 216 blkz.start = zone->start; 217 blkz.len = zone->len; 218 blkz.wp = zone->wp; 219 blkz.type = zone->type; 220 blkz.cond = zone->cond; 221 blkz.capacity = zone->capacity; 222 null_unlock_zone(dev, zone); 223 224 error = cb(&blkz, i, data); 225 if (error) 226 return error; 227 } 228 229 return nr_zones; 230} 231 232/* 233 * This is called in the case of memory backing from null_process_cmd() 234 * with the target zone already locked. 235 */ 236size_t null_zone_valid_read_len(struct nullb *nullb, 237 sector_t sector, unsigned int len) 238{ 239 struct nullb_device *dev = nullb->dev; 240 struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; 241 unsigned int nr_sectors = len >> SECTOR_SHIFT; 242 243 /* Read must be below the write pointer position */ 244 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || 245 sector + nr_sectors <= zone->wp) 246 return len; 247 248 if (sector > zone->wp) 249 return 0; 250 251 return (zone->wp - sector) << SECTOR_SHIFT; 252} 253 254static blk_status_t __null_close_zone(struct nullb_device *dev, 255 struct nullb_zone *zone) 256{ 257 switch (zone->cond) { 258 case BLK_ZONE_COND_CLOSED: 259 /* close operation on closed is not an error */ 260 return BLK_STS_OK; 261 case BLK_ZONE_COND_IMP_OPEN: 262 dev->nr_zones_imp_open--; 263 break; 264 case BLK_ZONE_COND_EXP_OPEN: 265 dev->nr_zones_exp_open--; 266 break; 267 case BLK_ZONE_COND_EMPTY: 268 case BLK_ZONE_COND_FULL: 269 default: 270 return BLK_STS_IOERR; 271 } 272 273 if (zone->wp == zone->start) { 274 zone->cond = BLK_ZONE_COND_EMPTY; 275 } else { 276 zone->cond = BLK_ZONE_COND_CLOSED; 277 dev->nr_zones_closed++; 278 } 279 280 return BLK_STS_OK; 281} 282 283static void null_close_imp_open_zone(struct nullb_device *dev) 284{ 285 struct nullb_zone *zone; 286 unsigned int zno, i; 287 288 zno = dev->imp_close_zone_no; 289 if (zno >= dev->nr_zones) 290 zno = dev->zone_nr_conv; 291 292 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 293 zone = &dev->zones[zno]; 294 zno++; 295 if (zno >= dev->nr_zones) 296 zno = dev->zone_nr_conv; 297 298 if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { 299 __null_close_zone(dev, zone); 300 dev->imp_close_zone_no = zno; 301 return; 302 } 303 } 304} 305 306static blk_status_t null_check_active(struct nullb_device *dev) 307{ 308 if (!dev->zone_max_active) 309 return BLK_STS_OK; 310 311 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + 312 dev->nr_zones_closed < dev->zone_max_active) 313 return BLK_STS_OK; 314 315 return BLK_STS_ZONE_ACTIVE_RESOURCE; 316} 317 318static blk_status_t null_check_open(struct nullb_device *dev) 319{ 320 if (!dev->zone_max_open) 321 return BLK_STS_OK; 322 323 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) 324 return BLK_STS_OK; 325 326 if (dev->nr_zones_imp_open) { 327 if (null_check_active(dev) == BLK_STS_OK) { 328 null_close_imp_open_zone(dev); 329 return BLK_STS_OK; 330 } 331 } 332 333 return BLK_STS_ZONE_OPEN_RESOURCE; 334} 335 336/* 337 * This function matches the manage open zone resources function in the ZBC standard, 338 * with the addition of max active zones support (added in the ZNS standard). 339 * 340 * The function determines if a zone can transition to implicit open or explicit open, 341 * while maintaining the max open zone (and max active zone) limit(s). It may close an 342 * implicit open zone in order to make additional zone resources available. 343 * 344 * ZBC states that an implicit open zone shall be closed only if there is not 345 * room within the open limit. However, with the addition of an active limit, 346 * it is not certain that closing an implicit open zone will allow a new zone 347 * to be opened, since we might already be at the active limit capacity. 348 */ 349static blk_status_t null_check_zone_resources(struct nullb_device *dev, 350 struct nullb_zone *zone) 351{ 352 blk_status_t ret; 353 354 switch (zone->cond) { 355 case BLK_ZONE_COND_EMPTY: 356 ret = null_check_active(dev); 357 if (ret != BLK_STS_OK) 358 return ret; 359 fallthrough; 360 case BLK_ZONE_COND_CLOSED: 361 return null_check_open(dev); 362 default: 363 /* Should never be called for other states */ 364 WARN_ON(1); 365 return BLK_STS_IOERR; 366 } 367} 368 369static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, 370 unsigned int nr_sectors, bool append) 371{ 372 struct nullb_device *dev = cmd->nq->dev; 373 unsigned int zno = null_zone_no(dev, sector); 374 struct nullb_zone *zone = &dev->zones[zno]; 375 blk_status_t ret; 376 377 trace_nullb_zone_op(cmd, zno, zone->cond); 378 379 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { 380 if (append) 381 return BLK_STS_IOERR; 382 return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 383 } 384 385 null_lock_zone(dev, zone); 386 387 if (zone->cond == BLK_ZONE_COND_FULL) { 388 /* Cannot write to a full zone */ 389 ret = BLK_STS_IOERR; 390 goto unlock; 391 } 392 393 /* 394 * Regular writes must be at the write pointer position. 395 * Zone append writes are automatically issued at the write 396 * pointer and the position returned using the request or BIO 397 * sector. 398 */ 399 if (append) { 400 sector = zone->wp; 401 if (dev->queue_mode == NULL_Q_MQ) 402 cmd->rq->__sector = sector; 403 else 404 cmd->bio->bi_iter.bi_sector = sector; 405 } else if (sector != zone->wp) { 406 ret = BLK_STS_IOERR; 407 goto unlock; 408 } 409 410 if (zone->wp + nr_sectors > zone->start + zone->capacity) { 411 ret = BLK_STS_IOERR; 412 goto unlock; 413 } 414 415 if (zone->cond == BLK_ZONE_COND_CLOSED || 416 zone->cond == BLK_ZONE_COND_EMPTY) { 417 null_lock_zone_res(dev); 418 419 ret = null_check_zone_resources(dev, zone); 420 if (ret != BLK_STS_OK) { 421 null_unlock_zone_res(dev); 422 goto unlock; 423 } 424 if (zone->cond == BLK_ZONE_COND_CLOSED) { 425 dev->nr_zones_closed--; 426 dev->nr_zones_imp_open++; 427 } else if (zone->cond == BLK_ZONE_COND_EMPTY) { 428 dev->nr_zones_imp_open++; 429 } 430 431 if (zone->cond != BLK_ZONE_COND_EXP_OPEN) 432 zone->cond = BLK_ZONE_COND_IMP_OPEN; 433 434 null_unlock_zone_res(dev); 435 } 436 437 ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 438 if (ret != BLK_STS_OK) 439 goto unlock; 440 441 zone->wp += nr_sectors; 442 if (zone->wp == zone->start + zone->capacity) { 443 null_lock_zone_res(dev); 444 if (zone->cond == BLK_ZONE_COND_EXP_OPEN) 445 dev->nr_zones_exp_open--; 446 else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) 447 dev->nr_zones_imp_open--; 448 zone->cond = BLK_ZONE_COND_FULL; 449 null_unlock_zone_res(dev); 450 } 451 452 ret = BLK_STS_OK; 453 454unlock: 455 null_unlock_zone(dev, zone); 456 457 return ret; 458} 459 460static blk_status_t null_open_zone(struct nullb_device *dev, 461 struct nullb_zone *zone) 462{ 463 blk_status_t ret = BLK_STS_OK; 464 465 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 466 return BLK_STS_IOERR; 467 468 null_lock_zone_res(dev); 469 470 switch (zone->cond) { 471 case BLK_ZONE_COND_EXP_OPEN: 472 /* open operation on exp open is not an error */ 473 goto unlock; 474 case BLK_ZONE_COND_EMPTY: 475 ret = null_check_zone_resources(dev, zone); 476 if (ret != BLK_STS_OK) 477 goto unlock; 478 break; 479 case BLK_ZONE_COND_IMP_OPEN: 480 dev->nr_zones_imp_open--; 481 break; 482 case BLK_ZONE_COND_CLOSED: 483 ret = null_check_zone_resources(dev, zone); 484 if (ret != BLK_STS_OK) 485 goto unlock; 486 dev->nr_zones_closed--; 487 break; 488 case BLK_ZONE_COND_FULL: 489 default: 490 ret = BLK_STS_IOERR; 491 goto unlock; 492 } 493 494 zone->cond = BLK_ZONE_COND_EXP_OPEN; 495 dev->nr_zones_exp_open++; 496 497unlock: 498 null_unlock_zone_res(dev); 499 500 return ret; 501} 502 503static blk_status_t null_close_zone(struct nullb_device *dev, 504 struct nullb_zone *zone) 505{ 506 blk_status_t ret; 507 508 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 509 return BLK_STS_IOERR; 510 511 null_lock_zone_res(dev); 512 ret = __null_close_zone(dev, zone); 513 null_unlock_zone_res(dev); 514 515 return ret; 516} 517 518static blk_status_t null_finish_zone(struct nullb_device *dev, 519 struct nullb_zone *zone) 520{ 521 blk_status_t ret = BLK_STS_OK; 522 523 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 524 return BLK_STS_IOERR; 525 526 null_lock_zone_res(dev); 527 528 switch (zone->cond) { 529 case BLK_ZONE_COND_FULL: 530 /* finish operation on full is not an error */ 531 goto unlock; 532 case BLK_ZONE_COND_EMPTY: 533 ret = null_check_zone_resources(dev, zone); 534 if (ret != BLK_STS_OK) 535 goto unlock; 536 break; 537 case BLK_ZONE_COND_IMP_OPEN: 538 dev->nr_zones_imp_open--; 539 break; 540 case BLK_ZONE_COND_EXP_OPEN: 541 dev->nr_zones_exp_open--; 542 break; 543 case BLK_ZONE_COND_CLOSED: 544 ret = null_check_zone_resources(dev, zone); 545 if (ret != BLK_STS_OK) 546 goto unlock; 547 dev->nr_zones_closed--; 548 break; 549 default: 550 ret = BLK_STS_IOERR; 551 goto unlock; 552 } 553 554 zone->cond = BLK_ZONE_COND_FULL; 555 zone->wp = zone->start + zone->len; 556 557unlock: 558 null_unlock_zone_res(dev); 559 560 return ret; 561} 562 563static blk_status_t null_reset_zone(struct nullb_device *dev, 564 struct nullb_zone *zone) 565{ 566 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 567 return BLK_STS_IOERR; 568 569 null_lock_zone_res(dev); 570 571 switch (zone->cond) { 572 case BLK_ZONE_COND_EMPTY: 573 /* reset operation on empty is not an error */ 574 null_unlock_zone_res(dev); 575 return BLK_STS_OK; 576 case BLK_ZONE_COND_IMP_OPEN: 577 dev->nr_zones_imp_open--; 578 break; 579 case BLK_ZONE_COND_EXP_OPEN: 580 dev->nr_zones_exp_open--; 581 break; 582 case BLK_ZONE_COND_CLOSED: 583 dev->nr_zones_closed--; 584 break; 585 case BLK_ZONE_COND_FULL: 586 break; 587 default: 588 null_unlock_zone_res(dev); 589 return BLK_STS_IOERR; 590 } 591 592 zone->cond = BLK_ZONE_COND_EMPTY; 593 zone->wp = zone->start; 594 595 null_unlock_zone_res(dev); 596 597 if (dev->memory_backed) 598 return null_handle_discard(dev, zone->start, zone->len); 599 600 return BLK_STS_OK; 601} 602 603static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, 604 sector_t sector) 605{ 606 struct nullb_device *dev = cmd->nq->dev; 607 unsigned int zone_no; 608 struct nullb_zone *zone; 609 blk_status_t ret; 610 size_t i; 611 612 if (op == REQ_OP_ZONE_RESET_ALL) { 613 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 614 zone = &dev->zones[i]; 615 null_lock_zone(dev, zone); 616 if (zone->cond != BLK_ZONE_COND_EMPTY) { 617 null_reset_zone(dev, zone); 618 trace_nullb_zone_op(cmd, i, zone->cond); 619 } 620 null_unlock_zone(dev, zone); 621 } 622 return BLK_STS_OK; 623 } 624 625 zone_no = null_zone_no(dev, sector); 626 zone = &dev->zones[zone_no]; 627 628 null_lock_zone(dev, zone); 629 630 switch (op) { 631 case REQ_OP_ZONE_RESET: 632 ret = null_reset_zone(dev, zone); 633 break; 634 case REQ_OP_ZONE_OPEN: 635 ret = null_open_zone(dev, zone); 636 break; 637 case REQ_OP_ZONE_CLOSE: 638 ret = null_close_zone(dev, zone); 639 break; 640 case REQ_OP_ZONE_FINISH: 641 ret = null_finish_zone(dev, zone); 642 break; 643 default: 644 ret = BLK_STS_NOTSUPP; 645 break; 646 } 647 648 if (ret == BLK_STS_OK) 649 trace_nullb_zone_op(cmd, zone_no, zone->cond); 650 651 null_unlock_zone(dev, zone); 652 653 return ret; 654} 655 656blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, 657 sector_t sector, sector_t nr_sectors) 658{ 659 struct nullb_device *dev; 660 struct nullb_zone *zone; 661 blk_status_t sts; 662 663 switch (op) { 664 case REQ_OP_WRITE: 665 return null_zone_write(cmd, sector, nr_sectors, false); 666 case REQ_OP_ZONE_APPEND: 667 return null_zone_write(cmd, sector, nr_sectors, true); 668 case REQ_OP_ZONE_RESET: 669 case REQ_OP_ZONE_RESET_ALL: 670 case REQ_OP_ZONE_OPEN: 671 case REQ_OP_ZONE_CLOSE: 672 case REQ_OP_ZONE_FINISH: 673 return null_zone_mgmt(cmd, op, sector); 674 default: 675 dev = cmd->nq->dev; 676 zone = &dev->zones[null_zone_no(dev, sector)]; 677 678 null_lock_zone(dev, zone); 679 sts = null_process_cmd(cmd, op, sector, nr_sectors); 680 null_unlock_zone(dev, zone); 681 return sts; 682 } 683}