zram_drv.c (51850B)
1/* 2 * Compressed RAM block device 3 * 4 * Copyright (C) 2008, 2009, 2010 Nitin Gupta 5 * 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the licence that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 * 13 */ 14 15#define KMSG_COMPONENT "zram" 16#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 17 18#include <linux/module.h> 19#include <linux/kernel.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/buffer_head.h> 24#include <linux/device.h> 25#include <linux/highmem.h> 26#include <linux/slab.h> 27#include <linux/backing-dev.h> 28#include <linux/string.h> 29#include <linux/vmalloc.h> 30#include <linux/err.h> 31#include <linux/idr.h> 32#include <linux/sysfs.h> 33#include <linux/debugfs.h> 34#include <linux/cpuhotplug.h> 35#include <linux/part_stat.h> 36 37#include "zram_drv.h" 38 39static DEFINE_IDR(zram_index_idr); 40/* idr index must be protected */ 41static DEFINE_MUTEX(zram_index_mutex); 42 43static int zram_major; 44static const char *default_compressor = CONFIG_ZRAM_DEF_COMP; 45 46/* Module params (documentation at end) */ 47static unsigned int num_devices = 1; 48/* 49 * Pages that compress to sizes equals or greater than this are stored 50 * uncompressed in memory. 51 */ 52static size_t huge_class_size; 53 54static const struct block_device_operations zram_devops; 55static const struct block_device_operations zram_wb_devops; 56 57static void zram_free_page(struct zram *zram, size_t index); 58static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, 59 u32 index, int offset, struct bio *bio); 60 61 62static int zram_slot_trylock(struct zram *zram, u32 index) 63{ 64 return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); 65} 66 67static void zram_slot_lock(struct zram *zram, u32 index) 68{ 69 bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); 70} 71 72static void zram_slot_unlock(struct zram *zram, u32 index) 73{ 74 bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); 75} 76 77static inline bool init_done(struct zram *zram) 78{ 79 return zram->disksize; 80} 81 82static inline struct zram *dev_to_zram(struct device *dev) 83{ 84 return (struct zram *)dev_to_disk(dev)->private_data; 85} 86 87static unsigned long zram_get_handle(struct zram *zram, u32 index) 88{ 89 return zram->table[index].handle; 90} 91 92static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) 93{ 94 zram->table[index].handle = handle; 95} 96 97/* flag operations require table entry bit_spin_lock() being held */ 98static bool zram_test_flag(struct zram *zram, u32 index, 99 enum zram_pageflags flag) 100{ 101 return zram->table[index].flags & BIT(flag); 102} 103 104static void zram_set_flag(struct zram *zram, u32 index, 105 enum zram_pageflags flag) 106{ 107 zram->table[index].flags |= BIT(flag); 108} 109 110static void zram_clear_flag(struct zram *zram, u32 index, 111 enum zram_pageflags flag) 112{ 113 zram->table[index].flags &= ~BIT(flag); 114} 115 116static inline void zram_set_element(struct zram *zram, u32 index, 117 unsigned long element) 118{ 119 zram->table[index].element = element; 120} 121 122static unsigned long zram_get_element(struct zram *zram, u32 index) 123{ 124 return zram->table[index].element; 125} 126 127static size_t zram_get_obj_size(struct zram *zram, u32 index) 128{ 129 return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); 130} 131 132static void zram_set_obj_size(struct zram *zram, 133 u32 index, size_t size) 134{ 135 unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; 136 137 zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; 138} 139 140static inline bool zram_allocated(struct zram *zram, u32 index) 141{ 142 return zram_get_obj_size(zram, index) || 143 zram_test_flag(zram, index, ZRAM_SAME) || 144 zram_test_flag(zram, index, ZRAM_WB); 145} 146 147#if PAGE_SIZE != 4096 148static inline bool is_partial_io(struct bio_vec *bvec) 149{ 150 return bvec->bv_len != PAGE_SIZE; 151} 152#else 153static inline bool is_partial_io(struct bio_vec *bvec) 154{ 155 return false; 156} 157#endif 158 159/* 160 * Check if request is within bounds and aligned on zram logical blocks. 161 */ 162static inline bool valid_io_request(struct zram *zram, 163 sector_t start, unsigned int size) 164{ 165 u64 end, bound; 166 167 /* unaligned request */ 168 if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) 169 return false; 170 if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) 171 return false; 172 173 end = start + (size >> SECTOR_SHIFT); 174 bound = zram->disksize >> SECTOR_SHIFT; 175 /* out of range range */ 176 if (unlikely(start >= bound || end > bound || start > end)) 177 return false; 178 179 /* I/O request is valid */ 180 return true; 181} 182 183static void update_position(u32 *index, int *offset, struct bio_vec *bvec) 184{ 185 *index += (*offset + bvec->bv_len) / PAGE_SIZE; 186 *offset = (*offset + bvec->bv_len) % PAGE_SIZE; 187} 188 189static inline void update_used_max(struct zram *zram, 190 const unsigned long pages) 191{ 192 unsigned long old_max, cur_max; 193 194 old_max = atomic_long_read(&zram->stats.max_used_pages); 195 196 do { 197 cur_max = old_max; 198 if (pages > cur_max) 199 old_max = atomic_long_cmpxchg( 200 &zram->stats.max_used_pages, cur_max, pages); 201 } while (old_max != cur_max); 202} 203 204static inline void zram_fill_page(void *ptr, unsigned long len, 205 unsigned long value) 206{ 207 WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long))); 208 memset_l(ptr, value, len / sizeof(unsigned long)); 209} 210 211static bool page_same_filled(void *ptr, unsigned long *element) 212{ 213 unsigned long *page; 214 unsigned long val; 215 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; 216 217 page = (unsigned long *)ptr; 218 val = page[0]; 219 220 if (val != page[last_pos]) 221 return false; 222 223 for (pos = 1; pos < last_pos; pos++) { 224 if (val != page[pos]) 225 return false; 226 } 227 228 *element = val; 229 230 return true; 231} 232 233static ssize_t initstate_show(struct device *dev, 234 struct device_attribute *attr, char *buf) 235{ 236 u32 val; 237 struct zram *zram = dev_to_zram(dev); 238 239 down_read(&zram->init_lock); 240 val = init_done(zram); 241 up_read(&zram->init_lock); 242 243 return scnprintf(buf, PAGE_SIZE, "%u\n", val); 244} 245 246static ssize_t disksize_show(struct device *dev, 247 struct device_attribute *attr, char *buf) 248{ 249 struct zram *zram = dev_to_zram(dev); 250 251 return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); 252} 253 254static ssize_t mem_limit_store(struct device *dev, 255 struct device_attribute *attr, const char *buf, size_t len) 256{ 257 u64 limit; 258 char *tmp; 259 struct zram *zram = dev_to_zram(dev); 260 261 limit = memparse(buf, &tmp); 262 if (buf == tmp) /* no chars parsed, invalid input */ 263 return -EINVAL; 264 265 down_write(&zram->init_lock); 266 zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; 267 up_write(&zram->init_lock); 268 269 return len; 270} 271 272static ssize_t mem_used_max_store(struct device *dev, 273 struct device_attribute *attr, const char *buf, size_t len) 274{ 275 int err; 276 unsigned long val; 277 struct zram *zram = dev_to_zram(dev); 278 279 err = kstrtoul(buf, 10, &val); 280 if (err || val != 0) 281 return -EINVAL; 282 283 down_read(&zram->init_lock); 284 if (init_done(zram)) { 285 atomic_long_set(&zram->stats.max_used_pages, 286 zs_get_total_pages(zram->mem_pool)); 287 } 288 up_read(&zram->init_lock); 289 290 return len; 291} 292 293/* 294 * Mark all pages which are older than or equal to cutoff as IDLE. 295 * Callers should hold the zram init lock in read mode 296 */ 297static void mark_idle(struct zram *zram, ktime_t cutoff) 298{ 299 int is_idle = 1; 300 unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; 301 int index; 302 303 for (index = 0; index < nr_pages; index++) { 304 /* 305 * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race. 306 * See the comment in writeback_store. 307 */ 308 zram_slot_lock(zram, index); 309 if (zram_allocated(zram, index) && 310 !zram_test_flag(zram, index, ZRAM_UNDER_WB)) { 311#ifdef CONFIG_ZRAM_MEMORY_TRACKING 312 is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time); 313#endif 314 if (is_idle) 315 zram_set_flag(zram, index, ZRAM_IDLE); 316 } 317 zram_slot_unlock(zram, index); 318 } 319} 320 321static ssize_t idle_store(struct device *dev, 322 struct device_attribute *attr, const char *buf, size_t len) 323{ 324 struct zram *zram = dev_to_zram(dev); 325 ktime_t cutoff_time = 0; 326 ssize_t rv = -EINVAL; 327 328 if (!sysfs_streq(buf, "all")) { 329 /* 330 * If it did not parse as 'all' try to treat it as an integer when 331 * we have memory tracking enabled. 332 */ 333 u64 age_sec; 334 335 if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec)) 336 cutoff_time = ktime_sub(ktime_get_boottime(), 337 ns_to_ktime(age_sec * NSEC_PER_SEC)); 338 else 339 goto out; 340 } 341 342 down_read(&zram->init_lock); 343 if (!init_done(zram)) 344 goto out_unlock; 345 346 /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */ 347 mark_idle(zram, cutoff_time); 348 rv = len; 349 350out_unlock: 351 up_read(&zram->init_lock); 352out: 353 return rv; 354} 355 356#ifdef CONFIG_ZRAM_WRITEBACK 357static ssize_t writeback_limit_enable_store(struct device *dev, 358 struct device_attribute *attr, const char *buf, size_t len) 359{ 360 struct zram *zram = dev_to_zram(dev); 361 u64 val; 362 ssize_t ret = -EINVAL; 363 364 if (kstrtoull(buf, 10, &val)) 365 return ret; 366 367 down_read(&zram->init_lock); 368 spin_lock(&zram->wb_limit_lock); 369 zram->wb_limit_enable = val; 370 spin_unlock(&zram->wb_limit_lock); 371 up_read(&zram->init_lock); 372 ret = len; 373 374 return ret; 375} 376 377static ssize_t writeback_limit_enable_show(struct device *dev, 378 struct device_attribute *attr, char *buf) 379{ 380 bool val; 381 struct zram *zram = dev_to_zram(dev); 382 383 down_read(&zram->init_lock); 384 spin_lock(&zram->wb_limit_lock); 385 val = zram->wb_limit_enable; 386 spin_unlock(&zram->wb_limit_lock); 387 up_read(&zram->init_lock); 388 389 return scnprintf(buf, PAGE_SIZE, "%d\n", val); 390} 391 392static ssize_t writeback_limit_store(struct device *dev, 393 struct device_attribute *attr, const char *buf, size_t len) 394{ 395 struct zram *zram = dev_to_zram(dev); 396 u64 val; 397 ssize_t ret = -EINVAL; 398 399 if (kstrtoull(buf, 10, &val)) 400 return ret; 401 402 down_read(&zram->init_lock); 403 spin_lock(&zram->wb_limit_lock); 404 zram->bd_wb_limit = val; 405 spin_unlock(&zram->wb_limit_lock); 406 up_read(&zram->init_lock); 407 ret = len; 408 409 return ret; 410} 411 412static ssize_t writeback_limit_show(struct device *dev, 413 struct device_attribute *attr, char *buf) 414{ 415 u64 val; 416 struct zram *zram = dev_to_zram(dev); 417 418 down_read(&zram->init_lock); 419 spin_lock(&zram->wb_limit_lock); 420 val = zram->bd_wb_limit; 421 spin_unlock(&zram->wb_limit_lock); 422 up_read(&zram->init_lock); 423 424 return scnprintf(buf, PAGE_SIZE, "%llu\n", val); 425} 426 427static void reset_bdev(struct zram *zram) 428{ 429 struct block_device *bdev; 430 431 if (!zram->backing_dev) 432 return; 433 434 bdev = zram->bdev; 435 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 436 /* hope filp_close flush all of IO */ 437 filp_close(zram->backing_dev, NULL); 438 zram->backing_dev = NULL; 439 zram->bdev = NULL; 440 zram->disk->fops = &zram_devops; 441 kvfree(zram->bitmap); 442 zram->bitmap = NULL; 443} 444 445static ssize_t backing_dev_show(struct device *dev, 446 struct device_attribute *attr, char *buf) 447{ 448 struct file *file; 449 struct zram *zram = dev_to_zram(dev); 450 char *p; 451 ssize_t ret; 452 453 down_read(&zram->init_lock); 454 file = zram->backing_dev; 455 if (!file) { 456 memcpy(buf, "none\n", 5); 457 up_read(&zram->init_lock); 458 return 5; 459 } 460 461 p = file_path(file, buf, PAGE_SIZE - 1); 462 if (IS_ERR(p)) { 463 ret = PTR_ERR(p); 464 goto out; 465 } 466 467 ret = strlen(p); 468 memmove(buf, p, ret); 469 buf[ret++] = '\n'; 470out: 471 up_read(&zram->init_lock); 472 return ret; 473} 474 475static ssize_t backing_dev_store(struct device *dev, 476 struct device_attribute *attr, const char *buf, size_t len) 477{ 478 char *file_name; 479 size_t sz; 480 struct file *backing_dev = NULL; 481 struct inode *inode; 482 struct address_space *mapping; 483 unsigned int bitmap_sz; 484 unsigned long nr_pages, *bitmap = NULL; 485 struct block_device *bdev = NULL; 486 int err; 487 struct zram *zram = dev_to_zram(dev); 488 489 file_name = kmalloc(PATH_MAX, GFP_KERNEL); 490 if (!file_name) 491 return -ENOMEM; 492 493 down_write(&zram->init_lock); 494 if (init_done(zram)) { 495 pr_info("Can't setup backing device for initialized device\n"); 496 err = -EBUSY; 497 goto out; 498 } 499 500 strlcpy(file_name, buf, PATH_MAX); 501 /* ignore trailing newline */ 502 sz = strlen(file_name); 503 if (sz > 0 && file_name[sz - 1] == '\n') 504 file_name[sz - 1] = 0x00; 505 506 backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0); 507 if (IS_ERR(backing_dev)) { 508 err = PTR_ERR(backing_dev); 509 backing_dev = NULL; 510 goto out; 511 } 512 513 mapping = backing_dev->f_mapping; 514 inode = mapping->host; 515 516 /* Support only block device in this moment */ 517 if (!S_ISBLK(inode->i_mode)) { 518 err = -ENOTBLK; 519 goto out; 520 } 521 522 bdev = blkdev_get_by_dev(inode->i_rdev, 523 FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram); 524 if (IS_ERR(bdev)) { 525 err = PTR_ERR(bdev); 526 bdev = NULL; 527 goto out; 528 } 529 530 nr_pages = i_size_read(inode) >> PAGE_SHIFT; 531 bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long); 532 bitmap = kvzalloc(bitmap_sz, GFP_KERNEL); 533 if (!bitmap) { 534 err = -ENOMEM; 535 goto out; 536 } 537 538 reset_bdev(zram); 539 540 zram->bdev = bdev; 541 zram->backing_dev = backing_dev; 542 zram->bitmap = bitmap; 543 zram->nr_pages = nr_pages; 544 /* 545 * With writeback feature, zram does asynchronous IO so it's no longer 546 * synchronous device so let's remove synchronous io flag. Othewise, 547 * upper layer(e.g., swap) could wait IO completion rather than 548 * (submit and return), which will cause system sluggish. 549 * Furthermore, when the IO function returns(e.g., swap_readpage), 550 * upper layer expects IO was done so it could deallocate the page 551 * freely but in fact, IO is going on so finally could cause 552 * use-after-free when the IO is really done. 553 */ 554 zram->disk->fops = &zram_wb_devops; 555 up_write(&zram->init_lock); 556 557 pr_info("setup backing device %s\n", file_name); 558 kfree(file_name); 559 560 return len; 561out: 562 kvfree(bitmap); 563 564 if (bdev) 565 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 566 567 if (backing_dev) 568 filp_close(backing_dev, NULL); 569 570 up_write(&zram->init_lock); 571 572 kfree(file_name); 573 574 return err; 575} 576 577static unsigned long alloc_block_bdev(struct zram *zram) 578{ 579 unsigned long blk_idx = 1; 580retry: 581 /* skip 0 bit to confuse zram.handle = 0 */ 582 blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx); 583 if (blk_idx == zram->nr_pages) 584 return 0; 585 586 if (test_and_set_bit(blk_idx, zram->bitmap)) 587 goto retry; 588 589 atomic64_inc(&zram->stats.bd_count); 590 return blk_idx; 591} 592 593static void free_block_bdev(struct zram *zram, unsigned long blk_idx) 594{ 595 int was_set; 596 597 was_set = test_and_clear_bit(blk_idx, zram->bitmap); 598 WARN_ON_ONCE(!was_set); 599 atomic64_dec(&zram->stats.bd_count); 600} 601 602static void zram_page_end_io(struct bio *bio) 603{ 604 struct page *page = bio_first_page_all(bio); 605 606 page_endio(page, op_is_write(bio_op(bio)), 607 blk_status_to_errno(bio->bi_status)); 608 bio_put(bio); 609} 610 611/* 612 * Returns 1 if the submission is successful. 613 */ 614static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, 615 unsigned long entry, struct bio *parent) 616{ 617 struct bio *bio; 618 619 bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ, 620 GFP_NOIO); 621 if (!bio) 622 return -ENOMEM; 623 624 bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); 625 if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) { 626 bio_put(bio); 627 return -EIO; 628 } 629 630 if (!parent) 631 bio->bi_end_io = zram_page_end_io; 632 else 633 bio_chain(bio, parent); 634 635 submit_bio(bio); 636 return 1; 637} 638 639#define PAGE_WB_SIG "page_index=" 640 641#define PAGE_WRITEBACK 0 642#define HUGE_WRITEBACK (1<<0) 643#define IDLE_WRITEBACK (1<<1) 644 645 646static ssize_t writeback_store(struct device *dev, 647 struct device_attribute *attr, const char *buf, size_t len) 648{ 649 struct zram *zram = dev_to_zram(dev); 650 unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; 651 unsigned long index = 0; 652 struct bio bio; 653 struct bio_vec bio_vec; 654 struct page *page; 655 ssize_t ret = len; 656 int mode, err; 657 unsigned long blk_idx = 0; 658 659 if (sysfs_streq(buf, "idle")) 660 mode = IDLE_WRITEBACK; 661 else if (sysfs_streq(buf, "huge")) 662 mode = HUGE_WRITEBACK; 663 else if (sysfs_streq(buf, "huge_idle")) 664 mode = IDLE_WRITEBACK | HUGE_WRITEBACK; 665 else { 666 if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) 667 return -EINVAL; 668 669 if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || 670 index >= nr_pages) 671 return -EINVAL; 672 673 nr_pages = 1; 674 mode = PAGE_WRITEBACK; 675 } 676 677 down_read(&zram->init_lock); 678 if (!init_done(zram)) { 679 ret = -EINVAL; 680 goto release_init_lock; 681 } 682 683 if (!zram->backing_dev) { 684 ret = -ENODEV; 685 goto release_init_lock; 686 } 687 688 page = alloc_page(GFP_KERNEL); 689 if (!page) { 690 ret = -ENOMEM; 691 goto release_init_lock; 692 } 693 694 for (; nr_pages != 0; index++, nr_pages--) { 695 struct bio_vec bvec; 696 697 bvec.bv_page = page; 698 bvec.bv_len = PAGE_SIZE; 699 bvec.bv_offset = 0; 700 701 spin_lock(&zram->wb_limit_lock); 702 if (zram->wb_limit_enable && !zram->bd_wb_limit) { 703 spin_unlock(&zram->wb_limit_lock); 704 ret = -EIO; 705 break; 706 } 707 spin_unlock(&zram->wb_limit_lock); 708 709 if (!blk_idx) { 710 blk_idx = alloc_block_bdev(zram); 711 if (!blk_idx) { 712 ret = -ENOSPC; 713 break; 714 } 715 } 716 717 zram_slot_lock(zram, index); 718 if (!zram_allocated(zram, index)) 719 goto next; 720 721 if (zram_test_flag(zram, index, ZRAM_WB) || 722 zram_test_flag(zram, index, ZRAM_SAME) || 723 zram_test_flag(zram, index, ZRAM_UNDER_WB)) 724 goto next; 725 726 if (mode & IDLE_WRITEBACK && 727 !zram_test_flag(zram, index, ZRAM_IDLE)) 728 goto next; 729 if (mode & HUGE_WRITEBACK && 730 !zram_test_flag(zram, index, ZRAM_HUGE)) 731 goto next; 732 /* 733 * Clearing ZRAM_UNDER_WB is duty of caller. 734 * IOW, zram_free_page never clear it. 735 */ 736 zram_set_flag(zram, index, ZRAM_UNDER_WB); 737 /* Need for hugepage writeback racing */ 738 zram_set_flag(zram, index, ZRAM_IDLE); 739 zram_slot_unlock(zram, index); 740 if (zram_bvec_read(zram, &bvec, index, 0, NULL)) { 741 zram_slot_lock(zram, index); 742 zram_clear_flag(zram, index, ZRAM_UNDER_WB); 743 zram_clear_flag(zram, index, ZRAM_IDLE); 744 zram_slot_unlock(zram, index); 745 continue; 746 } 747 748 bio_init(&bio, zram->bdev, &bio_vec, 1, 749 REQ_OP_WRITE | REQ_SYNC); 750 bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); 751 752 bio_add_page(&bio, bvec.bv_page, bvec.bv_len, 753 bvec.bv_offset); 754 /* 755 * XXX: A single page IO would be inefficient for write 756 * but it would be not bad as starter. 757 */ 758 err = submit_bio_wait(&bio); 759 if (err) { 760 zram_slot_lock(zram, index); 761 zram_clear_flag(zram, index, ZRAM_UNDER_WB); 762 zram_clear_flag(zram, index, ZRAM_IDLE); 763 zram_slot_unlock(zram, index); 764 /* 765 * Return last IO error unless every IO were 766 * not suceeded. 767 */ 768 ret = err; 769 continue; 770 } 771 772 atomic64_inc(&zram->stats.bd_writes); 773 /* 774 * We released zram_slot_lock so need to check if the slot was 775 * changed. If there is freeing for the slot, we can catch it 776 * easily by zram_allocated. 777 * A subtle case is the slot is freed/reallocated/marked as 778 * ZRAM_IDLE again. To close the race, idle_store doesn't 779 * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB. 780 * Thus, we could close the race by checking ZRAM_IDLE bit. 781 */ 782 zram_slot_lock(zram, index); 783 if (!zram_allocated(zram, index) || 784 !zram_test_flag(zram, index, ZRAM_IDLE)) { 785 zram_clear_flag(zram, index, ZRAM_UNDER_WB); 786 zram_clear_flag(zram, index, ZRAM_IDLE); 787 goto next; 788 } 789 790 zram_free_page(zram, index); 791 zram_clear_flag(zram, index, ZRAM_UNDER_WB); 792 zram_set_flag(zram, index, ZRAM_WB); 793 zram_set_element(zram, index, blk_idx); 794 blk_idx = 0; 795 atomic64_inc(&zram->stats.pages_stored); 796 spin_lock(&zram->wb_limit_lock); 797 if (zram->wb_limit_enable && zram->bd_wb_limit > 0) 798 zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); 799 spin_unlock(&zram->wb_limit_lock); 800next: 801 zram_slot_unlock(zram, index); 802 } 803 804 if (blk_idx) 805 free_block_bdev(zram, blk_idx); 806 __free_page(page); 807release_init_lock: 808 up_read(&zram->init_lock); 809 810 return ret; 811} 812 813struct zram_work { 814 struct work_struct work; 815 struct zram *zram; 816 unsigned long entry; 817 struct bio *bio; 818 struct bio_vec bvec; 819}; 820 821#if PAGE_SIZE != 4096 822static void zram_sync_read(struct work_struct *work) 823{ 824 struct zram_work *zw = container_of(work, struct zram_work, work); 825 struct zram *zram = zw->zram; 826 unsigned long entry = zw->entry; 827 struct bio *bio = zw->bio; 828 829 read_from_bdev_async(zram, &zw->bvec, entry, bio); 830} 831 832/* 833 * Block layer want one ->submit_bio to be active at a time, so if we use 834 * chained IO with parent IO in same context, it's a deadlock. To avoid that, 835 * use a worker thread context. 836 */ 837static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, 838 unsigned long entry, struct bio *bio) 839{ 840 struct zram_work work; 841 842 work.bvec = *bvec; 843 work.zram = zram; 844 work.entry = entry; 845 work.bio = bio; 846 847 INIT_WORK_ONSTACK(&work.work, zram_sync_read); 848 queue_work(system_unbound_wq, &work.work); 849 flush_work(&work.work); 850 destroy_work_on_stack(&work.work); 851 852 return 1; 853} 854#else 855static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, 856 unsigned long entry, struct bio *bio) 857{ 858 WARN_ON(1); 859 return -EIO; 860} 861#endif 862 863static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, 864 unsigned long entry, struct bio *parent, bool sync) 865{ 866 atomic64_inc(&zram->stats.bd_reads); 867 if (sync) 868 return read_from_bdev_sync(zram, bvec, entry, parent); 869 else 870 return read_from_bdev_async(zram, bvec, entry, parent); 871} 872#else 873static inline void reset_bdev(struct zram *zram) {}; 874static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, 875 unsigned long entry, struct bio *parent, bool sync) 876{ 877 return -EIO; 878} 879 880static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {}; 881#endif 882 883#ifdef CONFIG_ZRAM_MEMORY_TRACKING 884 885static struct dentry *zram_debugfs_root; 886 887static void zram_debugfs_create(void) 888{ 889 zram_debugfs_root = debugfs_create_dir("zram", NULL); 890} 891 892static void zram_debugfs_destroy(void) 893{ 894 debugfs_remove_recursive(zram_debugfs_root); 895} 896 897static void zram_accessed(struct zram *zram, u32 index) 898{ 899 zram_clear_flag(zram, index, ZRAM_IDLE); 900 zram->table[index].ac_time = ktime_get_boottime(); 901} 902 903static ssize_t read_block_state(struct file *file, char __user *buf, 904 size_t count, loff_t *ppos) 905{ 906 char *kbuf; 907 ssize_t index, written = 0; 908 struct zram *zram = file->private_data; 909 unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; 910 struct timespec64 ts; 911 912 kbuf = kvmalloc(count, GFP_KERNEL); 913 if (!kbuf) 914 return -ENOMEM; 915 916 down_read(&zram->init_lock); 917 if (!init_done(zram)) { 918 up_read(&zram->init_lock); 919 kvfree(kbuf); 920 return -EINVAL; 921 } 922 923 for (index = *ppos; index < nr_pages; index++) { 924 int copied; 925 926 zram_slot_lock(zram, index); 927 if (!zram_allocated(zram, index)) 928 goto next; 929 930 ts = ktime_to_timespec64(zram->table[index].ac_time); 931 copied = snprintf(kbuf + written, count, 932 "%12zd %12lld.%06lu %c%c%c%c\n", 933 index, (s64)ts.tv_sec, 934 ts.tv_nsec / NSEC_PER_USEC, 935 zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', 936 zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', 937 zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', 938 zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.'); 939 940 if (count <= copied) { 941 zram_slot_unlock(zram, index); 942 break; 943 } 944 written += copied; 945 count -= copied; 946next: 947 zram_slot_unlock(zram, index); 948 *ppos += 1; 949 } 950 951 up_read(&zram->init_lock); 952 if (copy_to_user(buf, kbuf, written)) 953 written = -EFAULT; 954 kvfree(kbuf); 955 956 return written; 957} 958 959static const struct file_operations proc_zram_block_state_op = { 960 .open = simple_open, 961 .read = read_block_state, 962 .llseek = default_llseek, 963}; 964 965static void zram_debugfs_register(struct zram *zram) 966{ 967 if (!zram_debugfs_root) 968 return; 969 970 zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name, 971 zram_debugfs_root); 972 debugfs_create_file("block_state", 0400, zram->debugfs_dir, 973 zram, &proc_zram_block_state_op); 974} 975 976static void zram_debugfs_unregister(struct zram *zram) 977{ 978 debugfs_remove_recursive(zram->debugfs_dir); 979} 980#else 981static void zram_debugfs_create(void) {}; 982static void zram_debugfs_destroy(void) {}; 983static void zram_accessed(struct zram *zram, u32 index) 984{ 985 zram_clear_flag(zram, index, ZRAM_IDLE); 986}; 987static void zram_debugfs_register(struct zram *zram) {}; 988static void zram_debugfs_unregister(struct zram *zram) {}; 989#endif 990 991/* 992 * We switched to per-cpu streams and this attr is not needed anymore. 993 * However, we will keep it around for some time, because: 994 * a) we may revert per-cpu streams in the future 995 * b) it's visible to user space and we need to follow our 2 years 996 * retirement rule; but we already have a number of 'soon to be 997 * altered' attrs, so max_comp_streams need to wait for the next 998 * layoff cycle. 999 */ 1000static ssize_t max_comp_streams_show(struct device *dev, 1001 struct device_attribute *attr, char *buf) 1002{ 1003 return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus()); 1004} 1005 1006static ssize_t max_comp_streams_store(struct device *dev, 1007 struct device_attribute *attr, const char *buf, size_t len) 1008{ 1009 return len; 1010} 1011 1012static ssize_t comp_algorithm_show(struct device *dev, 1013 struct device_attribute *attr, char *buf) 1014{ 1015 size_t sz; 1016 struct zram *zram = dev_to_zram(dev); 1017 1018 down_read(&zram->init_lock); 1019 sz = zcomp_available_show(zram->compressor, buf); 1020 up_read(&zram->init_lock); 1021 1022 return sz; 1023} 1024 1025static ssize_t comp_algorithm_store(struct device *dev, 1026 struct device_attribute *attr, const char *buf, size_t len) 1027{ 1028 struct zram *zram = dev_to_zram(dev); 1029 char compressor[ARRAY_SIZE(zram->compressor)]; 1030 size_t sz; 1031 1032 strlcpy(compressor, buf, sizeof(compressor)); 1033 /* ignore trailing newline */ 1034 sz = strlen(compressor); 1035 if (sz > 0 && compressor[sz - 1] == '\n') 1036 compressor[sz - 1] = 0x00; 1037 1038 if (!zcomp_available_algorithm(compressor)) 1039 return -EINVAL; 1040 1041 down_write(&zram->init_lock); 1042 if (init_done(zram)) { 1043 up_write(&zram->init_lock); 1044 pr_info("Can't change algorithm for initialized device\n"); 1045 return -EBUSY; 1046 } 1047 1048 strcpy(zram->compressor, compressor); 1049 up_write(&zram->init_lock); 1050 return len; 1051} 1052 1053static ssize_t compact_store(struct device *dev, 1054 struct device_attribute *attr, const char *buf, size_t len) 1055{ 1056 struct zram *zram = dev_to_zram(dev); 1057 1058 down_read(&zram->init_lock); 1059 if (!init_done(zram)) { 1060 up_read(&zram->init_lock); 1061 return -EINVAL; 1062 } 1063 1064 zs_compact(zram->mem_pool); 1065 up_read(&zram->init_lock); 1066 1067 return len; 1068} 1069 1070static ssize_t io_stat_show(struct device *dev, 1071 struct device_attribute *attr, char *buf) 1072{ 1073 struct zram *zram = dev_to_zram(dev); 1074 ssize_t ret; 1075 1076 down_read(&zram->init_lock); 1077 ret = scnprintf(buf, PAGE_SIZE, 1078 "%8llu %8llu %8llu %8llu\n", 1079 (u64)atomic64_read(&zram->stats.failed_reads), 1080 (u64)atomic64_read(&zram->stats.failed_writes), 1081 (u64)atomic64_read(&zram->stats.invalid_io), 1082 (u64)atomic64_read(&zram->stats.notify_free)); 1083 up_read(&zram->init_lock); 1084 1085 return ret; 1086} 1087 1088static ssize_t mm_stat_show(struct device *dev, 1089 struct device_attribute *attr, char *buf) 1090{ 1091 struct zram *zram = dev_to_zram(dev); 1092 struct zs_pool_stats pool_stats; 1093 u64 orig_size, mem_used = 0; 1094 long max_used; 1095 ssize_t ret; 1096 1097 memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats)); 1098 1099 down_read(&zram->init_lock); 1100 if (init_done(zram)) { 1101 mem_used = zs_get_total_pages(zram->mem_pool); 1102 zs_pool_stats(zram->mem_pool, &pool_stats); 1103 } 1104 1105 orig_size = atomic64_read(&zram->stats.pages_stored); 1106 max_used = atomic_long_read(&zram->stats.max_used_pages); 1107 1108 ret = scnprintf(buf, PAGE_SIZE, 1109 "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n", 1110 orig_size << PAGE_SHIFT, 1111 (u64)atomic64_read(&zram->stats.compr_data_size), 1112 mem_used << PAGE_SHIFT, 1113 zram->limit_pages << PAGE_SHIFT, 1114 max_used << PAGE_SHIFT, 1115 (u64)atomic64_read(&zram->stats.same_pages), 1116 atomic_long_read(&pool_stats.pages_compacted), 1117 (u64)atomic64_read(&zram->stats.huge_pages), 1118 (u64)atomic64_read(&zram->stats.huge_pages_since)); 1119 up_read(&zram->init_lock); 1120 1121 return ret; 1122} 1123 1124#ifdef CONFIG_ZRAM_WRITEBACK 1125#define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12))) 1126static ssize_t bd_stat_show(struct device *dev, 1127 struct device_attribute *attr, char *buf) 1128{ 1129 struct zram *zram = dev_to_zram(dev); 1130 ssize_t ret; 1131 1132 down_read(&zram->init_lock); 1133 ret = scnprintf(buf, PAGE_SIZE, 1134 "%8llu %8llu %8llu\n", 1135 FOUR_K((u64)atomic64_read(&zram->stats.bd_count)), 1136 FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)), 1137 FOUR_K((u64)atomic64_read(&zram->stats.bd_writes))); 1138 up_read(&zram->init_lock); 1139 1140 return ret; 1141} 1142#endif 1143 1144static ssize_t debug_stat_show(struct device *dev, 1145 struct device_attribute *attr, char *buf) 1146{ 1147 int version = 2; 1148 struct zram *zram = dev_to_zram(dev); 1149 ssize_t ret; 1150 1151 down_read(&zram->init_lock); 1152 ret = scnprintf(buf, PAGE_SIZE, 1153 "version: %d\n%8llu\n", 1154 version, 1155 (u64)atomic64_read(&zram->stats.miss_free)); 1156 up_read(&zram->init_lock); 1157 1158 return ret; 1159} 1160 1161static DEVICE_ATTR_RO(io_stat); 1162static DEVICE_ATTR_RO(mm_stat); 1163#ifdef CONFIG_ZRAM_WRITEBACK 1164static DEVICE_ATTR_RO(bd_stat); 1165#endif 1166static DEVICE_ATTR_RO(debug_stat); 1167 1168static void zram_meta_free(struct zram *zram, u64 disksize) 1169{ 1170 size_t num_pages = disksize >> PAGE_SHIFT; 1171 size_t index; 1172 1173 /* Free all pages that are still in this zram device */ 1174 for (index = 0; index < num_pages; index++) 1175 zram_free_page(zram, index); 1176 1177 zs_destroy_pool(zram->mem_pool); 1178 vfree(zram->table); 1179} 1180 1181static bool zram_meta_alloc(struct zram *zram, u64 disksize) 1182{ 1183 size_t num_pages; 1184 1185 num_pages = disksize >> PAGE_SHIFT; 1186 zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table))); 1187 if (!zram->table) 1188 return false; 1189 1190 zram->mem_pool = zs_create_pool(zram->disk->disk_name); 1191 if (!zram->mem_pool) { 1192 vfree(zram->table); 1193 return false; 1194 } 1195 1196 if (!huge_class_size) 1197 huge_class_size = zs_huge_class_size(zram->mem_pool); 1198 return true; 1199} 1200 1201/* 1202 * To protect concurrent access to the same index entry, 1203 * caller should hold this table index entry's bit_spinlock to 1204 * indicate this index entry is accessing. 1205 */ 1206static void zram_free_page(struct zram *zram, size_t index) 1207{ 1208 unsigned long handle; 1209 1210#ifdef CONFIG_ZRAM_MEMORY_TRACKING 1211 zram->table[index].ac_time = 0; 1212#endif 1213 if (zram_test_flag(zram, index, ZRAM_IDLE)) 1214 zram_clear_flag(zram, index, ZRAM_IDLE); 1215 1216 if (zram_test_flag(zram, index, ZRAM_HUGE)) { 1217 zram_clear_flag(zram, index, ZRAM_HUGE); 1218 atomic64_dec(&zram->stats.huge_pages); 1219 } 1220 1221 if (zram_test_flag(zram, index, ZRAM_WB)) { 1222 zram_clear_flag(zram, index, ZRAM_WB); 1223 free_block_bdev(zram, zram_get_element(zram, index)); 1224 goto out; 1225 } 1226 1227 /* 1228 * No memory is allocated for same element filled pages. 1229 * Simply clear same page flag. 1230 */ 1231 if (zram_test_flag(zram, index, ZRAM_SAME)) { 1232 zram_clear_flag(zram, index, ZRAM_SAME); 1233 atomic64_dec(&zram->stats.same_pages); 1234 goto out; 1235 } 1236 1237 handle = zram_get_handle(zram, index); 1238 if (!handle) 1239 return; 1240 1241 zs_free(zram->mem_pool, handle); 1242 1243 atomic64_sub(zram_get_obj_size(zram, index), 1244 &zram->stats.compr_data_size); 1245out: 1246 atomic64_dec(&zram->stats.pages_stored); 1247 zram_set_handle(zram, index, 0); 1248 zram_set_obj_size(zram, index, 0); 1249 WARN_ON_ONCE(zram->table[index].flags & 1250 ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); 1251} 1252 1253static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, 1254 struct bio *bio, bool partial_io) 1255{ 1256 struct zcomp_strm *zstrm; 1257 unsigned long handle; 1258 unsigned int size; 1259 void *src, *dst; 1260 int ret; 1261 1262 zram_slot_lock(zram, index); 1263 if (zram_test_flag(zram, index, ZRAM_WB)) { 1264 struct bio_vec bvec; 1265 1266 zram_slot_unlock(zram, index); 1267 1268 bvec.bv_page = page; 1269 bvec.bv_len = PAGE_SIZE; 1270 bvec.bv_offset = 0; 1271 return read_from_bdev(zram, &bvec, 1272 zram_get_element(zram, index), 1273 bio, partial_io); 1274 } 1275 1276 handle = zram_get_handle(zram, index); 1277 if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { 1278 unsigned long value; 1279 void *mem; 1280 1281 value = handle ? zram_get_element(zram, index) : 0; 1282 mem = kmap_atomic(page); 1283 zram_fill_page(mem, PAGE_SIZE, value); 1284 kunmap_atomic(mem); 1285 zram_slot_unlock(zram, index); 1286 return 0; 1287 } 1288 1289 size = zram_get_obj_size(zram, index); 1290 1291 if (size != PAGE_SIZE) 1292 zstrm = zcomp_stream_get(zram->comp); 1293 1294 src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); 1295 if (size == PAGE_SIZE) { 1296 dst = kmap_atomic(page); 1297 memcpy(dst, src, PAGE_SIZE); 1298 kunmap_atomic(dst); 1299 ret = 0; 1300 } else { 1301 dst = kmap_atomic(page); 1302 ret = zcomp_decompress(zstrm, src, size, dst); 1303 kunmap_atomic(dst); 1304 zcomp_stream_put(zram->comp); 1305 } 1306 zs_unmap_object(zram->mem_pool, handle); 1307 zram_slot_unlock(zram, index); 1308 1309 /* Should NEVER happen. Return bio error if it does. */ 1310 if (WARN_ON(ret)) 1311 pr_err("Decompression failed! err=%d, page=%u\n", ret, index); 1312 1313 return ret; 1314} 1315 1316static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, 1317 u32 index, int offset, struct bio *bio) 1318{ 1319 int ret; 1320 struct page *page; 1321 1322 page = bvec->bv_page; 1323 if (is_partial_io(bvec)) { 1324 /* Use a temporary buffer to decompress the page */ 1325 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); 1326 if (!page) 1327 return -ENOMEM; 1328 } 1329 1330 ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec)); 1331 if (unlikely(ret)) 1332 goto out; 1333 1334 if (is_partial_io(bvec)) { 1335 void *src = kmap_atomic(page); 1336 1337 memcpy_to_bvec(bvec, src + offset); 1338 kunmap_atomic(src); 1339 } 1340out: 1341 if (is_partial_io(bvec)) 1342 __free_page(page); 1343 1344 return ret; 1345} 1346 1347static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, 1348 u32 index, struct bio *bio) 1349{ 1350 int ret = 0; 1351 unsigned long alloced_pages; 1352 unsigned long handle = 0; 1353 unsigned int comp_len = 0; 1354 void *src, *dst, *mem; 1355 struct zcomp_strm *zstrm; 1356 struct page *page = bvec->bv_page; 1357 unsigned long element = 0; 1358 enum zram_pageflags flags = 0; 1359 1360 mem = kmap_atomic(page); 1361 if (page_same_filled(mem, &element)) { 1362 kunmap_atomic(mem); 1363 /* Free memory associated with this sector now. */ 1364 flags = ZRAM_SAME; 1365 atomic64_inc(&zram->stats.same_pages); 1366 goto out; 1367 } 1368 kunmap_atomic(mem); 1369 1370 zstrm = zcomp_stream_get(zram->comp); 1371 src = kmap_atomic(page); 1372 ret = zcomp_compress(zstrm, src, &comp_len); 1373 kunmap_atomic(src); 1374 1375 if (unlikely(ret)) { 1376 zcomp_stream_put(zram->comp); 1377 pr_err("Compression failed! err=%d\n", ret); 1378 return ret; 1379 } 1380 1381 if (comp_len >= huge_class_size) 1382 comp_len = PAGE_SIZE; 1383 1384 handle = zs_malloc(zram->mem_pool, comp_len, 1385 __GFP_KSWAPD_RECLAIM | 1386 __GFP_NOWARN | 1387 __GFP_HIGHMEM | 1388 __GFP_MOVABLE); 1389 1390 if (unlikely(!handle)) { 1391 zcomp_stream_put(zram->comp); 1392 return -ENOMEM; 1393 } 1394 1395 alloced_pages = zs_get_total_pages(zram->mem_pool); 1396 update_used_max(zram, alloced_pages); 1397 1398 if (zram->limit_pages && alloced_pages > zram->limit_pages) { 1399 zcomp_stream_put(zram->comp); 1400 zs_free(zram->mem_pool, handle); 1401 return -ENOMEM; 1402 } 1403 1404 dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); 1405 1406 src = zstrm->buffer; 1407 if (comp_len == PAGE_SIZE) 1408 src = kmap_atomic(page); 1409 memcpy(dst, src, comp_len); 1410 if (comp_len == PAGE_SIZE) 1411 kunmap_atomic(src); 1412 1413 zcomp_stream_put(zram->comp); 1414 zs_unmap_object(zram->mem_pool, handle); 1415 atomic64_add(comp_len, &zram->stats.compr_data_size); 1416out: 1417 /* 1418 * Free memory associated with this sector 1419 * before overwriting unused sectors. 1420 */ 1421 zram_slot_lock(zram, index); 1422 zram_free_page(zram, index); 1423 1424 if (comp_len == PAGE_SIZE) { 1425 zram_set_flag(zram, index, ZRAM_HUGE); 1426 atomic64_inc(&zram->stats.huge_pages); 1427 atomic64_inc(&zram->stats.huge_pages_since); 1428 } 1429 1430 if (flags) { 1431 zram_set_flag(zram, index, flags); 1432 zram_set_element(zram, index, element); 1433 } else { 1434 zram_set_handle(zram, index, handle); 1435 zram_set_obj_size(zram, index, comp_len); 1436 } 1437 zram_slot_unlock(zram, index); 1438 1439 /* Update stats */ 1440 atomic64_inc(&zram->stats.pages_stored); 1441 return ret; 1442} 1443 1444static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, 1445 u32 index, int offset, struct bio *bio) 1446{ 1447 int ret; 1448 struct page *page = NULL; 1449 struct bio_vec vec; 1450 1451 vec = *bvec; 1452 if (is_partial_io(bvec)) { 1453 void *dst; 1454 /* 1455 * This is a partial IO. We need to read the full page 1456 * before to write the changes. 1457 */ 1458 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); 1459 if (!page) 1460 return -ENOMEM; 1461 1462 ret = __zram_bvec_read(zram, page, index, bio, true); 1463 if (ret) 1464 goto out; 1465 1466 dst = kmap_atomic(page); 1467 memcpy_from_bvec(dst + offset, bvec); 1468 kunmap_atomic(dst); 1469 1470 vec.bv_page = page; 1471 vec.bv_len = PAGE_SIZE; 1472 vec.bv_offset = 0; 1473 } 1474 1475 ret = __zram_bvec_write(zram, &vec, index, bio); 1476out: 1477 if (is_partial_io(bvec)) 1478 __free_page(page); 1479 return ret; 1480} 1481 1482/* 1483 * zram_bio_discard - handler on discard request 1484 * @index: physical block index in PAGE_SIZE units 1485 * @offset: byte offset within physical block 1486 */ 1487static void zram_bio_discard(struct zram *zram, u32 index, 1488 int offset, struct bio *bio) 1489{ 1490 size_t n = bio->bi_iter.bi_size; 1491 1492 /* 1493 * zram manages data in physical block size units. Because logical block 1494 * size isn't identical with physical block size on some arch, we 1495 * could get a discard request pointing to a specific offset within a 1496 * certain physical block. Although we can handle this request by 1497 * reading that physiclal block and decompressing and partially zeroing 1498 * and re-compressing and then re-storing it, this isn't reasonable 1499 * because our intent with a discard request is to save memory. So 1500 * skipping this logical block is appropriate here. 1501 */ 1502 if (offset) { 1503 if (n <= (PAGE_SIZE - offset)) 1504 return; 1505 1506 n -= (PAGE_SIZE - offset); 1507 index++; 1508 } 1509 1510 while (n >= PAGE_SIZE) { 1511 zram_slot_lock(zram, index); 1512 zram_free_page(zram, index); 1513 zram_slot_unlock(zram, index); 1514 atomic64_inc(&zram->stats.notify_free); 1515 index++; 1516 n -= PAGE_SIZE; 1517 } 1518} 1519 1520/* 1521 * Returns errno if it has some problem. Otherwise return 0 or 1. 1522 * Returns 0 if IO request was done synchronously 1523 * Returns 1 if IO request was successfully submitted. 1524 */ 1525static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, 1526 int offset, unsigned int op, struct bio *bio) 1527{ 1528 int ret; 1529 1530 if (!op_is_write(op)) { 1531 atomic64_inc(&zram->stats.num_reads); 1532 ret = zram_bvec_read(zram, bvec, index, offset, bio); 1533 flush_dcache_page(bvec->bv_page); 1534 } else { 1535 atomic64_inc(&zram->stats.num_writes); 1536 ret = zram_bvec_write(zram, bvec, index, offset, bio); 1537 } 1538 1539 zram_slot_lock(zram, index); 1540 zram_accessed(zram, index); 1541 zram_slot_unlock(zram, index); 1542 1543 if (unlikely(ret < 0)) { 1544 if (!op_is_write(op)) 1545 atomic64_inc(&zram->stats.failed_reads); 1546 else 1547 atomic64_inc(&zram->stats.failed_writes); 1548 } 1549 1550 return ret; 1551} 1552 1553static void __zram_make_request(struct zram *zram, struct bio *bio) 1554{ 1555 int offset; 1556 u32 index; 1557 struct bio_vec bvec; 1558 struct bvec_iter iter; 1559 unsigned long start_time; 1560 1561 index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; 1562 offset = (bio->bi_iter.bi_sector & 1563 (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; 1564 1565 switch (bio_op(bio)) { 1566 case REQ_OP_DISCARD: 1567 case REQ_OP_WRITE_ZEROES: 1568 zram_bio_discard(zram, index, offset, bio); 1569 bio_endio(bio); 1570 return; 1571 default: 1572 break; 1573 } 1574 1575 start_time = bio_start_io_acct(bio); 1576 bio_for_each_segment(bvec, bio, iter) { 1577 struct bio_vec bv = bvec; 1578 unsigned int unwritten = bvec.bv_len; 1579 1580 do { 1581 bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, 1582 unwritten); 1583 if (zram_bvec_rw(zram, &bv, index, offset, 1584 bio_op(bio), bio) < 0) { 1585 bio->bi_status = BLK_STS_IOERR; 1586 break; 1587 } 1588 1589 bv.bv_offset += bv.bv_len; 1590 unwritten -= bv.bv_len; 1591 1592 update_position(&index, &offset, &bv); 1593 } while (unwritten); 1594 } 1595 bio_end_io_acct(bio, start_time); 1596 bio_endio(bio); 1597} 1598 1599/* 1600 * Handler function for all zram I/O requests. 1601 */ 1602static void zram_submit_bio(struct bio *bio) 1603{ 1604 struct zram *zram = bio->bi_bdev->bd_disk->private_data; 1605 1606 if (!valid_io_request(zram, bio->bi_iter.bi_sector, 1607 bio->bi_iter.bi_size)) { 1608 atomic64_inc(&zram->stats.invalid_io); 1609 bio_io_error(bio); 1610 return; 1611 } 1612 1613 __zram_make_request(zram, bio); 1614} 1615 1616static void zram_slot_free_notify(struct block_device *bdev, 1617 unsigned long index) 1618{ 1619 struct zram *zram; 1620 1621 zram = bdev->bd_disk->private_data; 1622 1623 atomic64_inc(&zram->stats.notify_free); 1624 if (!zram_slot_trylock(zram, index)) { 1625 atomic64_inc(&zram->stats.miss_free); 1626 return; 1627 } 1628 1629 zram_free_page(zram, index); 1630 zram_slot_unlock(zram, index); 1631} 1632 1633static int zram_rw_page(struct block_device *bdev, sector_t sector, 1634 struct page *page, unsigned int op) 1635{ 1636 int offset, ret; 1637 u32 index; 1638 struct zram *zram; 1639 struct bio_vec bv; 1640 unsigned long start_time; 1641 1642 if (PageTransHuge(page)) 1643 return -ENOTSUPP; 1644 zram = bdev->bd_disk->private_data; 1645 1646 if (!valid_io_request(zram, sector, PAGE_SIZE)) { 1647 atomic64_inc(&zram->stats.invalid_io); 1648 ret = -EINVAL; 1649 goto out; 1650 } 1651 1652 index = sector >> SECTORS_PER_PAGE_SHIFT; 1653 offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; 1654 1655 bv.bv_page = page; 1656 bv.bv_len = PAGE_SIZE; 1657 bv.bv_offset = 0; 1658 1659 start_time = bdev_start_io_acct(bdev->bd_disk->part0, 1660 SECTORS_PER_PAGE, op, jiffies); 1661 ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); 1662 bdev_end_io_acct(bdev->bd_disk->part0, op, start_time); 1663out: 1664 /* 1665 * If I/O fails, just return error(ie, non-zero) without 1666 * calling page_endio. 1667 * It causes resubmit the I/O with bio request by upper functions 1668 * of rw_page(e.g., swap_readpage, __swap_writepage) and 1669 * bio->bi_end_io does things to handle the error 1670 * (e.g., SetPageError, set_page_dirty and extra works). 1671 */ 1672 if (unlikely(ret < 0)) 1673 return ret; 1674 1675 switch (ret) { 1676 case 0: 1677 page_endio(page, op_is_write(op), 0); 1678 break; 1679 case 1: 1680 ret = 0; 1681 break; 1682 default: 1683 WARN_ON(1); 1684 } 1685 return ret; 1686} 1687 1688static void zram_reset_device(struct zram *zram) 1689{ 1690 struct zcomp *comp; 1691 u64 disksize; 1692 1693 down_write(&zram->init_lock); 1694 1695 zram->limit_pages = 0; 1696 1697 if (!init_done(zram)) { 1698 up_write(&zram->init_lock); 1699 return; 1700 } 1701 1702 comp = zram->comp; 1703 disksize = zram->disksize; 1704 zram->disksize = 0; 1705 1706 set_capacity_and_notify(zram->disk, 0); 1707 part_stat_set_all(zram->disk->part0, 0); 1708 1709 /* I/O operation under all of CPU are done so let's free */ 1710 zram_meta_free(zram, disksize); 1711 memset(&zram->stats, 0, sizeof(zram->stats)); 1712 zcomp_destroy(comp); 1713 reset_bdev(zram); 1714 1715 up_write(&zram->init_lock); 1716} 1717 1718static ssize_t disksize_store(struct device *dev, 1719 struct device_attribute *attr, const char *buf, size_t len) 1720{ 1721 u64 disksize; 1722 struct zcomp *comp; 1723 struct zram *zram = dev_to_zram(dev); 1724 int err; 1725 1726 disksize = memparse(buf, NULL); 1727 if (!disksize) 1728 return -EINVAL; 1729 1730 down_write(&zram->init_lock); 1731 if (init_done(zram)) { 1732 pr_info("Cannot change disksize for initialized device\n"); 1733 err = -EBUSY; 1734 goto out_unlock; 1735 } 1736 1737 disksize = PAGE_ALIGN(disksize); 1738 if (!zram_meta_alloc(zram, disksize)) { 1739 err = -ENOMEM; 1740 goto out_unlock; 1741 } 1742 1743 comp = zcomp_create(zram->compressor); 1744 if (IS_ERR(comp)) { 1745 pr_err("Cannot initialise %s compressing backend\n", 1746 zram->compressor); 1747 err = PTR_ERR(comp); 1748 goto out_free_meta; 1749 } 1750 1751 zram->comp = comp; 1752 zram->disksize = disksize; 1753 set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT); 1754 up_write(&zram->init_lock); 1755 1756 return len; 1757 1758out_free_meta: 1759 zram_meta_free(zram, disksize); 1760out_unlock: 1761 up_write(&zram->init_lock); 1762 return err; 1763} 1764 1765static ssize_t reset_store(struct device *dev, 1766 struct device_attribute *attr, const char *buf, size_t len) 1767{ 1768 int ret; 1769 unsigned short do_reset; 1770 struct zram *zram; 1771 struct gendisk *disk; 1772 1773 ret = kstrtou16(buf, 10, &do_reset); 1774 if (ret) 1775 return ret; 1776 1777 if (!do_reset) 1778 return -EINVAL; 1779 1780 zram = dev_to_zram(dev); 1781 disk = zram->disk; 1782 1783 mutex_lock(&disk->open_mutex); 1784 /* Do not reset an active device or claimed device */ 1785 if (disk_openers(disk) || zram->claim) { 1786 mutex_unlock(&disk->open_mutex); 1787 return -EBUSY; 1788 } 1789 1790 /* From now on, anyone can't open /dev/zram[0-9] */ 1791 zram->claim = true; 1792 mutex_unlock(&disk->open_mutex); 1793 1794 /* Make sure all the pending I/O are finished */ 1795 sync_blockdev(disk->part0); 1796 zram_reset_device(zram); 1797 1798 mutex_lock(&disk->open_mutex); 1799 zram->claim = false; 1800 mutex_unlock(&disk->open_mutex); 1801 1802 return len; 1803} 1804 1805static int zram_open(struct block_device *bdev, fmode_t mode) 1806{ 1807 int ret = 0; 1808 struct zram *zram; 1809 1810 WARN_ON(!mutex_is_locked(&bdev->bd_disk->open_mutex)); 1811 1812 zram = bdev->bd_disk->private_data; 1813 /* zram was claimed to reset so open request fails */ 1814 if (zram->claim) 1815 ret = -EBUSY; 1816 1817 return ret; 1818} 1819 1820static const struct block_device_operations zram_devops = { 1821 .open = zram_open, 1822 .submit_bio = zram_submit_bio, 1823 .swap_slot_free_notify = zram_slot_free_notify, 1824 .rw_page = zram_rw_page, 1825 .owner = THIS_MODULE 1826}; 1827 1828#ifdef CONFIG_ZRAM_WRITEBACK 1829static const struct block_device_operations zram_wb_devops = { 1830 .open = zram_open, 1831 .submit_bio = zram_submit_bio, 1832 .swap_slot_free_notify = zram_slot_free_notify, 1833 .owner = THIS_MODULE 1834}; 1835#endif 1836 1837static DEVICE_ATTR_WO(compact); 1838static DEVICE_ATTR_RW(disksize); 1839static DEVICE_ATTR_RO(initstate); 1840static DEVICE_ATTR_WO(reset); 1841static DEVICE_ATTR_WO(mem_limit); 1842static DEVICE_ATTR_WO(mem_used_max); 1843static DEVICE_ATTR_WO(idle); 1844static DEVICE_ATTR_RW(max_comp_streams); 1845static DEVICE_ATTR_RW(comp_algorithm); 1846#ifdef CONFIG_ZRAM_WRITEBACK 1847static DEVICE_ATTR_RW(backing_dev); 1848static DEVICE_ATTR_WO(writeback); 1849static DEVICE_ATTR_RW(writeback_limit); 1850static DEVICE_ATTR_RW(writeback_limit_enable); 1851#endif 1852 1853static struct attribute *zram_disk_attrs[] = { 1854 &dev_attr_disksize.attr, 1855 &dev_attr_initstate.attr, 1856 &dev_attr_reset.attr, 1857 &dev_attr_compact.attr, 1858 &dev_attr_mem_limit.attr, 1859 &dev_attr_mem_used_max.attr, 1860 &dev_attr_idle.attr, 1861 &dev_attr_max_comp_streams.attr, 1862 &dev_attr_comp_algorithm.attr, 1863#ifdef CONFIG_ZRAM_WRITEBACK 1864 &dev_attr_backing_dev.attr, 1865 &dev_attr_writeback.attr, 1866 &dev_attr_writeback_limit.attr, 1867 &dev_attr_writeback_limit_enable.attr, 1868#endif 1869 &dev_attr_io_stat.attr, 1870 &dev_attr_mm_stat.attr, 1871#ifdef CONFIG_ZRAM_WRITEBACK 1872 &dev_attr_bd_stat.attr, 1873#endif 1874 &dev_attr_debug_stat.attr, 1875 NULL, 1876}; 1877 1878ATTRIBUTE_GROUPS(zram_disk); 1879 1880/* 1881 * Allocate and initialize new zram device. the function returns 1882 * '>= 0' device_id upon success, and negative value otherwise. 1883 */ 1884static int zram_add(void) 1885{ 1886 struct zram *zram; 1887 int ret, device_id; 1888 1889 zram = kzalloc(sizeof(struct zram), GFP_KERNEL); 1890 if (!zram) 1891 return -ENOMEM; 1892 1893 ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL); 1894 if (ret < 0) 1895 goto out_free_dev; 1896 device_id = ret; 1897 1898 init_rwsem(&zram->init_lock); 1899#ifdef CONFIG_ZRAM_WRITEBACK 1900 spin_lock_init(&zram->wb_limit_lock); 1901#endif 1902 1903 /* gendisk structure */ 1904 zram->disk = blk_alloc_disk(NUMA_NO_NODE); 1905 if (!zram->disk) { 1906 pr_err("Error allocating disk structure for device %d\n", 1907 device_id); 1908 ret = -ENOMEM; 1909 goto out_free_idr; 1910 } 1911 1912 zram->disk->major = zram_major; 1913 zram->disk->first_minor = device_id; 1914 zram->disk->minors = 1; 1915 zram->disk->flags |= GENHD_FL_NO_PART; 1916 zram->disk->fops = &zram_devops; 1917 zram->disk->private_data = zram; 1918 snprintf(zram->disk->disk_name, 16, "zram%d", device_id); 1919 1920 /* Actual capacity set using syfs (/sys/block/zram<id>/disksize */ 1921 set_capacity(zram->disk, 0); 1922 /* zram devices sort of resembles non-rotational disks */ 1923 blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); 1924 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); 1925 1926 /* 1927 * To ensure that we always get PAGE_SIZE aligned 1928 * and n*PAGE_SIZED sized I/O requests. 1929 */ 1930 blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); 1931 blk_queue_logical_block_size(zram->disk->queue, 1932 ZRAM_LOGICAL_BLOCK_SIZE); 1933 blk_queue_io_min(zram->disk->queue, PAGE_SIZE); 1934 blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); 1935 zram->disk->queue->limits.discard_granularity = PAGE_SIZE; 1936 blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); 1937 1938 /* 1939 * zram_bio_discard() will clear all logical blocks if logical block 1940 * size is identical with physical block size(PAGE_SIZE). But if it is 1941 * different, we will skip discarding some parts of logical blocks in 1942 * the part of the request range which isn't aligned to physical block 1943 * size. So we can't ensure that all discarded logical blocks are 1944 * zeroed. 1945 */ 1946 if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) 1947 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); 1948 1949 ret = device_add_disk(NULL, zram->disk, zram_disk_groups); 1950 if (ret) 1951 goto out_cleanup_disk; 1952 1953 strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); 1954 1955 zram_debugfs_register(zram); 1956 pr_info("Added device: %s\n", zram->disk->disk_name); 1957 return device_id; 1958 1959out_cleanup_disk: 1960 blk_cleanup_disk(zram->disk); 1961out_free_idr: 1962 idr_remove(&zram_index_idr, device_id); 1963out_free_dev: 1964 kfree(zram); 1965 return ret; 1966} 1967 1968static int zram_remove(struct zram *zram) 1969{ 1970 bool claimed; 1971 1972 mutex_lock(&zram->disk->open_mutex); 1973 if (disk_openers(zram->disk)) { 1974 mutex_unlock(&zram->disk->open_mutex); 1975 return -EBUSY; 1976 } 1977 1978 claimed = zram->claim; 1979 if (!claimed) 1980 zram->claim = true; 1981 mutex_unlock(&zram->disk->open_mutex); 1982 1983 zram_debugfs_unregister(zram); 1984 1985 if (claimed) { 1986 /* 1987 * If we were claimed by reset_store(), del_gendisk() will 1988 * wait until reset_store() is done, so nothing need to do. 1989 */ 1990 ; 1991 } else { 1992 /* Make sure all the pending I/O are finished */ 1993 sync_blockdev(zram->disk->part0); 1994 zram_reset_device(zram); 1995 } 1996 1997 pr_info("Removed device: %s\n", zram->disk->disk_name); 1998 1999 del_gendisk(zram->disk); 2000 2001 /* del_gendisk drains pending reset_store */ 2002 WARN_ON_ONCE(claimed && zram->claim); 2003 2004 /* 2005 * disksize_store() may be called in between zram_reset_device() 2006 * and del_gendisk(), so run the last reset to avoid leaking 2007 * anything allocated with disksize_store() 2008 */ 2009 zram_reset_device(zram); 2010 2011 blk_cleanup_disk(zram->disk); 2012 kfree(zram); 2013 return 0; 2014} 2015 2016/* zram-control sysfs attributes */ 2017 2018/* 2019 * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a 2020 * sense that reading from this file does alter the state of your system -- it 2021 * creates a new un-initialized zram device and returns back this device's 2022 * device_id (or an error code if it fails to create a new device). 2023 */ 2024static ssize_t hot_add_show(struct class *class, 2025 struct class_attribute *attr, 2026 char *buf) 2027{ 2028 int ret; 2029 2030 mutex_lock(&zram_index_mutex); 2031 ret = zram_add(); 2032 mutex_unlock(&zram_index_mutex); 2033 2034 if (ret < 0) 2035 return ret; 2036 return scnprintf(buf, PAGE_SIZE, "%d\n", ret); 2037} 2038static struct class_attribute class_attr_hot_add = 2039 __ATTR(hot_add, 0400, hot_add_show, NULL); 2040 2041static ssize_t hot_remove_store(struct class *class, 2042 struct class_attribute *attr, 2043 const char *buf, 2044 size_t count) 2045{ 2046 struct zram *zram; 2047 int ret, dev_id; 2048 2049 /* dev_id is gendisk->first_minor, which is `int' */ 2050 ret = kstrtoint(buf, 10, &dev_id); 2051 if (ret) 2052 return ret; 2053 if (dev_id < 0) 2054 return -EINVAL; 2055 2056 mutex_lock(&zram_index_mutex); 2057 2058 zram = idr_find(&zram_index_idr, dev_id); 2059 if (zram) { 2060 ret = zram_remove(zram); 2061 if (!ret) 2062 idr_remove(&zram_index_idr, dev_id); 2063 } else { 2064 ret = -ENODEV; 2065 } 2066 2067 mutex_unlock(&zram_index_mutex); 2068 return ret ? ret : count; 2069} 2070static CLASS_ATTR_WO(hot_remove); 2071 2072static struct attribute *zram_control_class_attrs[] = { 2073 &class_attr_hot_add.attr, 2074 &class_attr_hot_remove.attr, 2075 NULL, 2076}; 2077ATTRIBUTE_GROUPS(zram_control_class); 2078 2079static struct class zram_control_class = { 2080 .name = "zram-control", 2081 .owner = THIS_MODULE, 2082 .class_groups = zram_control_class_groups, 2083}; 2084 2085static int zram_remove_cb(int id, void *ptr, void *data) 2086{ 2087 WARN_ON_ONCE(zram_remove(ptr)); 2088 return 0; 2089} 2090 2091static void destroy_devices(void) 2092{ 2093 class_unregister(&zram_control_class); 2094 idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); 2095 zram_debugfs_destroy(); 2096 idr_destroy(&zram_index_idr); 2097 unregister_blkdev(zram_major, "zram"); 2098 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); 2099} 2100 2101static int __init zram_init(void) 2102{ 2103 int ret; 2104 2105 ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", 2106 zcomp_cpu_up_prepare, zcomp_cpu_dead); 2107 if (ret < 0) 2108 return ret; 2109 2110 ret = class_register(&zram_control_class); 2111 if (ret) { 2112 pr_err("Unable to register zram-control class\n"); 2113 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); 2114 return ret; 2115 } 2116 2117 zram_debugfs_create(); 2118 zram_major = register_blkdev(0, "zram"); 2119 if (zram_major <= 0) { 2120 pr_err("Unable to get major number\n"); 2121 class_unregister(&zram_control_class); 2122 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); 2123 return -EBUSY; 2124 } 2125 2126 while (num_devices != 0) { 2127 mutex_lock(&zram_index_mutex); 2128 ret = zram_add(); 2129 mutex_unlock(&zram_index_mutex); 2130 if (ret < 0) 2131 goto out_error; 2132 num_devices--; 2133 } 2134 2135 return 0; 2136 2137out_error: 2138 destroy_devices(); 2139 return ret; 2140} 2141 2142static void __exit zram_exit(void) 2143{ 2144 destroy_devices(); 2145} 2146 2147module_init(zram_init); 2148module_exit(zram_exit); 2149 2150module_param(num_devices, uint, 0); 2151MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices"); 2152 2153MODULE_LICENSE("Dual BSD/GPL"); 2154MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 2155MODULE_DESCRIPTION("Compressed RAM Block Device");