compression.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
compression.c (51397B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2008 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/kernel.h>
      7#include <linux/bio.h>
      8#include <linux/file.h>
      9#include <linux/fs.h>
     10#include <linux/pagemap.h>
     11#include <linux/highmem.h>
     12#include <linux/kthread.h>
     13#include <linux/time.h>
     14#include <linux/init.h>
     15#include <linux/string.h>
     16#include <linux/backing-dev.h>
     17#include <linux/writeback.h>
     18#include <linux/slab.h>
     19#include <linux/sched/mm.h>
     20#include <linux/log2.h>
     21#include <crypto/hash.h>
     22#include "misc.h"
     23#include "ctree.h"
     24#include "disk-io.h"
     25#include "transaction.h"
     26#include "btrfs_inode.h"
     27#include "volumes.h"
     28#include "ordered-data.h"
     29#include "compression.h"
     30#include "extent_io.h"
     31#include "extent_map.h"
     32#include "subpage.h"
     33#include "zoned.h"
     34
     35static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
     36
     37const char* btrfs_compress_type2str(enum btrfs_compression_type type)
     38{
     39	switch (type) {
     40	case BTRFS_COMPRESS_ZLIB:
     41	case BTRFS_COMPRESS_LZO:
     42	case BTRFS_COMPRESS_ZSTD:
     43	case BTRFS_COMPRESS_NONE:
     44		return btrfs_compress_types[type];
     45	default:
     46		break;
     47	}
     48
     49	return NULL;
     50}
     51
     52bool btrfs_compress_is_valid_type(const char *str, size_t len)
     53{
     54	int i;
     55
     56	for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
     57		size_t comp_len = strlen(btrfs_compress_types[i]);
     58
     59		if (len < comp_len)
     60			continue;
     61
     62		if (!strncmp(btrfs_compress_types[i], str, comp_len))
     63			return true;
     64	}
     65	return false;
     66}
     67
     68static int compression_compress_pages(int type, struct list_head *ws,
     69               struct address_space *mapping, u64 start, struct page **pages,
     70               unsigned long *out_pages, unsigned long *total_in,
     71               unsigned long *total_out)
     72{
     73	switch (type) {
     74	case BTRFS_COMPRESS_ZLIB:
     75		return zlib_compress_pages(ws, mapping, start, pages,
     76				out_pages, total_in, total_out);
     77	case BTRFS_COMPRESS_LZO:
     78		return lzo_compress_pages(ws, mapping, start, pages,
     79				out_pages, total_in, total_out);
     80	case BTRFS_COMPRESS_ZSTD:
     81		return zstd_compress_pages(ws, mapping, start, pages,
     82				out_pages, total_in, total_out);
     83	case BTRFS_COMPRESS_NONE:
     84	default:
     85		/*
     86		 * This can happen when compression races with remount setting
     87		 * it to 'no compress', while caller doesn't call
     88		 * inode_need_compress() to check if we really need to
     89		 * compress.
     90		 *
     91		 * Not a big deal, just need to inform caller that we
     92		 * haven't allocated any pages yet.
     93		 */
     94		*out_pages = 0;
     95		return -E2BIG;
     96	}
     97}
     98
     99static int compression_decompress_bio(struct list_head *ws,
    100				      struct compressed_bio *cb)
    101{
    102	switch (cb->compress_type) {
    103	case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
    104	case BTRFS_COMPRESS_LZO:  return lzo_decompress_bio(ws, cb);
    105	case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
    106	case BTRFS_COMPRESS_NONE:
    107	default:
    108		/*
    109		 * This can't happen, the type is validated several times
    110		 * before we get here.
    111		 */
    112		BUG();
    113	}
    114}
    115
    116static int compression_decompress(int type, struct list_head *ws,
    117               unsigned char *data_in, struct page *dest_page,
    118               unsigned long start_byte, size_t srclen, size_t destlen)
    119{
    120	switch (type) {
    121	case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
    122						start_byte, srclen, destlen);
    123	case BTRFS_COMPRESS_LZO:  return lzo_decompress(ws, data_in, dest_page,
    124						start_byte, srclen, destlen);
    125	case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
    126						start_byte, srclen, destlen);
    127	case BTRFS_COMPRESS_NONE:
    128	default:
    129		/*
    130		 * This can't happen, the type is validated several times
    131		 * before we get here.
    132		 */
    133		BUG();
    134	}
    135}
    136
    137static int btrfs_decompress_bio(struct compressed_bio *cb);
    138
    139static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
    140				      unsigned long disk_size)
    141{
    142	return sizeof(struct compressed_bio) +
    143		(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
    144}
    145
    146static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
    147				 u64 disk_start)
    148{
    149	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    150	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
    151	const u32 csum_size = fs_info->csum_size;
    152	const u32 sectorsize = fs_info->sectorsize;
    153	struct page *page;
    154	unsigned int i;
    155	char *kaddr;
    156	u8 csum[BTRFS_CSUM_SIZE];
    157	struct compressed_bio *cb = bio->bi_private;
    158	u8 *cb_sum = cb->sums;
    159
    160	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
    161	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
    162		return 0;
    163
    164	shash->tfm = fs_info->csum_shash;
    165
    166	for (i = 0; i < cb->nr_pages; i++) {
    167		u32 pg_offset;
    168		u32 bytes_left = PAGE_SIZE;
    169		page = cb->compressed_pages[i];
    170
    171		/* Determine the remaining bytes inside the page first */
    172		if (i == cb->nr_pages - 1)
    173			bytes_left = cb->compressed_len - i * PAGE_SIZE;
    174
    175		/* Hash through the page sector by sector */
    176		for (pg_offset = 0; pg_offset < bytes_left;
    177		     pg_offset += sectorsize) {
    178			kaddr = kmap_atomic(page);
    179			crypto_shash_digest(shash, kaddr + pg_offset,
    180					    sectorsize, csum);
    181			kunmap_atomic(kaddr);
    182
    183			if (memcmp(&csum, cb_sum, csum_size) != 0) {
    184				btrfs_print_data_csum_error(inode, disk_start,
    185						csum, cb_sum, cb->mirror_num);
    186				if (btrfs_bio(bio)->device)
    187					btrfs_dev_stat_inc_and_print(
    188						btrfs_bio(bio)->device,
    189						BTRFS_DEV_STAT_CORRUPTION_ERRS);
    190				return -EIO;
    191			}
    192			cb_sum += csum_size;
    193			disk_start += sectorsize;
    194		}
    195	}
    196	return 0;
    197}
    198
    199/*
    200 * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
    201 *
    202 * Return true if there is no pending bio nor io.
    203 * Return false otherwise.
    204 */
    205static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
    206{
    207	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
    208	unsigned int bi_size = 0;
    209	bool last_io = false;
    210	struct bio_vec *bvec;
    211	struct bvec_iter_all iter_all;
    212
    213	/*
    214	 * At endio time, bi_iter.bi_size doesn't represent the real bio size.
    215	 * Thus here we have to iterate through all segments to grab correct
    216	 * bio size.
    217	 */
    218	bio_for_each_segment_all(bvec, bio, iter_all)
    219		bi_size += bvec->bv_len;
    220
    221	if (bio->bi_status)
    222		cb->status = bio->bi_status;
    223
    224	ASSERT(bi_size && bi_size <= cb->compressed_len);
    225	last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
    226					&cb->pending_sectors);
    227	/*
    228	 * Here we must wake up the possible error handler after all other
    229	 * operations on @cb finished, or we can race with
    230	 * finish_compressed_bio_*() which may free @cb.
    231	 */
    232	wake_up_var(cb);
    233
    234	return last_io;
    235}
    236
    237static void finish_compressed_bio_read(struct compressed_bio *cb)
    238{
    239	unsigned int index;
    240	struct page *page;
    241
    242	/* Release the compressed pages */
    243	for (index = 0; index < cb->nr_pages; index++) {
    244		page = cb->compressed_pages[index];
    245		page->mapping = NULL;
    246		put_page(page);
    247	}
    248
    249	/* Do io completion on the original bio */
    250	if (cb->status != BLK_STS_OK) {
    251		cb->orig_bio->bi_status = cb->status;
    252		bio_endio(cb->orig_bio);
    253	} else {
    254		struct bio_vec *bvec;
    255		struct bvec_iter_all iter_all;
    256
    257		/*
    258		 * We have verified the checksum already, set page checked so
    259		 * the end_io handlers know about it
    260		 */
    261		ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
    262		bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
    263			u64 bvec_start = page_offset(bvec->bv_page) +
    264					 bvec->bv_offset;
    265
    266			btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
    267					bvec->bv_page, bvec_start,
    268					bvec->bv_len);
    269		}
    270
    271		bio_endio(cb->orig_bio);
    272	}
    273
    274	/* Finally free the cb struct */
    275	kfree(cb->compressed_pages);
    276	kfree(cb);
    277}
    278
    279/* when we finish reading compressed pages from the disk, we
    280 * decompress them and then run the bio end_io routines on the
    281 * decompressed pages (in the inode address space).
    282 *
    283 * This allows the checksumming and other IO error handling routines
    284 * to work normally
    285 *
    286 * The compressed pages are freed here, and it must be run
    287 * in process context
    288 */
    289static void end_compressed_bio_read(struct bio *bio)
    290{
    291	struct compressed_bio *cb = bio->bi_private;
    292	struct inode *inode;
    293	unsigned int mirror = btrfs_bio(bio)->mirror_num;
    294	int ret = 0;
    295
    296	if (!dec_and_test_compressed_bio(cb, bio))
    297		goto out;
    298
    299	/*
    300	 * Record the correct mirror_num in cb->orig_bio so that
    301	 * read-repair can work properly.
    302	 */
    303	btrfs_bio(cb->orig_bio)->mirror_num = mirror;
    304	cb->mirror_num = mirror;
    305
    306	/*
    307	 * Some IO in this cb have failed, just skip checksum as there
    308	 * is no way it could be correct.
    309	 */
    310	if (cb->status != BLK_STS_OK)
    311		goto csum_failed;
    312
    313	inode = cb->inode;
    314	ret = check_compressed_csum(BTRFS_I(inode), bio,
    315				    bio->bi_iter.bi_sector << 9);
    316	if (ret)
    317		goto csum_failed;
    318
    319	/* ok, we're the last bio for this extent, lets start
    320	 * the decompression.
    321	 */
    322	ret = btrfs_decompress_bio(cb);
    323
    324csum_failed:
    325	if (ret)
    326		cb->status = errno_to_blk_status(ret);
    327	finish_compressed_bio_read(cb);
    328out:
    329	bio_put(bio);
    330}
    331
    332/*
    333 * Clear the writeback bits on all of the file
    334 * pages for a compressed write
    335 */
    336static noinline void end_compressed_writeback(struct inode *inode,
    337					      const struct compressed_bio *cb)
    338{
    339	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    340	unsigned long index = cb->start >> PAGE_SHIFT;
    341	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
    342	struct page *pages[16];
    343	unsigned long nr_pages = end_index - index + 1;
    344	const int errno = blk_status_to_errno(cb->status);
    345	int i;
    346	int ret;
    347
    348	if (errno)
    349		mapping_set_error(inode->i_mapping, errno);
    350
    351	while (nr_pages > 0) {
    352		ret = find_get_pages_contig(inode->i_mapping, index,
    353				     min_t(unsigned long,
    354				     nr_pages, ARRAY_SIZE(pages)), pages);
    355		if (ret == 0) {
    356			nr_pages -= 1;
    357			index += 1;
    358			continue;
    359		}
    360		for (i = 0; i < ret; i++) {
    361			if (errno)
    362				SetPageError(pages[i]);
    363			btrfs_page_clamp_clear_writeback(fs_info, pages[i],
    364							 cb->start, cb->len);
    365			put_page(pages[i]);
    366		}
    367		nr_pages -= ret;
    368		index += ret;
    369	}
    370	/* the inode may be gone now */
    371}
    372
    373static void finish_compressed_bio_write(struct compressed_bio *cb)
    374{
    375	struct inode *inode = cb->inode;
    376	unsigned int index;
    377
    378	/*
    379	 * Ok, we're the last bio for this extent, step one is to call back
    380	 * into the FS and do all the end_io operations.
    381	 */
    382	btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
    383			cb->start, cb->start + cb->len - 1,
    384			cb->status == BLK_STS_OK);
    385
    386	if (cb->writeback)
    387		end_compressed_writeback(inode, cb);
    388	/* Note, our inode could be gone now */
    389
    390	/*
    391	 * Release the compressed pages, these came from alloc_page and
    392	 * are not attached to the inode at all
    393	 */
    394	for (index = 0; index < cb->nr_pages; index++) {
    395		struct page *page = cb->compressed_pages[index];
    396
    397		page->mapping = NULL;
    398		put_page(page);
    399	}
    400
    401	/* Finally free the cb struct */
    402	kfree(cb->compressed_pages);
    403	kfree(cb);
    404}
    405
    406/*
    407 * Do the cleanup once all the compressed pages hit the disk.  This will clear
    408 * writeback on the file pages and free the compressed pages.
    409 *
    410 * This also calls the writeback end hooks for the file pages so that metadata
    411 * and checksums can be updated in the file.
    412 */
    413static void end_compressed_bio_write(struct bio *bio)
    414{
    415	struct compressed_bio *cb = bio->bi_private;
    416
    417	if (!dec_and_test_compressed_bio(cb, bio))
    418		goto out;
    419
    420	btrfs_record_physical_zoned(cb->inode, cb->start, bio);
    421
    422	finish_compressed_bio_write(cb);
    423out:
    424	bio_put(bio);
    425}
    426
    427static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
    428					  struct bio *bio, int mirror_num)
    429{
    430	blk_status_t ret;
    431
    432	ASSERT(bio->bi_iter.bi_size);
    433	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
    434	if (ret)
    435		return ret;
    436	ret = btrfs_map_bio(fs_info, bio, mirror_num);
    437	return ret;
    438}
    439
    440/*
    441 * Allocate a compressed_bio, which will be used to read/write on-disk
    442 * (aka, compressed) * data.
    443 *
    444 * @cb:                 The compressed_bio structure, which records all the needed
    445 *                      information to bind the compressed data to the uncompressed
    446 *                      page cache.
    447 * @disk_byten:         The logical bytenr where the compressed data will be read
    448 *                      from or written to.
    449 * @endio_func:         The endio function to call after the IO for compressed data
    450 *                      is finished.
    451 * @next_stripe_start:  Return value of logical bytenr of where next stripe starts.
    452 *                      Let the caller know to only fill the bio up to the stripe
    453 *                      boundary.
    454 */
    455
    456
    457static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
    458					unsigned int opf, bio_end_io_t endio_func,
    459					u64 *next_stripe_start)
    460{
    461	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
    462	struct btrfs_io_geometry geom;
    463	struct extent_map *em;
    464	struct bio *bio;
    465	int ret;
    466
    467	bio = btrfs_bio_alloc(BIO_MAX_VECS);
    468
    469	bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
    470	bio->bi_opf = opf;
    471	bio->bi_private = cb;
    472	bio->bi_end_io = endio_func;
    473
    474	em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
    475	if (IS_ERR(em)) {
    476		bio_put(bio);
    477		return ERR_CAST(em);
    478	}
    479
    480	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
    481		bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
    482
    483	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
    484	free_extent_map(em);
    485	if (ret < 0) {
    486		bio_put(bio);
    487		return ERR_PTR(ret);
    488	}
    489	*next_stripe_start = disk_bytenr + geom.len;
    490
    491	return bio;
    492}
    493
    494/*
    495 * worker function to build and submit bios for previously compressed pages.
    496 * The corresponding pages in the inode should be marked for writeback
    497 * and the compressed pages should have a reference on them for dropping
    498 * when the IO is complete.
    499 *
    500 * This also checksums the file bytes and gets things ready for
    501 * the end io hooks.
    502 */
    503blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
    504				 unsigned int len, u64 disk_start,
    505				 unsigned int compressed_len,
    506				 struct page **compressed_pages,
    507				 unsigned int nr_pages,
    508				 unsigned int write_flags,
    509				 struct cgroup_subsys_state *blkcg_css,
    510				 bool writeback)
    511{
    512	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    513	struct bio *bio = NULL;
    514	struct compressed_bio *cb;
    515	u64 cur_disk_bytenr = disk_start;
    516	u64 next_stripe_start;
    517	blk_status_t ret;
    518	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
    519	const bool use_append = btrfs_use_zone_append(inode, disk_start);
    520	const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
    521
    522	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
    523	       IS_ALIGNED(len, fs_info->sectorsize));
    524	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
    525	if (!cb)
    526		return BLK_STS_RESOURCE;
    527	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
    528	cb->status = BLK_STS_OK;
    529	cb->inode = &inode->vfs_inode;
    530	cb->start = start;
    531	cb->len = len;
    532	cb->mirror_num = 0;
    533	cb->compressed_pages = compressed_pages;
    534	cb->compressed_len = compressed_len;
    535	cb->writeback = writeback;
    536	cb->orig_bio = NULL;
    537	cb->nr_pages = nr_pages;
    538
    539	if (blkcg_css)
    540		kthread_associate_blkcg(blkcg_css);
    541
    542	while (cur_disk_bytenr < disk_start + compressed_len) {
    543		u64 offset = cur_disk_bytenr - disk_start;
    544		unsigned int index = offset >> PAGE_SHIFT;
    545		unsigned int real_size;
    546		unsigned int added;
    547		struct page *page = compressed_pages[index];
    548		bool submit = false;
    549
    550		/* Allocate new bio if submitted or not yet allocated */
    551		if (!bio) {
    552			bio = alloc_compressed_bio(cb, cur_disk_bytenr,
    553				bio_op | write_flags, end_compressed_bio_write,
    554				&next_stripe_start);
    555			if (IS_ERR(bio)) {
    556				ret = errno_to_blk_status(PTR_ERR(bio));
    557				bio = NULL;
    558				goto finish_cb;
    559			}
    560			if (blkcg_css)
    561				bio->bi_opf |= REQ_CGROUP_PUNT;
    562		}
    563		/*
    564		 * We should never reach next_stripe_start start as we will
    565		 * submit comp_bio when reach the boundary immediately.
    566		 */
    567		ASSERT(cur_disk_bytenr != next_stripe_start);
    568
    569		/*
    570		 * We have various limits on the real read size:
    571		 * - stripe boundary
    572		 * - page boundary
    573		 * - compressed length boundary
    574		 */
    575		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
    576		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
    577		real_size = min_t(u64, real_size, compressed_len - offset);
    578		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
    579
    580		if (use_append)
    581			added = bio_add_zone_append_page(bio, page, real_size,
    582					offset_in_page(offset));
    583		else
    584			added = bio_add_page(bio, page, real_size,
    585					offset_in_page(offset));
    586		/* Reached zoned boundary */
    587		if (added == 0)
    588			submit = true;
    589
    590		cur_disk_bytenr += added;
    591		/* Reached stripe boundary */
    592		if (cur_disk_bytenr == next_stripe_start)
    593			submit = true;
    594
    595		/* Finished the range */
    596		if (cur_disk_bytenr == disk_start + compressed_len)
    597			submit = true;
    598
    599		if (submit) {
    600			if (!skip_sum) {
    601				ret = btrfs_csum_one_bio(inode, bio, start, true);
    602				if (ret)
    603					goto finish_cb;
    604			}
    605
    606			ret = submit_compressed_bio(fs_info, bio, 0);
    607			if (ret)
    608				goto finish_cb;
    609			bio = NULL;
    610		}
    611		cond_resched();
    612	}
    613	if (blkcg_css)
    614		kthread_associate_blkcg(NULL);
    615
    616	return 0;
    617
    618finish_cb:
    619	if (blkcg_css)
    620		kthread_associate_blkcg(NULL);
    621
    622	if (bio) {
    623		bio->bi_status = ret;
    624		bio_endio(bio);
    625	}
    626	/* Last byte of @cb is submitted, endio will free @cb */
    627	if (cur_disk_bytenr == disk_start + compressed_len)
    628		return ret;
    629
    630	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
    631			   (disk_start + compressed_len - cur_disk_bytenr) >>
    632			   fs_info->sectorsize_bits);
    633	/*
    634	 * Even with previous bio ended, we should still have io not yet
    635	 * submitted, thus need to finish manually.
    636	 */
    637	ASSERT(refcount_read(&cb->pending_sectors));
    638	/* Now we are the only one referring @cb, can finish it safely. */
    639	finish_compressed_bio_write(cb);
    640	return ret;
    641}
    642
    643static u64 bio_end_offset(struct bio *bio)
    644{
    645	struct bio_vec *last = bio_last_bvec_all(bio);
    646
    647	return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
    648}
    649
    650/*
    651 * Add extra pages in the same compressed file extent so that we don't need to
    652 * re-read the same extent again and again.
    653 *
    654 * NOTE: this won't work well for subpage, as for subpage read, we lock the
    655 * full page then submit bio for each compressed/regular extents.
    656 *
    657 * This means, if we have several sectors in the same page points to the same
    658 * on-disk compressed data, we will re-read the same extent many times and
    659 * this function can only help for the next page.
    660 */
    661static noinline int add_ra_bio_pages(struct inode *inode,
    662				     u64 compressed_end,
    663				     struct compressed_bio *cb)
    664{
    665	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    666	unsigned long end_index;
    667	u64 cur = bio_end_offset(cb->orig_bio);
    668	u64 isize = i_size_read(inode);
    669	int ret;
    670	struct page *page;
    671	struct extent_map *em;
    672	struct address_space *mapping = inode->i_mapping;
    673	struct extent_map_tree *em_tree;
    674	struct extent_io_tree *tree;
    675	int sectors_missed = 0;
    676
    677	em_tree = &BTRFS_I(inode)->extent_tree;
    678	tree = &BTRFS_I(inode)->io_tree;
    679
    680	if (isize == 0)
    681		return 0;
    682
    683	/*
    684	 * For current subpage support, we only support 64K page size,
    685	 * which means maximum compressed extent size (128K) is just 2x page
    686	 * size.
    687	 * This makes readahead less effective, so here disable readahead for
    688	 * subpage for now, until full compressed write is supported.
    689	 */
    690	if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
    691		return 0;
    692
    693	end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
    694
    695	while (cur < compressed_end) {
    696		u64 page_end;
    697		u64 pg_index = cur >> PAGE_SHIFT;
    698		u32 add_size;
    699
    700		if (pg_index > end_index)
    701			break;
    702
    703		page = xa_load(&mapping->i_pages, pg_index);
    704		if (page && !xa_is_value(page)) {
    705			sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
    706					  fs_info->sectorsize_bits;
    707
    708			/* Beyond threshold, no need to continue */
    709			if (sectors_missed > 4)
    710				break;
    711
    712			/*
    713			 * Jump to next page start as we already have page for
    714			 * current offset.
    715			 */
    716			cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
    717			continue;
    718		}
    719
    720		page = __page_cache_alloc(mapping_gfp_constraint(mapping,
    721								 ~__GFP_FS));
    722		if (!page)
    723			break;
    724
    725		if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
    726			put_page(page);
    727			/* There is already a page, skip to page end */
    728			cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
    729			continue;
    730		}
    731
    732		ret = set_page_extent_mapped(page);
    733		if (ret < 0) {
    734			unlock_page(page);
    735			put_page(page);
    736			break;
    737		}
    738
    739		page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
    740		lock_extent(tree, cur, page_end);
    741		read_lock(&em_tree->lock);
    742		em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
    743		read_unlock(&em_tree->lock);
    744
    745		/*
    746		 * At this point, we have a locked page in the page cache for
    747		 * these bytes in the file.  But, we have to make sure they map
    748		 * to this compressed extent on disk.
    749		 */
    750		if (!em || cur < em->start ||
    751		    (cur + fs_info->sectorsize > extent_map_end(em)) ||
    752		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
    753			free_extent_map(em);
    754			unlock_extent(tree, cur, page_end);
    755			unlock_page(page);
    756			put_page(page);
    757			break;
    758		}
    759		free_extent_map(em);
    760
    761		if (page->index == end_index) {
    762			size_t zero_offset = offset_in_page(isize);
    763
    764			if (zero_offset) {
    765				int zeros;
    766				zeros = PAGE_SIZE - zero_offset;
    767				memzero_page(page, zero_offset, zeros);
    768				flush_dcache_page(page);
    769			}
    770		}
    771
    772		add_size = min(em->start + em->len, page_end + 1) - cur;
    773		ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
    774		if (ret != add_size) {
    775			unlock_extent(tree, cur, page_end);
    776			unlock_page(page);
    777			put_page(page);
    778			break;
    779		}
    780		/*
    781		 * If it's subpage, we also need to increase its
    782		 * subpage::readers number, as at endio we will decrease
    783		 * subpage::readers and to unlock the page.
    784		 */
    785		if (fs_info->sectorsize < PAGE_SIZE)
    786			btrfs_subpage_start_reader(fs_info, page, cur, add_size);
    787		put_page(page);
    788		cur += add_size;
    789	}
    790	return 0;
    791}
    792
    793/*
    794 * for a compressed read, the bio we get passed has all the inode pages
    795 * in it.  We don't actually do IO on those pages but allocate new ones
    796 * to hold the compressed pages on disk.
    797 *
    798 * bio->bi_iter.bi_sector points to the compressed extent on disk
    799 * bio->bi_io_vec points to all of the inode pages
    800 *
    801 * After the compressed pages are read, we copy the bytes into the
    802 * bio we were passed and then call the bio end_io calls
    803 */
    804void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
    805				  int mirror_num)
    806{
    807	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    808	struct extent_map_tree *em_tree;
    809	struct compressed_bio *cb;
    810	unsigned int compressed_len;
    811	struct bio *comp_bio = NULL;
    812	const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
    813	u64 cur_disk_byte = disk_bytenr;
    814	u64 next_stripe_start;
    815	u64 file_offset;
    816	u64 em_len;
    817	u64 em_start;
    818	struct extent_map *em;
    819	blk_status_t ret;
    820	int ret2;
    821	int i;
    822	u8 *sums;
    823
    824	em_tree = &BTRFS_I(inode)->extent_tree;
    825
    826	file_offset = bio_first_bvec_all(bio)->bv_offset +
    827		      page_offset(bio_first_page_all(bio));
    828
    829	/* we need the actual starting offset of this extent in the file */
    830	read_lock(&em_tree->lock);
    831	em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
    832	read_unlock(&em_tree->lock);
    833	if (!em) {
    834		ret = BLK_STS_IOERR;
    835		goto out;
    836	}
    837
    838	ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
    839	compressed_len = em->block_len;
    840	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
    841	if (!cb) {
    842		ret = BLK_STS_RESOURCE;
    843		goto out;
    844	}
    845
    846	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
    847	cb->status = BLK_STS_OK;
    848	cb->inode = inode;
    849	cb->mirror_num = mirror_num;
    850	sums = cb->sums;
    851
    852	cb->start = em->orig_start;
    853	em_len = em->len;
    854	em_start = em->start;
    855
    856	cb->len = bio->bi_iter.bi_size;
    857	cb->compressed_len = compressed_len;
    858	cb->compress_type = em->compress_type;
    859	cb->orig_bio = bio;
    860
    861	free_extent_map(em);
    862	em = NULL;
    863
    864	cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
    865	cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
    866	if (!cb->compressed_pages) {
    867		ret = BLK_STS_RESOURCE;
    868		goto fail;
    869	}
    870
    871	ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
    872	if (ret2) {
    873		ret = BLK_STS_RESOURCE;
    874		goto fail;
    875	}
    876
    877	add_ra_bio_pages(inode, em_start + em_len, cb);
    878
    879	/* include any pages we added in add_ra-bio_pages */
    880	cb->len = bio->bi_iter.bi_size;
    881
    882	while (cur_disk_byte < disk_bytenr + compressed_len) {
    883		u64 offset = cur_disk_byte - disk_bytenr;
    884		unsigned int index = offset >> PAGE_SHIFT;
    885		unsigned int real_size;
    886		unsigned int added;
    887		struct page *page = cb->compressed_pages[index];
    888		bool submit = false;
    889
    890		/* Allocate new bio if submitted or not yet allocated */
    891		if (!comp_bio) {
    892			comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
    893					REQ_OP_READ, end_compressed_bio_read,
    894					&next_stripe_start);
    895			if (IS_ERR(comp_bio)) {
    896				ret = errno_to_blk_status(PTR_ERR(comp_bio));
    897				comp_bio = NULL;
    898				goto finish_cb;
    899			}
    900		}
    901		/*
    902		 * We should never reach next_stripe_start start as we will
    903		 * submit comp_bio when reach the boundary immediately.
    904		 */
    905		ASSERT(cur_disk_byte != next_stripe_start);
    906		/*
    907		 * We have various limit on the real read size:
    908		 * - stripe boundary
    909		 * - page boundary
    910		 * - compressed length boundary
    911		 */
    912		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
    913		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
    914		real_size = min_t(u64, real_size, compressed_len - offset);
    915		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
    916
    917		added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
    918		/*
    919		 * Maximum compressed extent is smaller than bio size limit,
    920		 * thus bio_add_page() should always success.
    921		 */
    922		ASSERT(added == real_size);
    923		cur_disk_byte += added;
    924
    925		/* Reached stripe boundary, need to submit */
    926		if (cur_disk_byte == next_stripe_start)
    927			submit = true;
    928
    929		/* Has finished the range, need to submit */
    930		if (cur_disk_byte == disk_bytenr + compressed_len)
    931			submit = true;
    932
    933		if (submit) {
    934			unsigned int nr_sectors;
    935
    936			ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
    937			if (ret)
    938				goto finish_cb;
    939
    940			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
    941						  fs_info->sectorsize);
    942			sums += fs_info->csum_size * nr_sectors;
    943
    944			ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
    945			if (ret)
    946				goto finish_cb;
    947			comp_bio = NULL;
    948		}
    949	}
    950	return;
    951
    952fail:
    953	if (cb->compressed_pages) {
    954		for (i = 0; i < cb->nr_pages; i++) {
    955			if (cb->compressed_pages[i])
    956				__free_page(cb->compressed_pages[i]);
    957		}
    958	}
    959
    960	kfree(cb->compressed_pages);
    961	kfree(cb);
    962out:
    963	free_extent_map(em);
    964	bio->bi_status = ret;
    965	bio_endio(bio);
    966	return;
    967finish_cb:
    968	if (comp_bio) {
    969		comp_bio->bi_status = ret;
    970		bio_endio(comp_bio);
    971	}
    972	/* All bytes of @cb is submitted, endio will free @cb */
    973	if (cur_disk_byte == disk_bytenr + compressed_len)
    974		return;
    975
    976	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
    977			   (disk_bytenr + compressed_len - cur_disk_byte) >>
    978			   fs_info->sectorsize_bits);
    979	/*
    980	 * Even with previous bio ended, we should still have io not yet
    981	 * submitted, thus need to finish @cb manually.
    982	 */
    983	ASSERT(refcount_read(&cb->pending_sectors));
    984	/* Now we are the only one referring @cb, can finish it safely. */
    985	finish_compressed_bio_read(cb);
    986}
    987
    988/*
    989 * Heuristic uses systematic sampling to collect data from the input data
    990 * range, the logic can be tuned by the following constants:
    991 *
    992 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
    993 * @SAMPLING_INTERVAL  - range from which the sampled data can be collected
    994 */
    995#define SAMPLING_READ_SIZE	(16)
    996#define SAMPLING_INTERVAL	(256)
    997
    998/*
    999 * For statistical analysis of the input data we consider bytes that form a
   1000 * Galois Field of 256 objects. Each object has an attribute count, ie. how
   1001 * many times the object appeared in the sample.
   1002 */
   1003#define BUCKET_SIZE		(256)
   1004
   1005/*
   1006 * The size of the sample is based on a statistical sampling rule of thumb.
   1007 * The common way is to perform sampling tests as long as the number of
   1008 * elements in each cell is at least 5.
   1009 *
   1010 * Instead of 5, we choose 32 to obtain more accurate results.
   1011 * If the data contain the maximum number of symbols, which is 256, we obtain a
   1012 * sample size bound by 8192.
   1013 *
   1014 * For a sample of at most 8KB of data per data range: 16 consecutive bytes
   1015 * from up to 512 locations.
   1016 */
   1017#define MAX_SAMPLE_SIZE		(BTRFS_MAX_UNCOMPRESSED *		\
   1018				 SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
   1019
   1020struct bucket_item {
   1021	u32 count;
   1022};
   1023
   1024struct heuristic_ws {
   1025	/* Partial copy of input data */
   1026	u8 *sample;
   1027	u32 sample_size;
   1028	/* Buckets store counters for each byte value */
   1029	struct bucket_item *bucket;
   1030	/* Sorting buffer */
   1031	struct bucket_item *bucket_b;
   1032	struct list_head list;
   1033};
   1034
   1035static struct workspace_manager heuristic_wsm;
   1036
   1037static void free_heuristic_ws(struct list_head *ws)
   1038{
   1039	struct heuristic_ws *workspace;
   1040
   1041	workspace = list_entry(ws, struct heuristic_ws, list);
   1042
   1043	kvfree(workspace->sample);
   1044	kfree(workspace->bucket);
   1045	kfree(workspace->bucket_b);
   1046	kfree(workspace);
   1047}
   1048
   1049static struct list_head *alloc_heuristic_ws(unsigned int level)
   1050{
   1051	struct heuristic_ws *ws;
   1052
   1053	ws = kzalloc(sizeof(*ws), GFP_KERNEL);
   1054	if (!ws)
   1055		return ERR_PTR(-ENOMEM);
   1056
   1057	ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
   1058	if (!ws->sample)
   1059		goto fail;
   1060
   1061	ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
   1062	if (!ws->bucket)
   1063		goto fail;
   1064
   1065	ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
   1066	if (!ws->bucket_b)
   1067		goto fail;
   1068
   1069	INIT_LIST_HEAD(&ws->list);
   1070	return &ws->list;
   1071fail:
   1072	free_heuristic_ws(&ws->list);
   1073	return ERR_PTR(-ENOMEM);
   1074}
   1075
   1076const struct btrfs_compress_op btrfs_heuristic_compress = {
   1077	.workspace_manager = &heuristic_wsm,
   1078};
   1079
   1080static const struct btrfs_compress_op * const btrfs_compress_op[] = {
   1081	/* The heuristic is represented as compression type 0 */
   1082	&btrfs_heuristic_compress,
   1083	&btrfs_zlib_compress,
   1084	&btrfs_lzo_compress,
   1085	&btrfs_zstd_compress,
   1086};
   1087
   1088static struct list_head *alloc_workspace(int type, unsigned int level)
   1089{
   1090	switch (type) {
   1091	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
   1092	case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
   1093	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace(level);
   1094	case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
   1095	default:
   1096		/*
   1097		 * This can't happen, the type is validated several times
   1098		 * before we get here.
   1099		 */
   1100		BUG();
   1101	}
   1102}
   1103
   1104static void free_workspace(int type, struct list_head *ws)
   1105{
   1106	switch (type) {
   1107	case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
   1108	case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
   1109	case BTRFS_COMPRESS_LZO:  return lzo_free_workspace(ws);
   1110	case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
   1111	default:
   1112		/*
   1113		 * This can't happen, the type is validated several times
   1114		 * before we get here.
   1115		 */
   1116		BUG();
   1117	}
   1118}
   1119
   1120static void btrfs_init_workspace_manager(int type)
   1121{
   1122	struct workspace_manager *wsm;
   1123	struct list_head *workspace;
   1124
   1125	wsm = btrfs_compress_op[type]->workspace_manager;
   1126	INIT_LIST_HEAD(&wsm->idle_ws);
   1127	spin_lock_init(&wsm->ws_lock);
   1128	atomic_set(&wsm->total_ws, 0);
   1129	init_waitqueue_head(&wsm->ws_wait);
   1130
   1131	/*
   1132	 * Preallocate one workspace for each compression type so we can
   1133	 * guarantee forward progress in the worst case
   1134	 */
   1135	workspace = alloc_workspace(type, 0);
   1136	if (IS_ERR(workspace)) {
   1137		pr_warn(
   1138	"BTRFS: cannot preallocate compression workspace, will try later\n");
   1139	} else {
   1140		atomic_set(&wsm->total_ws, 1);
   1141		wsm->free_ws = 1;
   1142		list_add(workspace, &wsm->idle_ws);
   1143	}
   1144}
   1145
   1146static void btrfs_cleanup_workspace_manager(int type)
   1147{
   1148	struct workspace_manager *wsman;
   1149	struct list_head *ws;
   1150
   1151	wsman = btrfs_compress_op[type]->workspace_manager;
   1152	while (!list_empty(&wsman->idle_ws)) {
   1153		ws = wsman->idle_ws.next;
   1154		list_del(ws);
   1155		free_workspace(type, ws);
   1156		atomic_dec(&wsman->total_ws);
   1157	}
   1158}
   1159
   1160/*
   1161 * This finds an available workspace or allocates a new one.
   1162 * If it's not possible to allocate a new one, waits until there's one.
   1163 * Preallocation makes a forward progress guarantees and we do not return
   1164 * errors.
   1165 */
   1166struct list_head *btrfs_get_workspace(int type, unsigned int level)
   1167{
   1168	struct workspace_manager *wsm;
   1169	struct list_head *workspace;
   1170	int cpus = num_online_cpus();
   1171	unsigned nofs_flag;
   1172	struct list_head *idle_ws;
   1173	spinlock_t *ws_lock;
   1174	atomic_t *total_ws;
   1175	wait_queue_head_t *ws_wait;
   1176	int *free_ws;
   1177
   1178	wsm = btrfs_compress_op[type]->workspace_manager;
   1179	idle_ws	 = &wsm->idle_ws;
   1180	ws_lock	 = &wsm->ws_lock;
   1181	total_ws = &wsm->total_ws;
   1182	ws_wait	 = &wsm->ws_wait;
   1183	free_ws	 = &wsm->free_ws;
   1184
   1185again:
   1186	spin_lock(ws_lock);
   1187	if (!list_empty(idle_ws)) {
   1188		workspace = idle_ws->next;
   1189		list_del(workspace);
   1190		(*free_ws)--;
   1191		spin_unlock(ws_lock);
   1192		return workspace;
   1193
   1194	}
   1195	if (atomic_read(total_ws) > cpus) {
   1196		DEFINE_WAIT(wait);
   1197
   1198		spin_unlock(ws_lock);
   1199		prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
   1200		if (atomic_read(total_ws) > cpus && !*free_ws)
   1201			schedule();
   1202		finish_wait(ws_wait, &wait);
   1203		goto again;
   1204	}
   1205	atomic_inc(total_ws);
   1206	spin_unlock(ws_lock);
   1207
   1208	/*
   1209	 * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
   1210	 * to turn it off here because we might get called from the restricted
   1211	 * context of btrfs_compress_bio/btrfs_compress_pages
   1212	 */
   1213	nofs_flag = memalloc_nofs_save();
   1214	workspace = alloc_workspace(type, level);
   1215	memalloc_nofs_restore(nofs_flag);
   1216
   1217	if (IS_ERR(workspace)) {
   1218		atomic_dec(total_ws);
   1219		wake_up(ws_wait);
   1220
   1221		/*
   1222		 * Do not return the error but go back to waiting. There's a
   1223		 * workspace preallocated for each type and the compression
   1224		 * time is bounded so we get to a workspace eventually. This
   1225		 * makes our caller's life easier.
   1226		 *
   1227		 * To prevent silent and low-probability deadlocks (when the
   1228		 * initial preallocation fails), check if there are any
   1229		 * workspaces at all.
   1230		 */
   1231		if (atomic_read(total_ws) == 0) {
   1232			static DEFINE_RATELIMIT_STATE(_rs,
   1233					/* once per minute */ 60 * HZ,
   1234					/* no burst */ 1);
   1235
   1236			if (__ratelimit(&_rs)) {
   1237				pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
   1238			}
   1239		}
   1240		goto again;
   1241	}
   1242	return workspace;
   1243}
   1244
   1245static struct list_head *get_workspace(int type, int level)
   1246{
   1247	switch (type) {
   1248	case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
   1249	case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
   1250	case BTRFS_COMPRESS_LZO:  return btrfs_get_workspace(type, level);
   1251	case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
   1252	default:
   1253		/*
   1254		 * This can't happen, the type is validated several times
   1255		 * before we get here.
   1256		 */
   1257		BUG();
   1258	}
   1259}
   1260
   1261/*
   1262 * put a workspace struct back on the list or free it if we have enough
   1263 * idle ones sitting around
   1264 */
   1265void btrfs_put_workspace(int type, struct list_head *ws)
   1266{
   1267	struct workspace_manager *wsm;
   1268	struct list_head *idle_ws;
   1269	spinlock_t *ws_lock;
   1270	atomic_t *total_ws;
   1271	wait_queue_head_t *ws_wait;
   1272	int *free_ws;
   1273
   1274	wsm = btrfs_compress_op[type]->workspace_manager;
   1275	idle_ws	 = &wsm->idle_ws;
   1276	ws_lock	 = &wsm->ws_lock;
   1277	total_ws = &wsm->total_ws;
   1278	ws_wait	 = &wsm->ws_wait;
   1279	free_ws	 = &wsm->free_ws;
   1280
   1281	spin_lock(ws_lock);
   1282	if (*free_ws <= num_online_cpus()) {
   1283		list_add(ws, idle_ws);
   1284		(*free_ws)++;
   1285		spin_unlock(ws_lock);
   1286		goto wake;
   1287	}
   1288	spin_unlock(ws_lock);
   1289
   1290	free_workspace(type, ws);
   1291	atomic_dec(total_ws);
   1292wake:
   1293	cond_wake_up(ws_wait);
   1294}
   1295
   1296static void put_workspace(int type, struct list_head *ws)
   1297{
   1298	switch (type) {
   1299	case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
   1300	case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
   1301	case BTRFS_COMPRESS_LZO:  return btrfs_put_workspace(type, ws);
   1302	case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
   1303	default:
   1304		/*
   1305		 * This can't happen, the type is validated several times
   1306		 * before we get here.
   1307		 */
   1308		BUG();
   1309	}
   1310}
   1311
   1312/*
   1313 * Adjust @level according to the limits of the compression algorithm or
   1314 * fallback to default
   1315 */
   1316static unsigned int btrfs_compress_set_level(int type, unsigned level)
   1317{
   1318	const struct btrfs_compress_op *ops = btrfs_compress_op[type];
   1319
   1320	if (level == 0)
   1321		level = ops->default_level;
   1322	else
   1323		level = min(level, ops->max_level);
   1324
   1325	return level;
   1326}
   1327
   1328/*
   1329 * Given an address space and start and length, compress the bytes into @pages
   1330 * that are allocated on demand.
   1331 *
   1332 * @type_level is encoded algorithm and level, where level 0 means whatever
   1333 * default the algorithm chooses and is opaque here;
   1334 * - compression algo are 0-3
   1335 * - the level are bits 4-7
   1336 *
   1337 * @out_pages is an in/out parameter, holds maximum number of pages to allocate
   1338 * and returns number of actually allocated pages
   1339 *
   1340 * @total_in is used to return the number of bytes actually read.  It
   1341 * may be smaller than the input length if we had to exit early because we
   1342 * ran out of room in the pages array or because we cross the
   1343 * max_out threshold.
   1344 *
   1345 * @total_out is an in/out parameter, must be set to the input length and will
   1346 * be also used to return the total number of compressed bytes
   1347 */
   1348int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
   1349			 u64 start, struct page **pages,
   1350			 unsigned long *out_pages,
   1351			 unsigned long *total_in,
   1352			 unsigned long *total_out)
   1353{
   1354	int type = btrfs_compress_type(type_level);
   1355	int level = btrfs_compress_level(type_level);
   1356	struct list_head *workspace;
   1357	int ret;
   1358
   1359	level = btrfs_compress_set_level(type, level);
   1360	workspace = get_workspace(type, level);
   1361	ret = compression_compress_pages(type, workspace, mapping, start, pages,
   1362					 out_pages, total_in, total_out);
   1363	put_workspace(type, workspace);
   1364	return ret;
   1365}
   1366
   1367static int btrfs_decompress_bio(struct compressed_bio *cb)
   1368{
   1369	struct list_head *workspace;
   1370	int ret;
   1371	int type = cb->compress_type;
   1372
   1373	workspace = get_workspace(type, 0);
   1374	ret = compression_decompress_bio(workspace, cb);
   1375	put_workspace(type, workspace);
   1376
   1377	return ret;
   1378}
   1379
   1380/*
   1381 * a less complex decompression routine.  Our compressed data fits in a
   1382 * single page, and we want to read a single page out of it.
   1383 * start_byte tells us the offset into the compressed data we're interested in
   1384 */
   1385int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
   1386		     unsigned long start_byte, size_t srclen, size_t destlen)
   1387{
   1388	struct list_head *workspace;
   1389	int ret;
   1390
   1391	workspace = get_workspace(type, 0);
   1392	ret = compression_decompress(type, workspace, data_in, dest_page,
   1393				     start_byte, srclen, destlen);
   1394	put_workspace(type, workspace);
   1395
   1396	return ret;
   1397}
   1398
   1399void __init btrfs_init_compress(void)
   1400{
   1401	btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
   1402	btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
   1403	btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
   1404	zstd_init_workspace_manager();
   1405}
   1406
   1407void __cold btrfs_exit_compress(void)
   1408{
   1409	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
   1410	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
   1411	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
   1412	zstd_cleanup_workspace_manager();
   1413}
   1414
   1415/*
   1416 * Copy decompressed data from working buffer to pages.
   1417 *
   1418 * @buf:		The decompressed data buffer
   1419 * @buf_len:		The decompressed data length
   1420 * @decompressed:	Number of bytes that are already decompressed inside the
   1421 * 			compressed extent
   1422 * @cb:			The compressed extent descriptor
   1423 * @orig_bio:		The original bio that the caller wants to read for
   1424 *
   1425 * An easier to understand graph is like below:
   1426 *
   1427 * 		|<- orig_bio ->|     |<- orig_bio->|
   1428 * 	|<-------      full decompressed extent      ----->|
   1429 * 	|<-----------    @cb range   ---->|
   1430 * 	|			|<-- @buf_len -->|
   1431 * 	|<--- @decompressed --->|
   1432 *
   1433 * Note that, @cb can be a subpage of the full decompressed extent, but
   1434 * @cb->start always has the same as the orig_file_offset value of the full
   1435 * decompressed extent.
   1436 *
   1437 * When reading compressed extent, we have to read the full compressed extent,
   1438 * while @orig_bio may only want part of the range.
   1439 * Thus this function will ensure only data covered by @orig_bio will be copied
   1440 * to.
   1441 *
   1442 * Return 0 if we have copied all needed contents for @orig_bio.
   1443 * Return >0 if we need continue decompress.
   1444 */
   1445int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
   1446			      struct compressed_bio *cb, u32 decompressed)
   1447{
   1448	struct bio *orig_bio = cb->orig_bio;
   1449	/* Offset inside the full decompressed extent */
   1450	u32 cur_offset;
   1451
   1452	cur_offset = decompressed;
   1453	/* The main loop to do the copy */
   1454	while (cur_offset < decompressed + buf_len) {
   1455		struct bio_vec bvec;
   1456		size_t copy_len;
   1457		u32 copy_start;
   1458		/* Offset inside the full decompressed extent */
   1459		u32 bvec_offset;
   1460
   1461		bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
   1462		/*
   1463		 * cb->start may underflow, but subtracting that value can still
   1464		 * give us correct offset inside the full decompressed extent.
   1465		 */
   1466		bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
   1467
   1468		/* Haven't reached the bvec range, exit */
   1469		if (decompressed + buf_len <= bvec_offset)
   1470			return 1;
   1471
   1472		copy_start = max(cur_offset, bvec_offset);
   1473		copy_len = min(bvec_offset + bvec.bv_len,
   1474			       decompressed + buf_len) - copy_start;
   1475		ASSERT(copy_len);
   1476
   1477		/*
   1478		 * Extra range check to ensure we didn't go beyond
   1479		 * @buf + @buf_len.
   1480		 */
   1481		ASSERT(copy_start - decompressed < buf_len);
   1482		memcpy_to_page(bvec.bv_page, bvec.bv_offset,
   1483			       buf + copy_start - decompressed, copy_len);
   1484		flush_dcache_page(bvec.bv_page);
   1485		cur_offset += copy_len;
   1486
   1487		bio_advance(orig_bio, copy_len);
   1488		/* Finished the bio */
   1489		if (!orig_bio->bi_iter.bi_size)
   1490			return 0;
   1491	}
   1492	return 1;
   1493}
   1494
   1495/*
   1496 * Shannon Entropy calculation
   1497 *
   1498 * Pure byte distribution analysis fails to determine compressibility of data.
   1499 * Try calculating entropy to estimate the average minimum number of bits
   1500 * needed to encode the sampled data.
   1501 *
   1502 * For convenience, return the percentage of needed bits, instead of amount of
   1503 * bits directly.
   1504 *
   1505 * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
   1506 *			    and can be compressible with high probability
   1507 *
   1508 * @ENTROPY_LVL_HIGH - data are not compressible with high probability
   1509 *
   1510 * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
   1511 */
   1512#define ENTROPY_LVL_ACEPTABLE		(65)
   1513#define ENTROPY_LVL_HIGH		(80)
   1514
   1515/*
   1516 * For increasead precision in shannon_entropy calculation,
   1517 * let's do pow(n, M) to save more digits after comma:
   1518 *
   1519 * - maximum int bit length is 64
   1520 * - ilog2(MAX_SAMPLE_SIZE)	-> 13
   1521 * - 13 * 4 = 52 < 64		-> M = 4
   1522 *
   1523 * So use pow(n, 4).
   1524 */
   1525static inline u32 ilog2_w(u64 n)
   1526{
   1527	return ilog2(n * n * n * n);
   1528}
   1529
   1530static u32 shannon_entropy(struct heuristic_ws *ws)
   1531{
   1532	const u32 entropy_max = 8 * ilog2_w(2);
   1533	u32 entropy_sum = 0;
   1534	u32 p, p_base, sz_base;
   1535	u32 i;
   1536
   1537	sz_base = ilog2_w(ws->sample_size);
   1538	for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
   1539		p = ws->bucket[i].count;
   1540		p_base = ilog2_w(p);
   1541		entropy_sum += p * (sz_base - p_base);
   1542	}
   1543
   1544	entropy_sum /= ws->sample_size;
   1545	return entropy_sum * 100 / entropy_max;
   1546}
   1547
   1548#define RADIX_BASE		4U
   1549#define COUNTERS_SIZE		(1U << RADIX_BASE)
   1550
   1551static u8 get4bits(u64 num, int shift) {
   1552	u8 low4bits;
   1553
   1554	num >>= shift;
   1555	/* Reverse order */
   1556	low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
   1557	return low4bits;
   1558}
   1559
   1560/*
   1561 * Use 4 bits as radix base
   1562 * Use 16 u32 counters for calculating new position in buf array
   1563 *
   1564 * @array     - array that will be sorted
   1565 * @array_buf - buffer array to store sorting results
   1566 *              must be equal in size to @array
   1567 * @num       - array size
   1568 */
   1569static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
   1570		       int num)
   1571{
   1572	u64 max_num;
   1573	u64 buf_num;
   1574	u32 counters[COUNTERS_SIZE];
   1575	u32 new_addr;
   1576	u32 addr;
   1577	int bitlen;
   1578	int shift;
   1579	int i;
   1580
   1581	/*
   1582	 * Try avoid useless loop iterations for small numbers stored in big
   1583	 * counters.  Example: 48 33 4 ... in 64bit array
   1584	 */
   1585	max_num = array[0].count;
   1586	for (i = 1; i < num; i++) {
   1587		buf_num = array[i].count;
   1588		if (buf_num > max_num)
   1589			max_num = buf_num;
   1590	}
   1591
   1592	buf_num = ilog2(max_num);
   1593	bitlen = ALIGN(buf_num, RADIX_BASE * 2);
   1594
   1595	shift = 0;
   1596	while (shift < bitlen) {
   1597		memset(counters, 0, sizeof(counters));
   1598
   1599		for (i = 0; i < num; i++) {
   1600			buf_num = array[i].count;
   1601			addr = get4bits(buf_num, shift);
   1602			counters[addr]++;
   1603		}
   1604
   1605		for (i = 1; i < COUNTERS_SIZE; i++)
   1606			counters[i] += counters[i - 1];
   1607
   1608		for (i = num - 1; i >= 0; i--) {
   1609			buf_num = array[i].count;
   1610			addr = get4bits(buf_num, shift);
   1611			counters[addr]--;
   1612			new_addr = counters[addr];
   1613			array_buf[new_addr] = array[i];
   1614		}
   1615
   1616		shift += RADIX_BASE;
   1617
   1618		/*
   1619		 * Normal radix expects to move data from a temporary array, to
   1620		 * the main one.  But that requires some CPU time. Avoid that
   1621		 * by doing another sort iteration to original array instead of
   1622		 * memcpy()
   1623		 */
   1624		memset(counters, 0, sizeof(counters));
   1625
   1626		for (i = 0; i < num; i ++) {
   1627			buf_num = array_buf[i].count;
   1628			addr = get4bits(buf_num, shift);
   1629			counters[addr]++;
   1630		}
   1631
   1632		for (i = 1; i < COUNTERS_SIZE; i++)
   1633			counters[i] += counters[i - 1];
   1634
   1635		for (i = num - 1; i >= 0; i--) {
   1636			buf_num = array_buf[i].count;
   1637			addr = get4bits(buf_num, shift);
   1638			counters[addr]--;
   1639			new_addr = counters[addr];
   1640			array[new_addr] = array_buf[i];
   1641		}
   1642
   1643		shift += RADIX_BASE;
   1644	}
   1645}
   1646
   1647/*
   1648 * Size of the core byte set - how many bytes cover 90% of the sample
   1649 *
   1650 * There are several types of structured binary data that use nearly all byte
   1651 * values. The distribution can be uniform and counts in all buckets will be
   1652 * nearly the same (eg. encrypted data). Unlikely to be compressible.
   1653 *
   1654 * Other possibility is normal (Gaussian) distribution, where the data could
   1655 * be potentially compressible, but we have to take a few more steps to decide
   1656 * how much.
   1657 *
   1658 * @BYTE_CORE_SET_LOW  - main part of byte values repeated frequently,
   1659 *                       compression algo can easy fix that
   1660 * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
   1661 *                       probability is not compressible
   1662 */
   1663#define BYTE_CORE_SET_LOW		(64)
   1664#define BYTE_CORE_SET_HIGH		(200)
   1665
   1666static int byte_core_set_size(struct heuristic_ws *ws)
   1667{
   1668	u32 i;
   1669	u32 coreset_sum = 0;
   1670	const u32 core_set_threshold = ws->sample_size * 90 / 100;
   1671	struct bucket_item *bucket = ws->bucket;
   1672
   1673	/* Sort in reverse order */
   1674	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
   1675
   1676	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
   1677		coreset_sum += bucket[i].count;
   1678
   1679	if (coreset_sum > core_set_threshold)
   1680		return i;
   1681
   1682	for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
   1683		coreset_sum += bucket[i].count;
   1684		if (coreset_sum > core_set_threshold)
   1685			break;
   1686	}
   1687
   1688	return i;
   1689}
   1690
   1691/*
   1692 * Count byte values in buckets.
   1693 * This heuristic can detect textual data (configs, xml, json, html, etc).
   1694 * Because in most text-like data byte set is restricted to limited number of
   1695 * possible characters, and that restriction in most cases makes data easy to
   1696 * compress.
   1697 *
   1698 * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
   1699 *	less - compressible
   1700 *	more - need additional analysis
   1701 */
   1702#define BYTE_SET_THRESHOLD		(64)
   1703
   1704static u32 byte_set_size(const struct heuristic_ws *ws)
   1705{
   1706	u32 i;
   1707	u32 byte_set_size = 0;
   1708
   1709	for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
   1710		if (ws->bucket[i].count > 0)
   1711			byte_set_size++;
   1712	}
   1713
   1714	/*
   1715	 * Continue collecting count of byte values in buckets.  If the byte
   1716	 * set size is bigger then the threshold, it's pointless to continue,
   1717	 * the detection technique would fail for this type of data.
   1718	 */
   1719	for (; i < BUCKET_SIZE; i++) {
   1720		if (ws->bucket[i].count > 0) {
   1721			byte_set_size++;
   1722			if (byte_set_size > BYTE_SET_THRESHOLD)
   1723				return byte_set_size;
   1724		}
   1725	}
   1726
   1727	return byte_set_size;
   1728}
   1729
   1730static bool sample_repeated_patterns(struct heuristic_ws *ws)
   1731{
   1732	const u32 half_of_sample = ws->sample_size / 2;
   1733	const u8 *data = ws->sample;
   1734
   1735	return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
   1736}
   1737
   1738static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
   1739				     struct heuristic_ws *ws)
   1740{
   1741	struct page *page;
   1742	u64 index, index_end;
   1743	u32 i, curr_sample_pos;
   1744	u8 *in_data;
   1745
   1746	/*
   1747	 * Compression handles the input data by chunks of 128KiB
   1748	 * (defined by BTRFS_MAX_UNCOMPRESSED)
   1749	 *
   1750	 * We do the same for the heuristic and loop over the whole range.
   1751	 *
   1752	 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
   1753	 * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
   1754	 */
   1755	if (end - start > BTRFS_MAX_UNCOMPRESSED)
   1756		end = start + BTRFS_MAX_UNCOMPRESSED;
   1757
   1758	index = start >> PAGE_SHIFT;
   1759	index_end = end >> PAGE_SHIFT;
   1760
   1761	/* Don't miss unaligned end */
   1762	if (!IS_ALIGNED(end, PAGE_SIZE))
   1763		index_end++;
   1764
   1765	curr_sample_pos = 0;
   1766	while (index < index_end) {
   1767		page = find_get_page(inode->i_mapping, index);
   1768		in_data = kmap_local_page(page);
   1769		/* Handle case where the start is not aligned to PAGE_SIZE */
   1770		i = start % PAGE_SIZE;
   1771		while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
   1772			/* Don't sample any garbage from the last page */
   1773			if (start > end - SAMPLING_READ_SIZE)
   1774				break;
   1775			memcpy(&ws->sample[curr_sample_pos], &in_data[i],
   1776					SAMPLING_READ_SIZE);
   1777			i += SAMPLING_INTERVAL;
   1778			start += SAMPLING_INTERVAL;
   1779			curr_sample_pos += SAMPLING_READ_SIZE;
   1780		}
   1781		kunmap_local(in_data);
   1782		put_page(page);
   1783
   1784		index++;
   1785	}
   1786
   1787	ws->sample_size = curr_sample_pos;
   1788}
   1789
   1790/*
   1791 * Compression heuristic.
   1792 *
   1793 * For now is's a naive and optimistic 'return true', we'll extend the logic to
   1794 * quickly (compared to direct compression) detect data characteristics
   1795 * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
   1796 * data.
   1797 *
   1798 * The following types of analysis can be performed:
   1799 * - detect mostly zero data
   1800 * - detect data with low "byte set" size (text, etc)
   1801 * - detect data with low/high "core byte" set
   1802 *
   1803 * Return non-zero if the compression should be done, 0 otherwise.
   1804 */
   1805int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
   1806{
   1807	struct list_head *ws_list = get_workspace(0, 0);
   1808	struct heuristic_ws *ws;
   1809	u32 i;
   1810	u8 byte;
   1811	int ret = 0;
   1812
   1813	ws = list_entry(ws_list, struct heuristic_ws, list);
   1814
   1815	heuristic_collect_sample(inode, start, end, ws);
   1816
   1817	if (sample_repeated_patterns(ws)) {
   1818		ret = 1;
   1819		goto out;
   1820	}
   1821
   1822	memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
   1823
   1824	for (i = 0; i < ws->sample_size; i++) {
   1825		byte = ws->sample[i];
   1826		ws->bucket[byte].count++;
   1827	}
   1828
   1829	i = byte_set_size(ws);
   1830	if (i < BYTE_SET_THRESHOLD) {
   1831		ret = 2;
   1832		goto out;
   1833	}
   1834
   1835	i = byte_core_set_size(ws);
   1836	if (i <= BYTE_CORE_SET_LOW) {
   1837		ret = 3;
   1838		goto out;
   1839	}
   1840
   1841	if (i >= BYTE_CORE_SET_HIGH) {
   1842		ret = 0;
   1843		goto out;
   1844	}
   1845
   1846	i = shannon_entropy(ws);
   1847	if (i <= ENTROPY_LVL_ACEPTABLE) {
   1848		ret = 4;
   1849		goto out;
   1850	}
   1851
   1852	/*
   1853	 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
   1854	 * needed to give green light to compression.
   1855	 *
   1856	 * For now just assume that compression at that level is not worth the
   1857	 * resources because:
   1858	 *
   1859	 * 1. it is possible to defrag the data later
   1860	 *
   1861	 * 2. the data would turn out to be hardly compressible, eg. 150 byte
   1862	 * values, every bucket has counter at level ~54. The heuristic would
   1863	 * be confused. This can happen when data have some internal repeated
   1864	 * patterns like "abbacbbc...". This can be detected by analyzing
   1865	 * pairs of bytes, which is too costly.
   1866	 */
   1867	if (i < ENTROPY_LVL_HIGH) {
   1868		ret = 5;
   1869		goto out;
   1870	} else {
   1871		ret = 0;
   1872		goto out;
   1873	}
   1874
   1875out:
   1876	put_workspace(0, ws_list);
   1877	return ret;
   1878}
   1879
   1880/*
   1881 * Convert the compression suffix (eg. after "zlib" starting with ":") to
   1882 * level, unrecognized string will set the default level
   1883 */
   1884unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
   1885{
   1886	unsigned int level = 0;
   1887	int ret;
   1888
   1889	if (!type)
   1890		return 0;
   1891
   1892	if (str[0] == ':') {
   1893		ret = kstrtouint(str + 1, 10, &level);
   1894		if (ret)
   1895			level = 0;
   1896	}
   1897
   1898	level = btrfs_compress_set_level(type, level);
   1899
   1900	return level;
   1901}