cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

inode.c (331543B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2007 Oracle.  All rights reserved.
      4 */
      5
      6#include <crypto/hash.h>
      7#include <linux/kernel.h>
      8#include <linux/bio.h>
      9#include <linux/blk-cgroup.h>
     10#include <linux/file.h>
     11#include <linux/fs.h>
     12#include <linux/pagemap.h>
     13#include <linux/highmem.h>
     14#include <linux/time.h>
     15#include <linux/init.h>
     16#include <linux/string.h>
     17#include <linux/backing-dev.h>
     18#include <linux/writeback.h>
     19#include <linux/compat.h>
     20#include <linux/xattr.h>
     21#include <linux/posix_acl.h>
     22#include <linux/falloc.h>
     23#include <linux/slab.h>
     24#include <linux/ratelimit.h>
     25#include <linux/btrfs.h>
     26#include <linux/blkdev.h>
     27#include <linux/posix_acl_xattr.h>
     28#include <linux/uio.h>
     29#include <linux/magic.h>
     30#include <linux/iversion.h>
     31#include <linux/swap.h>
     32#include <linux/migrate.h>
     33#include <linux/sched/mm.h>
     34#include <linux/iomap.h>
     35#include <asm/unaligned.h>
     36#include <linux/fsverity.h>
     37#include "misc.h"
     38#include "ctree.h"
     39#include "disk-io.h"
     40#include "transaction.h"
     41#include "btrfs_inode.h"
     42#include "print-tree.h"
     43#include "ordered-data.h"
     44#include "xattr.h"
     45#include "tree-log.h"
     46#include "volumes.h"
     47#include "compression.h"
     48#include "locking.h"
     49#include "free-space-cache.h"
     50#include "props.h"
     51#include "qgroup.h"
     52#include "delalloc-space.h"
     53#include "block-group.h"
     54#include "space-info.h"
     55#include "zoned.h"
     56#include "subpage.h"
     57#include "inode-item.h"
     58
     59struct btrfs_iget_args {
     60	u64 ino;
     61	struct btrfs_root *root;
     62};
     63
     64struct btrfs_dio_data {
     65	ssize_t submitted;
     66	struct extent_changeset *data_reserved;
     67	bool data_space_reserved;
     68	bool nocow_done;
     69};
     70
     71struct btrfs_dio_private {
     72	struct inode *inode;
     73
     74	/*
     75	 * Since DIO can use anonymous page, we cannot use page_offset() to
     76	 * grab the file offset, thus need a dedicated member for file offset.
     77	 */
     78	u64 file_offset;
     79	/* Used for bio::bi_size */
     80	u32 bytes;
     81
     82	/*
     83	 * References to this structure. There is one reference per in-flight
     84	 * bio plus one while we're still setting up.
     85	 */
     86	refcount_t refs;
     87
     88	/* Array of checksums */
     89	u8 *csums;
     90
     91	/* This must be last */
     92	struct bio bio;
     93};
     94
     95static struct bio_set btrfs_dio_bioset;
     96
     97struct btrfs_rename_ctx {
     98	/* Output field. Stores the index number of the old directory entry. */
     99	u64 index;
    100};
    101
    102static const struct inode_operations btrfs_dir_inode_operations;
    103static const struct inode_operations btrfs_symlink_inode_operations;
    104static const struct inode_operations btrfs_special_inode_operations;
    105static const struct inode_operations btrfs_file_inode_operations;
    106static const struct address_space_operations btrfs_aops;
    107static const struct file_operations btrfs_dir_file_operations;
    108
    109static struct kmem_cache *btrfs_inode_cachep;
    110struct kmem_cache *btrfs_trans_handle_cachep;
    111struct kmem_cache *btrfs_path_cachep;
    112struct kmem_cache *btrfs_free_space_cachep;
    113struct kmem_cache *btrfs_free_space_bitmap_cachep;
    114
    115static int btrfs_setsize(struct inode *inode, struct iattr *attr);
    116static int btrfs_truncate(struct inode *inode, bool skip_writeback);
    117static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
    118static noinline int cow_file_range(struct btrfs_inode *inode,
    119				   struct page *locked_page,
    120				   u64 start, u64 end, int *page_started,
    121				   unsigned long *nr_written, int unlock);
    122static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
    123				       u64 len, u64 orig_start, u64 block_start,
    124				       u64 block_len, u64 orig_block_len,
    125				       u64 ram_bytes, int compress_type,
    126				       int type);
    127
    128static void __endio_write_update_ordered(struct btrfs_inode *inode,
    129					 const u64 offset, const u64 bytes,
    130					 const bool uptodate);
    131
    132/*
    133 * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
    134 *
    135 * ilock_flags can have the following bit set:
    136 *
    137 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
    138 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
    139 *		     return -EAGAIN
    140 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
    141 */
    142int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
    143{
    144	if (ilock_flags & BTRFS_ILOCK_SHARED) {
    145		if (ilock_flags & BTRFS_ILOCK_TRY) {
    146			if (!inode_trylock_shared(inode))
    147				return -EAGAIN;
    148			else
    149				return 0;
    150		}
    151		inode_lock_shared(inode);
    152	} else {
    153		if (ilock_flags & BTRFS_ILOCK_TRY) {
    154			if (!inode_trylock(inode))
    155				return -EAGAIN;
    156			else
    157				return 0;
    158		}
    159		inode_lock(inode);
    160	}
    161	if (ilock_flags & BTRFS_ILOCK_MMAP)
    162		down_write(&BTRFS_I(inode)->i_mmap_lock);
    163	return 0;
    164}
    165
    166/*
    167 * btrfs_inode_unlock - unock inode i_rwsem
    168 *
    169 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
    170 * to decide whether the lock acquired is shared or exclusive.
    171 */
    172void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
    173{
    174	if (ilock_flags & BTRFS_ILOCK_MMAP)
    175		up_write(&BTRFS_I(inode)->i_mmap_lock);
    176	if (ilock_flags & BTRFS_ILOCK_SHARED)
    177		inode_unlock_shared(inode);
    178	else
    179		inode_unlock(inode);
    180}
    181
    182/*
    183 * Cleanup all submitted ordered extents in specified range to handle errors
    184 * from the btrfs_run_delalloc_range() callback.
    185 *
    186 * NOTE: caller must ensure that when an error happens, it can not call
    187 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
    188 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
    189 * to be released, which we want to happen only when finishing the ordered
    190 * extent (btrfs_finish_ordered_io()).
    191 */
    192static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
    193						 struct page *locked_page,
    194						 u64 offset, u64 bytes)
    195{
    196	unsigned long index = offset >> PAGE_SHIFT;
    197	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
    198	u64 page_start = page_offset(locked_page);
    199	u64 page_end = page_start + PAGE_SIZE - 1;
    200
    201	struct page *page;
    202
    203	while (index <= end_index) {
    204		/*
    205		 * For locked page, we will call end_extent_writepage() on it
    206		 * in run_delalloc_range() for the error handling.  That
    207		 * end_extent_writepage() function will call
    208		 * btrfs_mark_ordered_io_finished() to clear page Ordered and
    209		 * run the ordered extent accounting.
    210		 *
    211		 * Here we can't just clear the Ordered bit, or
    212		 * btrfs_mark_ordered_io_finished() would skip the accounting
    213		 * for the page range, and the ordered extent will never finish.
    214		 */
    215		if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
    216			index++;
    217			continue;
    218		}
    219		page = find_get_page(inode->vfs_inode.i_mapping, index);
    220		index++;
    221		if (!page)
    222			continue;
    223
    224		/*
    225		 * Here we just clear all Ordered bits for every page in the
    226		 * range, then __endio_write_update_ordered() will handle
    227		 * the ordered extent accounting for the range.
    228		 */
    229		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
    230					       offset, bytes);
    231		put_page(page);
    232	}
    233
    234	/* The locked page covers the full range, nothing needs to be done */
    235	if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
    236		return;
    237	/*
    238	 * In case this page belongs to the delalloc range being instantiated
    239	 * then skip it, since the first page of a range is going to be
    240	 * properly cleaned up by the caller of run_delalloc_range
    241	 */
    242	if (page_start >= offset && page_end <= (offset + bytes - 1)) {
    243		bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
    244		offset = page_offset(locked_page) + PAGE_SIZE;
    245	}
    246
    247	return __endio_write_update_ordered(inode, offset, bytes, false);
    248}
    249
    250static int btrfs_dirty_inode(struct inode *inode);
    251
    252static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
    253				     struct btrfs_new_inode_args *args)
    254{
    255	int err;
    256
    257	if (args->default_acl) {
    258		err = __btrfs_set_acl(trans, args->inode, args->default_acl,
    259				      ACL_TYPE_DEFAULT);
    260		if (err)
    261			return err;
    262	}
    263	if (args->acl) {
    264		err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
    265		if (err)
    266			return err;
    267	}
    268	if (!args->default_acl && !args->acl)
    269		cache_no_acl(args->inode);
    270	return btrfs_xattr_security_init(trans, args->inode, args->dir,
    271					 &args->dentry->d_name);
    272}
    273
    274/*
    275 * this does all the hard work for inserting an inline extent into
    276 * the btree.  The caller should have done a btrfs_drop_extents so that
    277 * no overlapping inline items exist in the btree
    278 */
    279static int insert_inline_extent(struct btrfs_trans_handle *trans,
    280				struct btrfs_path *path,
    281				struct btrfs_inode *inode, bool extent_inserted,
    282				size_t size, size_t compressed_size,
    283				int compress_type,
    284				struct page **compressed_pages,
    285				bool update_i_size)
    286{
    287	struct btrfs_root *root = inode->root;
    288	struct extent_buffer *leaf;
    289	struct page *page = NULL;
    290	char *kaddr;
    291	unsigned long ptr;
    292	struct btrfs_file_extent_item *ei;
    293	int ret;
    294	size_t cur_size = size;
    295	u64 i_size;
    296
    297	ASSERT((compressed_size > 0 && compressed_pages) ||
    298	       (compressed_size == 0 && !compressed_pages));
    299
    300	if (compressed_size && compressed_pages)
    301		cur_size = compressed_size;
    302
    303	if (!extent_inserted) {
    304		struct btrfs_key key;
    305		size_t datasize;
    306
    307		key.objectid = btrfs_ino(inode);
    308		key.offset = 0;
    309		key.type = BTRFS_EXTENT_DATA_KEY;
    310
    311		datasize = btrfs_file_extent_calc_inline_size(cur_size);
    312		ret = btrfs_insert_empty_item(trans, root, path, &key,
    313					      datasize);
    314		if (ret)
    315			goto fail;
    316	}
    317	leaf = path->nodes[0];
    318	ei = btrfs_item_ptr(leaf, path->slots[0],
    319			    struct btrfs_file_extent_item);
    320	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
    321	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
    322	btrfs_set_file_extent_encryption(leaf, ei, 0);
    323	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
    324	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
    325	ptr = btrfs_file_extent_inline_start(ei);
    326
    327	if (compress_type != BTRFS_COMPRESS_NONE) {
    328		struct page *cpage;
    329		int i = 0;
    330		while (compressed_size > 0) {
    331			cpage = compressed_pages[i];
    332			cur_size = min_t(unsigned long, compressed_size,
    333				       PAGE_SIZE);
    334
    335			kaddr = kmap_atomic(cpage);
    336			write_extent_buffer(leaf, kaddr, ptr, cur_size);
    337			kunmap_atomic(kaddr);
    338
    339			i++;
    340			ptr += cur_size;
    341			compressed_size -= cur_size;
    342		}
    343		btrfs_set_file_extent_compression(leaf, ei,
    344						  compress_type);
    345	} else {
    346		page = find_get_page(inode->vfs_inode.i_mapping, 0);
    347		btrfs_set_file_extent_compression(leaf, ei, 0);
    348		kaddr = kmap_atomic(page);
    349		write_extent_buffer(leaf, kaddr, ptr, size);
    350		kunmap_atomic(kaddr);
    351		put_page(page);
    352	}
    353	btrfs_mark_buffer_dirty(leaf);
    354	btrfs_release_path(path);
    355
    356	/*
    357	 * We align size to sectorsize for inline extents just for simplicity
    358	 * sake.
    359	 */
    360	ret = btrfs_inode_set_file_extent_range(inode, 0,
    361					ALIGN(size, root->fs_info->sectorsize));
    362	if (ret)
    363		goto fail;
    364
    365	/*
    366	 * We're an inline extent, so nobody can extend the file past i_size
    367	 * without locking a page we already have locked.
    368	 *
    369	 * We must do any i_size and inode updates before we unlock the pages.
    370	 * Otherwise we could end up racing with unlink.
    371	 */
    372	i_size = i_size_read(&inode->vfs_inode);
    373	if (update_i_size && size > i_size) {
    374		i_size_write(&inode->vfs_inode, size);
    375		i_size = size;
    376	}
    377	inode->disk_i_size = i_size;
    378
    379fail:
    380	return ret;
    381}
    382
    383
    384/*
    385 * conditionally insert an inline extent into the file.  This
    386 * does the checks required to make sure the data is small enough
    387 * to fit as an inline extent.
    388 */
    389static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
    390					  size_t compressed_size,
    391					  int compress_type,
    392					  struct page **compressed_pages,
    393					  bool update_i_size)
    394{
    395	struct btrfs_drop_extents_args drop_args = { 0 };
    396	struct btrfs_root *root = inode->root;
    397	struct btrfs_fs_info *fs_info = root->fs_info;
    398	struct btrfs_trans_handle *trans;
    399	u64 data_len = (compressed_size ?: size);
    400	int ret;
    401	struct btrfs_path *path;
    402
    403	/*
    404	 * We can create an inline extent if it ends at or beyond the current
    405	 * i_size, is no larger than a sector (decompressed), and the (possibly
    406	 * compressed) data fits in a leaf and the configured maximum inline
    407	 * size.
    408	 */
    409	if (size < i_size_read(&inode->vfs_inode) ||
    410	    size > fs_info->sectorsize ||
    411	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
    412	    data_len > fs_info->max_inline)
    413		return 1;
    414
    415	path = btrfs_alloc_path();
    416	if (!path)
    417		return -ENOMEM;
    418
    419	trans = btrfs_join_transaction(root);
    420	if (IS_ERR(trans)) {
    421		btrfs_free_path(path);
    422		return PTR_ERR(trans);
    423	}
    424	trans->block_rsv = &inode->block_rsv;
    425
    426	drop_args.path = path;
    427	drop_args.start = 0;
    428	drop_args.end = fs_info->sectorsize;
    429	drop_args.drop_cache = true;
    430	drop_args.replace_extent = true;
    431	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
    432	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
    433	if (ret) {
    434		btrfs_abort_transaction(trans, ret);
    435		goto out;
    436	}
    437
    438	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
    439				   size, compressed_size, compress_type,
    440				   compressed_pages, update_i_size);
    441	if (ret && ret != -ENOSPC) {
    442		btrfs_abort_transaction(trans, ret);
    443		goto out;
    444	} else if (ret == -ENOSPC) {
    445		ret = 1;
    446		goto out;
    447	}
    448
    449	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
    450	ret = btrfs_update_inode(trans, root, inode);
    451	if (ret && ret != -ENOSPC) {
    452		btrfs_abort_transaction(trans, ret);
    453		goto out;
    454	} else if (ret == -ENOSPC) {
    455		ret = 1;
    456		goto out;
    457	}
    458
    459	btrfs_set_inode_full_sync(inode);
    460out:
    461	/*
    462	 * Don't forget to free the reserved space, as for inlined extent
    463	 * it won't count as data extent, free them directly here.
    464	 * And at reserve time, it's always aligned to page size, so
    465	 * just free one page here.
    466	 */
    467	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
    468	btrfs_free_path(path);
    469	btrfs_end_transaction(trans);
    470	return ret;
    471}
    472
    473struct async_extent {
    474	u64 start;
    475	u64 ram_size;
    476	u64 compressed_size;
    477	struct page **pages;
    478	unsigned long nr_pages;
    479	int compress_type;
    480	struct list_head list;
    481};
    482
    483struct async_chunk {
    484	struct inode *inode;
    485	struct page *locked_page;
    486	u64 start;
    487	u64 end;
    488	unsigned int write_flags;
    489	struct list_head extents;
    490	struct cgroup_subsys_state *blkcg_css;
    491	struct btrfs_work work;
    492	struct async_cow *async_cow;
    493};
    494
    495struct async_cow {
    496	atomic_t num_chunks;
    497	struct async_chunk chunks[];
    498};
    499
    500static noinline int add_async_extent(struct async_chunk *cow,
    501				     u64 start, u64 ram_size,
    502				     u64 compressed_size,
    503				     struct page **pages,
    504				     unsigned long nr_pages,
    505				     int compress_type)
    506{
    507	struct async_extent *async_extent;
    508
    509	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
    510	BUG_ON(!async_extent); /* -ENOMEM */
    511	async_extent->start = start;
    512	async_extent->ram_size = ram_size;
    513	async_extent->compressed_size = compressed_size;
    514	async_extent->pages = pages;
    515	async_extent->nr_pages = nr_pages;
    516	async_extent->compress_type = compress_type;
    517	list_add_tail(&async_extent->list, &cow->extents);
    518	return 0;
    519}
    520
    521/*
    522 * Check if the inode needs to be submitted to compression, based on mount
    523 * options, defragmentation, properties or heuristics.
    524 */
    525static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
    526				      u64 end)
    527{
    528	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    529
    530	if (!btrfs_inode_can_compress(inode)) {
    531		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
    532			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
    533			btrfs_ino(inode));
    534		return 0;
    535	}
    536	/*
    537	 * Special check for subpage.
    538	 *
    539	 * We lock the full page then run each delalloc range in the page, thus
    540	 * for the following case, we will hit some subpage specific corner case:
    541	 *
    542	 * 0		32K		64K
    543	 * |	|///////|	|///////|
    544	 *		\- A		\- B
    545	 *
    546	 * In above case, both range A and range B will try to unlock the full
    547	 * page [0, 64K), causing the one finished later will have page
    548	 * unlocked already, triggering various page lock requirement BUG_ON()s.
    549	 *
    550	 * So here we add an artificial limit that subpage compression can only
    551	 * if the range is fully page aligned.
    552	 *
    553	 * In theory we only need to ensure the first page is fully covered, but
    554	 * the tailing partial page will be locked until the full compression
    555	 * finishes, delaying the write of other range.
    556	 *
    557	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
    558	 * first to prevent any submitted async extent to unlock the full page.
    559	 * By this, we can ensure for subpage case that only the last async_cow
    560	 * will unlock the full page.
    561	 */
    562	if (fs_info->sectorsize < PAGE_SIZE) {
    563		if (!IS_ALIGNED(start, PAGE_SIZE) ||
    564		    !IS_ALIGNED(end + 1, PAGE_SIZE))
    565			return 0;
    566	}
    567
    568	/* force compress */
    569	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
    570		return 1;
    571	/* defrag ioctl */
    572	if (inode->defrag_compress)
    573		return 1;
    574	/* bad compression ratios */
    575	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
    576		return 0;
    577	if (btrfs_test_opt(fs_info, COMPRESS) ||
    578	    inode->flags & BTRFS_INODE_COMPRESS ||
    579	    inode->prop_compress)
    580		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
    581	return 0;
    582}
    583
    584static inline void inode_should_defrag(struct btrfs_inode *inode,
    585		u64 start, u64 end, u64 num_bytes, u32 small_write)
    586{
    587	/* If this is a small write inside eof, kick off a defrag */
    588	if (num_bytes < small_write &&
    589	    (start > 0 || end + 1 < inode->disk_i_size))
    590		btrfs_add_inode_defrag(NULL, inode, small_write);
    591}
    592
    593/*
    594 * we create compressed extents in two phases.  The first
    595 * phase compresses a range of pages that have already been
    596 * locked (both pages and state bits are locked).
    597 *
    598 * This is done inside an ordered work queue, and the compression
    599 * is spread across many cpus.  The actual IO submission is step
    600 * two, and the ordered work queue takes care of making sure that
    601 * happens in the same order things were put onto the queue by
    602 * writepages and friends.
    603 *
    604 * If this code finds it can't get good compression, it puts an
    605 * entry onto the work queue to write the uncompressed bytes.  This
    606 * makes sure that both compressed inodes and uncompressed inodes
    607 * are written in the same order that the flusher thread sent them
    608 * down.
    609 */
    610static noinline int compress_file_range(struct async_chunk *async_chunk)
    611{
    612	struct inode *inode = async_chunk->inode;
    613	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    614	u64 blocksize = fs_info->sectorsize;
    615	u64 start = async_chunk->start;
    616	u64 end = async_chunk->end;
    617	u64 actual_end;
    618	u64 i_size;
    619	int ret = 0;
    620	struct page **pages = NULL;
    621	unsigned long nr_pages;
    622	unsigned long total_compressed = 0;
    623	unsigned long total_in = 0;
    624	int i;
    625	int will_compress;
    626	int compress_type = fs_info->compress_type;
    627	int compressed_extents = 0;
    628	int redirty = 0;
    629
    630	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
    631			SZ_16K);
    632
    633	/*
    634	 * We need to save i_size before now because it could change in between
    635	 * us evaluating the size and assigning it.  This is because we lock and
    636	 * unlock the page in truncate and fallocate, and then modify the i_size
    637	 * later on.
    638	 *
    639	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
    640	 * does that for us.
    641	 */
    642	barrier();
    643	i_size = i_size_read(inode);
    644	barrier();
    645	actual_end = min_t(u64, i_size, end + 1);
    646again:
    647	will_compress = 0;
    648	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
    649	nr_pages = min_t(unsigned long, nr_pages,
    650			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
    651
    652	/*
    653	 * we don't want to send crud past the end of i_size through
    654	 * compression, that's just a waste of CPU time.  So, if the
    655	 * end of the file is before the start of our current
    656	 * requested range of bytes, we bail out to the uncompressed
    657	 * cleanup code that can deal with all of this.
    658	 *
    659	 * It isn't really the fastest way to fix things, but this is a
    660	 * very uncommon corner.
    661	 */
    662	if (actual_end <= start)
    663		goto cleanup_and_bail_uncompressed;
    664
    665	total_compressed = actual_end - start;
    666
    667	/*
    668	 * Skip compression for a small file range(<=blocksize) that
    669	 * isn't an inline extent, since it doesn't save disk space at all.
    670	 */
    671	if (total_compressed <= blocksize &&
    672	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
    673		goto cleanup_and_bail_uncompressed;
    674
    675	/*
    676	 * For subpage case, we require full page alignment for the sector
    677	 * aligned range.
    678	 * Thus we must also check against @actual_end, not just @end.
    679	 */
    680	if (blocksize < PAGE_SIZE) {
    681		if (!IS_ALIGNED(start, PAGE_SIZE) ||
    682		    !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
    683			goto cleanup_and_bail_uncompressed;
    684	}
    685
    686	total_compressed = min_t(unsigned long, total_compressed,
    687			BTRFS_MAX_UNCOMPRESSED);
    688	total_in = 0;
    689	ret = 0;
    690
    691	/*
    692	 * we do compression for mount -o compress and when the
    693	 * inode has not been flagged as nocompress.  This flag can
    694	 * change at any time if we discover bad compression ratios.
    695	 */
    696	if (inode_need_compress(BTRFS_I(inode), start, end)) {
    697		WARN_ON(pages);
    698		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
    699		if (!pages) {
    700			/* just bail out to the uncompressed code */
    701			nr_pages = 0;
    702			goto cont;
    703		}
    704
    705		if (BTRFS_I(inode)->defrag_compress)
    706			compress_type = BTRFS_I(inode)->defrag_compress;
    707		else if (BTRFS_I(inode)->prop_compress)
    708			compress_type = BTRFS_I(inode)->prop_compress;
    709
    710		/*
    711		 * we need to call clear_page_dirty_for_io on each
    712		 * page in the range.  Otherwise applications with the file
    713		 * mmap'd can wander in and change the page contents while
    714		 * we are compressing them.
    715		 *
    716		 * If the compression fails for any reason, we set the pages
    717		 * dirty again later on.
    718		 *
    719		 * Note that the remaining part is redirtied, the start pointer
    720		 * has moved, the end is the original one.
    721		 */
    722		if (!redirty) {
    723			extent_range_clear_dirty_for_io(inode, start, end);
    724			redirty = 1;
    725		}
    726
    727		/* Compression level is applied here and only here */
    728		ret = btrfs_compress_pages(
    729			compress_type | (fs_info->compress_level << 4),
    730					   inode->i_mapping, start,
    731					   pages,
    732					   &nr_pages,
    733					   &total_in,
    734					   &total_compressed);
    735
    736		if (!ret) {
    737			unsigned long offset = offset_in_page(total_compressed);
    738			struct page *page = pages[nr_pages - 1];
    739
    740			/* zero the tail end of the last page, we might be
    741			 * sending it down to disk
    742			 */
    743			if (offset)
    744				memzero_page(page, offset, PAGE_SIZE - offset);
    745			will_compress = 1;
    746		}
    747	}
    748cont:
    749	/*
    750	 * Check cow_file_range() for why we don't even try to create inline
    751	 * extent for subpage case.
    752	 */
    753	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
    754		/* lets try to make an inline extent */
    755		if (ret || total_in < actual_end) {
    756			/* we didn't compress the entire range, try
    757			 * to make an uncompressed inline extent.
    758			 */
    759			ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
    760						    0, BTRFS_COMPRESS_NONE,
    761						    NULL, false);
    762		} else {
    763			/* try making a compressed inline extent */
    764			ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
    765						    total_compressed,
    766						    compress_type, pages,
    767						    false);
    768		}
    769		if (ret <= 0) {
    770			unsigned long clear_flags = EXTENT_DELALLOC |
    771				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
    772				EXTENT_DO_ACCOUNTING;
    773			unsigned long page_error_op;
    774
    775			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
    776
    777			/*
    778			 * inline extent creation worked or returned error,
    779			 * we don't need to create any more async work items.
    780			 * Unlock and free up our temp pages.
    781			 *
    782			 * We use DO_ACCOUNTING here because we need the
    783			 * delalloc_release_metadata to be done _after_ we drop
    784			 * our outstanding extent for clearing delalloc for this
    785			 * range.
    786			 */
    787			extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
    788						     NULL,
    789						     clear_flags,
    790						     PAGE_UNLOCK |
    791						     PAGE_START_WRITEBACK |
    792						     page_error_op |
    793						     PAGE_END_WRITEBACK);
    794
    795			/*
    796			 * Ensure we only free the compressed pages if we have
    797			 * them allocated, as we can still reach here with
    798			 * inode_need_compress() == false.
    799			 */
    800			if (pages) {
    801				for (i = 0; i < nr_pages; i++) {
    802					WARN_ON(pages[i]->mapping);
    803					put_page(pages[i]);
    804				}
    805				kfree(pages);
    806			}
    807			return 0;
    808		}
    809	}
    810
    811	if (will_compress) {
    812		/*
    813		 * we aren't doing an inline extent round the compressed size
    814		 * up to a block size boundary so the allocator does sane
    815		 * things
    816		 */
    817		total_compressed = ALIGN(total_compressed, blocksize);
    818
    819		/*
    820		 * one last check to make sure the compression is really a
    821		 * win, compare the page count read with the blocks on disk,
    822		 * compression must free at least one sector size
    823		 */
    824		total_in = round_up(total_in, fs_info->sectorsize);
    825		if (total_compressed + blocksize <= total_in) {
    826			compressed_extents++;
    827
    828			/*
    829			 * The async work queues will take care of doing actual
    830			 * allocation on disk for these compressed pages, and
    831			 * will submit them to the elevator.
    832			 */
    833			add_async_extent(async_chunk, start, total_in,
    834					total_compressed, pages, nr_pages,
    835					compress_type);
    836
    837			if (start + total_in < end) {
    838				start += total_in;
    839				pages = NULL;
    840				cond_resched();
    841				goto again;
    842			}
    843			return compressed_extents;
    844		}
    845	}
    846	if (pages) {
    847		/*
    848		 * the compression code ran but failed to make things smaller,
    849		 * free any pages it allocated and our page pointer array
    850		 */
    851		for (i = 0; i < nr_pages; i++) {
    852			WARN_ON(pages[i]->mapping);
    853			put_page(pages[i]);
    854		}
    855		kfree(pages);
    856		pages = NULL;
    857		total_compressed = 0;
    858		nr_pages = 0;
    859
    860		/* flag the file so we don't compress in the future */
    861		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
    862		    !(BTRFS_I(inode)->prop_compress)) {
    863			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
    864		}
    865	}
    866cleanup_and_bail_uncompressed:
    867	/*
    868	 * No compression, but we still need to write the pages in the file
    869	 * we've been given so far.  redirty the locked page if it corresponds
    870	 * to our extent and set things up for the async work queue to run
    871	 * cow_file_range to do the normal delalloc dance.
    872	 */
    873	if (async_chunk->locked_page &&
    874	    (page_offset(async_chunk->locked_page) >= start &&
    875	     page_offset(async_chunk->locked_page)) <= end) {
    876		__set_page_dirty_nobuffers(async_chunk->locked_page);
    877		/* unlocked later on in the async handlers */
    878	}
    879
    880	if (redirty)
    881		extent_range_redirty_for_io(inode, start, end);
    882	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
    883			 BTRFS_COMPRESS_NONE);
    884	compressed_extents++;
    885
    886	return compressed_extents;
    887}
    888
    889static void free_async_extent_pages(struct async_extent *async_extent)
    890{
    891	int i;
    892
    893	if (!async_extent->pages)
    894		return;
    895
    896	for (i = 0; i < async_extent->nr_pages; i++) {
    897		WARN_ON(async_extent->pages[i]->mapping);
    898		put_page(async_extent->pages[i]);
    899	}
    900	kfree(async_extent->pages);
    901	async_extent->nr_pages = 0;
    902	async_extent->pages = NULL;
    903}
    904
    905static int submit_uncompressed_range(struct btrfs_inode *inode,
    906				     struct async_extent *async_extent,
    907				     struct page *locked_page)
    908{
    909	u64 start = async_extent->start;
    910	u64 end = async_extent->start + async_extent->ram_size - 1;
    911	unsigned long nr_written = 0;
    912	int page_started = 0;
    913	int ret;
    914
    915	/*
    916	 * Call cow_file_range() to run the delalloc range directly, since we
    917	 * won't go to NOCOW or async path again.
    918	 *
    919	 * Also we call cow_file_range() with @unlock_page == 0, so that we
    920	 * can directly submit them without interruption.
    921	 */
    922	ret = cow_file_range(inode, locked_page, start, end, &page_started,
    923			     &nr_written, 0);
    924	/* Inline extent inserted, page gets unlocked and everything is done */
    925	if (page_started) {
    926		ret = 0;
    927		goto out;
    928	}
    929	if (ret < 0) {
    930		if (locked_page)
    931			unlock_page(locked_page);
    932		goto out;
    933	}
    934
    935	ret = extent_write_locked_range(&inode->vfs_inode, start, end);
    936	/* All pages will be unlocked, including @locked_page */
    937out:
    938	kfree(async_extent);
    939	return ret;
    940}
    941
    942static int submit_one_async_extent(struct btrfs_inode *inode,
    943				   struct async_chunk *async_chunk,
    944				   struct async_extent *async_extent,
    945				   u64 *alloc_hint)
    946{
    947	struct extent_io_tree *io_tree = &inode->io_tree;
    948	struct btrfs_root *root = inode->root;
    949	struct btrfs_fs_info *fs_info = root->fs_info;
    950	struct btrfs_key ins;
    951	struct page *locked_page = NULL;
    952	struct extent_map *em;
    953	int ret = 0;
    954	u64 start = async_extent->start;
    955	u64 end = async_extent->start + async_extent->ram_size - 1;
    956
    957	/*
    958	 * If async_chunk->locked_page is in the async_extent range, we need to
    959	 * handle it.
    960	 */
    961	if (async_chunk->locked_page) {
    962		u64 locked_page_start = page_offset(async_chunk->locked_page);
    963		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
    964
    965		if (!(start >= locked_page_end || end <= locked_page_start))
    966			locked_page = async_chunk->locked_page;
    967	}
    968	lock_extent(io_tree, start, end);
    969
    970	/* We have fall back to uncompressed write */
    971	if (!async_extent->pages)
    972		return submit_uncompressed_range(inode, async_extent, locked_page);
    973
    974	ret = btrfs_reserve_extent(root, async_extent->ram_size,
    975				   async_extent->compressed_size,
    976				   async_extent->compressed_size,
    977				   0, *alloc_hint, &ins, 1, 1);
    978	if (ret) {
    979		free_async_extent_pages(async_extent);
    980		/*
    981		 * Here we used to try again by going back to non-compressed
    982		 * path for ENOSPC.  But we can't reserve space even for
    983		 * compressed size, how could it work for uncompressed size
    984		 * which requires larger size?  So here we directly go error
    985		 * path.
    986		 */
    987		goto out_free;
    988	}
    989
    990	/* Here we're doing allocation and writeback of the compressed pages */
    991	em = create_io_em(inode, start,
    992			  async_extent->ram_size,	/* len */
    993			  start,			/* orig_start */
    994			  ins.objectid,			/* block_start */
    995			  ins.offset,			/* block_len */
    996			  ins.offset,			/* orig_block_len */
    997			  async_extent->ram_size,	/* ram_bytes */
    998			  async_extent->compress_type,
    999			  BTRFS_ORDERED_COMPRESSED);
   1000	if (IS_ERR(em)) {
   1001		ret = PTR_ERR(em);
   1002		goto out_free_reserve;
   1003	}
   1004	free_extent_map(em);
   1005
   1006	ret = btrfs_add_ordered_extent(inode, start,		/* file_offset */
   1007				       async_extent->ram_size,	/* num_bytes */
   1008				       async_extent->ram_size,	/* ram_bytes */
   1009				       ins.objectid,		/* disk_bytenr */
   1010				       ins.offset,		/* disk_num_bytes */
   1011				       0,			/* offset */
   1012				       1 << BTRFS_ORDERED_COMPRESSED,
   1013				       async_extent->compress_type);
   1014	if (ret) {
   1015		btrfs_drop_extent_cache(inode, start, end, 0);
   1016		goto out_free_reserve;
   1017	}
   1018	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
   1019
   1020	/* Clear dirty, set writeback and unlock the pages. */
   1021	extent_clear_unlock_delalloc(inode, start, end,
   1022			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
   1023			PAGE_UNLOCK | PAGE_START_WRITEBACK);
   1024	if (btrfs_submit_compressed_write(inode, start,	/* file_offset */
   1025			    async_extent->ram_size,	/* num_bytes */
   1026			    ins.objectid,		/* disk_bytenr */
   1027			    ins.offset,			/* compressed_len */
   1028			    async_extent->pages,	/* compressed_pages */
   1029			    async_extent->nr_pages,
   1030			    async_chunk->write_flags,
   1031			    async_chunk->blkcg_css, true)) {
   1032		const u64 start = async_extent->start;
   1033		const u64 end = start + async_extent->ram_size - 1;
   1034
   1035		btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
   1036
   1037		extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
   1038					     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
   1039		free_async_extent_pages(async_extent);
   1040	}
   1041	*alloc_hint = ins.objectid + ins.offset;
   1042	kfree(async_extent);
   1043	return ret;
   1044
   1045out_free_reserve:
   1046	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
   1047	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
   1048out_free:
   1049	extent_clear_unlock_delalloc(inode, start, end,
   1050				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
   1051				     EXTENT_DELALLOC_NEW |
   1052				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
   1053				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
   1054				     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
   1055	free_async_extent_pages(async_extent);
   1056	kfree(async_extent);
   1057	return ret;
   1058}
   1059
   1060/*
   1061 * Phase two of compressed writeback.  This is the ordered portion of the code,
   1062 * which only gets called in the order the work was queued.  We walk all the
   1063 * async extents created by compress_file_range and send them down to the disk.
   1064 */
   1065static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
   1066{
   1067	struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
   1068	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   1069	struct async_extent *async_extent;
   1070	u64 alloc_hint = 0;
   1071	int ret = 0;
   1072
   1073	while (!list_empty(&async_chunk->extents)) {
   1074		u64 extent_start;
   1075		u64 ram_size;
   1076
   1077		async_extent = list_entry(async_chunk->extents.next,
   1078					  struct async_extent, list);
   1079		list_del(&async_extent->list);
   1080		extent_start = async_extent->start;
   1081		ram_size = async_extent->ram_size;
   1082
   1083		ret = submit_one_async_extent(inode, async_chunk, async_extent,
   1084					      &alloc_hint);
   1085		btrfs_debug(fs_info,
   1086"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
   1087			    inode->root->root_key.objectid,
   1088			    btrfs_ino(inode), extent_start, ram_size, ret);
   1089	}
   1090}
   1091
   1092static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
   1093				      u64 num_bytes)
   1094{
   1095	struct extent_map_tree *em_tree = &inode->extent_tree;
   1096	struct extent_map *em;
   1097	u64 alloc_hint = 0;
   1098
   1099	read_lock(&em_tree->lock);
   1100	em = search_extent_mapping(em_tree, start, num_bytes);
   1101	if (em) {
   1102		/*
   1103		 * if block start isn't an actual block number then find the
   1104		 * first block in this inode and use that as a hint.  If that
   1105		 * block is also bogus then just don't worry about it.
   1106		 */
   1107		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
   1108			free_extent_map(em);
   1109			em = search_extent_mapping(em_tree, 0, 0);
   1110			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
   1111				alloc_hint = em->block_start;
   1112			if (em)
   1113				free_extent_map(em);
   1114		} else {
   1115			alloc_hint = em->block_start;
   1116			free_extent_map(em);
   1117		}
   1118	}
   1119	read_unlock(&em_tree->lock);
   1120
   1121	return alloc_hint;
   1122}
   1123
   1124/*
   1125 * when extent_io.c finds a delayed allocation range in the file,
   1126 * the call backs end up in this code.  The basic idea is to
   1127 * allocate extents on disk for the range, and create ordered data structs
   1128 * in ram to track those extents.
   1129 *
   1130 * locked_page is the page that writepage had locked already.  We use
   1131 * it to make sure we don't do extra locks or unlocks.
   1132 *
   1133 * *page_started is set to one if we unlock locked_page and do everything
   1134 * required to start IO on it.  It may be clean and already done with
   1135 * IO when we return.
   1136 */
   1137static noinline int cow_file_range(struct btrfs_inode *inode,
   1138				   struct page *locked_page,
   1139				   u64 start, u64 end, int *page_started,
   1140				   unsigned long *nr_written, int unlock)
   1141{
   1142	struct btrfs_root *root = inode->root;
   1143	struct btrfs_fs_info *fs_info = root->fs_info;
   1144	u64 alloc_hint = 0;
   1145	u64 num_bytes;
   1146	unsigned long ram_size;
   1147	u64 cur_alloc_size = 0;
   1148	u64 min_alloc_size;
   1149	u64 blocksize = fs_info->sectorsize;
   1150	struct btrfs_key ins;
   1151	struct extent_map *em;
   1152	unsigned clear_bits;
   1153	unsigned long page_ops;
   1154	bool extent_reserved = false;
   1155	int ret = 0;
   1156
   1157	if (btrfs_is_free_space_inode(inode)) {
   1158		ret = -EINVAL;
   1159		goto out_unlock;
   1160	}
   1161
   1162	num_bytes = ALIGN(end - start + 1, blocksize);
   1163	num_bytes = max(blocksize,  num_bytes);
   1164	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
   1165
   1166	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
   1167
   1168	/*
   1169	 * Due to the page size limit, for subpage we can only trigger the
   1170	 * writeback for the dirty sectors of page, that means data writeback
   1171	 * is doing more writeback than what we want.
   1172	 *
   1173	 * This is especially unexpected for some call sites like fallocate,
   1174	 * where we only increase i_size after everything is done.
   1175	 * This means we can trigger inline extent even if we didn't want to.
   1176	 * So here we skip inline extent creation completely.
   1177	 */
   1178	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
   1179		u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
   1180				       end + 1);
   1181
   1182		/* lets try to make an inline extent */
   1183		ret = cow_file_range_inline(inode, actual_end, 0,
   1184					    BTRFS_COMPRESS_NONE, NULL, false);
   1185		if (ret == 0) {
   1186			/*
   1187			 * We use DO_ACCOUNTING here because we need the
   1188			 * delalloc_release_metadata to be run _after_ we drop
   1189			 * our outstanding extent for clearing delalloc for this
   1190			 * range.
   1191			 */
   1192			extent_clear_unlock_delalloc(inode, start, end,
   1193				     locked_page,
   1194				     EXTENT_LOCKED | EXTENT_DELALLOC |
   1195				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
   1196				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
   1197				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
   1198			*nr_written = *nr_written +
   1199			     (end - start + PAGE_SIZE) / PAGE_SIZE;
   1200			*page_started = 1;
   1201			/*
   1202			 * locked_page is locked by the caller of
   1203			 * writepage_delalloc(), not locked by
   1204			 * __process_pages_contig().
   1205			 *
   1206			 * We can't let __process_pages_contig() to unlock it,
   1207			 * as it doesn't have any subpage::writers recorded.
   1208			 *
   1209			 * Here we manually unlock the page, since the caller
   1210			 * can't use page_started to determine if it's an
   1211			 * inline extent or a compressed extent.
   1212			 */
   1213			unlock_page(locked_page);
   1214			goto out;
   1215		} else if (ret < 0) {
   1216			goto out_unlock;
   1217		}
   1218	}
   1219
   1220	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
   1221	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
   1222
   1223	/*
   1224	 * Relocation relies on the relocated extents to have exactly the same
   1225	 * size as the original extents. Normally writeback for relocation data
   1226	 * extents follows a NOCOW path because relocation preallocates the
   1227	 * extents. However, due to an operation such as scrub turning a block
   1228	 * group to RO mode, it may fallback to COW mode, so we must make sure
   1229	 * an extent allocated during COW has exactly the requested size and can
   1230	 * not be split into smaller extents, otherwise relocation breaks and
   1231	 * fails during the stage where it updates the bytenr of file extent
   1232	 * items.
   1233	 */
   1234	if (btrfs_is_data_reloc_root(root))
   1235		min_alloc_size = num_bytes;
   1236	else
   1237		min_alloc_size = fs_info->sectorsize;
   1238
   1239	while (num_bytes > 0) {
   1240		cur_alloc_size = num_bytes;
   1241		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
   1242					   min_alloc_size, 0, alloc_hint,
   1243					   &ins, 1, 1);
   1244		if (ret < 0)
   1245			goto out_unlock;
   1246		cur_alloc_size = ins.offset;
   1247		extent_reserved = true;
   1248
   1249		ram_size = ins.offset;
   1250		em = create_io_em(inode, start, ins.offset, /* len */
   1251				  start, /* orig_start */
   1252				  ins.objectid, /* block_start */
   1253				  ins.offset, /* block_len */
   1254				  ins.offset, /* orig_block_len */
   1255				  ram_size, /* ram_bytes */
   1256				  BTRFS_COMPRESS_NONE, /* compress_type */
   1257				  BTRFS_ORDERED_REGULAR /* type */);
   1258		if (IS_ERR(em)) {
   1259			ret = PTR_ERR(em);
   1260			goto out_reserve;
   1261		}
   1262		free_extent_map(em);
   1263
   1264		ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
   1265					       ins.objectid, cur_alloc_size, 0,
   1266					       1 << BTRFS_ORDERED_REGULAR,
   1267					       BTRFS_COMPRESS_NONE);
   1268		if (ret)
   1269			goto out_drop_extent_cache;
   1270
   1271		if (btrfs_is_data_reloc_root(root)) {
   1272			ret = btrfs_reloc_clone_csums(inode, start,
   1273						      cur_alloc_size);
   1274			/*
   1275			 * Only drop cache here, and process as normal.
   1276			 *
   1277			 * We must not allow extent_clear_unlock_delalloc()
   1278			 * at out_unlock label to free meta of this ordered
   1279			 * extent, as its meta should be freed by
   1280			 * btrfs_finish_ordered_io().
   1281			 *
   1282			 * So we must continue until @start is increased to
   1283			 * skip current ordered extent.
   1284			 */
   1285			if (ret)
   1286				btrfs_drop_extent_cache(inode, start,
   1287						start + ram_size - 1, 0);
   1288		}
   1289
   1290		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
   1291
   1292		/*
   1293		 * We're not doing compressed IO, don't unlock the first page
   1294		 * (which the caller expects to stay locked), don't clear any
   1295		 * dirty bits and don't set any writeback bits
   1296		 *
   1297		 * Do set the Ordered (Private2) bit so we know this page was
   1298		 * properly setup for writepage.
   1299		 */
   1300		page_ops = unlock ? PAGE_UNLOCK : 0;
   1301		page_ops |= PAGE_SET_ORDERED;
   1302
   1303		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
   1304					     locked_page,
   1305					     EXTENT_LOCKED | EXTENT_DELALLOC,
   1306					     page_ops);
   1307		if (num_bytes < cur_alloc_size)
   1308			num_bytes = 0;
   1309		else
   1310			num_bytes -= cur_alloc_size;
   1311		alloc_hint = ins.objectid + ins.offset;
   1312		start += cur_alloc_size;
   1313		extent_reserved = false;
   1314
   1315		/*
   1316		 * btrfs_reloc_clone_csums() error, since start is increased
   1317		 * extent_clear_unlock_delalloc() at out_unlock label won't
   1318		 * free metadata of current ordered extent, we're OK to exit.
   1319		 */
   1320		if (ret)
   1321			goto out_unlock;
   1322	}
   1323out:
   1324	return ret;
   1325
   1326out_drop_extent_cache:
   1327	btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
   1328out_reserve:
   1329	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
   1330	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
   1331out_unlock:
   1332	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
   1333		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
   1334	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
   1335	/*
   1336	 * If we reserved an extent for our delalloc range (or a subrange) and
   1337	 * failed to create the respective ordered extent, then it means that
   1338	 * when we reserved the extent we decremented the extent's size from
   1339	 * the data space_info's bytes_may_use counter and incremented the
   1340	 * space_info's bytes_reserved counter by the same amount. We must make
   1341	 * sure extent_clear_unlock_delalloc() does not try to decrement again
   1342	 * the data space_info's bytes_may_use counter, therefore we do not pass
   1343	 * it the flag EXTENT_CLEAR_DATA_RESV.
   1344	 */
   1345	if (extent_reserved) {
   1346		extent_clear_unlock_delalloc(inode, start,
   1347					     start + cur_alloc_size - 1,
   1348					     locked_page,
   1349					     clear_bits,
   1350					     page_ops);
   1351		start += cur_alloc_size;
   1352		if (start >= end)
   1353			goto out;
   1354	}
   1355	extent_clear_unlock_delalloc(inode, start, end, locked_page,
   1356				     clear_bits | EXTENT_CLEAR_DATA_RESV,
   1357				     page_ops);
   1358	goto out;
   1359}
   1360
   1361/*
   1362 * work queue call back to started compression on a file and pages
   1363 */
   1364static noinline void async_cow_start(struct btrfs_work *work)
   1365{
   1366	struct async_chunk *async_chunk;
   1367	int compressed_extents;
   1368
   1369	async_chunk = container_of(work, struct async_chunk, work);
   1370
   1371	compressed_extents = compress_file_range(async_chunk);
   1372	if (compressed_extents == 0) {
   1373		btrfs_add_delayed_iput(async_chunk->inode);
   1374		async_chunk->inode = NULL;
   1375	}
   1376}
   1377
   1378/*
   1379 * work queue call back to submit previously compressed pages
   1380 */
   1381static noinline void async_cow_submit(struct btrfs_work *work)
   1382{
   1383	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
   1384						     work);
   1385	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
   1386	unsigned long nr_pages;
   1387
   1388	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
   1389		PAGE_SHIFT;
   1390
   1391	/*
   1392	 * ->inode could be NULL if async_chunk_start has failed to compress,
   1393	 * in which case we don't have anything to submit, yet we need to
   1394	 * always adjust ->async_delalloc_pages as its paired with the init
   1395	 * happening in cow_file_range_async
   1396	 */
   1397	if (async_chunk->inode)
   1398		submit_compressed_extents(async_chunk);
   1399
   1400	/* atomic_sub_return implies a barrier */
   1401	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
   1402	    5 * SZ_1M)
   1403		cond_wake_up_nomb(&fs_info->async_submit_wait);
   1404}
   1405
   1406static noinline void async_cow_free(struct btrfs_work *work)
   1407{
   1408	struct async_chunk *async_chunk;
   1409	struct async_cow *async_cow;
   1410
   1411	async_chunk = container_of(work, struct async_chunk, work);
   1412	if (async_chunk->inode)
   1413		btrfs_add_delayed_iput(async_chunk->inode);
   1414	if (async_chunk->blkcg_css)
   1415		css_put(async_chunk->blkcg_css);
   1416
   1417	async_cow = async_chunk->async_cow;
   1418	if (atomic_dec_and_test(&async_cow->num_chunks))
   1419		kvfree(async_cow);
   1420}
   1421
   1422static int cow_file_range_async(struct btrfs_inode *inode,
   1423				struct writeback_control *wbc,
   1424				struct page *locked_page,
   1425				u64 start, u64 end, int *page_started,
   1426				unsigned long *nr_written)
   1427{
   1428	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   1429	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
   1430	struct async_cow *ctx;
   1431	struct async_chunk *async_chunk;
   1432	unsigned long nr_pages;
   1433	u64 cur_end;
   1434	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
   1435	int i;
   1436	bool should_compress;
   1437	unsigned nofs_flag;
   1438	const unsigned int write_flags = wbc_to_write_flags(wbc);
   1439
   1440	unlock_extent(&inode->io_tree, start, end);
   1441
   1442	if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
   1443	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
   1444		num_chunks = 1;
   1445		should_compress = false;
   1446	} else {
   1447		should_compress = true;
   1448	}
   1449
   1450	nofs_flag = memalloc_nofs_save();
   1451	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
   1452	memalloc_nofs_restore(nofs_flag);
   1453
   1454	if (!ctx) {
   1455		unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
   1456			EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
   1457			EXTENT_DO_ACCOUNTING;
   1458		unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
   1459					 PAGE_END_WRITEBACK | PAGE_SET_ERROR;
   1460
   1461		extent_clear_unlock_delalloc(inode, start, end, locked_page,
   1462					     clear_bits, page_ops);
   1463		return -ENOMEM;
   1464	}
   1465
   1466	async_chunk = ctx->chunks;
   1467	atomic_set(&ctx->num_chunks, num_chunks);
   1468
   1469	for (i = 0; i < num_chunks; i++) {
   1470		if (should_compress)
   1471			cur_end = min(end, start + SZ_512K - 1);
   1472		else
   1473			cur_end = end;
   1474
   1475		/*
   1476		 * igrab is called higher up in the call chain, take only the
   1477		 * lightweight reference for the callback lifetime
   1478		 */
   1479		ihold(&inode->vfs_inode);
   1480		async_chunk[i].async_cow = ctx;
   1481		async_chunk[i].inode = &inode->vfs_inode;
   1482		async_chunk[i].start = start;
   1483		async_chunk[i].end = cur_end;
   1484		async_chunk[i].write_flags = write_flags;
   1485		INIT_LIST_HEAD(&async_chunk[i].extents);
   1486
   1487		/*
   1488		 * The locked_page comes all the way from writepage and its
   1489		 * the original page we were actually given.  As we spread
   1490		 * this large delalloc region across multiple async_chunk
   1491		 * structs, only the first struct needs a pointer to locked_page
   1492		 *
   1493		 * This way we don't need racey decisions about who is supposed
   1494		 * to unlock it.
   1495		 */
   1496		if (locked_page) {
   1497			/*
   1498			 * Depending on the compressibility, the pages might or
   1499			 * might not go through async.  We want all of them to
   1500			 * be accounted against wbc once.  Let's do it here
   1501			 * before the paths diverge.  wbc accounting is used
   1502			 * only for foreign writeback detection and doesn't
   1503			 * need full accuracy.  Just account the whole thing
   1504			 * against the first page.
   1505			 */
   1506			wbc_account_cgroup_owner(wbc, locked_page,
   1507						 cur_end - start);
   1508			async_chunk[i].locked_page = locked_page;
   1509			locked_page = NULL;
   1510		} else {
   1511			async_chunk[i].locked_page = NULL;
   1512		}
   1513
   1514		if (blkcg_css != blkcg_root_css) {
   1515			css_get(blkcg_css);
   1516			async_chunk[i].blkcg_css = blkcg_css;
   1517		} else {
   1518			async_chunk[i].blkcg_css = NULL;
   1519		}
   1520
   1521		btrfs_init_work(&async_chunk[i].work, async_cow_start,
   1522				async_cow_submit, async_cow_free);
   1523
   1524		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
   1525		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
   1526
   1527		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
   1528
   1529		*nr_written += nr_pages;
   1530		start = cur_end + 1;
   1531	}
   1532	*page_started = 1;
   1533	return 0;
   1534}
   1535
   1536static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
   1537				       struct page *locked_page, u64 start,
   1538				       u64 end, int *page_started,
   1539				       unsigned long *nr_written)
   1540{
   1541	int ret;
   1542
   1543	ret = cow_file_range(inode, locked_page, start, end, page_started,
   1544			     nr_written, 0);
   1545	if (ret)
   1546		return ret;
   1547
   1548	if (*page_started)
   1549		return 0;
   1550
   1551	__set_page_dirty_nobuffers(locked_page);
   1552	account_page_redirty(locked_page);
   1553	extent_write_locked_range(&inode->vfs_inode, start, end);
   1554	*page_started = 1;
   1555
   1556	return 0;
   1557}
   1558
   1559static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
   1560					u64 bytenr, u64 num_bytes)
   1561{
   1562	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
   1563	struct btrfs_ordered_sum *sums;
   1564	int ret;
   1565	LIST_HEAD(list);
   1566
   1567	ret = btrfs_lookup_csums_range(csum_root, bytenr,
   1568				       bytenr + num_bytes - 1, &list, 0);
   1569	if (ret == 0 && list_empty(&list))
   1570		return 0;
   1571
   1572	while (!list_empty(&list)) {
   1573		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
   1574		list_del(&sums->list);
   1575		kfree(sums);
   1576	}
   1577	if (ret < 0)
   1578		return ret;
   1579	return 1;
   1580}
   1581
   1582static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
   1583			   const u64 start, const u64 end,
   1584			   int *page_started, unsigned long *nr_written)
   1585{
   1586	const bool is_space_ino = btrfs_is_free_space_inode(inode);
   1587	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
   1588	const u64 range_bytes = end + 1 - start;
   1589	struct extent_io_tree *io_tree = &inode->io_tree;
   1590	u64 range_start = start;
   1591	u64 count;
   1592
   1593	/*
   1594	 * If EXTENT_NORESERVE is set it means that when the buffered write was
   1595	 * made we had not enough available data space and therefore we did not
   1596	 * reserve data space for it, since we though we could do NOCOW for the
   1597	 * respective file range (either there is prealloc extent or the inode
   1598	 * has the NOCOW bit set).
   1599	 *
   1600	 * However when we need to fallback to COW mode (because for example the
   1601	 * block group for the corresponding extent was turned to RO mode by a
   1602	 * scrub or relocation) we need to do the following:
   1603	 *
   1604	 * 1) We increment the bytes_may_use counter of the data space info.
   1605	 *    If COW succeeds, it allocates a new data extent and after doing
   1606	 *    that it decrements the space info's bytes_may_use counter and
   1607	 *    increments its bytes_reserved counter by the same amount (we do
   1608	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
   1609	 *    bytes_may_use counter to compensate (when space is reserved at
   1610	 *    buffered write time, the bytes_may_use counter is incremented);
   1611	 *
   1612	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
   1613	 *    that if the COW path fails for any reason, it decrements (through
   1614	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
   1615	 *    data space info, which we incremented in the step above.
   1616	 *
   1617	 * If we need to fallback to cow and the inode corresponds to a free
   1618	 * space cache inode or an inode of the data relocation tree, we must
   1619	 * also increment bytes_may_use of the data space_info for the same
   1620	 * reason. Space caches and relocated data extents always get a prealloc
   1621	 * extent for them, however scrub or balance may have set the block
   1622	 * group that contains that extent to RO mode and therefore force COW
   1623	 * when starting writeback.
   1624	 */
   1625	count = count_range_bits(io_tree, &range_start, end, range_bytes,
   1626				 EXTENT_NORESERVE, 0);
   1627	if (count > 0 || is_space_ino || is_reloc_ino) {
   1628		u64 bytes = count;
   1629		struct btrfs_fs_info *fs_info = inode->root->fs_info;
   1630		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
   1631
   1632		if (is_space_ino || is_reloc_ino)
   1633			bytes = range_bytes;
   1634
   1635		spin_lock(&sinfo->lock);
   1636		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
   1637		spin_unlock(&sinfo->lock);
   1638
   1639		if (count > 0)
   1640			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
   1641					 0, 0, NULL);
   1642	}
   1643
   1644	return cow_file_range(inode, locked_page, start, end, page_started,
   1645			      nr_written, 1);
   1646}
   1647
   1648struct can_nocow_file_extent_args {
   1649	/* Input fields. */
   1650
   1651	/* Start file offset of the range we want to NOCOW. */
   1652	u64 start;
   1653	/* End file offset (inclusive) of the range we want to NOCOW. */
   1654	u64 end;
   1655	bool writeback_path;
   1656	bool strict;
   1657	/*
   1658	 * Free the path passed to can_nocow_file_extent() once it's not needed
   1659	 * anymore.
   1660	 */
   1661	bool free_path;
   1662
   1663	/* Output fields. Only set when can_nocow_file_extent() returns 1. */
   1664
   1665	u64 disk_bytenr;
   1666	u64 disk_num_bytes;
   1667	u64 extent_offset;
   1668	/* Number of bytes that can be written to in NOCOW mode. */
   1669	u64 num_bytes;
   1670};
   1671
   1672/*
   1673 * Check if we can NOCOW the file extent that the path points to.
   1674 * This function may return with the path released, so the caller should check
   1675 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
   1676 *
   1677 * Returns: < 0 on error
   1678 *            0 if we can not NOCOW
   1679 *            1 if we can NOCOW
   1680 */
   1681static int can_nocow_file_extent(struct btrfs_path *path,
   1682				 struct btrfs_key *key,
   1683				 struct btrfs_inode *inode,
   1684				 struct can_nocow_file_extent_args *args)
   1685{
   1686	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
   1687	struct extent_buffer *leaf = path->nodes[0];
   1688	struct btrfs_root *root = inode->root;
   1689	struct btrfs_file_extent_item *fi;
   1690	u64 extent_end;
   1691	u8 extent_type;
   1692	int can_nocow = 0;
   1693	int ret = 0;
   1694
   1695	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
   1696	extent_type = btrfs_file_extent_type(leaf, fi);
   1697
   1698	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
   1699		goto out;
   1700
   1701	/* Can't access these fields unless we know it's not an inline extent. */
   1702	args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
   1703	args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
   1704	args->extent_offset = btrfs_file_extent_offset(leaf, fi);
   1705
   1706	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
   1707	    extent_type == BTRFS_FILE_EXTENT_REG)
   1708		goto out;
   1709
   1710	/*
   1711	 * If the extent was created before the generation where the last snapshot
   1712	 * for its subvolume was created, then this implies the extent is shared,
   1713	 * hence we must COW.
   1714	 */
   1715	if (!args->strict &&
   1716	    btrfs_file_extent_generation(leaf, fi) <=
   1717	    btrfs_root_last_snapshot(&root->root_item))
   1718		goto out;
   1719
   1720	/* An explicit hole, must COW. */
   1721	if (args->disk_bytenr == 0)
   1722		goto out;
   1723
   1724	/* Compressed/encrypted/encoded extents must be COWed. */
   1725	if (btrfs_file_extent_compression(leaf, fi) ||
   1726	    btrfs_file_extent_encryption(leaf, fi) ||
   1727	    btrfs_file_extent_other_encoding(leaf, fi))
   1728		goto out;
   1729
   1730	extent_end = btrfs_file_extent_end(path);
   1731
   1732	/*
   1733	 * The following checks can be expensive, as they need to take other
   1734	 * locks and do btree or rbtree searches, so release the path to avoid
   1735	 * blocking other tasks for too long.
   1736	 */
   1737	btrfs_release_path(path);
   1738
   1739	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
   1740				    key->offset - args->extent_offset,
   1741				    args->disk_bytenr, false, path);
   1742	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
   1743	if (ret != 0)
   1744		goto out;
   1745
   1746	if (args->free_path) {
   1747		/*
   1748		 * We don't need the path anymore, plus through the
   1749		 * csum_exist_in_range() call below we will end up allocating
   1750		 * another path. So free the path to avoid unnecessary extra
   1751		 * memory usage.
   1752		 */
   1753		btrfs_free_path(path);
   1754		path = NULL;
   1755	}
   1756
   1757	/* If there are pending snapshots for this root, we must COW. */
   1758	if (args->writeback_path && !is_freespace_inode &&
   1759	    atomic_read(&root->snapshot_force_cow))
   1760		goto out;
   1761
   1762	args->disk_bytenr += args->extent_offset;
   1763	args->disk_bytenr += args->start - key->offset;
   1764	args->num_bytes = min(args->end + 1, extent_end) - args->start;
   1765
   1766	/*
   1767	 * Force COW if csums exist in the range. This ensures that csums for a
   1768	 * given extent are either valid or do not exist.
   1769	 */
   1770	ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes);
   1771	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
   1772	if (ret != 0)
   1773		goto out;
   1774
   1775	can_nocow = 1;
   1776 out:
   1777	if (args->free_path && path)
   1778		btrfs_free_path(path);
   1779
   1780	return ret < 0 ? ret : can_nocow;
   1781}
   1782
   1783/*
   1784 * when nowcow writeback call back.  This checks for snapshots or COW copies
   1785 * of the extents that exist in the file, and COWs the file as required.
   1786 *
   1787 * If no cow copies or snapshots exist, we write directly to the existing
   1788 * blocks on disk
   1789 */
   1790static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
   1791				       struct page *locked_page,
   1792				       const u64 start, const u64 end,
   1793				       int *page_started,
   1794				       unsigned long *nr_written)
   1795{
   1796	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   1797	struct btrfs_root *root = inode->root;
   1798	struct btrfs_path *path;
   1799	u64 cow_start = (u64)-1;
   1800	u64 cur_offset = start;
   1801	int ret;
   1802	bool check_prev = true;
   1803	u64 ino = btrfs_ino(inode);
   1804	struct btrfs_block_group *bg;
   1805	bool nocow = false;
   1806	struct can_nocow_file_extent_args nocow_args = { 0 };
   1807
   1808	path = btrfs_alloc_path();
   1809	if (!path) {
   1810		extent_clear_unlock_delalloc(inode, start, end, locked_page,
   1811					     EXTENT_LOCKED | EXTENT_DELALLOC |
   1812					     EXTENT_DO_ACCOUNTING |
   1813					     EXTENT_DEFRAG, PAGE_UNLOCK |
   1814					     PAGE_START_WRITEBACK |
   1815					     PAGE_END_WRITEBACK);
   1816		return -ENOMEM;
   1817	}
   1818
   1819	nocow_args.end = end;
   1820	nocow_args.writeback_path = true;
   1821
   1822	while (1) {
   1823		struct btrfs_key found_key;
   1824		struct btrfs_file_extent_item *fi;
   1825		struct extent_buffer *leaf;
   1826		u64 extent_end;
   1827		u64 ram_bytes;
   1828		u64 nocow_end;
   1829		int extent_type;
   1830
   1831		nocow = false;
   1832
   1833		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
   1834					       cur_offset, 0);
   1835		if (ret < 0)
   1836			goto error;
   1837
   1838		/*
   1839		 * If there is no extent for our range when doing the initial
   1840		 * search, then go back to the previous slot as it will be the
   1841		 * one containing the search offset
   1842		 */
   1843		if (ret > 0 && path->slots[0] > 0 && check_prev) {
   1844			leaf = path->nodes[0];
   1845			btrfs_item_key_to_cpu(leaf, &found_key,
   1846					      path->slots[0] - 1);
   1847			if (found_key.objectid == ino &&
   1848			    found_key.type == BTRFS_EXTENT_DATA_KEY)
   1849				path->slots[0]--;
   1850		}
   1851		check_prev = false;
   1852next_slot:
   1853		/* Go to next leaf if we have exhausted the current one */
   1854		leaf = path->nodes[0];
   1855		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
   1856			ret = btrfs_next_leaf(root, path);
   1857			if (ret < 0) {
   1858				if (cow_start != (u64)-1)
   1859					cur_offset = cow_start;
   1860				goto error;
   1861			}
   1862			if (ret > 0)
   1863				break;
   1864			leaf = path->nodes[0];
   1865		}
   1866
   1867		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   1868
   1869		/* Didn't find anything for our INO */
   1870		if (found_key.objectid > ino)
   1871			break;
   1872		/*
   1873		 * Keep searching until we find an EXTENT_ITEM or there are no
   1874		 * more extents for this inode
   1875		 */
   1876		if (WARN_ON_ONCE(found_key.objectid < ino) ||
   1877		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
   1878			path->slots[0]++;
   1879			goto next_slot;
   1880		}
   1881
   1882		/* Found key is not EXTENT_DATA_KEY or starts after req range */
   1883		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
   1884		    found_key.offset > end)
   1885			break;
   1886
   1887		/*
   1888		 * If the found extent starts after requested offset, then
   1889		 * adjust extent_end to be right before this extent begins
   1890		 */
   1891		if (found_key.offset > cur_offset) {
   1892			extent_end = found_key.offset;
   1893			extent_type = 0;
   1894			goto out_check;
   1895		}
   1896
   1897		/*
   1898		 * Found extent which begins before our range and potentially
   1899		 * intersect it
   1900		 */
   1901		fi = btrfs_item_ptr(leaf, path->slots[0],
   1902				    struct btrfs_file_extent_item);
   1903		extent_type = btrfs_file_extent_type(leaf, fi);
   1904		/* If this is triggered then we have a memory corruption. */
   1905		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
   1906		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
   1907			ret = -EUCLEAN;
   1908			goto error;
   1909		}
   1910		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
   1911		extent_end = btrfs_file_extent_end(path);
   1912
   1913		/*
   1914		 * If the extent we got ends before our current offset, skip to
   1915		 * the next extent.
   1916		 */
   1917		if (extent_end <= cur_offset) {
   1918			path->slots[0]++;
   1919			goto next_slot;
   1920		}
   1921
   1922		nocow_args.start = cur_offset;
   1923		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
   1924		if (ret < 0) {
   1925			if (cow_start != (u64)-1)
   1926				cur_offset = cow_start;
   1927			goto error;
   1928		} else if (ret == 0) {
   1929			goto out_check;
   1930		}
   1931
   1932		ret = 0;
   1933		bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
   1934		if (bg)
   1935			nocow = true;
   1936out_check:
   1937		/*
   1938		 * If nocow is false then record the beginning of the range
   1939		 * that needs to be COWed
   1940		 */
   1941		if (!nocow) {
   1942			if (cow_start == (u64)-1)
   1943				cow_start = cur_offset;
   1944			cur_offset = extent_end;
   1945			if (cur_offset > end)
   1946				break;
   1947			if (!path->nodes[0])
   1948				continue;
   1949			path->slots[0]++;
   1950			goto next_slot;
   1951		}
   1952
   1953		/*
   1954		 * COW range from cow_start to found_key.offset - 1. As the key
   1955		 * will contain the beginning of the first extent that can be
   1956		 * NOCOW, following one which needs to be COW'ed
   1957		 */
   1958		if (cow_start != (u64)-1) {
   1959			ret = fallback_to_cow(inode, locked_page,
   1960					      cow_start, found_key.offset - 1,
   1961					      page_started, nr_written);
   1962			if (ret)
   1963				goto error;
   1964			cow_start = (u64)-1;
   1965		}
   1966
   1967		nocow_end = cur_offset + nocow_args.num_bytes - 1;
   1968
   1969		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
   1970			u64 orig_start = found_key.offset - nocow_args.extent_offset;
   1971			struct extent_map *em;
   1972
   1973			em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
   1974					  orig_start,
   1975					  nocow_args.disk_bytenr, /* block_start */
   1976					  nocow_args.num_bytes, /* block_len */
   1977					  nocow_args.disk_num_bytes, /* orig_block_len */
   1978					  ram_bytes, BTRFS_COMPRESS_NONE,
   1979					  BTRFS_ORDERED_PREALLOC);
   1980			if (IS_ERR(em)) {
   1981				ret = PTR_ERR(em);
   1982				goto error;
   1983			}
   1984			free_extent_map(em);
   1985			ret = btrfs_add_ordered_extent(inode,
   1986					cur_offset, nocow_args.num_bytes,
   1987					nocow_args.num_bytes,
   1988					nocow_args.disk_bytenr,
   1989					nocow_args.num_bytes, 0,
   1990					1 << BTRFS_ORDERED_PREALLOC,
   1991					BTRFS_COMPRESS_NONE);
   1992			if (ret) {
   1993				btrfs_drop_extent_cache(inode, cur_offset,
   1994							nocow_end, 0);
   1995				goto error;
   1996			}
   1997		} else {
   1998			ret = btrfs_add_ordered_extent(inode, cur_offset,
   1999						       nocow_args.num_bytes,
   2000						       nocow_args.num_bytes,
   2001						       nocow_args.disk_bytenr,
   2002						       nocow_args.num_bytes,
   2003						       0,
   2004						       1 << BTRFS_ORDERED_NOCOW,
   2005						       BTRFS_COMPRESS_NONE);
   2006			if (ret)
   2007				goto error;
   2008		}
   2009
   2010		if (nocow) {
   2011			btrfs_dec_nocow_writers(bg);
   2012			nocow = false;
   2013		}
   2014
   2015		if (btrfs_is_data_reloc_root(root))
   2016			/*
   2017			 * Error handled later, as we must prevent
   2018			 * extent_clear_unlock_delalloc() in error handler
   2019			 * from freeing metadata of created ordered extent.
   2020			 */
   2021			ret = btrfs_reloc_clone_csums(inode, cur_offset,
   2022						      nocow_args.num_bytes);
   2023
   2024		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
   2025					     locked_page, EXTENT_LOCKED |
   2026					     EXTENT_DELALLOC |
   2027					     EXTENT_CLEAR_DATA_RESV,
   2028					     PAGE_UNLOCK | PAGE_SET_ORDERED);
   2029
   2030		cur_offset = extent_end;
   2031
   2032		/*
   2033		 * btrfs_reloc_clone_csums() error, now we're OK to call error
   2034		 * handler, as metadata for created ordered extent will only
   2035		 * be freed by btrfs_finish_ordered_io().
   2036		 */
   2037		if (ret)
   2038			goto error;
   2039		if (cur_offset > end)
   2040			break;
   2041	}
   2042	btrfs_release_path(path);
   2043
   2044	if (cur_offset <= end && cow_start == (u64)-1)
   2045		cow_start = cur_offset;
   2046
   2047	if (cow_start != (u64)-1) {
   2048		cur_offset = end;
   2049		ret = fallback_to_cow(inode, locked_page, cow_start, end,
   2050				      page_started, nr_written);
   2051		if (ret)
   2052			goto error;
   2053	}
   2054
   2055error:
   2056	if (nocow)
   2057		btrfs_dec_nocow_writers(bg);
   2058
   2059	if (ret && cur_offset < end)
   2060		extent_clear_unlock_delalloc(inode, cur_offset, end,
   2061					     locked_page, EXTENT_LOCKED |
   2062					     EXTENT_DELALLOC | EXTENT_DEFRAG |
   2063					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
   2064					     PAGE_START_WRITEBACK |
   2065					     PAGE_END_WRITEBACK);
   2066	btrfs_free_path(path);
   2067	return ret;
   2068}
   2069
   2070static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
   2071{
   2072	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
   2073		if (inode->defrag_bytes &&
   2074		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
   2075				   0, NULL))
   2076			return false;
   2077		return true;
   2078	}
   2079	return false;
   2080}
   2081
   2082/*
   2083 * Function to process delayed allocation (create CoW) for ranges which are
   2084 * being touched for the first time.
   2085 */
   2086int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
   2087		u64 start, u64 end, int *page_started, unsigned long *nr_written,
   2088		struct writeback_control *wbc)
   2089{
   2090	int ret;
   2091	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
   2092
   2093	/*
   2094	 * The range must cover part of the @locked_page, or the returned
   2095	 * @page_started can confuse the caller.
   2096	 */
   2097	ASSERT(!(end <= page_offset(locked_page) ||
   2098		 start >= page_offset(locked_page) + PAGE_SIZE));
   2099
   2100	if (should_nocow(inode, start, end)) {
   2101		/*
   2102		 * Normally on a zoned device we're only doing COW writes, but
   2103		 * in case of relocation on a zoned filesystem we have taken
   2104		 * precaution, that we're only writing sequentially. It's safe
   2105		 * to use run_delalloc_nocow() here, like for  regular
   2106		 * preallocated inodes.
   2107		 */
   2108		ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
   2109		ret = run_delalloc_nocow(inode, locked_page, start, end,
   2110					 page_started, nr_written);
   2111	} else if (!btrfs_inode_can_compress(inode) ||
   2112		   !inode_need_compress(inode, start, end)) {
   2113		if (zoned)
   2114			ret = run_delalloc_zoned(inode, locked_page, start, end,
   2115						 page_started, nr_written);
   2116		else
   2117			ret = cow_file_range(inode, locked_page, start, end,
   2118					     page_started, nr_written, 1);
   2119	} else {
   2120		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
   2121		ret = cow_file_range_async(inode, wbc, locked_page, start, end,
   2122					   page_started, nr_written);
   2123	}
   2124	ASSERT(ret <= 0);
   2125	if (ret)
   2126		btrfs_cleanup_ordered_extents(inode, locked_page, start,
   2127					      end - start + 1);
   2128	return ret;
   2129}
   2130
   2131void btrfs_split_delalloc_extent(struct inode *inode,
   2132				 struct extent_state *orig, u64 split)
   2133{
   2134	u64 size;
   2135
   2136	/* not delalloc, ignore it */
   2137	if (!(orig->state & EXTENT_DELALLOC))
   2138		return;
   2139
   2140	size = orig->end - orig->start + 1;
   2141	if (size > BTRFS_MAX_EXTENT_SIZE) {
   2142		u32 num_extents;
   2143		u64 new_size;
   2144
   2145		/*
   2146		 * See the explanation in btrfs_merge_delalloc_extent, the same
   2147		 * applies here, just in reverse.
   2148		 */
   2149		new_size = orig->end - split + 1;
   2150		num_extents = count_max_extents(new_size);
   2151		new_size = split - orig->start;
   2152		num_extents += count_max_extents(new_size);
   2153		if (count_max_extents(size) >= num_extents)
   2154			return;
   2155	}
   2156
   2157	spin_lock(&BTRFS_I(inode)->lock);
   2158	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
   2159	spin_unlock(&BTRFS_I(inode)->lock);
   2160}
   2161
   2162/*
   2163 * Handle merged delayed allocation extents so we can keep track of new extents
   2164 * that are just merged onto old extents, such as when we are doing sequential
   2165 * writes, so we can properly account for the metadata space we'll need.
   2166 */
   2167void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
   2168				 struct extent_state *other)
   2169{
   2170	u64 new_size, old_size;
   2171	u32 num_extents;
   2172
   2173	/* not delalloc, ignore it */
   2174	if (!(other->state & EXTENT_DELALLOC))
   2175		return;
   2176
   2177	if (new->start > other->start)
   2178		new_size = new->end - other->start + 1;
   2179	else
   2180		new_size = other->end - new->start + 1;
   2181
   2182	/* we're not bigger than the max, unreserve the space and go */
   2183	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
   2184		spin_lock(&BTRFS_I(inode)->lock);
   2185		btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
   2186		spin_unlock(&BTRFS_I(inode)->lock);
   2187		return;
   2188	}
   2189
   2190	/*
   2191	 * We have to add up either side to figure out how many extents were
   2192	 * accounted for before we merged into one big extent.  If the number of
   2193	 * extents we accounted for is <= the amount we need for the new range
   2194	 * then we can return, otherwise drop.  Think of it like this
   2195	 *
   2196	 * [ 4k][MAX_SIZE]
   2197	 *
   2198	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
   2199	 * need 2 outstanding extents, on one side we have 1 and the other side
   2200	 * we have 1 so they are == and we can return.  But in this case
   2201	 *
   2202	 * [MAX_SIZE+4k][MAX_SIZE+4k]
   2203	 *
   2204	 * Each range on their own accounts for 2 extents, but merged together
   2205	 * they are only 3 extents worth of accounting, so we need to drop in
   2206	 * this case.
   2207	 */
   2208	old_size = other->end - other->start + 1;
   2209	num_extents = count_max_extents(old_size);
   2210	old_size = new->end - new->start + 1;
   2211	num_extents += count_max_extents(old_size);
   2212	if (count_max_extents(new_size) >= num_extents)
   2213		return;
   2214
   2215	spin_lock(&BTRFS_I(inode)->lock);
   2216	btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
   2217	spin_unlock(&BTRFS_I(inode)->lock);
   2218}
   2219
   2220static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
   2221				      struct inode *inode)
   2222{
   2223	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   2224
   2225	spin_lock(&root->delalloc_lock);
   2226	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
   2227		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
   2228			      &root->delalloc_inodes);
   2229		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
   2230			&BTRFS_I(inode)->runtime_flags);
   2231		root->nr_delalloc_inodes++;
   2232		if (root->nr_delalloc_inodes == 1) {
   2233			spin_lock(&fs_info->delalloc_root_lock);
   2234			BUG_ON(!list_empty(&root->delalloc_root));
   2235			list_add_tail(&root->delalloc_root,
   2236				      &fs_info->delalloc_roots);
   2237			spin_unlock(&fs_info->delalloc_root_lock);
   2238		}
   2239	}
   2240	spin_unlock(&root->delalloc_lock);
   2241}
   2242
   2243
   2244void __btrfs_del_delalloc_inode(struct btrfs_root *root,
   2245				struct btrfs_inode *inode)
   2246{
   2247	struct btrfs_fs_info *fs_info = root->fs_info;
   2248
   2249	if (!list_empty(&inode->delalloc_inodes)) {
   2250		list_del_init(&inode->delalloc_inodes);
   2251		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
   2252			  &inode->runtime_flags);
   2253		root->nr_delalloc_inodes--;
   2254		if (!root->nr_delalloc_inodes) {
   2255			ASSERT(list_empty(&root->delalloc_inodes));
   2256			spin_lock(&fs_info->delalloc_root_lock);
   2257			BUG_ON(list_empty(&root->delalloc_root));
   2258			list_del_init(&root->delalloc_root);
   2259			spin_unlock(&fs_info->delalloc_root_lock);
   2260		}
   2261	}
   2262}
   2263
   2264static void btrfs_del_delalloc_inode(struct btrfs_root *root,
   2265				     struct btrfs_inode *inode)
   2266{
   2267	spin_lock(&root->delalloc_lock);
   2268	__btrfs_del_delalloc_inode(root, inode);
   2269	spin_unlock(&root->delalloc_lock);
   2270}
   2271
   2272/*
   2273 * Properly track delayed allocation bytes in the inode and to maintain the
   2274 * list of inodes that have pending delalloc work to be done.
   2275 */
   2276void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
   2277			       unsigned *bits)
   2278{
   2279	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   2280
   2281	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
   2282		WARN_ON(1);
   2283	/*
   2284	 * set_bit and clear bit hooks normally require _irqsave/restore
   2285	 * but in this case, we are only testing for the DELALLOC
   2286	 * bit, which is only set or cleared with irqs on
   2287	 */
   2288	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
   2289		struct btrfs_root *root = BTRFS_I(inode)->root;
   2290		u64 len = state->end + 1 - state->start;
   2291		u32 num_extents = count_max_extents(len);
   2292		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
   2293
   2294		spin_lock(&BTRFS_I(inode)->lock);
   2295		btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
   2296		spin_unlock(&BTRFS_I(inode)->lock);
   2297
   2298		/* For sanity tests */
   2299		if (btrfs_is_testing(fs_info))
   2300			return;
   2301
   2302		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
   2303					 fs_info->delalloc_batch);
   2304		spin_lock(&BTRFS_I(inode)->lock);
   2305		BTRFS_I(inode)->delalloc_bytes += len;
   2306		if (*bits & EXTENT_DEFRAG)
   2307			BTRFS_I(inode)->defrag_bytes += len;
   2308		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
   2309					 &BTRFS_I(inode)->runtime_flags))
   2310			btrfs_add_delalloc_inodes(root, inode);
   2311		spin_unlock(&BTRFS_I(inode)->lock);
   2312	}
   2313
   2314	if (!(state->state & EXTENT_DELALLOC_NEW) &&
   2315	    (*bits & EXTENT_DELALLOC_NEW)) {
   2316		spin_lock(&BTRFS_I(inode)->lock);
   2317		BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
   2318			state->start;
   2319		spin_unlock(&BTRFS_I(inode)->lock);
   2320	}
   2321}
   2322
   2323/*
   2324 * Once a range is no longer delalloc this function ensures that proper
   2325 * accounting happens.
   2326 */
   2327void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
   2328				 struct extent_state *state, unsigned *bits)
   2329{
   2330	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
   2331	struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
   2332	u64 len = state->end + 1 - state->start;
   2333	u32 num_extents = count_max_extents(len);
   2334
   2335	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
   2336		spin_lock(&inode->lock);
   2337		inode->defrag_bytes -= len;
   2338		spin_unlock(&inode->lock);
   2339	}
   2340
   2341	/*
   2342	 * set_bit and clear bit hooks normally require _irqsave/restore
   2343	 * but in this case, we are only testing for the DELALLOC
   2344	 * bit, which is only set or cleared with irqs on
   2345	 */
   2346	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
   2347		struct btrfs_root *root = inode->root;
   2348		bool do_list = !btrfs_is_free_space_inode(inode);
   2349
   2350		spin_lock(&inode->lock);
   2351		btrfs_mod_outstanding_extents(inode, -num_extents);
   2352		spin_unlock(&inode->lock);
   2353
   2354		/*
   2355		 * We don't reserve metadata space for space cache inodes so we
   2356		 * don't need to call delalloc_release_metadata if there is an
   2357		 * error.
   2358		 */
   2359		if (*bits & EXTENT_CLEAR_META_RESV &&
   2360		    root != fs_info->tree_root)
   2361			btrfs_delalloc_release_metadata(inode, len, false);
   2362
   2363		/* For sanity tests. */
   2364		if (btrfs_is_testing(fs_info))
   2365			return;
   2366
   2367		if (!btrfs_is_data_reloc_root(root) &&
   2368		    do_list && !(state->state & EXTENT_NORESERVE) &&
   2369		    (*bits & EXTENT_CLEAR_DATA_RESV))
   2370			btrfs_free_reserved_data_space_noquota(fs_info, len);
   2371
   2372		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
   2373					 fs_info->delalloc_batch);
   2374		spin_lock(&inode->lock);
   2375		inode->delalloc_bytes -= len;
   2376		if (do_list && inode->delalloc_bytes == 0 &&
   2377		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
   2378					&inode->runtime_flags))
   2379			btrfs_del_delalloc_inode(root, inode);
   2380		spin_unlock(&inode->lock);
   2381	}
   2382
   2383	if ((state->state & EXTENT_DELALLOC_NEW) &&
   2384	    (*bits & EXTENT_DELALLOC_NEW)) {
   2385		spin_lock(&inode->lock);
   2386		ASSERT(inode->new_delalloc_bytes >= len);
   2387		inode->new_delalloc_bytes -= len;
   2388		if (*bits & EXTENT_ADD_INODE_BYTES)
   2389			inode_add_bytes(&inode->vfs_inode, len);
   2390		spin_unlock(&inode->lock);
   2391	}
   2392}
   2393
   2394/*
   2395 * in order to insert checksums into the metadata in large chunks,
   2396 * we wait until bio submission time.   All the pages in the bio are
   2397 * checksummed and sums are attached onto the ordered extent record.
   2398 *
   2399 * At IO completion time the cums attached on the ordered extent record
   2400 * are inserted into the btree
   2401 */
   2402static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
   2403					   u64 dio_file_offset)
   2404{
   2405	return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
   2406}
   2407
   2408/*
   2409 * Split an extent_map at [start, start + len]
   2410 *
   2411 * This function is intended to be used only for extract_ordered_extent().
   2412 */
   2413static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
   2414			  u64 pre, u64 post)
   2415{
   2416	struct extent_map_tree *em_tree = &inode->extent_tree;
   2417	struct extent_map *em;
   2418	struct extent_map *split_pre = NULL;
   2419	struct extent_map *split_mid = NULL;
   2420	struct extent_map *split_post = NULL;
   2421	int ret = 0;
   2422	unsigned long flags;
   2423
   2424	/* Sanity check */
   2425	if (pre == 0 && post == 0)
   2426		return 0;
   2427
   2428	split_pre = alloc_extent_map();
   2429	if (pre)
   2430		split_mid = alloc_extent_map();
   2431	if (post)
   2432		split_post = alloc_extent_map();
   2433	if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
   2434		ret = -ENOMEM;
   2435		goto out;
   2436	}
   2437
   2438	ASSERT(pre + post < len);
   2439
   2440	lock_extent(&inode->io_tree, start, start + len - 1);
   2441	write_lock(&em_tree->lock);
   2442	em = lookup_extent_mapping(em_tree, start, len);
   2443	if (!em) {
   2444		ret = -EIO;
   2445		goto out_unlock;
   2446	}
   2447
   2448	ASSERT(em->len == len);
   2449	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
   2450	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
   2451	ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
   2452	ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
   2453	ASSERT(!list_empty(&em->list));
   2454
   2455	flags = em->flags;
   2456	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
   2457
   2458	/* First, replace the em with a new extent_map starting from * em->start */
   2459	split_pre->start = em->start;
   2460	split_pre->len = (pre ? pre : em->len - post);
   2461	split_pre->orig_start = split_pre->start;
   2462	split_pre->block_start = em->block_start;
   2463	split_pre->block_len = split_pre->len;
   2464	split_pre->orig_block_len = split_pre->block_len;
   2465	split_pre->ram_bytes = split_pre->len;
   2466	split_pre->flags = flags;
   2467	split_pre->compress_type = em->compress_type;
   2468	split_pre->generation = em->generation;
   2469
   2470	replace_extent_mapping(em_tree, em, split_pre, 1);
   2471
   2472	/*
   2473	 * Now we only have an extent_map at:
   2474	 *     [em->start, em->start + pre] if pre != 0
   2475	 *     [em->start, em->start + em->len - post] if pre == 0
   2476	 */
   2477
   2478	if (pre) {
   2479		/* Insert the middle extent_map */
   2480		split_mid->start = em->start + pre;
   2481		split_mid->len = em->len - pre - post;
   2482		split_mid->orig_start = split_mid->start;
   2483		split_mid->block_start = em->block_start + pre;
   2484		split_mid->block_len = split_mid->len;
   2485		split_mid->orig_block_len = split_mid->block_len;
   2486		split_mid->ram_bytes = split_mid->len;
   2487		split_mid->flags = flags;
   2488		split_mid->compress_type = em->compress_type;
   2489		split_mid->generation = em->generation;
   2490		add_extent_mapping(em_tree, split_mid, 1);
   2491	}
   2492
   2493	if (post) {
   2494		split_post->start = em->start + em->len - post;
   2495		split_post->len = post;
   2496		split_post->orig_start = split_post->start;
   2497		split_post->block_start = em->block_start + em->len - post;
   2498		split_post->block_len = split_post->len;
   2499		split_post->orig_block_len = split_post->block_len;
   2500		split_post->ram_bytes = split_post->len;
   2501		split_post->flags = flags;
   2502		split_post->compress_type = em->compress_type;
   2503		split_post->generation = em->generation;
   2504		add_extent_mapping(em_tree, split_post, 1);
   2505	}
   2506
   2507	/* Once for us */
   2508	free_extent_map(em);
   2509	/* Once for the tree */
   2510	free_extent_map(em);
   2511
   2512out_unlock:
   2513	write_unlock(&em_tree->lock);
   2514	unlock_extent(&inode->io_tree, start, start + len - 1);
   2515out:
   2516	free_extent_map(split_pre);
   2517	free_extent_map(split_mid);
   2518	free_extent_map(split_post);
   2519
   2520	return ret;
   2521}
   2522
   2523static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
   2524					   struct bio *bio, loff_t file_offset)
   2525{
   2526	struct btrfs_ordered_extent *ordered;
   2527	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
   2528	u64 file_len;
   2529	u64 len = bio->bi_iter.bi_size;
   2530	u64 end = start + len;
   2531	u64 ordered_end;
   2532	u64 pre, post;
   2533	int ret = 0;
   2534
   2535	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
   2536	if (WARN_ON_ONCE(!ordered))
   2537		return BLK_STS_IOERR;
   2538
   2539	/* No need to split */
   2540	if (ordered->disk_num_bytes == len)
   2541		goto out;
   2542
   2543	/* We cannot split once end_bio'd ordered extent */
   2544	if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
   2545		ret = -EINVAL;
   2546		goto out;
   2547	}
   2548
   2549	/* We cannot split a compressed ordered extent */
   2550	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
   2551		ret = -EINVAL;
   2552		goto out;
   2553	}
   2554
   2555	ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
   2556	/* bio must be in one ordered extent */
   2557	if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
   2558		ret = -EINVAL;
   2559		goto out;
   2560	}
   2561
   2562	/* Checksum list should be empty */
   2563	if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
   2564		ret = -EINVAL;
   2565		goto out;
   2566	}
   2567
   2568	file_len = ordered->num_bytes;
   2569	pre = start - ordered->disk_bytenr;
   2570	post = ordered_end - end;
   2571
   2572	ret = btrfs_split_ordered_extent(ordered, pre, post);
   2573	if (ret)
   2574		goto out;
   2575	ret = split_zoned_em(inode, file_offset, file_len, pre, post);
   2576
   2577out:
   2578	btrfs_put_ordered_extent(ordered);
   2579
   2580	return errno_to_blk_status(ret);
   2581}
   2582
   2583/*
   2584 * extent_io.c submission hook. This does the right thing for csum calculation
   2585 * on write, or reading the csums from the tree before a read.
   2586 *
   2587 * Rules about async/sync submit,
   2588 * a) read:				sync submit
   2589 *
   2590 * b) write without checksum:		sync submit
   2591 *
   2592 * c) write with checksum:
   2593 *    c-1) if bio is issued by fsync:	sync submit
   2594 *         (sync_writers != 0)
   2595 *
   2596 *    c-2) if root is reloc root:	sync submit
   2597 *         (only in case of buffered IO)
   2598 *
   2599 *    c-3) otherwise:			async submit
   2600 */
   2601void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
   2602			   int mirror_num, enum btrfs_compression_type compress_type)
   2603{
   2604	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   2605	struct btrfs_root *root = BTRFS_I(inode)->root;
   2606	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
   2607	blk_status_t ret = 0;
   2608	int skip_sum;
   2609	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
   2610
   2611	skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
   2612		test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
   2613
   2614	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
   2615		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
   2616
   2617	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
   2618		struct page *page = bio_first_bvec_all(bio)->bv_page;
   2619		loff_t file_offset = page_offset(page);
   2620
   2621		ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
   2622		if (ret)
   2623			goto out;
   2624	}
   2625
   2626	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
   2627		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
   2628		if (ret)
   2629			goto out;
   2630
   2631		if (compress_type != BTRFS_COMPRESS_NONE) {
   2632			/*
   2633			 * btrfs_submit_compressed_read will handle completing
   2634			 * the bio if there were any errors, so just return
   2635			 * here.
   2636			 */
   2637			btrfs_submit_compressed_read(inode, bio, mirror_num);
   2638			return;
   2639		} else {
   2640			/*
   2641			 * Lookup bio sums does extra checks around whether we
   2642			 * need to csum or not, which is why we ignore skip_sum
   2643			 * here.
   2644			 */
   2645			ret = btrfs_lookup_bio_sums(inode, bio, NULL);
   2646			if (ret)
   2647				goto out;
   2648		}
   2649		goto mapit;
   2650	} else if (async && !skip_sum) {
   2651		/* csum items have already been cloned */
   2652		if (btrfs_is_data_reloc_root(root))
   2653			goto mapit;
   2654		/* we're doing a write, do the async checksumming */
   2655		ret = btrfs_wq_submit_bio(inode, bio, mirror_num,
   2656					  0, btrfs_submit_bio_start);
   2657		goto out;
   2658	} else if (!skip_sum) {
   2659		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
   2660		if (ret)
   2661			goto out;
   2662	}
   2663
   2664mapit:
   2665	ret = btrfs_map_bio(fs_info, bio, mirror_num);
   2666
   2667out:
   2668	if (ret) {
   2669		bio->bi_status = ret;
   2670		bio_endio(bio);
   2671	}
   2672}
   2673
   2674/*
   2675 * given a list of ordered sums record them in the inode.  This happens
   2676 * at IO completion time based on sums calculated at bio submission time.
   2677 */
   2678static int add_pending_csums(struct btrfs_trans_handle *trans,
   2679			     struct list_head *list)
   2680{
   2681	struct btrfs_ordered_sum *sum;
   2682	struct btrfs_root *csum_root = NULL;
   2683	int ret;
   2684
   2685	list_for_each_entry(sum, list, list) {
   2686		trans->adding_csums = true;
   2687		if (!csum_root)
   2688			csum_root = btrfs_csum_root(trans->fs_info,
   2689						    sum->bytenr);
   2690		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
   2691		trans->adding_csums = false;
   2692		if (ret)
   2693			return ret;
   2694	}
   2695	return 0;
   2696}
   2697
   2698static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
   2699					 const u64 start,
   2700					 const u64 len,
   2701					 struct extent_state **cached_state)
   2702{
   2703	u64 search_start = start;
   2704	const u64 end = start + len - 1;
   2705
   2706	while (search_start < end) {
   2707		const u64 search_len = end - search_start + 1;
   2708		struct extent_map *em;
   2709		u64 em_len;
   2710		int ret = 0;
   2711
   2712		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
   2713		if (IS_ERR(em))
   2714			return PTR_ERR(em);
   2715
   2716		if (em->block_start != EXTENT_MAP_HOLE)
   2717			goto next;
   2718
   2719		em_len = em->len;
   2720		if (em->start < search_start)
   2721			em_len -= search_start - em->start;
   2722		if (em_len > search_len)
   2723			em_len = search_len;
   2724
   2725		ret = set_extent_bit(&inode->io_tree, search_start,
   2726				     search_start + em_len - 1,
   2727				     EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
   2728				     GFP_NOFS, NULL);
   2729next:
   2730		search_start = extent_map_end(em);
   2731		free_extent_map(em);
   2732		if (ret)
   2733			return ret;
   2734	}
   2735	return 0;
   2736}
   2737
   2738int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
   2739			      unsigned int extra_bits,
   2740			      struct extent_state **cached_state)
   2741{
   2742	WARN_ON(PAGE_ALIGNED(end));
   2743
   2744	if (start >= i_size_read(&inode->vfs_inode) &&
   2745	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
   2746		/*
   2747		 * There can't be any extents following eof in this case so just
   2748		 * set the delalloc new bit for the range directly.
   2749		 */
   2750		extra_bits |= EXTENT_DELALLOC_NEW;
   2751	} else {
   2752		int ret;
   2753
   2754		ret = btrfs_find_new_delalloc_bytes(inode, start,
   2755						    end + 1 - start,
   2756						    cached_state);
   2757		if (ret)
   2758			return ret;
   2759	}
   2760
   2761	return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
   2762				   cached_state);
   2763}
   2764
   2765/* see btrfs_writepage_start_hook for details on why this is required */
   2766struct btrfs_writepage_fixup {
   2767	struct page *page;
   2768	struct inode *inode;
   2769	struct btrfs_work work;
   2770};
   2771
   2772static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
   2773{
   2774	struct btrfs_writepage_fixup *fixup;
   2775	struct btrfs_ordered_extent *ordered;
   2776	struct extent_state *cached_state = NULL;
   2777	struct extent_changeset *data_reserved = NULL;
   2778	struct page *page;
   2779	struct btrfs_inode *inode;
   2780	u64 page_start;
   2781	u64 page_end;
   2782	int ret = 0;
   2783	bool free_delalloc_space = true;
   2784
   2785	fixup = container_of(work, struct btrfs_writepage_fixup, work);
   2786	page = fixup->page;
   2787	inode = BTRFS_I(fixup->inode);
   2788	page_start = page_offset(page);
   2789	page_end = page_offset(page) + PAGE_SIZE - 1;
   2790
   2791	/*
   2792	 * This is similar to page_mkwrite, we need to reserve the space before
   2793	 * we take the page lock.
   2794	 */
   2795	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
   2796					   PAGE_SIZE);
   2797again:
   2798	lock_page(page);
   2799
   2800	/*
   2801	 * Before we queued this fixup, we took a reference on the page.
   2802	 * page->mapping may go NULL, but it shouldn't be moved to a different
   2803	 * address space.
   2804	 */
   2805	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
   2806		/*
   2807		 * Unfortunately this is a little tricky, either
   2808		 *
   2809		 * 1) We got here and our page had already been dealt with and
   2810		 *    we reserved our space, thus ret == 0, so we need to just
   2811		 *    drop our space reservation and bail.  This can happen the
   2812		 *    first time we come into the fixup worker, or could happen
   2813		 *    while waiting for the ordered extent.
   2814		 * 2) Our page was already dealt with, but we happened to get an
   2815		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
   2816		 *    this case we obviously don't have anything to release, but
   2817		 *    because the page was already dealt with we don't want to
   2818		 *    mark the page with an error, so make sure we're resetting
   2819		 *    ret to 0.  This is why we have this check _before_ the ret
   2820		 *    check, because we do not want to have a surprise ENOSPC
   2821		 *    when the page was already properly dealt with.
   2822		 */
   2823		if (!ret) {
   2824			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
   2825			btrfs_delalloc_release_space(inode, data_reserved,
   2826						     page_start, PAGE_SIZE,
   2827						     true);
   2828		}
   2829		ret = 0;
   2830		goto out_page;
   2831	}
   2832
   2833	/*
   2834	 * We can't mess with the page state unless it is locked, so now that
   2835	 * it is locked bail if we failed to make our space reservation.
   2836	 */
   2837	if (ret)
   2838		goto out_page;
   2839
   2840	lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
   2841
   2842	/* already ordered? We're done */
   2843	if (PageOrdered(page))
   2844		goto out_reserved;
   2845
   2846	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
   2847	if (ordered) {
   2848		unlock_extent_cached(&inode->io_tree, page_start, page_end,
   2849				     &cached_state);
   2850		unlock_page(page);
   2851		btrfs_start_ordered_extent(ordered, 1);
   2852		btrfs_put_ordered_extent(ordered);
   2853		goto again;
   2854	}
   2855
   2856	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
   2857					&cached_state);
   2858	if (ret)
   2859		goto out_reserved;
   2860
   2861	/*
   2862	 * Everything went as planned, we're now the owner of a dirty page with
   2863	 * delayed allocation bits set and space reserved for our COW
   2864	 * destination.
   2865	 *
   2866	 * The page was dirty when we started, nothing should have cleaned it.
   2867	 */
   2868	BUG_ON(!PageDirty(page));
   2869	free_delalloc_space = false;
   2870out_reserved:
   2871	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
   2872	if (free_delalloc_space)
   2873		btrfs_delalloc_release_space(inode, data_reserved, page_start,
   2874					     PAGE_SIZE, true);
   2875	unlock_extent_cached(&inode->io_tree, page_start, page_end,
   2876			     &cached_state);
   2877out_page:
   2878	if (ret) {
   2879		/*
   2880		 * We hit ENOSPC or other errors.  Update the mapping and page
   2881		 * to reflect the errors and clean the page.
   2882		 */
   2883		mapping_set_error(page->mapping, ret);
   2884		end_extent_writepage(page, ret, page_start, page_end);
   2885		clear_page_dirty_for_io(page);
   2886		SetPageError(page);
   2887	}
   2888	btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
   2889	unlock_page(page);
   2890	put_page(page);
   2891	kfree(fixup);
   2892	extent_changeset_free(data_reserved);
   2893	/*
   2894	 * As a precaution, do a delayed iput in case it would be the last iput
   2895	 * that could need flushing space. Recursing back to fixup worker would
   2896	 * deadlock.
   2897	 */
   2898	btrfs_add_delayed_iput(&inode->vfs_inode);
   2899}
   2900
   2901/*
   2902 * There are a few paths in the higher layers of the kernel that directly
   2903 * set the page dirty bit without asking the filesystem if it is a
   2904 * good idea.  This causes problems because we want to make sure COW
   2905 * properly happens and the data=ordered rules are followed.
   2906 *
   2907 * In our case any range that doesn't have the ORDERED bit set
   2908 * hasn't been properly setup for IO.  We kick off an async process
   2909 * to fix it up.  The async helper will wait for ordered extents, set
   2910 * the delalloc bit and make it safe to write the page.
   2911 */
   2912int btrfs_writepage_cow_fixup(struct page *page)
   2913{
   2914	struct inode *inode = page->mapping->host;
   2915	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   2916	struct btrfs_writepage_fixup *fixup;
   2917
   2918	/* This page has ordered extent covering it already */
   2919	if (PageOrdered(page))
   2920		return 0;
   2921
   2922	/*
   2923	 * PageChecked is set below when we create a fixup worker for this page,
   2924	 * don't try to create another one if we're already PageChecked()
   2925	 *
   2926	 * The extent_io writepage code will redirty the page if we send back
   2927	 * EAGAIN.
   2928	 */
   2929	if (PageChecked(page))
   2930		return -EAGAIN;
   2931
   2932	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
   2933	if (!fixup)
   2934		return -EAGAIN;
   2935
   2936	/*
   2937	 * We are already holding a reference to this inode from
   2938	 * write_cache_pages.  We need to hold it because the space reservation
   2939	 * takes place outside of the page lock, and we can't trust
   2940	 * page->mapping outside of the page lock.
   2941	 */
   2942	ihold(inode);
   2943	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
   2944	get_page(page);
   2945	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
   2946	fixup->page = page;
   2947	fixup->inode = inode;
   2948	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
   2949
   2950	return -EAGAIN;
   2951}
   2952
   2953static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
   2954				       struct btrfs_inode *inode, u64 file_pos,
   2955				       struct btrfs_file_extent_item *stack_fi,
   2956				       const bool update_inode_bytes,
   2957				       u64 qgroup_reserved)
   2958{
   2959	struct btrfs_root *root = inode->root;
   2960	const u64 sectorsize = root->fs_info->sectorsize;
   2961	struct btrfs_path *path;
   2962	struct extent_buffer *leaf;
   2963	struct btrfs_key ins;
   2964	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
   2965	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
   2966	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
   2967	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
   2968	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
   2969	struct btrfs_drop_extents_args drop_args = { 0 };
   2970	int ret;
   2971
   2972	path = btrfs_alloc_path();
   2973	if (!path)
   2974		return -ENOMEM;
   2975
   2976	/*
   2977	 * we may be replacing one extent in the tree with another.
   2978	 * The new extent is pinned in the extent map, and we don't want
   2979	 * to drop it from the cache until it is completely in the btree.
   2980	 *
   2981	 * So, tell btrfs_drop_extents to leave this extent in the cache.
   2982	 * the caller is expected to unpin it and allow it to be merged
   2983	 * with the others.
   2984	 */
   2985	drop_args.path = path;
   2986	drop_args.start = file_pos;
   2987	drop_args.end = file_pos + num_bytes;
   2988	drop_args.replace_extent = true;
   2989	drop_args.extent_item_size = sizeof(*stack_fi);
   2990	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
   2991	if (ret)
   2992		goto out;
   2993
   2994	if (!drop_args.extent_inserted) {
   2995		ins.objectid = btrfs_ino(inode);
   2996		ins.offset = file_pos;
   2997		ins.type = BTRFS_EXTENT_DATA_KEY;
   2998
   2999		ret = btrfs_insert_empty_item(trans, root, path, &ins,
   3000					      sizeof(*stack_fi));
   3001		if (ret)
   3002			goto out;
   3003	}
   3004	leaf = path->nodes[0];
   3005	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
   3006	write_extent_buffer(leaf, stack_fi,
   3007			btrfs_item_ptr_offset(leaf, path->slots[0]),
   3008			sizeof(struct btrfs_file_extent_item));
   3009
   3010	btrfs_mark_buffer_dirty(leaf);
   3011	btrfs_release_path(path);
   3012
   3013	/*
   3014	 * If we dropped an inline extent here, we know the range where it is
   3015	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
   3016	 * number of bytes only for that range containing the inline extent.
   3017	 * The remaining of the range will be processed when clearning the
   3018	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
   3019	 */
   3020	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
   3021		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
   3022
   3023		inline_size = drop_args.bytes_found - inline_size;
   3024		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
   3025		drop_args.bytes_found -= inline_size;
   3026		num_bytes -= sectorsize;
   3027	}
   3028
   3029	if (update_inode_bytes)
   3030		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
   3031
   3032	ins.objectid = disk_bytenr;
   3033	ins.offset = disk_num_bytes;
   3034	ins.type = BTRFS_EXTENT_ITEM_KEY;
   3035
   3036	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
   3037	if (ret)
   3038		goto out;
   3039
   3040	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
   3041					       file_pos - offset,
   3042					       qgroup_reserved, &ins);
   3043out:
   3044	btrfs_free_path(path);
   3045
   3046	return ret;
   3047}
   3048
   3049static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
   3050					 u64 start, u64 len)
   3051{
   3052	struct btrfs_block_group *cache;
   3053
   3054	cache = btrfs_lookup_block_group(fs_info, start);
   3055	ASSERT(cache);
   3056
   3057	spin_lock(&cache->lock);
   3058	cache->delalloc_bytes -= len;
   3059	spin_unlock(&cache->lock);
   3060
   3061	btrfs_put_block_group(cache);
   3062}
   3063
   3064static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
   3065					     struct btrfs_ordered_extent *oe)
   3066{
   3067	struct btrfs_file_extent_item stack_fi;
   3068	bool update_inode_bytes;
   3069	u64 num_bytes = oe->num_bytes;
   3070	u64 ram_bytes = oe->ram_bytes;
   3071
   3072	memset(&stack_fi, 0, sizeof(stack_fi));
   3073	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
   3074	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
   3075	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
   3076						   oe->disk_num_bytes);
   3077	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
   3078	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
   3079		num_bytes = ram_bytes = oe->truncated_len;
   3080	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
   3081	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
   3082	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
   3083	/* Encryption and other encoding is reserved and all 0 */
   3084
   3085	/*
   3086	 * For delalloc, when completing an ordered extent we update the inode's
   3087	 * bytes when clearing the range in the inode's io tree, so pass false
   3088	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
   3089	 * except if the ordered extent was truncated.
   3090	 */
   3091	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
   3092			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
   3093			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
   3094
   3095	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
   3096					   oe->file_offset, &stack_fi,
   3097					   update_inode_bytes, oe->qgroup_rsv);
   3098}
   3099
   3100/*
   3101 * As ordered data IO finishes, this gets called so we can finish
   3102 * an ordered extent if the range of bytes in the file it covers are
   3103 * fully written.
   3104 */
   3105static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
   3106{
   3107	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
   3108	struct btrfs_root *root = inode->root;
   3109	struct btrfs_fs_info *fs_info = root->fs_info;
   3110	struct btrfs_trans_handle *trans = NULL;
   3111	struct extent_io_tree *io_tree = &inode->io_tree;
   3112	struct extent_state *cached_state = NULL;
   3113	u64 start, end;
   3114	int compress_type = 0;
   3115	int ret = 0;
   3116	u64 logical_len = ordered_extent->num_bytes;
   3117	bool freespace_inode;
   3118	bool truncated = false;
   3119	bool clear_reserved_extent = true;
   3120	unsigned int clear_bits = EXTENT_DEFRAG;
   3121
   3122	start = ordered_extent->file_offset;
   3123	end = start + ordered_extent->num_bytes - 1;
   3124
   3125	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
   3126	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
   3127	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
   3128	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
   3129		clear_bits |= EXTENT_DELALLOC_NEW;
   3130
   3131	freespace_inode = btrfs_is_free_space_inode(inode);
   3132
   3133	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
   3134		ret = -EIO;
   3135		goto out;
   3136	}
   3137
   3138	/* A valid bdev implies a write on a sequential zone */
   3139	if (ordered_extent->bdev) {
   3140		btrfs_rewrite_logical_zoned(ordered_extent);
   3141		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
   3142					ordered_extent->disk_num_bytes);
   3143	}
   3144
   3145	btrfs_free_io_failure_record(inode, start, end);
   3146
   3147	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
   3148		truncated = true;
   3149		logical_len = ordered_extent->truncated_len;
   3150		/* Truncated the entire extent, don't bother adding */
   3151		if (!logical_len)
   3152			goto out;
   3153	}
   3154
   3155	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
   3156		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
   3157
   3158		btrfs_inode_safe_disk_i_size_write(inode, 0);
   3159		if (freespace_inode)
   3160			trans = btrfs_join_transaction_spacecache(root);
   3161		else
   3162			trans = btrfs_join_transaction(root);
   3163		if (IS_ERR(trans)) {
   3164			ret = PTR_ERR(trans);
   3165			trans = NULL;
   3166			goto out;
   3167		}
   3168		trans->block_rsv = &inode->block_rsv;
   3169		ret = btrfs_update_inode_fallback(trans, root, inode);
   3170		if (ret) /* -ENOMEM or corruption */
   3171			btrfs_abort_transaction(trans, ret);
   3172		goto out;
   3173	}
   3174
   3175	clear_bits |= EXTENT_LOCKED;
   3176	lock_extent_bits(io_tree, start, end, &cached_state);
   3177
   3178	if (freespace_inode)
   3179		trans = btrfs_join_transaction_spacecache(root);
   3180	else
   3181		trans = btrfs_join_transaction(root);
   3182	if (IS_ERR(trans)) {
   3183		ret = PTR_ERR(trans);
   3184		trans = NULL;
   3185		goto out;
   3186	}
   3187
   3188	trans->block_rsv = &inode->block_rsv;
   3189
   3190	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
   3191		compress_type = ordered_extent->compress_type;
   3192	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
   3193		BUG_ON(compress_type);
   3194		ret = btrfs_mark_extent_written(trans, inode,
   3195						ordered_extent->file_offset,
   3196						ordered_extent->file_offset +
   3197						logical_len);
   3198		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
   3199						  ordered_extent->disk_num_bytes);
   3200	} else {
   3201		BUG_ON(root == fs_info->tree_root);
   3202		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
   3203		if (!ret) {
   3204			clear_reserved_extent = false;
   3205			btrfs_release_delalloc_bytes(fs_info,
   3206						ordered_extent->disk_bytenr,
   3207						ordered_extent->disk_num_bytes);
   3208		}
   3209	}
   3210	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
   3211			   ordered_extent->num_bytes, trans->transid);
   3212	if (ret < 0) {
   3213		btrfs_abort_transaction(trans, ret);
   3214		goto out;
   3215	}
   3216
   3217	ret = add_pending_csums(trans, &ordered_extent->list);
   3218	if (ret) {
   3219		btrfs_abort_transaction(trans, ret);
   3220		goto out;
   3221	}
   3222
   3223	/*
   3224	 * If this is a new delalloc range, clear its new delalloc flag to
   3225	 * update the inode's number of bytes. This needs to be done first
   3226	 * before updating the inode item.
   3227	 */
   3228	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
   3229	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
   3230		clear_extent_bit(&inode->io_tree, start, end,
   3231				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
   3232				 0, 0, &cached_state);
   3233
   3234	btrfs_inode_safe_disk_i_size_write(inode, 0);
   3235	ret = btrfs_update_inode_fallback(trans, root, inode);
   3236	if (ret) { /* -ENOMEM or corruption */
   3237		btrfs_abort_transaction(trans, ret);
   3238		goto out;
   3239	}
   3240	ret = 0;
   3241out:
   3242	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
   3243			 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
   3244			 &cached_state);
   3245
   3246	if (trans)
   3247		btrfs_end_transaction(trans);
   3248
   3249	if (ret || truncated) {
   3250		u64 unwritten_start = start;
   3251
   3252		/*
   3253		 * If we failed to finish this ordered extent for any reason we
   3254		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
   3255		 * extent, and mark the inode with the error if it wasn't
   3256		 * already set.  Any error during writeback would have already
   3257		 * set the mapping error, so we need to set it if we're the ones
   3258		 * marking this ordered extent as failed.
   3259		 */
   3260		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
   3261					     &ordered_extent->flags))
   3262			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
   3263
   3264		if (truncated)
   3265			unwritten_start += logical_len;
   3266		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
   3267
   3268		/* Drop the cache for the part of the extent we didn't write. */
   3269		btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
   3270
   3271		/*
   3272		 * If the ordered extent had an IOERR or something else went
   3273		 * wrong we need to return the space for this ordered extent
   3274		 * back to the allocator.  We only free the extent in the
   3275		 * truncated case if we didn't write out the extent at all.
   3276		 *
   3277		 * If we made it past insert_reserved_file_extent before we
   3278		 * errored out then we don't need to do this as the accounting
   3279		 * has already been done.
   3280		 */
   3281		if ((ret || !logical_len) &&
   3282		    clear_reserved_extent &&
   3283		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
   3284		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
   3285			/*
   3286			 * Discard the range before returning it back to the
   3287			 * free space pool
   3288			 */
   3289			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
   3290				btrfs_discard_extent(fs_info,
   3291						ordered_extent->disk_bytenr,
   3292						ordered_extent->disk_num_bytes,
   3293						NULL);
   3294			btrfs_free_reserved_extent(fs_info,
   3295					ordered_extent->disk_bytenr,
   3296					ordered_extent->disk_num_bytes, 1);
   3297		}
   3298	}
   3299
   3300	/*
   3301	 * This needs to be done to make sure anybody waiting knows we are done
   3302	 * updating everything for this ordered extent.
   3303	 */
   3304	btrfs_remove_ordered_extent(inode, ordered_extent);
   3305
   3306	/* once for us */
   3307	btrfs_put_ordered_extent(ordered_extent);
   3308	/* once for the tree */
   3309	btrfs_put_ordered_extent(ordered_extent);
   3310
   3311	return ret;
   3312}
   3313
   3314static void finish_ordered_fn(struct btrfs_work *work)
   3315{
   3316	struct btrfs_ordered_extent *ordered_extent;
   3317	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
   3318	btrfs_finish_ordered_io(ordered_extent);
   3319}
   3320
   3321void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
   3322					  struct page *page, u64 start,
   3323					  u64 end, bool uptodate)
   3324{
   3325	trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
   3326
   3327	btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
   3328				       finish_ordered_fn, uptodate);
   3329}
   3330
   3331/*
   3332 * check_data_csum - verify checksum of one sector of uncompressed data
   3333 * @inode:	inode
   3334 * @io_bio:	btrfs_io_bio which contains the csum
   3335 * @bio_offset:	offset to the beginning of the bio (in bytes)
   3336 * @page:	page where is the data to be verified
   3337 * @pgoff:	offset inside the page
   3338 * @start:	logical offset in the file
   3339 *
   3340 * The length of such check is always one sector size.
   3341 */
   3342static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
   3343			   u32 bio_offset, struct page *page, u32 pgoff,
   3344			   u64 start)
   3345{
   3346	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   3347	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
   3348	char *kaddr;
   3349	u32 len = fs_info->sectorsize;
   3350	const u32 csum_size = fs_info->csum_size;
   3351	unsigned int offset_sectors;
   3352	u8 *csum_expected;
   3353	u8 csum[BTRFS_CSUM_SIZE];
   3354
   3355	ASSERT(pgoff + len <= PAGE_SIZE);
   3356
   3357	offset_sectors = bio_offset >> fs_info->sectorsize_bits;
   3358	csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
   3359
   3360	kaddr = kmap_atomic(page);
   3361	shash->tfm = fs_info->csum_shash;
   3362
   3363	crypto_shash_digest(shash, kaddr + pgoff, len, csum);
   3364	kunmap_atomic(kaddr);
   3365
   3366	if (memcmp(csum, csum_expected, csum_size))
   3367		goto zeroit;
   3368
   3369	return 0;
   3370zeroit:
   3371	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
   3372				    bbio->mirror_num);
   3373	if (bbio->device)
   3374		btrfs_dev_stat_inc_and_print(bbio->device,
   3375					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
   3376	memzero_page(page, pgoff, len);
   3377	return -EIO;
   3378}
   3379
   3380/*
   3381 * When reads are done, we need to check csums to verify the data is correct.
   3382 * if there's a match, we allow the bio to finish.  If not, the code in
   3383 * extent_io.c will try to find good copies for us.
   3384 *
   3385 * @bio_offset:	offset to the beginning of the bio (in bytes)
   3386 * @start:	file offset of the range start
   3387 * @end:	file offset of the range end (inclusive)
   3388 *
   3389 * Return a bitmap where bit set means a csum mismatch, and bit not set means
   3390 * csum match.
   3391 */
   3392unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
   3393				    u32 bio_offset, struct page *page,
   3394				    u64 start, u64 end)
   3395{
   3396	struct inode *inode = page->mapping->host;
   3397	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   3398	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
   3399	struct btrfs_root *root = BTRFS_I(inode)->root;
   3400	const u32 sectorsize = root->fs_info->sectorsize;
   3401	u32 pg_off;
   3402	unsigned int result = 0;
   3403
   3404	if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
   3405		btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
   3406		return 0;
   3407	}
   3408
   3409	/*
   3410	 * This only happens for NODATASUM or compressed read.
   3411	 * Normally this should be covered by above check for compressed read
   3412	 * or the next check for NODATASUM.  Just do a quicker exit here.
   3413	 */
   3414	if (bbio->csum == NULL)
   3415		return 0;
   3416
   3417	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
   3418		return 0;
   3419
   3420	if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
   3421		return 0;
   3422
   3423	ASSERT(page_offset(page) <= start &&
   3424	       end <= page_offset(page) + PAGE_SIZE - 1);
   3425	for (pg_off = offset_in_page(start);
   3426	     pg_off < offset_in_page(end);
   3427	     pg_off += sectorsize, bio_offset += sectorsize) {
   3428		u64 file_offset = pg_off + page_offset(page);
   3429		int ret;
   3430
   3431		if (btrfs_is_data_reloc_root(root) &&
   3432		    test_range_bit(io_tree, file_offset,
   3433				   file_offset + sectorsize - 1,
   3434				   EXTENT_NODATASUM, 1, NULL)) {
   3435			/* Skip the range without csum for data reloc inode */
   3436			clear_extent_bits(io_tree, file_offset,
   3437					  file_offset + sectorsize - 1,
   3438					  EXTENT_NODATASUM);
   3439			continue;
   3440		}
   3441		ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
   3442				      page_offset(page) + pg_off);
   3443		if (ret < 0) {
   3444			const int nr_bit = (pg_off - offset_in_page(start)) >>
   3445				     root->fs_info->sectorsize_bits;
   3446
   3447			result |= (1U << nr_bit);
   3448		}
   3449	}
   3450	return result;
   3451}
   3452
   3453/*
   3454 * btrfs_add_delayed_iput - perform a delayed iput on @inode
   3455 *
   3456 * @inode: The inode we want to perform iput on
   3457 *
   3458 * This function uses the generic vfs_inode::i_count to track whether we should
   3459 * just decrement it (in case it's > 1) or if this is the last iput then link
   3460 * the inode to the delayed iput machinery. Delayed iputs are processed at
   3461 * transaction commit time/superblock commit/cleaner kthread.
   3462 */
   3463void btrfs_add_delayed_iput(struct inode *inode)
   3464{
   3465	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   3466	struct btrfs_inode *binode = BTRFS_I(inode);
   3467
   3468	if (atomic_add_unless(&inode->i_count, -1, 1))
   3469		return;
   3470
   3471	atomic_inc(&fs_info->nr_delayed_iputs);
   3472	spin_lock(&fs_info->delayed_iput_lock);
   3473	ASSERT(list_empty(&binode->delayed_iput));
   3474	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
   3475	spin_unlock(&fs_info->delayed_iput_lock);
   3476	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
   3477		wake_up_process(fs_info->cleaner_kthread);
   3478}
   3479
   3480static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
   3481				    struct btrfs_inode *inode)
   3482{
   3483	list_del_init(&inode->delayed_iput);
   3484	spin_unlock(&fs_info->delayed_iput_lock);
   3485	iput(&inode->vfs_inode);
   3486	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
   3487		wake_up(&fs_info->delayed_iputs_wait);
   3488	spin_lock(&fs_info->delayed_iput_lock);
   3489}
   3490
   3491static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
   3492				   struct btrfs_inode *inode)
   3493{
   3494	if (!list_empty(&inode->delayed_iput)) {
   3495		spin_lock(&fs_info->delayed_iput_lock);
   3496		if (!list_empty(&inode->delayed_iput))
   3497			run_delayed_iput_locked(fs_info, inode);
   3498		spin_unlock(&fs_info->delayed_iput_lock);
   3499	}
   3500}
   3501
   3502void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
   3503{
   3504
   3505	spin_lock(&fs_info->delayed_iput_lock);
   3506	while (!list_empty(&fs_info->delayed_iputs)) {
   3507		struct btrfs_inode *inode;
   3508
   3509		inode = list_first_entry(&fs_info->delayed_iputs,
   3510				struct btrfs_inode, delayed_iput);
   3511		run_delayed_iput_locked(fs_info, inode);
   3512		cond_resched_lock(&fs_info->delayed_iput_lock);
   3513	}
   3514	spin_unlock(&fs_info->delayed_iput_lock);
   3515}
   3516
   3517/**
   3518 * Wait for flushing all delayed iputs
   3519 *
   3520 * @fs_info:  the filesystem
   3521 *
   3522 * This will wait on any delayed iputs that are currently running with KILLABLE
   3523 * set.  Once they are all done running we will return, unless we are killed in
   3524 * which case we return EINTR. This helps in user operations like fallocate etc
   3525 * that might get blocked on the iputs.
   3526 *
   3527 * Return EINTR if we were killed, 0 if nothing's pending
   3528 */
   3529int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
   3530{
   3531	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
   3532			atomic_read(&fs_info->nr_delayed_iputs) == 0);
   3533	if (ret)
   3534		return -EINTR;
   3535	return 0;
   3536}
   3537
   3538/*
   3539 * This creates an orphan entry for the given inode in case something goes wrong
   3540 * in the middle of an unlink.
   3541 */
   3542int btrfs_orphan_add(struct btrfs_trans_handle *trans,
   3543		     struct btrfs_inode *inode)
   3544{
   3545	int ret;
   3546
   3547	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
   3548	if (ret && ret != -EEXIST) {
   3549		btrfs_abort_transaction(trans, ret);
   3550		return ret;
   3551	}
   3552
   3553	return 0;
   3554}
   3555
   3556/*
   3557 * We have done the delete so we can go ahead and remove the orphan item for
   3558 * this particular inode.
   3559 */
   3560static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
   3561			    struct btrfs_inode *inode)
   3562{
   3563	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
   3564}
   3565
   3566/*
   3567 * this cleans up any orphans that may be left on the list from the last use
   3568 * of this root.
   3569 */
   3570int btrfs_orphan_cleanup(struct btrfs_root *root)
   3571{
   3572	struct btrfs_fs_info *fs_info = root->fs_info;
   3573	struct btrfs_path *path;
   3574	struct extent_buffer *leaf;
   3575	struct btrfs_key key, found_key;
   3576	struct btrfs_trans_handle *trans;
   3577	struct inode *inode;
   3578	u64 last_objectid = 0;
   3579	int ret = 0, nr_unlink = 0;
   3580
   3581	/* Bail out if the cleanup is already running. */
   3582	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
   3583		return 0;
   3584
   3585	path = btrfs_alloc_path();
   3586	if (!path) {
   3587		ret = -ENOMEM;
   3588		goto out;
   3589	}
   3590	path->reada = READA_BACK;
   3591
   3592	key.objectid = BTRFS_ORPHAN_OBJECTID;
   3593	key.type = BTRFS_ORPHAN_ITEM_KEY;
   3594	key.offset = (u64)-1;
   3595
   3596	while (1) {
   3597		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   3598		if (ret < 0)
   3599			goto out;
   3600
   3601		/*
   3602		 * if ret == 0 means we found what we were searching for, which
   3603		 * is weird, but possible, so only screw with path if we didn't
   3604		 * find the key and see if we have stuff that matches
   3605		 */
   3606		if (ret > 0) {
   3607			ret = 0;
   3608			if (path->slots[0] == 0)
   3609				break;
   3610			path->slots[0]--;
   3611		}
   3612
   3613		/* pull out the item */
   3614		leaf = path->nodes[0];
   3615		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   3616
   3617		/* make sure the item matches what we want */
   3618		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
   3619			break;
   3620		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
   3621			break;
   3622
   3623		/* release the path since we're done with it */
   3624		btrfs_release_path(path);
   3625
   3626		/*
   3627		 * this is where we are basically btrfs_lookup, without the
   3628		 * crossing root thing.  we store the inode number in the
   3629		 * offset of the orphan item.
   3630		 */
   3631
   3632		if (found_key.offset == last_objectid) {
   3633			btrfs_err(fs_info,
   3634				  "Error removing orphan entry, stopping orphan cleanup");
   3635			ret = -EINVAL;
   3636			goto out;
   3637		}
   3638
   3639		last_objectid = found_key.offset;
   3640
   3641		found_key.objectid = found_key.offset;
   3642		found_key.type = BTRFS_INODE_ITEM_KEY;
   3643		found_key.offset = 0;
   3644		inode = btrfs_iget(fs_info->sb, last_objectid, root);
   3645		ret = PTR_ERR_OR_ZERO(inode);
   3646		if (ret && ret != -ENOENT)
   3647			goto out;
   3648
   3649		if (ret == -ENOENT && root == fs_info->tree_root) {
   3650			struct btrfs_root *dead_root;
   3651			int is_dead_root = 0;
   3652
   3653			/*
   3654			 * This is an orphan in the tree root. Currently these
   3655			 * could come from 2 sources:
   3656			 *  a) a root (snapshot/subvolume) deletion in progress
   3657			 *  b) a free space cache inode
   3658			 * We need to distinguish those two, as the orphan item
   3659			 * for a root must not get deleted before the deletion
   3660			 * of the snapshot/subvolume's tree completes.
   3661			 *
   3662			 * btrfs_find_orphan_roots() ran before us, which has
   3663			 * found all deleted roots and loaded them into
   3664			 * fs_info->fs_roots. So here we can find if an
   3665			 * orphan item corresponds to a deleted root by looking
   3666			 * up the root from that xarray.
   3667			 */
   3668
   3669			spin_lock(&fs_info->fs_roots_lock);
   3670			dead_root = xa_load(&fs_info->fs_roots,
   3671					    (unsigned long)found_key.objectid);
   3672			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
   3673				is_dead_root = 1;
   3674			spin_unlock(&fs_info->fs_roots_lock);
   3675
   3676			if (is_dead_root) {
   3677				/* prevent this orphan from being found again */
   3678				key.offset = found_key.objectid - 1;
   3679				continue;
   3680			}
   3681
   3682		}
   3683
   3684		/*
   3685		 * If we have an inode with links, there are a couple of
   3686		 * possibilities:
   3687		 *
   3688		 * 1. We were halfway through creating fsverity metadata for the
   3689		 * file. In that case, the orphan item represents incomplete
   3690		 * fsverity metadata which must be cleaned up with
   3691		 * btrfs_drop_verity_items and deleting the orphan item.
   3692
   3693		 * 2. Old kernels (before v3.12) used to create an
   3694		 * orphan item for truncate indicating that there were possibly
   3695		 * extent items past i_size that needed to be deleted. In v3.12,
   3696		 * truncate was changed to update i_size in sync with the extent
   3697		 * items, but the (useless) orphan item was still created. Since
   3698		 * v4.18, we don't create the orphan item for truncate at all.
   3699		 *
   3700		 * So, this item could mean that we need to do a truncate, but
   3701		 * only if this filesystem was last used on a pre-v3.12 kernel
   3702		 * and was not cleanly unmounted. The odds of that are quite
   3703		 * slim, and it's a pain to do the truncate now, so just delete
   3704		 * the orphan item.
   3705		 *
   3706		 * It's also possible that this orphan item was supposed to be
   3707		 * deleted but wasn't. The inode number may have been reused,
   3708		 * but either way, we can delete the orphan item.
   3709		 */
   3710		if (ret == -ENOENT || inode->i_nlink) {
   3711			if (!ret) {
   3712				ret = btrfs_drop_verity_items(BTRFS_I(inode));
   3713				iput(inode);
   3714				if (ret)
   3715					goto out;
   3716			}
   3717			trans = btrfs_start_transaction(root, 1);
   3718			if (IS_ERR(trans)) {
   3719				ret = PTR_ERR(trans);
   3720				goto out;
   3721			}
   3722			btrfs_debug(fs_info, "auto deleting %Lu",
   3723				    found_key.objectid);
   3724			ret = btrfs_del_orphan_item(trans, root,
   3725						    found_key.objectid);
   3726			btrfs_end_transaction(trans);
   3727			if (ret)
   3728				goto out;
   3729			continue;
   3730		}
   3731
   3732		nr_unlink++;
   3733
   3734		/* this will do delete_inode and everything for us */
   3735		iput(inode);
   3736	}
   3737	/* release the path since we're done with it */
   3738	btrfs_release_path(path);
   3739
   3740	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
   3741		trans = btrfs_join_transaction(root);
   3742		if (!IS_ERR(trans))
   3743			btrfs_end_transaction(trans);
   3744	}
   3745
   3746	if (nr_unlink)
   3747		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
   3748
   3749out:
   3750	if (ret)
   3751		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
   3752	btrfs_free_path(path);
   3753	return ret;
   3754}
   3755
   3756/*
   3757 * very simple check to peek ahead in the leaf looking for xattrs.  If we
   3758 * don't find any xattrs, we know there can't be any acls.
   3759 *
   3760 * slot is the slot the inode is in, objectid is the objectid of the inode
   3761 */
   3762static noinline int acls_after_inode_item(struct extent_buffer *leaf,
   3763					  int slot, u64 objectid,
   3764					  int *first_xattr_slot)
   3765{
   3766	u32 nritems = btrfs_header_nritems(leaf);
   3767	struct btrfs_key found_key;
   3768	static u64 xattr_access = 0;
   3769	static u64 xattr_default = 0;
   3770	int scanned = 0;
   3771
   3772	if (!xattr_access) {
   3773		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
   3774					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
   3775		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
   3776					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
   3777	}
   3778
   3779	slot++;
   3780	*first_xattr_slot = -1;
   3781	while (slot < nritems) {
   3782		btrfs_item_key_to_cpu(leaf, &found_key, slot);
   3783
   3784		/* we found a different objectid, there must not be acls */
   3785		if (found_key.objectid != objectid)
   3786			return 0;
   3787
   3788		/* we found an xattr, assume we've got an acl */
   3789		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
   3790			if (*first_xattr_slot == -1)
   3791				*first_xattr_slot = slot;
   3792			if (found_key.offset == xattr_access ||
   3793			    found_key.offset == xattr_default)
   3794				return 1;
   3795		}
   3796
   3797		/*
   3798		 * we found a key greater than an xattr key, there can't
   3799		 * be any acls later on
   3800		 */
   3801		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
   3802			return 0;
   3803
   3804		slot++;
   3805		scanned++;
   3806
   3807		/*
   3808		 * it goes inode, inode backrefs, xattrs, extents,
   3809		 * so if there are a ton of hard links to an inode there can
   3810		 * be a lot of backrefs.  Don't waste time searching too hard,
   3811		 * this is just an optimization
   3812		 */
   3813		if (scanned >= 8)
   3814			break;
   3815	}
   3816	/* we hit the end of the leaf before we found an xattr or
   3817	 * something larger than an xattr.  We have to assume the inode
   3818	 * has acls
   3819	 */
   3820	if (*first_xattr_slot == -1)
   3821		*first_xattr_slot = slot;
   3822	return 1;
   3823}
   3824
   3825/*
   3826 * read an inode from the btree into the in-memory inode
   3827 */
   3828static int btrfs_read_locked_inode(struct inode *inode,
   3829				   struct btrfs_path *in_path)
   3830{
   3831	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   3832	struct btrfs_path *path = in_path;
   3833	struct extent_buffer *leaf;
   3834	struct btrfs_inode_item *inode_item;
   3835	struct btrfs_root *root = BTRFS_I(inode)->root;
   3836	struct btrfs_key location;
   3837	unsigned long ptr;
   3838	int maybe_acls;
   3839	u32 rdev;
   3840	int ret;
   3841	bool filled = false;
   3842	int first_xattr_slot;
   3843
   3844	ret = btrfs_fill_inode(inode, &rdev);
   3845	if (!ret)
   3846		filled = true;
   3847
   3848	if (!path) {
   3849		path = btrfs_alloc_path();
   3850		if (!path)
   3851			return -ENOMEM;
   3852	}
   3853
   3854	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
   3855
   3856	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
   3857	if (ret) {
   3858		if (path != in_path)
   3859			btrfs_free_path(path);
   3860		return ret;
   3861	}
   3862
   3863	leaf = path->nodes[0];
   3864
   3865	if (filled)
   3866		goto cache_index;
   3867
   3868	inode_item = btrfs_item_ptr(leaf, path->slots[0],
   3869				    struct btrfs_inode_item);
   3870	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
   3871	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
   3872	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
   3873	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
   3874	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
   3875	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
   3876			round_up(i_size_read(inode), fs_info->sectorsize));
   3877
   3878	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
   3879	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
   3880
   3881	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
   3882	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
   3883
   3884	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
   3885	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
   3886
   3887	BTRFS_I(inode)->i_otime.tv_sec =
   3888		btrfs_timespec_sec(leaf, &inode_item->otime);
   3889	BTRFS_I(inode)->i_otime.tv_nsec =
   3890		btrfs_timespec_nsec(leaf, &inode_item->otime);
   3891
   3892	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
   3893	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
   3894	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
   3895
   3896	inode_set_iversion_queried(inode,
   3897				   btrfs_inode_sequence(leaf, inode_item));
   3898	inode->i_generation = BTRFS_I(inode)->generation;
   3899	inode->i_rdev = 0;
   3900	rdev = btrfs_inode_rdev(leaf, inode_item);
   3901
   3902	BTRFS_I(inode)->index_cnt = (u64)-1;
   3903	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
   3904				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
   3905
   3906cache_index:
   3907	/*
   3908	 * If we were modified in the current generation and evicted from memory
   3909	 * and then re-read we need to do a full sync since we don't have any
   3910	 * idea about which extents were modified before we were evicted from
   3911	 * cache.
   3912	 *
   3913	 * This is required for both inode re-read from disk and delayed inode
   3914	 * in the delayed_nodes xarray.
   3915	 */
   3916	if (BTRFS_I(inode)->last_trans == fs_info->generation)
   3917		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
   3918			&BTRFS_I(inode)->runtime_flags);
   3919
   3920	/*
   3921	 * We don't persist the id of the transaction where an unlink operation
   3922	 * against the inode was last made. So here we assume the inode might
   3923	 * have been evicted, and therefore the exact value of last_unlink_trans
   3924	 * lost, and set it to last_trans to avoid metadata inconsistencies
   3925	 * between the inode and its parent if the inode is fsync'ed and the log
   3926	 * replayed. For example, in the scenario:
   3927	 *
   3928	 * touch mydir/foo
   3929	 * ln mydir/foo mydir/bar
   3930	 * sync
   3931	 * unlink mydir/bar
   3932	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
   3933	 * xfs_io -c fsync mydir/foo
   3934	 * <power failure>
   3935	 * mount fs, triggers fsync log replay
   3936	 *
   3937	 * We must make sure that when we fsync our inode foo we also log its
   3938	 * parent inode, otherwise after log replay the parent still has the
   3939	 * dentry with the "bar" name but our inode foo has a link count of 1
   3940	 * and doesn't have an inode ref with the name "bar" anymore.
   3941	 *
   3942	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
   3943	 * but it guarantees correctness at the expense of occasional full
   3944	 * transaction commits on fsync if our inode is a directory, or if our
   3945	 * inode is not a directory, logging its parent unnecessarily.
   3946	 */
   3947	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
   3948
   3949	/*
   3950	 * Same logic as for last_unlink_trans. We don't persist the generation
   3951	 * of the last transaction where this inode was used for a reflink
   3952	 * operation, so after eviction and reloading the inode we must be
   3953	 * pessimistic and assume the last transaction that modified the inode.
   3954	 */
   3955	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
   3956
   3957	path->slots[0]++;
   3958	if (inode->i_nlink != 1 ||
   3959	    path->slots[0] >= btrfs_header_nritems(leaf))
   3960		goto cache_acl;
   3961
   3962	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
   3963	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
   3964		goto cache_acl;
   3965
   3966	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
   3967	if (location.type == BTRFS_INODE_REF_KEY) {
   3968		struct btrfs_inode_ref *ref;
   3969
   3970		ref = (struct btrfs_inode_ref *)ptr;
   3971		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
   3972	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
   3973		struct btrfs_inode_extref *extref;
   3974
   3975		extref = (struct btrfs_inode_extref *)ptr;
   3976		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
   3977								     extref);
   3978	}
   3979cache_acl:
   3980	/*
   3981	 * try to precache a NULL acl entry for files that don't have
   3982	 * any xattrs or acls
   3983	 */
   3984	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
   3985			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
   3986	if (first_xattr_slot != -1) {
   3987		path->slots[0] = first_xattr_slot;
   3988		ret = btrfs_load_inode_props(inode, path);
   3989		if (ret)
   3990			btrfs_err(fs_info,
   3991				  "error loading props for ino %llu (root %llu): %d",
   3992				  btrfs_ino(BTRFS_I(inode)),
   3993				  root->root_key.objectid, ret);
   3994	}
   3995	if (path != in_path)
   3996		btrfs_free_path(path);
   3997
   3998	if (!maybe_acls)
   3999		cache_no_acl(inode);
   4000
   4001	switch (inode->i_mode & S_IFMT) {
   4002	case S_IFREG:
   4003		inode->i_mapping->a_ops = &btrfs_aops;
   4004		inode->i_fop = &btrfs_file_operations;
   4005		inode->i_op = &btrfs_file_inode_operations;
   4006		break;
   4007	case S_IFDIR:
   4008		inode->i_fop = &btrfs_dir_file_operations;
   4009		inode->i_op = &btrfs_dir_inode_operations;
   4010		break;
   4011	case S_IFLNK:
   4012		inode->i_op = &btrfs_symlink_inode_operations;
   4013		inode_nohighmem(inode);
   4014		inode->i_mapping->a_ops = &btrfs_aops;
   4015		break;
   4016	default:
   4017		inode->i_op = &btrfs_special_inode_operations;
   4018		init_special_inode(inode, inode->i_mode, rdev);
   4019		break;
   4020	}
   4021
   4022	btrfs_sync_inode_flags_to_i_flags(inode);
   4023	return 0;
   4024}
   4025
   4026/*
   4027 * given a leaf and an inode, copy the inode fields into the leaf
   4028 */
   4029static void fill_inode_item(struct btrfs_trans_handle *trans,
   4030			    struct extent_buffer *leaf,
   4031			    struct btrfs_inode_item *item,
   4032			    struct inode *inode)
   4033{
   4034	struct btrfs_map_token token;
   4035	u64 flags;
   4036
   4037	btrfs_init_map_token(&token, leaf);
   4038
   4039	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
   4040	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
   4041	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
   4042	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
   4043	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
   4044
   4045	btrfs_set_token_timespec_sec(&token, &item->atime,
   4046				     inode->i_atime.tv_sec);
   4047	btrfs_set_token_timespec_nsec(&token, &item->atime,
   4048				      inode->i_atime.tv_nsec);
   4049
   4050	btrfs_set_token_timespec_sec(&token, &item->mtime,
   4051				     inode->i_mtime.tv_sec);
   4052	btrfs_set_token_timespec_nsec(&token, &item->mtime,
   4053				      inode->i_mtime.tv_nsec);
   4054
   4055	btrfs_set_token_timespec_sec(&token, &item->ctime,
   4056				     inode->i_ctime.tv_sec);
   4057	btrfs_set_token_timespec_nsec(&token, &item->ctime,
   4058				      inode->i_ctime.tv_nsec);
   4059
   4060	btrfs_set_token_timespec_sec(&token, &item->otime,
   4061				     BTRFS_I(inode)->i_otime.tv_sec);
   4062	btrfs_set_token_timespec_nsec(&token, &item->otime,
   4063				      BTRFS_I(inode)->i_otime.tv_nsec);
   4064
   4065	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
   4066	btrfs_set_token_inode_generation(&token, item,
   4067					 BTRFS_I(inode)->generation);
   4068	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
   4069	btrfs_set_token_inode_transid(&token, item, trans->transid);
   4070	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
   4071	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
   4072					  BTRFS_I(inode)->ro_flags);
   4073	btrfs_set_token_inode_flags(&token, item, flags);
   4074	btrfs_set_token_inode_block_group(&token, item, 0);
   4075}
   4076
   4077/*
   4078 * copy everything in the in-memory inode into the btree.
   4079 */
   4080static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
   4081				struct btrfs_root *root,
   4082				struct btrfs_inode *inode)
   4083{
   4084	struct btrfs_inode_item *inode_item;
   4085	struct btrfs_path *path;
   4086	struct extent_buffer *leaf;
   4087	int ret;
   4088
   4089	path = btrfs_alloc_path();
   4090	if (!path)
   4091		return -ENOMEM;
   4092
   4093	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
   4094	if (ret) {
   4095		if (ret > 0)
   4096			ret = -ENOENT;
   4097		goto failed;
   4098	}
   4099
   4100	leaf = path->nodes[0];
   4101	inode_item = btrfs_item_ptr(leaf, path->slots[0],
   4102				    struct btrfs_inode_item);
   4103
   4104	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
   4105	btrfs_mark_buffer_dirty(leaf);
   4106	btrfs_set_inode_last_trans(trans, inode);
   4107	ret = 0;
   4108failed:
   4109	btrfs_free_path(path);
   4110	return ret;
   4111}
   4112
   4113/*
   4114 * copy everything in the in-memory inode into the btree.
   4115 */
   4116noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
   4117				struct btrfs_root *root,
   4118				struct btrfs_inode *inode)
   4119{
   4120	struct btrfs_fs_info *fs_info = root->fs_info;
   4121	int ret;
   4122
   4123	/*
   4124	 * If the inode is a free space inode, we can deadlock during commit
   4125	 * if we put it into the delayed code.
   4126	 *
   4127	 * The data relocation inode should also be directly updated
   4128	 * without delay
   4129	 */
   4130	if (!btrfs_is_free_space_inode(inode)
   4131	    && !btrfs_is_data_reloc_root(root)
   4132	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
   4133		btrfs_update_root_times(trans, root);
   4134
   4135		ret = btrfs_delayed_update_inode(trans, root, inode);
   4136		if (!ret)
   4137			btrfs_set_inode_last_trans(trans, inode);
   4138		return ret;
   4139	}
   4140
   4141	return btrfs_update_inode_item(trans, root, inode);
   4142}
   4143
   4144int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
   4145				struct btrfs_root *root, struct btrfs_inode *inode)
   4146{
   4147	int ret;
   4148
   4149	ret = btrfs_update_inode(trans, root, inode);
   4150	if (ret == -ENOSPC)
   4151		return btrfs_update_inode_item(trans, root, inode);
   4152	return ret;
   4153}
   4154
   4155/*
   4156 * unlink helper that gets used here in inode.c and in the tree logging
   4157 * recovery code.  It remove a link in a directory with a given name, and
   4158 * also drops the back refs in the inode to the directory
   4159 */
   4160static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
   4161				struct btrfs_inode *dir,
   4162				struct btrfs_inode *inode,
   4163				const char *name, int name_len,
   4164				struct btrfs_rename_ctx *rename_ctx)
   4165{
   4166	struct btrfs_root *root = dir->root;
   4167	struct btrfs_fs_info *fs_info = root->fs_info;
   4168	struct btrfs_path *path;
   4169	int ret = 0;
   4170	struct btrfs_dir_item *di;
   4171	u64 index;
   4172	u64 ino = btrfs_ino(inode);
   4173	u64 dir_ino = btrfs_ino(dir);
   4174
   4175	path = btrfs_alloc_path();
   4176	if (!path) {
   4177		ret = -ENOMEM;
   4178		goto out;
   4179	}
   4180
   4181	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
   4182				    name, name_len, -1);
   4183	if (IS_ERR_OR_NULL(di)) {
   4184		ret = di ? PTR_ERR(di) : -ENOENT;
   4185		goto err;
   4186	}
   4187	ret = btrfs_delete_one_dir_name(trans, root, path, di);
   4188	if (ret)
   4189		goto err;
   4190	btrfs_release_path(path);
   4191
   4192	/*
   4193	 * If we don't have dir index, we have to get it by looking up
   4194	 * the inode ref, since we get the inode ref, remove it directly,
   4195	 * it is unnecessary to do delayed deletion.
   4196	 *
   4197	 * But if we have dir index, needn't search inode ref to get it.
   4198	 * Since the inode ref is close to the inode item, it is better
   4199	 * that we delay to delete it, and just do this deletion when
   4200	 * we update the inode item.
   4201	 */
   4202	if (inode->dir_index) {
   4203		ret = btrfs_delayed_delete_inode_ref(inode);
   4204		if (!ret) {
   4205			index = inode->dir_index;
   4206			goto skip_backref;
   4207		}
   4208	}
   4209
   4210	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
   4211				  dir_ino, &index);
   4212	if (ret) {
   4213		btrfs_info(fs_info,
   4214			"failed to delete reference to %.*s, inode %llu parent %llu",
   4215			name_len, name, ino, dir_ino);
   4216		btrfs_abort_transaction(trans, ret);
   4217		goto err;
   4218	}
   4219skip_backref:
   4220	if (rename_ctx)
   4221		rename_ctx->index = index;
   4222
   4223	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
   4224	if (ret) {
   4225		btrfs_abort_transaction(trans, ret);
   4226		goto err;
   4227	}
   4228
   4229	/*
   4230	 * If we are in a rename context, we don't need to update anything in the
   4231	 * log. That will be done later during the rename by btrfs_log_new_name().
   4232	 * Besides that, doing it here would only cause extra unncessary btree
   4233	 * operations on the log tree, increasing latency for applications.
   4234	 */
   4235	if (!rename_ctx) {
   4236		btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
   4237					   dir_ino);
   4238		btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
   4239					     index);
   4240	}
   4241
   4242	/*
   4243	 * If we have a pending delayed iput we could end up with the final iput
   4244	 * being run in btrfs-cleaner context.  If we have enough of these built
   4245	 * up we can end up burning a lot of time in btrfs-cleaner without any
   4246	 * way to throttle the unlinks.  Since we're currently holding a ref on
   4247	 * the inode we can run the delayed iput here without any issues as the
   4248	 * final iput won't be done until after we drop the ref we're currently
   4249	 * holding.
   4250	 */
   4251	btrfs_run_delayed_iput(fs_info, inode);
   4252err:
   4253	btrfs_free_path(path);
   4254	if (ret)
   4255		goto out;
   4256
   4257	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
   4258	inode_inc_iversion(&inode->vfs_inode);
   4259	inode_inc_iversion(&dir->vfs_inode);
   4260	inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
   4261		dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
   4262	ret = btrfs_update_inode(trans, root, dir);
   4263out:
   4264	return ret;
   4265}
   4266
   4267int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
   4268		       struct btrfs_inode *dir, struct btrfs_inode *inode,
   4269		       const char *name, int name_len)
   4270{
   4271	int ret;
   4272	ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
   4273	if (!ret) {
   4274		drop_nlink(&inode->vfs_inode);
   4275		ret = btrfs_update_inode(trans, inode->root, inode);
   4276	}
   4277	return ret;
   4278}
   4279
   4280/*
   4281 * helper to start transaction for unlink and rmdir.
   4282 *
   4283 * unlink and rmdir are special in btrfs, they do not always free space, so
   4284 * if we cannot make our reservations the normal way try and see if there is
   4285 * plenty of slack room in the global reserve to migrate, otherwise we cannot
   4286 * allow the unlink to occur.
   4287 */
   4288static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
   4289{
   4290	struct btrfs_root *root = BTRFS_I(dir)->root;
   4291
   4292	/*
   4293	 * 1 for the possible orphan item
   4294	 * 1 for the dir item
   4295	 * 1 for the dir index
   4296	 * 1 for the inode ref
   4297	 * 1 for the inode
   4298	 * 1 for the parent inode
   4299	 */
   4300	return btrfs_start_transaction_fallback_global_rsv(root, 6);
   4301}
   4302
   4303static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
   4304{
   4305	struct btrfs_trans_handle *trans;
   4306	struct inode *inode = d_inode(dentry);
   4307	int ret;
   4308
   4309	trans = __unlink_start_trans(dir);
   4310	if (IS_ERR(trans))
   4311		return PTR_ERR(trans);
   4312
   4313	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
   4314			0);
   4315
   4316	ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
   4317			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
   4318			dentry->d_name.len);
   4319	if (ret)
   4320		goto out;
   4321
   4322	if (inode->i_nlink == 0) {
   4323		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
   4324		if (ret)
   4325			goto out;
   4326	}
   4327
   4328out:
   4329	btrfs_end_transaction(trans);
   4330	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
   4331	return ret;
   4332}
   4333
   4334static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
   4335			       struct inode *dir, struct dentry *dentry)
   4336{
   4337	struct btrfs_root *root = BTRFS_I(dir)->root;
   4338	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
   4339	struct btrfs_path *path;
   4340	struct extent_buffer *leaf;
   4341	struct btrfs_dir_item *di;
   4342	struct btrfs_key key;
   4343	const char *name = dentry->d_name.name;
   4344	int name_len = dentry->d_name.len;
   4345	u64 index;
   4346	int ret;
   4347	u64 objectid;
   4348	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
   4349
   4350	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
   4351		objectid = inode->root->root_key.objectid;
   4352	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
   4353		objectid = inode->location.objectid;
   4354	} else {
   4355		WARN_ON(1);
   4356		return -EINVAL;
   4357	}
   4358
   4359	path = btrfs_alloc_path();
   4360	if (!path)
   4361		return -ENOMEM;
   4362
   4363	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
   4364				   name, name_len, -1);
   4365	if (IS_ERR_OR_NULL(di)) {
   4366		ret = di ? PTR_ERR(di) : -ENOENT;
   4367		goto out;
   4368	}
   4369
   4370	leaf = path->nodes[0];
   4371	btrfs_dir_item_key_to_cpu(leaf, di, &key);
   4372	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
   4373	ret = btrfs_delete_one_dir_name(trans, root, path, di);
   4374	if (ret) {
   4375		btrfs_abort_transaction(trans, ret);
   4376		goto out;
   4377	}
   4378	btrfs_release_path(path);
   4379
   4380	/*
   4381	 * This is a placeholder inode for a subvolume we didn't have a
   4382	 * reference to at the time of the snapshot creation.  In the meantime
   4383	 * we could have renamed the real subvol link into our snapshot, so
   4384	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
   4385	 * Instead simply lookup the dir_index_item for this entry so we can
   4386	 * remove it.  Otherwise we know we have a ref to the root and we can
   4387	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
   4388	 */
   4389	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
   4390		di = btrfs_search_dir_index_item(root, path, dir_ino,
   4391						 name, name_len);
   4392		if (IS_ERR_OR_NULL(di)) {
   4393			if (!di)
   4394				ret = -ENOENT;
   4395			else
   4396				ret = PTR_ERR(di);
   4397			btrfs_abort_transaction(trans, ret);
   4398			goto out;
   4399		}
   4400
   4401		leaf = path->nodes[0];
   4402		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   4403		index = key.offset;
   4404		btrfs_release_path(path);
   4405	} else {
   4406		ret = btrfs_del_root_ref(trans, objectid,
   4407					 root->root_key.objectid, dir_ino,
   4408					 &index, name, name_len);
   4409		if (ret) {
   4410			btrfs_abort_transaction(trans, ret);
   4411			goto out;
   4412		}
   4413	}
   4414
   4415	ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
   4416	if (ret) {
   4417		btrfs_abort_transaction(trans, ret);
   4418		goto out;
   4419	}
   4420
   4421	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
   4422	inode_inc_iversion(dir);
   4423	dir->i_mtime = dir->i_ctime = current_time(dir);
   4424	ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
   4425	if (ret)
   4426		btrfs_abort_transaction(trans, ret);
   4427out:
   4428	btrfs_free_path(path);
   4429	return ret;
   4430}
   4431
   4432/*
   4433 * Helper to check if the subvolume references other subvolumes or if it's
   4434 * default.
   4435 */
   4436static noinline int may_destroy_subvol(struct btrfs_root *root)
   4437{
   4438	struct btrfs_fs_info *fs_info = root->fs_info;
   4439	struct btrfs_path *path;
   4440	struct btrfs_dir_item *di;
   4441	struct btrfs_key key;
   4442	u64 dir_id;
   4443	int ret;
   4444
   4445	path = btrfs_alloc_path();
   4446	if (!path)
   4447		return -ENOMEM;
   4448
   4449	/* Make sure this root isn't set as the default subvol */
   4450	dir_id = btrfs_super_root_dir(fs_info->super_copy);
   4451	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
   4452				   dir_id, "default", 7, 0);
   4453	if (di && !IS_ERR(di)) {
   4454		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
   4455		if (key.objectid == root->root_key.objectid) {
   4456			ret = -EPERM;
   4457			btrfs_err(fs_info,
   4458				  "deleting default subvolume %llu is not allowed",
   4459				  key.objectid);
   4460			goto out;
   4461		}
   4462		btrfs_release_path(path);
   4463	}
   4464
   4465	key.objectid = root->root_key.objectid;
   4466	key.type = BTRFS_ROOT_REF_KEY;
   4467	key.offset = (u64)-1;
   4468
   4469	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
   4470	if (ret < 0)
   4471		goto out;
   4472	BUG_ON(ret == 0);
   4473
   4474	ret = 0;
   4475	if (path->slots[0] > 0) {
   4476		path->slots[0]--;
   4477		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   4478		if (key.objectid == root->root_key.objectid &&
   4479		    key.type == BTRFS_ROOT_REF_KEY)
   4480			ret = -ENOTEMPTY;
   4481	}
   4482out:
   4483	btrfs_free_path(path);
   4484	return ret;
   4485}
   4486
   4487/* Delete all dentries for inodes belonging to the root */
   4488static void btrfs_prune_dentries(struct btrfs_root *root)
   4489{
   4490	struct btrfs_fs_info *fs_info = root->fs_info;
   4491	struct rb_node *node;
   4492	struct rb_node *prev;
   4493	struct btrfs_inode *entry;
   4494	struct inode *inode;
   4495	u64 objectid = 0;
   4496
   4497	if (!BTRFS_FS_ERROR(fs_info))
   4498		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
   4499
   4500	spin_lock(&root->inode_lock);
   4501again:
   4502	node = root->inode_tree.rb_node;
   4503	prev = NULL;
   4504	while (node) {
   4505		prev = node;
   4506		entry = rb_entry(node, struct btrfs_inode, rb_node);
   4507
   4508		if (objectid < btrfs_ino(entry))
   4509			node = node->rb_left;
   4510		else if (objectid > btrfs_ino(entry))
   4511			node = node->rb_right;
   4512		else
   4513			break;
   4514	}
   4515	if (!node) {
   4516		while (prev) {
   4517			entry = rb_entry(prev, struct btrfs_inode, rb_node);
   4518			if (objectid <= btrfs_ino(entry)) {
   4519				node = prev;
   4520				break;
   4521			}
   4522			prev = rb_next(prev);
   4523		}
   4524	}
   4525	while (node) {
   4526		entry = rb_entry(node, struct btrfs_inode, rb_node);
   4527		objectid = btrfs_ino(entry) + 1;
   4528		inode = igrab(&entry->vfs_inode);
   4529		if (inode) {
   4530			spin_unlock(&root->inode_lock);
   4531			if (atomic_read(&inode->i_count) > 1)
   4532				d_prune_aliases(inode);
   4533			/*
   4534			 * btrfs_drop_inode will have it removed from the inode
   4535			 * cache when its usage count hits zero.
   4536			 */
   4537			iput(inode);
   4538			cond_resched();
   4539			spin_lock(&root->inode_lock);
   4540			goto again;
   4541		}
   4542
   4543		if (cond_resched_lock(&root->inode_lock))
   4544			goto again;
   4545
   4546		node = rb_next(node);
   4547	}
   4548	spin_unlock(&root->inode_lock);
   4549}
   4550
   4551int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
   4552{
   4553	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
   4554	struct btrfs_root *root = BTRFS_I(dir)->root;
   4555	struct inode *inode = d_inode(dentry);
   4556	struct btrfs_root *dest = BTRFS_I(inode)->root;
   4557	struct btrfs_trans_handle *trans;
   4558	struct btrfs_block_rsv block_rsv;
   4559	u64 root_flags;
   4560	int ret;
   4561
   4562	/*
   4563	 * Don't allow to delete a subvolume with send in progress. This is
   4564	 * inside the inode lock so the error handling that has to drop the bit
   4565	 * again is not run concurrently.
   4566	 */
   4567	spin_lock(&dest->root_item_lock);
   4568	if (dest->send_in_progress) {
   4569		spin_unlock(&dest->root_item_lock);
   4570		btrfs_warn(fs_info,
   4571			   "attempt to delete subvolume %llu during send",
   4572			   dest->root_key.objectid);
   4573		return -EPERM;
   4574	}
   4575	if (atomic_read(&dest->nr_swapfiles)) {
   4576		spin_unlock(&dest->root_item_lock);
   4577		btrfs_warn(fs_info,
   4578			   "attempt to delete subvolume %llu with active swapfile",
   4579			   root->root_key.objectid);
   4580		return -EPERM;
   4581	}
   4582	root_flags = btrfs_root_flags(&dest->root_item);
   4583	btrfs_set_root_flags(&dest->root_item,
   4584			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
   4585	spin_unlock(&dest->root_item_lock);
   4586
   4587	down_write(&fs_info->subvol_sem);
   4588
   4589	ret = may_destroy_subvol(dest);
   4590	if (ret)
   4591		goto out_up_write;
   4592
   4593	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
   4594	/*
   4595	 * One for dir inode,
   4596	 * two for dir entries,
   4597	 * two for root ref/backref.
   4598	 */
   4599	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
   4600	if (ret)
   4601		goto out_up_write;
   4602
   4603	trans = btrfs_start_transaction(root, 0);
   4604	if (IS_ERR(trans)) {
   4605		ret = PTR_ERR(trans);
   4606		goto out_release;
   4607	}
   4608	trans->block_rsv = &block_rsv;
   4609	trans->bytes_reserved = block_rsv.size;
   4610
   4611	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
   4612
   4613	ret = btrfs_unlink_subvol(trans, dir, dentry);
   4614	if (ret) {
   4615		btrfs_abort_transaction(trans, ret);
   4616		goto out_end_trans;
   4617	}
   4618
   4619	ret = btrfs_record_root_in_trans(trans, dest);
   4620	if (ret) {
   4621		btrfs_abort_transaction(trans, ret);
   4622		goto out_end_trans;
   4623	}
   4624
   4625	memset(&dest->root_item.drop_progress, 0,
   4626		sizeof(dest->root_item.drop_progress));
   4627	btrfs_set_root_drop_level(&dest->root_item, 0);
   4628	btrfs_set_root_refs(&dest->root_item, 0);
   4629
   4630	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
   4631		ret = btrfs_insert_orphan_item(trans,
   4632					fs_info->tree_root,
   4633					dest->root_key.objectid);
   4634		if (ret) {
   4635			btrfs_abort_transaction(trans, ret);
   4636			goto out_end_trans;
   4637		}
   4638	}
   4639
   4640	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
   4641				  BTRFS_UUID_KEY_SUBVOL,
   4642				  dest->root_key.objectid);
   4643	if (ret && ret != -ENOENT) {
   4644		btrfs_abort_transaction(trans, ret);
   4645		goto out_end_trans;
   4646	}
   4647	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
   4648		ret = btrfs_uuid_tree_remove(trans,
   4649					  dest->root_item.received_uuid,
   4650					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
   4651					  dest->root_key.objectid);
   4652		if (ret && ret != -ENOENT) {
   4653			btrfs_abort_transaction(trans, ret);
   4654			goto out_end_trans;
   4655		}
   4656	}
   4657
   4658	free_anon_bdev(dest->anon_dev);
   4659	dest->anon_dev = 0;
   4660out_end_trans:
   4661	trans->block_rsv = NULL;
   4662	trans->bytes_reserved = 0;
   4663	ret = btrfs_end_transaction(trans);
   4664	inode->i_flags |= S_DEAD;
   4665out_release:
   4666	btrfs_subvolume_release_metadata(root, &block_rsv);
   4667out_up_write:
   4668	up_write(&fs_info->subvol_sem);
   4669	if (ret) {
   4670		spin_lock(&dest->root_item_lock);
   4671		root_flags = btrfs_root_flags(&dest->root_item);
   4672		btrfs_set_root_flags(&dest->root_item,
   4673				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
   4674		spin_unlock(&dest->root_item_lock);
   4675	} else {
   4676		d_invalidate(dentry);
   4677		btrfs_prune_dentries(dest);
   4678		ASSERT(dest->send_in_progress == 0);
   4679	}
   4680
   4681	return ret;
   4682}
   4683
   4684static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
   4685{
   4686	struct inode *inode = d_inode(dentry);
   4687	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
   4688	int err = 0;
   4689	struct btrfs_trans_handle *trans;
   4690	u64 last_unlink_trans;
   4691
   4692	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
   4693		return -ENOTEMPTY;
   4694	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
   4695		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
   4696			btrfs_err(fs_info,
   4697			"extent tree v2 doesn't support snapshot deletion yet");
   4698			return -EOPNOTSUPP;
   4699		}
   4700		return btrfs_delete_subvolume(dir, dentry);
   4701	}
   4702
   4703	trans = __unlink_start_trans(dir);
   4704	if (IS_ERR(trans))
   4705		return PTR_ERR(trans);
   4706
   4707	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
   4708		err = btrfs_unlink_subvol(trans, dir, dentry);
   4709		goto out;
   4710	}
   4711
   4712	err = btrfs_orphan_add(trans, BTRFS_I(inode));
   4713	if (err)
   4714		goto out;
   4715
   4716	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
   4717
   4718	/* now the directory is empty */
   4719	err = btrfs_unlink_inode(trans, BTRFS_I(dir),
   4720			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
   4721			dentry->d_name.len);
   4722	if (!err) {
   4723		btrfs_i_size_write(BTRFS_I(inode), 0);
   4724		/*
   4725		 * Propagate the last_unlink_trans value of the deleted dir to
   4726		 * its parent directory. This is to prevent an unrecoverable
   4727		 * log tree in the case we do something like this:
   4728		 * 1) create dir foo
   4729		 * 2) create snapshot under dir foo
   4730		 * 3) delete the snapshot
   4731		 * 4) rmdir foo
   4732		 * 5) mkdir foo
   4733		 * 6) fsync foo or some file inside foo
   4734		 */
   4735		if (last_unlink_trans >= trans->transid)
   4736			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
   4737	}
   4738out:
   4739	btrfs_end_transaction(trans);
   4740	btrfs_btree_balance_dirty(fs_info);
   4741
   4742	return err;
   4743}
   4744
   4745/*
   4746 * btrfs_truncate_block - read, zero a chunk and write a block
   4747 * @inode - inode that we're zeroing
   4748 * @from - the offset to start zeroing
   4749 * @len - the length to zero, 0 to zero the entire range respective to the
   4750 *	offset
   4751 * @front - zero up to the offset instead of from the offset on
   4752 *
   4753 * This will find the block for the "from" offset and cow the block and zero the
   4754 * part we want to zero.  This is used with truncate and hole punching.
   4755 */
   4756int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
   4757			 int front)
   4758{
   4759	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   4760	struct address_space *mapping = inode->vfs_inode.i_mapping;
   4761	struct extent_io_tree *io_tree = &inode->io_tree;
   4762	struct btrfs_ordered_extent *ordered;
   4763	struct extent_state *cached_state = NULL;
   4764	struct extent_changeset *data_reserved = NULL;
   4765	bool only_release_metadata = false;
   4766	u32 blocksize = fs_info->sectorsize;
   4767	pgoff_t index = from >> PAGE_SHIFT;
   4768	unsigned offset = from & (blocksize - 1);
   4769	struct page *page;
   4770	gfp_t mask = btrfs_alloc_write_mask(mapping);
   4771	size_t write_bytes = blocksize;
   4772	int ret = 0;
   4773	u64 block_start;
   4774	u64 block_end;
   4775
   4776	if (IS_ALIGNED(offset, blocksize) &&
   4777	    (!len || IS_ALIGNED(len, blocksize)))
   4778		goto out;
   4779
   4780	block_start = round_down(from, blocksize);
   4781	block_end = block_start + blocksize - 1;
   4782
   4783	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
   4784					  blocksize);
   4785	if (ret < 0) {
   4786		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
   4787			/* For nocow case, no need to reserve data space */
   4788			only_release_metadata = true;
   4789		} else {
   4790			goto out;
   4791		}
   4792	}
   4793	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
   4794	if (ret < 0) {
   4795		if (!only_release_metadata)
   4796			btrfs_free_reserved_data_space(inode, data_reserved,
   4797						       block_start, blocksize);
   4798		goto out;
   4799	}
   4800again:
   4801	page = find_or_create_page(mapping, index, mask);
   4802	if (!page) {
   4803		btrfs_delalloc_release_space(inode, data_reserved, block_start,
   4804					     blocksize, true);
   4805		btrfs_delalloc_release_extents(inode, blocksize);
   4806		ret = -ENOMEM;
   4807		goto out;
   4808	}
   4809	ret = set_page_extent_mapped(page);
   4810	if (ret < 0)
   4811		goto out_unlock;
   4812
   4813	if (!PageUptodate(page)) {
   4814		ret = btrfs_read_folio(NULL, page_folio(page));
   4815		lock_page(page);
   4816		if (page->mapping != mapping) {
   4817			unlock_page(page);
   4818			put_page(page);
   4819			goto again;
   4820		}
   4821		if (!PageUptodate(page)) {
   4822			ret = -EIO;
   4823			goto out_unlock;
   4824		}
   4825	}
   4826	wait_on_page_writeback(page);
   4827
   4828	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
   4829
   4830	ordered = btrfs_lookup_ordered_extent(inode, block_start);
   4831	if (ordered) {
   4832		unlock_extent_cached(io_tree, block_start, block_end,
   4833				     &cached_state);
   4834		unlock_page(page);
   4835		put_page(page);
   4836		btrfs_start_ordered_extent(ordered, 1);
   4837		btrfs_put_ordered_extent(ordered);
   4838		goto again;
   4839	}
   4840
   4841	clear_extent_bit(&inode->io_tree, block_start, block_end,
   4842			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
   4843			 0, 0, &cached_state);
   4844
   4845	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
   4846					&cached_state);
   4847	if (ret) {
   4848		unlock_extent_cached(io_tree, block_start, block_end,
   4849				     &cached_state);
   4850		goto out_unlock;
   4851	}
   4852
   4853	if (offset != blocksize) {
   4854		if (!len)
   4855			len = blocksize - offset;
   4856		if (front)
   4857			memzero_page(page, (block_start - page_offset(page)),
   4858				     offset);
   4859		else
   4860			memzero_page(page, (block_start - page_offset(page)) + offset,
   4861				     len);
   4862		flush_dcache_page(page);
   4863	}
   4864	btrfs_page_clear_checked(fs_info, page, block_start,
   4865				 block_end + 1 - block_start);
   4866	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
   4867	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
   4868
   4869	if (only_release_metadata)
   4870		set_extent_bit(&inode->io_tree, block_start, block_end,
   4871			       EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
   4872
   4873out_unlock:
   4874	if (ret) {
   4875		if (only_release_metadata)
   4876			btrfs_delalloc_release_metadata(inode, blocksize, true);
   4877		else
   4878			btrfs_delalloc_release_space(inode, data_reserved,
   4879					block_start, blocksize, true);
   4880	}
   4881	btrfs_delalloc_release_extents(inode, blocksize);
   4882	unlock_page(page);
   4883	put_page(page);
   4884out:
   4885	if (only_release_metadata)
   4886		btrfs_check_nocow_unlock(inode);
   4887	extent_changeset_free(data_reserved);
   4888	return ret;
   4889}
   4890
   4891static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
   4892			     u64 offset, u64 len)
   4893{
   4894	struct btrfs_fs_info *fs_info = root->fs_info;
   4895	struct btrfs_trans_handle *trans;
   4896	struct btrfs_drop_extents_args drop_args = { 0 };
   4897	int ret;
   4898
   4899	/*
   4900	 * If NO_HOLES is enabled, we don't need to do anything.
   4901	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
   4902	 * or btrfs_update_inode() will be called, which guarantee that the next
   4903	 * fsync will know this inode was changed and needs to be logged.
   4904	 */
   4905	if (btrfs_fs_incompat(fs_info, NO_HOLES))
   4906		return 0;
   4907
   4908	/*
   4909	 * 1 - for the one we're dropping
   4910	 * 1 - for the one we're adding
   4911	 * 1 - for updating the inode.
   4912	 */
   4913	trans = btrfs_start_transaction(root, 3);
   4914	if (IS_ERR(trans))
   4915		return PTR_ERR(trans);
   4916
   4917	drop_args.start = offset;
   4918	drop_args.end = offset + len;
   4919	drop_args.drop_cache = true;
   4920
   4921	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
   4922	if (ret) {
   4923		btrfs_abort_transaction(trans, ret);
   4924		btrfs_end_transaction(trans);
   4925		return ret;
   4926	}
   4927
   4928	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
   4929			offset, 0, 0, len, 0, len, 0, 0, 0);
   4930	if (ret) {
   4931		btrfs_abort_transaction(trans, ret);
   4932	} else {
   4933		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
   4934		btrfs_update_inode(trans, root, inode);
   4935	}
   4936	btrfs_end_transaction(trans);
   4937	return ret;
   4938}
   4939
   4940/*
   4941 * This function puts in dummy file extents for the area we're creating a hole
   4942 * for.  So if we are truncating this file to a larger size we need to insert
   4943 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
   4944 * the range between oldsize and size
   4945 */
   4946int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
   4947{
   4948	struct btrfs_root *root = inode->root;
   4949	struct btrfs_fs_info *fs_info = root->fs_info;
   4950	struct extent_io_tree *io_tree = &inode->io_tree;
   4951	struct extent_map *em = NULL;
   4952	struct extent_state *cached_state = NULL;
   4953	struct extent_map_tree *em_tree = &inode->extent_tree;
   4954	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
   4955	u64 block_end = ALIGN(size, fs_info->sectorsize);
   4956	u64 last_byte;
   4957	u64 cur_offset;
   4958	u64 hole_size;
   4959	int err = 0;
   4960
   4961	/*
   4962	 * If our size started in the middle of a block we need to zero out the
   4963	 * rest of the block before we expand the i_size, otherwise we could
   4964	 * expose stale data.
   4965	 */
   4966	err = btrfs_truncate_block(inode, oldsize, 0, 0);
   4967	if (err)
   4968		return err;
   4969
   4970	if (size <= hole_start)
   4971		return 0;
   4972
   4973	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
   4974					   &cached_state);
   4975	cur_offset = hole_start;
   4976	while (1) {
   4977		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
   4978				      block_end - cur_offset);
   4979		if (IS_ERR(em)) {
   4980			err = PTR_ERR(em);
   4981			em = NULL;
   4982			break;
   4983		}
   4984		last_byte = min(extent_map_end(em), block_end);
   4985		last_byte = ALIGN(last_byte, fs_info->sectorsize);
   4986		hole_size = last_byte - cur_offset;
   4987
   4988		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
   4989			struct extent_map *hole_em;
   4990
   4991			err = maybe_insert_hole(root, inode, cur_offset,
   4992						hole_size);
   4993			if (err)
   4994				break;
   4995
   4996			err = btrfs_inode_set_file_extent_range(inode,
   4997							cur_offset, hole_size);
   4998			if (err)
   4999				break;
   5000
   5001			btrfs_drop_extent_cache(inode, cur_offset,
   5002						cur_offset + hole_size - 1, 0);
   5003			hole_em = alloc_extent_map();
   5004			if (!hole_em) {
   5005				btrfs_set_inode_full_sync(inode);
   5006				goto next;
   5007			}
   5008			hole_em->start = cur_offset;
   5009			hole_em->len = hole_size;
   5010			hole_em->orig_start = cur_offset;
   5011
   5012			hole_em->block_start = EXTENT_MAP_HOLE;
   5013			hole_em->block_len = 0;
   5014			hole_em->orig_block_len = 0;
   5015			hole_em->ram_bytes = hole_size;
   5016			hole_em->compress_type = BTRFS_COMPRESS_NONE;
   5017			hole_em->generation = fs_info->generation;
   5018
   5019			while (1) {
   5020				write_lock(&em_tree->lock);
   5021				err = add_extent_mapping(em_tree, hole_em, 1);
   5022				write_unlock(&em_tree->lock);
   5023				if (err != -EEXIST)
   5024					break;
   5025				btrfs_drop_extent_cache(inode, cur_offset,
   5026							cur_offset +
   5027							hole_size - 1, 0);
   5028			}
   5029			free_extent_map(hole_em);
   5030		} else {
   5031			err = btrfs_inode_set_file_extent_range(inode,
   5032							cur_offset, hole_size);
   5033			if (err)
   5034				break;
   5035		}
   5036next:
   5037		free_extent_map(em);
   5038		em = NULL;
   5039		cur_offset = last_byte;
   5040		if (cur_offset >= block_end)
   5041			break;
   5042	}
   5043	free_extent_map(em);
   5044	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
   5045	return err;
   5046}
   5047
   5048static int btrfs_setsize(struct inode *inode, struct iattr *attr)
   5049{
   5050	struct btrfs_root *root = BTRFS_I(inode)->root;
   5051	struct btrfs_trans_handle *trans;
   5052	loff_t oldsize = i_size_read(inode);
   5053	loff_t newsize = attr->ia_size;
   5054	int mask = attr->ia_valid;
   5055	int ret;
   5056
   5057	/*
   5058	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
   5059	 * special case where we need to update the times despite not having
   5060	 * these flags set.  For all other operations the VFS set these flags
   5061	 * explicitly if it wants a timestamp update.
   5062	 */
   5063	if (newsize != oldsize) {
   5064		inode_inc_iversion(inode);
   5065		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
   5066			inode->i_ctime = inode->i_mtime =
   5067				current_time(inode);
   5068	}
   5069
   5070	if (newsize > oldsize) {
   5071		/*
   5072		 * Don't do an expanding truncate while snapshotting is ongoing.
   5073		 * This is to ensure the snapshot captures a fully consistent
   5074		 * state of this file - if the snapshot captures this expanding
   5075		 * truncation, it must capture all writes that happened before
   5076		 * this truncation.
   5077		 */
   5078		btrfs_drew_write_lock(&root->snapshot_lock);
   5079		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
   5080		if (ret) {
   5081			btrfs_drew_write_unlock(&root->snapshot_lock);
   5082			return ret;
   5083		}
   5084
   5085		trans = btrfs_start_transaction(root, 1);
   5086		if (IS_ERR(trans)) {
   5087			btrfs_drew_write_unlock(&root->snapshot_lock);
   5088			return PTR_ERR(trans);
   5089		}
   5090
   5091		i_size_write(inode, newsize);
   5092		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
   5093		pagecache_isize_extended(inode, oldsize, newsize);
   5094		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   5095		btrfs_drew_write_unlock(&root->snapshot_lock);
   5096		btrfs_end_transaction(trans);
   5097	} else {
   5098		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   5099
   5100		if (btrfs_is_zoned(fs_info)) {
   5101			ret = btrfs_wait_ordered_range(inode,
   5102					ALIGN(newsize, fs_info->sectorsize),
   5103					(u64)-1);
   5104			if (ret)
   5105				return ret;
   5106		}
   5107
   5108		/*
   5109		 * We're truncating a file that used to have good data down to
   5110		 * zero. Make sure any new writes to the file get on disk
   5111		 * on close.
   5112		 */
   5113		if (newsize == 0)
   5114			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
   5115				&BTRFS_I(inode)->runtime_flags);
   5116
   5117		truncate_setsize(inode, newsize);
   5118
   5119		inode_dio_wait(inode);
   5120
   5121		ret = btrfs_truncate(inode, newsize == oldsize);
   5122		if (ret && inode->i_nlink) {
   5123			int err;
   5124
   5125			/*
   5126			 * Truncate failed, so fix up the in-memory size. We
   5127			 * adjusted disk_i_size down as we removed extents, so
   5128			 * wait for disk_i_size to be stable and then update the
   5129			 * in-memory size to match.
   5130			 */
   5131			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
   5132			if (err)
   5133				return err;
   5134			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
   5135		}
   5136	}
   5137
   5138	return ret;
   5139}
   5140
   5141static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
   5142			 struct iattr *attr)
   5143{
   5144	struct inode *inode = d_inode(dentry);
   5145	struct btrfs_root *root = BTRFS_I(inode)->root;
   5146	int err;
   5147
   5148	if (btrfs_root_readonly(root))
   5149		return -EROFS;
   5150
   5151	err = setattr_prepare(mnt_userns, dentry, attr);
   5152	if (err)
   5153		return err;
   5154
   5155	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
   5156		err = btrfs_setsize(inode, attr);
   5157		if (err)
   5158			return err;
   5159	}
   5160
   5161	if (attr->ia_valid) {
   5162		setattr_copy(mnt_userns, inode, attr);
   5163		inode_inc_iversion(inode);
   5164		err = btrfs_dirty_inode(inode);
   5165
   5166		if (!err && attr->ia_valid & ATTR_MODE)
   5167			err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
   5168	}
   5169
   5170	return err;
   5171}
   5172
   5173/*
   5174 * While truncating the inode pages during eviction, we get the VFS
   5175 * calling btrfs_invalidate_folio() against each folio of the inode. This
   5176 * is slow because the calls to btrfs_invalidate_folio() result in a
   5177 * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
   5178 * which keep merging and splitting extent_state structures over and over,
   5179 * wasting lots of time.
   5180 *
   5181 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
   5182 * skip all those expensive operations on a per folio basis and do only
   5183 * the ordered io finishing, while we release here the extent_map and
   5184 * extent_state structures, without the excessive merging and splitting.
   5185 */
   5186static void evict_inode_truncate_pages(struct inode *inode)
   5187{
   5188	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
   5189	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
   5190	struct rb_node *node;
   5191
   5192	ASSERT(inode->i_state & I_FREEING);
   5193	truncate_inode_pages_final(&inode->i_data);
   5194
   5195	write_lock(&map_tree->lock);
   5196	while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
   5197		struct extent_map *em;
   5198
   5199		node = rb_first_cached(&map_tree->map);
   5200		em = rb_entry(node, struct extent_map, rb_node);
   5201		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
   5202		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
   5203		remove_extent_mapping(map_tree, em);
   5204		free_extent_map(em);
   5205		if (need_resched()) {
   5206			write_unlock(&map_tree->lock);
   5207			cond_resched();
   5208			write_lock(&map_tree->lock);
   5209		}
   5210	}
   5211	write_unlock(&map_tree->lock);
   5212
   5213	/*
   5214	 * Keep looping until we have no more ranges in the io tree.
   5215	 * We can have ongoing bios started by readahead that have
   5216	 * their endio callback (extent_io.c:end_bio_extent_readpage)
   5217	 * still in progress (unlocked the pages in the bio but did not yet
   5218	 * unlocked the ranges in the io tree). Therefore this means some
   5219	 * ranges can still be locked and eviction started because before
   5220	 * submitting those bios, which are executed by a separate task (work
   5221	 * queue kthread), inode references (inode->i_count) were not taken
   5222	 * (which would be dropped in the end io callback of each bio).
   5223	 * Therefore here we effectively end up waiting for those bios and
   5224	 * anyone else holding locked ranges without having bumped the inode's
   5225	 * reference count - if we don't do it, when they access the inode's
   5226	 * io_tree to unlock a range it may be too late, leading to an
   5227	 * use-after-free issue.
   5228	 */
   5229	spin_lock(&io_tree->lock);
   5230	while (!RB_EMPTY_ROOT(&io_tree->state)) {
   5231		struct extent_state *state;
   5232		struct extent_state *cached_state = NULL;
   5233		u64 start;
   5234		u64 end;
   5235		unsigned state_flags;
   5236
   5237		node = rb_first(&io_tree->state);
   5238		state = rb_entry(node, struct extent_state, rb_node);
   5239		start = state->start;
   5240		end = state->end;
   5241		state_flags = state->state;
   5242		spin_unlock(&io_tree->lock);
   5243
   5244		lock_extent_bits(io_tree, start, end, &cached_state);
   5245
   5246		/*
   5247		 * If still has DELALLOC flag, the extent didn't reach disk,
   5248		 * and its reserved space won't be freed by delayed_ref.
   5249		 * So we need to free its reserved space here.
   5250		 * (Refer to comment in btrfs_invalidate_folio, case 2)
   5251		 *
   5252		 * Note, end is the bytenr of last byte, so we need + 1 here.
   5253		 */
   5254		if (state_flags & EXTENT_DELALLOC)
   5255			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
   5256					       end - start + 1);
   5257
   5258		clear_extent_bit(io_tree, start, end,
   5259				 EXTENT_LOCKED | EXTENT_DELALLOC |
   5260				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
   5261				 &cached_state);
   5262
   5263		cond_resched();
   5264		spin_lock(&io_tree->lock);
   5265	}
   5266	spin_unlock(&io_tree->lock);
   5267}
   5268
   5269static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
   5270							struct btrfs_block_rsv *rsv)
   5271{
   5272	struct btrfs_fs_info *fs_info = root->fs_info;
   5273	struct btrfs_trans_handle *trans;
   5274	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
   5275	int ret;
   5276
   5277	/*
   5278	 * Eviction should be taking place at some place safe because of our
   5279	 * delayed iputs.  However the normal flushing code will run delayed
   5280	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
   5281	 *
   5282	 * We reserve the delayed_refs_extra here again because we can't use
   5283	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
   5284	 * above.  We reserve our extra bit here because we generate a ton of
   5285	 * delayed refs activity by truncating.
   5286	 *
   5287	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
   5288	 * if we fail to make this reservation we can re-try without the
   5289	 * delayed_refs_extra so we can make some forward progress.
   5290	 */
   5291	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
   5292				     BTRFS_RESERVE_FLUSH_EVICT);
   5293	if (ret) {
   5294		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
   5295					     BTRFS_RESERVE_FLUSH_EVICT);
   5296		if (ret) {
   5297			btrfs_warn(fs_info,
   5298				   "could not allocate space for delete; will truncate on mount");
   5299			return ERR_PTR(-ENOSPC);
   5300		}
   5301		delayed_refs_extra = 0;
   5302	}
   5303
   5304	trans = btrfs_join_transaction(root);
   5305	if (IS_ERR(trans))
   5306		return trans;
   5307
   5308	if (delayed_refs_extra) {
   5309		trans->block_rsv = &fs_info->trans_block_rsv;
   5310		trans->bytes_reserved = delayed_refs_extra;
   5311		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
   5312					delayed_refs_extra, 1);
   5313	}
   5314	return trans;
   5315}
   5316
   5317void btrfs_evict_inode(struct inode *inode)
   5318{
   5319	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   5320	struct btrfs_trans_handle *trans;
   5321	struct btrfs_root *root = BTRFS_I(inode)->root;
   5322	struct btrfs_block_rsv *rsv;
   5323	int ret;
   5324
   5325	trace_btrfs_inode_evict(inode);
   5326
   5327	if (!root) {
   5328		fsverity_cleanup_inode(inode);
   5329		clear_inode(inode);
   5330		return;
   5331	}
   5332
   5333	evict_inode_truncate_pages(inode);
   5334
   5335	if (inode->i_nlink &&
   5336	    ((btrfs_root_refs(&root->root_item) != 0 &&
   5337	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
   5338	     btrfs_is_free_space_inode(BTRFS_I(inode))))
   5339		goto no_delete;
   5340
   5341	if (is_bad_inode(inode))
   5342		goto no_delete;
   5343
   5344	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
   5345
   5346	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
   5347		goto no_delete;
   5348
   5349	if (inode->i_nlink > 0) {
   5350		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
   5351		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
   5352		goto no_delete;
   5353	}
   5354
   5355	/*
   5356	 * This makes sure the inode item in tree is uptodate and the space for
   5357	 * the inode update is released.
   5358	 */
   5359	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
   5360	if (ret)
   5361		goto no_delete;
   5362
   5363	/*
   5364	 * This drops any pending insert or delete operations we have for this
   5365	 * inode.  We could have a delayed dir index deletion queued up, but
   5366	 * we're removing the inode completely so that'll be taken care of in
   5367	 * the truncate.
   5368	 */
   5369	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
   5370
   5371	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
   5372	if (!rsv)
   5373		goto no_delete;
   5374	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
   5375	rsv->failfast = 1;
   5376
   5377	btrfs_i_size_write(BTRFS_I(inode), 0);
   5378
   5379	while (1) {
   5380		struct btrfs_truncate_control control = {
   5381			.inode = BTRFS_I(inode),
   5382			.ino = btrfs_ino(BTRFS_I(inode)),
   5383			.new_size = 0,
   5384			.min_type = 0,
   5385		};
   5386
   5387		trans = evict_refill_and_join(root, rsv);
   5388		if (IS_ERR(trans))
   5389			goto free_rsv;
   5390
   5391		trans->block_rsv = rsv;
   5392
   5393		ret = btrfs_truncate_inode_items(trans, root, &control);
   5394		trans->block_rsv = &fs_info->trans_block_rsv;
   5395		btrfs_end_transaction(trans);
   5396		btrfs_btree_balance_dirty(fs_info);
   5397		if (ret && ret != -ENOSPC && ret != -EAGAIN)
   5398			goto free_rsv;
   5399		else if (!ret)
   5400			break;
   5401	}
   5402
   5403	/*
   5404	 * Errors here aren't a big deal, it just means we leave orphan items in
   5405	 * the tree. They will be cleaned up on the next mount. If the inode
   5406	 * number gets reused, cleanup deletes the orphan item without doing
   5407	 * anything, and unlink reuses the existing orphan item.
   5408	 *
   5409	 * If it turns out that we are dropping too many of these, we might want
   5410	 * to add a mechanism for retrying these after a commit.
   5411	 */
   5412	trans = evict_refill_and_join(root, rsv);
   5413	if (!IS_ERR(trans)) {
   5414		trans->block_rsv = rsv;
   5415		btrfs_orphan_del(trans, BTRFS_I(inode));
   5416		trans->block_rsv = &fs_info->trans_block_rsv;
   5417		btrfs_end_transaction(trans);
   5418	}
   5419
   5420free_rsv:
   5421	btrfs_free_block_rsv(fs_info, rsv);
   5422no_delete:
   5423	/*
   5424	 * If we didn't successfully delete, the orphan item will still be in
   5425	 * the tree and we'll retry on the next mount. Again, we might also want
   5426	 * to retry these periodically in the future.
   5427	 */
   5428	btrfs_remove_delayed_node(BTRFS_I(inode));
   5429	fsverity_cleanup_inode(inode);
   5430	clear_inode(inode);
   5431}
   5432
   5433/*
   5434 * Return the key found in the dir entry in the location pointer, fill @type
   5435 * with BTRFS_FT_*, and return 0.
   5436 *
   5437 * If no dir entries were found, returns -ENOENT.
   5438 * If found a corrupted location in dir entry, returns -EUCLEAN.
   5439 */
   5440static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
   5441			       struct btrfs_key *location, u8 *type)
   5442{
   5443	const char *name = dentry->d_name.name;
   5444	int namelen = dentry->d_name.len;
   5445	struct btrfs_dir_item *di;
   5446	struct btrfs_path *path;
   5447	struct btrfs_root *root = BTRFS_I(dir)->root;
   5448	int ret = 0;
   5449
   5450	path = btrfs_alloc_path();
   5451	if (!path)
   5452		return -ENOMEM;
   5453
   5454	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
   5455			name, namelen, 0);
   5456	if (IS_ERR_OR_NULL(di)) {
   5457		ret = di ? PTR_ERR(di) : -ENOENT;
   5458		goto out;
   5459	}
   5460
   5461	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
   5462	if (location->type != BTRFS_INODE_ITEM_KEY &&
   5463	    location->type != BTRFS_ROOT_ITEM_KEY) {
   5464		ret = -EUCLEAN;
   5465		btrfs_warn(root->fs_info,
   5466"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
   5467			   __func__, name, btrfs_ino(BTRFS_I(dir)),
   5468			   location->objectid, location->type, location->offset);
   5469	}
   5470	if (!ret)
   5471		*type = btrfs_dir_type(path->nodes[0], di);
   5472out:
   5473	btrfs_free_path(path);
   5474	return ret;
   5475}
   5476
   5477/*
   5478 * when we hit a tree root in a directory, the btrfs part of the inode
   5479 * needs to be changed to reflect the root directory of the tree root.  This
   5480 * is kind of like crossing a mount point.
   5481 */
   5482static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
   5483				    struct inode *dir,
   5484				    struct dentry *dentry,
   5485				    struct btrfs_key *location,
   5486				    struct btrfs_root **sub_root)
   5487{
   5488	struct btrfs_path *path;
   5489	struct btrfs_root *new_root;
   5490	struct btrfs_root_ref *ref;
   5491	struct extent_buffer *leaf;
   5492	struct btrfs_key key;
   5493	int ret;
   5494	int err = 0;
   5495
   5496	path = btrfs_alloc_path();
   5497	if (!path) {
   5498		err = -ENOMEM;
   5499		goto out;
   5500	}
   5501
   5502	err = -ENOENT;
   5503	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
   5504	key.type = BTRFS_ROOT_REF_KEY;
   5505	key.offset = location->objectid;
   5506
   5507	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
   5508	if (ret) {
   5509		if (ret < 0)
   5510			err = ret;
   5511		goto out;
   5512	}
   5513
   5514	leaf = path->nodes[0];
   5515	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
   5516	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
   5517	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
   5518		goto out;
   5519
   5520	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
   5521				   (unsigned long)(ref + 1),
   5522				   dentry->d_name.len);
   5523	if (ret)
   5524		goto out;
   5525
   5526	btrfs_release_path(path);
   5527
   5528	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
   5529	if (IS_ERR(new_root)) {
   5530		err = PTR_ERR(new_root);
   5531		goto out;
   5532	}
   5533
   5534	*sub_root = new_root;
   5535	location->objectid = btrfs_root_dirid(&new_root->root_item);
   5536	location->type = BTRFS_INODE_ITEM_KEY;
   5537	location->offset = 0;
   5538	err = 0;
   5539out:
   5540	btrfs_free_path(path);
   5541	return err;
   5542}
   5543
   5544static void inode_tree_add(struct inode *inode)
   5545{
   5546	struct btrfs_root *root = BTRFS_I(inode)->root;
   5547	struct btrfs_inode *entry;
   5548	struct rb_node **p;
   5549	struct rb_node *parent;
   5550	struct rb_node *new = &BTRFS_I(inode)->rb_node;
   5551	u64 ino = btrfs_ino(BTRFS_I(inode));
   5552
   5553	if (inode_unhashed(inode))
   5554		return;
   5555	parent = NULL;
   5556	spin_lock(&root->inode_lock);
   5557	p = &root->inode_tree.rb_node;
   5558	while (*p) {
   5559		parent = *p;
   5560		entry = rb_entry(parent, struct btrfs_inode, rb_node);
   5561
   5562		if (ino < btrfs_ino(entry))
   5563			p = &parent->rb_left;
   5564		else if (ino > btrfs_ino(entry))
   5565			p = &parent->rb_right;
   5566		else {
   5567			WARN_ON(!(entry->vfs_inode.i_state &
   5568				  (I_WILL_FREE | I_FREEING)));
   5569			rb_replace_node(parent, new, &root->inode_tree);
   5570			RB_CLEAR_NODE(parent);
   5571			spin_unlock(&root->inode_lock);
   5572			return;
   5573		}
   5574	}
   5575	rb_link_node(new, parent, p);
   5576	rb_insert_color(new, &root->inode_tree);
   5577	spin_unlock(&root->inode_lock);
   5578}
   5579
   5580static void inode_tree_del(struct btrfs_inode *inode)
   5581{
   5582	struct btrfs_root *root = inode->root;
   5583	int empty = 0;
   5584
   5585	spin_lock(&root->inode_lock);
   5586	if (!RB_EMPTY_NODE(&inode->rb_node)) {
   5587		rb_erase(&inode->rb_node, &root->inode_tree);
   5588		RB_CLEAR_NODE(&inode->rb_node);
   5589		empty = RB_EMPTY_ROOT(&root->inode_tree);
   5590	}
   5591	spin_unlock(&root->inode_lock);
   5592
   5593	if (empty && btrfs_root_refs(&root->root_item) == 0) {
   5594		spin_lock(&root->inode_lock);
   5595		empty = RB_EMPTY_ROOT(&root->inode_tree);
   5596		spin_unlock(&root->inode_lock);
   5597		if (empty)
   5598			btrfs_add_dead_root(root);
   5599	}
   5600}
   5601
   5602
   5603static int btrfs_init_locked_inode(struct inode *inode, void *p)
   5604{
   5605	struct btrfs_iget_args *args = p;
   5606
   5607	inode->i_ino = args->ino;
   5608	BTRFS_I(inode)->location.objectid = args->ino;
   5609	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
   5610	BTRFS_I(inode)->location.offset = 0;
   5611	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
   5612	BUG_ON(args->root && !BTRFS_I(inode)->root);
   5613	return 0;
   5614}
   5615
   5616static int btrfs_find_actor(struct inode *inode, void *opaque)
   5617{
   5618	struct btrfs_iget_args *args = opaque;
   5619
   5620	return args->ino == BTRFS_I(inode)->location.objectid &&
   5621		args->root == BTRFS_I(inode)->root;
   5622}
   5623
   5624static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
   5625				       struct btrfs_root *root)
   5626{
   5627	struct inode *inode;
   5628	struct btrfs_iget_args args;
   5629	unsigned long hashval = btrfs_inode_hash(ino, root);
   5630
   5631	args.ino = ino;
   5632	args.root = root;
   5633
   5634	inode = iget5_locked(s, hashval, btrfs_find_actor,
   5635			     btrfs_init_locked_inode,
   5636			     (void *)&args);
   5637	return inode;
   5638}
   5639
   5640/*
   5641 * Get an inode object given its inode number and corresponding root.
   5642 * Path can be preallocated to prevent recursing back to iget through
   5643 * allocator. NULL is also valid but may require an additional allocation
   5644 * later.
   5645 */
   5646struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
   5647			      struct btrfs_root *root, struct btrfs_path *path)
   5648{
   5649	struct inode *inode;
   5650
   5651	inode = btrfs_iget_locked(s, ino, root);
   5652	if (!inode)
   5653		return ERR_PTR(-ENOMEM);
   5654
   5655	if (inode->i_state & I_NEW) {
   5656		int ret;
   5657
   5658		ret = btrfs_read_locked_inode(inode, path);
   5659		if (!ret) {
   5660			inode_tree_add(inode);
   5661			unlock_new_inode(inode);
   5662		} else {
   5663			iget_failed(inode);
   5664			/*
   5665			 * ret > 0 can come from btrfs_search_slot called by
   5666			 * btrfs_read_locked_inode, this means the inode item
   5667			 * was not found.
   5668			 */
   5669			if (ret > 0)
   5670				ret = -ENOENT;
   5671			inode = ERR_PTR(ret);
   5672		}
   5673	}
   5674
   5675	return inode;
   5676}
   5677
   5678struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
   5679{
   5680	return btrfs_iget_path(s, ino, root, NULL);
   5681}
   5682
   5683static struct inode *new_simple_dir(struct super_block *s,
   5684				    struct btrfs_key *key,
   5685				    struct btrfs_root *root)
   5686{
   5687	struct inode *inode = new_inode(s);
   5688
   5689	if (!inode)
   5690		return ERR_PTR(-ENOMEM);
   5691
   5692	BTRFS_I(inode)->root = btrfs_grab_root(root);
   5693	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
   5694	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
   5695
   5696	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
   5697	/*
   5698	 * We only need lookup, the rest is read-only and there's no inode
   5699	 * associated with the dentry
   5700	 */
   5701	inode->i_op = &simple_dir_inode_operations;
   5702	inode->i_opflags &= ~IOP_XATTR;
   5703	inode->i_fop = &simple_dir_operations;
   5704	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
   5705	inode->i_mtime = current_time(inode);
   5706	inode->i_atime = inode->i_mtime;
   5707	inode->i_ctime = inode->i_mtime;
   5708	BTRFS_I(inode)->i_otime = inode->i_mtime;
   5709
   5710	return inode;
   5711}
   5712
   5713static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
   5714static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
   5715static_assert(BTRFS_FT_DIR == FT_DIR);
   5716static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
   5717static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
   5718static_assert(BTRFS_FT_FIFO == FT_FIFO);
   5719static_assert(BTRFS_FT_SOCK == FT_SOCK);
   5720static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
   5721
   5722static inline u8 btrfs_inode_type(struct inode *inode)
   5723{
   5724	return fs_umode_to_ftype(inode->i_mode);
   5725}
   5726
   5727struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
   5728{
   5729	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
   5730	struct inode *inode;
   5731	struct btrfs_root *root = BTRFS_I(dir)->root;
   5732	struct btrfs_root *sub_root = root;
   5733	struct btrfs_key location;
   5734	u8 di_type = 0;
   5735	int ret = 0;
   5736
   5737	if (dentry->d_name.len > BTRFS_NAME_LEN)
   5738		return ERR_PTR(-ENAMETOOLONG);
   5739
   5740	ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
   5741	if (ret < 0)
   5742		return ERR_PTR(ret);
   5743
   5744	if (location.type == BTRFS_INODE_ITEM_KEY) {
   5745		inode = btrfs_iget(dir->i_sb, location.objectid, root);
   5746		if (IS_ERR(inode))
   5747			return inode;
   5748
   5749		/* Do extra check against inode mode with di_type */
   5750		if (btrfs_inode_type(inode) != di_type) {
   5751			btrfs_crit(fs_info,
   5752"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
   5753				  inode->i_mode, btrfs_inode_type(inode),
   5754				  di_type);
   5755			iput(inode);
   5756			return ERR_PTR(-EUCLEAN);
   5757		}
   5758		return inode;
   5759	}
   5760
   5761	ret = fixup_tree_root_location(fs_info, dir, dentry,
   5762				       &location, &sub_root);
   5763	if (ret < 0) {
   5764		if (ret != -ENOENT)
   5765			inode = ERR_PTR(ret);
   5766		else
   5767			inode = new_simple_dir(dir->i_sb, &location, sub_root);
   5768	} else {
   5769		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
   5770	}
   5771	if (root != sub_root)
   5772		btrfs_put_root(sub_root);
   5773
   5774	if (!IS_ERR(inode) && root != sub_root) {
   5775		down_read(&fs_info->cleanup_work_sem);
   5776		if (!sb_rdonly(inode->i_sb))
   5777			ret = btrfs_orphan_cleanup(sub_root);
   5778		up_read(&fs_info->cleanup_work_sem);
   5779		if (ret) {
   5780			iput(inode);
   5781			inode = ERR_PTR(ret);
   5782		}
   5783	}
   5784
   5785	return inode;
   5786}
   5787
   5788static int btrfs_dentry_delete(const struct dentry *dentry)
   5789{
   5790	struct btrfs_root *root;
   5791	struct inode *inode = d_inode(dentry);
   5792
   5793	if (!inode && !IS_ROOT(dentry))
   5794		inode = d_inode(dentry->d_parent);
   5795
   5796	if (inode) {
   5797		root = BTRFS_I(inode)->root;
   5798		if (btrfs_root_refs(&root->root_item) == 0)
   5799			return 1;
   5800
   5801		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
   5802			return 1;
   5803	}
   5804	return 0;
   5805}
   5806
   5807static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
   5808				   unsigned int flags)
   5809{
   5810	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
   5811
   5812	if (inode == ERR_PTR(-ENOENT))
   5813		inode = NULL;
   5814	return d_splice_alias(inode, dentry);
   5815}
   5816
   5817/*
   5818 * All this infrastructure exists because dir_emit can fault, and we are holding
   5819 * the tree lock when doing readdir.  For now just allocate a buffer and copy
   5820 * our information into that, and then dir_emit from the buffer.  This is
   5821 * similar to what NFS does, only we don't keep the buffer around in pagecache
   5822 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
   5823 * copy_to_user_inatomic so we don't have to worry about page faulting under the
   5824 * tree lock.
   5825 */
   5826static int btrfs_opendir(struct inode *inode, struct file *file)
   5827{
   5828	struct btrfs_file_private *private;
   5829
   5830	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
   5831	if (!private)
   5832		return -ENOMEM;
   5833	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
   5834	if (!private->filldir_buf) {
   5835		kfree(private);
   5836		return -ENOMEM;
   5837	}
   5838	file->private_data = private;
   5839	return 0;
   5840}
   5841
   5842struct dir_entry {
   5843	u64 ino;
   5844	u64 offset;
   5845	unsigned type;
   5846	int name_len;
   5847};
   5848
   5849static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
   5850{
   5851	while (entries--) {
   5852		struct dir_entry *entry = addr;
   5853		char *name = (char *)(entry + 1);
   5854
   5855		ctx->pos = get_unaligned(&entry->offset);
   5856		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
   5857					 get_unaligned(&entry->ino),
   5858					 get_unaligned(&entry->type)))
   5859			return 1;
   5860		addr += sizeof(struct dir_entry) +
   5861			get_unaligned(&entry->name_len);
   5862		ctx->pos++;
   5863	}
   5864	return 0;
   5865}
   5866
   5867static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
   5868{
   5869	struct inode *inode = file_inode(file);
   5870	struct btrfs_root *root = BTRFS_I(inode)->root;
   5871	struct btrfs_file_private *private = file->private_data;
   5872	struct btrfs_dir_item *di;
   5873	struct btrfs_key key;
   5874	struct btrfs_key found_key;
   5875	struct btrfs_path *path;
   5876	void *addr;
   5877	struct list_head ins_list;
   5878	struct list_head del_list;
   5879	int ret;
   5880	char *name_ptr;
   5881	int name_len;
   5882	int entries = 0;
   5883	int total_len = 0;
   5884	bool put = false;
   5885	struct btrfs_key location;
   5886
   5887	if (!dir_emit_dots(file, ctx))
   5888		return 0;
   5889
   5890	path = btrfs_alloc_path();
   5891	if (!path)
   5892		return -ENOMEM;
   5893
   5894	addr = private->filldir_buf;
   5895	path->reada = READA_FORWARD;
   5896
   5897	INIT_LIST_HEAD(&ins_list);
   5898	INIT_LIST_HEAD(&del_list);
   5899	put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
   5900
   5901again:
   5902	key.type = BTRFS_DIR_INDEX_KEY;
   5903	key.offset = ctx->pos;
   5904	key.objectid = btrfs_ino(BTRFS_I(inode));
   5905
   5906	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
   5907		struct dir_entry *entry;
   5908		struct extent_buffer *leaf = path->nodes[0];
   5909
   5910		if (found_key.objectid != key.objectid)
   5911			break;
   5912		if (found_key.type != BTRFS_DIR_INDEX_KEY)
   5913			break;
   5914		if (found_key.offset < ctx->pos)
   5915			continue;
   5916		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
   5917			continue;
   5918		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
   5919		name_len = btrfs_dir_name_len(leaf, di);
   5920		if ((total_len + sizeof(struct dir_entry) + name_len) >=
   5921		    PAGE_SIZE) {
   5922			btrfs_release_path(path);
   5923			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
   5924			if (ret)
   5925				goto nopos;
   5926			addr = private->filldir_buf;
   5927			entries = 0;
   5928			total_len = 0;
   5929			goto again;
   5930		}
   5931
   5932		entry = addr;
   5933		put_unaligned(name_len, &entry->name_len);
   5934		name_ptr = (char *)(entry + 1);
   5935		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
   5936				   name_len);
   5937		put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
   5938				&entry->type);
   5939		btrfs_dir_item_key_to_cpu(leaf, di, &location);
   5940		put_unaligned(location.objectid, &entry->ino);
   5941		put_unaligned(found_key.offset, &entry->offset);
   5942		entries++;
   5943		addr += sizeof(struct dir_entry) + name_len;
   5944		total_len += sizeof(struct dir_entry) + name_len;
   5945	}
   5946	/* Catch error encountered during iteration */
   5947	if (ret < 0)
   5948		goto err;
   5949
   5950	btrfs_release_path(path);
   5951
   5952	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
   5953	if (ret)
   5954		goto nopos;
   5955
   5956	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
   5957	if (ret)
   5958		goto nopos;
   5959
   5960	/*
   5961	 * Stop new entries from being returned after we return the last
   5962	 * entry.
   5963	 *
   5964	 * New directory entries are assigned a strictly increasing
   5965	 * offset.  This means that new entries created during readdir
   5966	 * are *guaranteed* to be seen in the future by that readdir.
   5967	 * This has broken buggy programs which operate on names as
   5968	 * they're returned by readdir.  Until we re-use freed offsets
   5969	 * we have this hack to stop new entries from being returned
   5970	 * under the assumption that they'll never reach this huge
   5971	 * offset.
   5972	 *
   5973	 * This is being careful not to overflow 32bit loff_t unless the
   5974	 * last entry requires it because doing so has broken 32bit apps
   5975	 * in the past.
   5976	 */
   5977	if (ctx->pos >= INT_MAX)
   5978		ctx->pos = LLONG_MAX;
   5979	else
   5980		ctx->pos = INT_MAX;
   5981nopos:
   5982	ret = 0;
   5983err:
   5984	if (put)
   5985		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
   5986	btrfs_free_path(path);
   5987	return ret;
   5988}
   5989
   5990/*
   5991 * This is somewhat expensive, updating the tree every time the
   5992 * inode changes.  But, it is most likely to find the inode in cache.
   5993 * FIXME, needs more benchmarking...there are no reasons other than performance
   5994 * to keep or drop this code.
   5995 */
   5996static int btrfs_dirty_inode(struct inode *inode)
   5997{
   5998	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   5999	struct btrfs_root *root = BTRFS_I(inode)->root;
   6000	struct btrfs_trans_handle *trans;
   6001	int ret;
   6002
   6003	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
   6004		return 0;
   6005
   6006	trans = btrfs_join_transaction(root);
   6007	if (IS_ERR(trans))
   6008		return PTR_ERR(trans);
   6009
   6010	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   6011	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
   6012		/* whoops, lets try again with the full transaction */
   6013		btrfs_end_transaction(trans);
   6014		trans = btrfs_start_transaction(root, 1);
   6015		if (IS_ERR(trans))
   6016			return PTR_ERR(trans);
   6017
   6018		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   6019	}
   6020	btrfs_end_transaction(trans);
   6021	if (BTRFS_I(inode)->delayed_node)
   6022		btrfs_balance_delayed_items(fs_info);
   6023
   6024	return ret;
   6025}
   6026
   6027/*
   6028 * This is a copy of file_update_time.  We need this so we can return error on
   6029 * ENOSPC for updating the inode in the case of file write and mmap writes.
   6030 */
   6031static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
   6032			     int flags)
   6033{
   6034	struct btrfs_root *root = BTRFS_I(inode)->root;
   6035	bool dirty = flags & ~S_VERSION;
   6036
   6037	if (btrfs_root_readonly(root))
   6038		return -EROFS;
   6039
   6040	if (flags & S_VERSION)
   6041		dirty |= inode_maybe_inc_iversion(inode, dirty);
   6042	if (flags & S_CTIME)
   6043		inode->i_ctime = *now;
   6044	if (flags & S_MTIME)
   6045		inode->i_mtime = *now;
   6046	if (flags & S_ATIME)
   6047		inode->i_atime = *now;
   6048	return dirty ? btrfs_dirty_inode(inode) : 0;
   6049}
   6050
   6051/*
   6052 * find the highest existing sequence number in a directory
   6053 * and then set the in-memory index_cnt variable to reflect
   6054 * free sequence numbers
   6055 */
   6056static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
   6057{
   6058	struct btrfs_root *root = inode->root;
   6059	struct btrfs_key key, found_key;
   6060	struct btrfs_path *path;
   6061	struct extent_buffer *leaf;
   6062	int ret;
   6063
   6064	key.objectid = btrfs_ino(inode);
   6065	key.type = BTRFS_DIR_INDEX_KEY;
   6066	key.offset = (u64)-1;
   6067
   6068	path = btrfs_alloc_path();
   6069	if (!path)
   6070		return -ENOMEM;
   6071
   6072	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   6073	if (ret < 0)
   6074		goto out;
   6075	/* FIXME: we should be able to handle this */
   6076	if (ret == 0)
   6077		goto out;
   6078	ret = 0;
   6079
   6080	if (path->slots[0] == 0) {
   6081		inode->index_cnt = BTRFS_DIR_START_INDEX;
   6082		goto out;
   6083	}
   6084
   6085	path->slots[0]--;
   6086
   6087	leaf = path->nodes[0];
   6088	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   6089
   6090	if (found_key.objectid != btrfs_ino(inode) ||
   6091	    found_key.type != BTRFS_DIR_INDEX_KEY) {
   6092		inode->index_cnt = BTRFS_DIR_START_INDEX;
   6093		goto out;
   6094	}
   6095
   6096	inode->index_cnt = found_key.offset + 1;
   6097out:
   6098	btrfs_free_path(path);
   6099	return ret;
   6100}
   6101
   6102/*
   6103 * helper to find a free sequence number in a given directory.  This current
   6104 * code is very simple, later versions will do smarter things in the btree
   6105 */
   6106int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
   6107{
   6108	int ret = 0;
   6109
   6110	if (dir->index_cnt == (u64)-1) {
   6111		ret = btrfs_inode_delayed_dir_index_count(dir);
   6112		if (ret) {
   6113			ret = btrfs_set_inode_index_count(dir);
   6114			if (ret)
   6115				return ret;
   6116		}
   6117	}
   6118
   6119	*index = dir->index_cnt;
   6120	dir->index_cnt++;
   6121
   6122	return ret;
   6123}
   6124
   6125static int btrfs_insert_inode_locked(struct inode *inode)
   6126{
   6127	struct btrfs_iget_args args;
   6128
   6129	args.ino = BTRFS_I(inode)->location.objectid;
   6130	args.root = BTRFS_I(inode)->root;
   6131
   6132	return insert_inode_locked4(inode,
   6133		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
   6134		   btrfs_find_actor, &args);
   6135}
   6136
   6137int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
   6138			    unsigned int *trans_num_items)
   6139{
   6140	struct inode *dir = args->dir;
   6141	struct inode *inode = args->inode;
   6142	int ret;
   6143
   6144	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
   6145	if (ret)
   6146		return ret;
   6147
   6148	/* 1 to add inode item */
   6149	*trans_num_items = 1;
   6150	/* 1 to add compression property */
   6151	if (BTRFS_I(dir)->prop_compress)
   6152		(*trans_num_items)++;
   6153	/* 1 to add default ACL xattr */
   6154	if (args->default_acl)
   6155		(*trans_num_items)++;
   6156	/* 1 to add access ACL xattr */
   6157	if (args->acl)
   6158		(*trans_num_items)++;
   6159#ifdef CONFIG_SECURITY
   6160	/* 1 to add LSM xattr */
   6161	if (dir->i_security)
   6162		(*trans_num_items)++;
   6163#endif
   6164	if (args->orphan) {
   6165		/* 1 to add orphan item */
   6166		(*trans_num_items)++;
   6167	} else {
   6168		/*
   6169		 * 1 to add dir item
   6170		 * 1 to add dir index
   6171		 * 1 to update parent inode item
   6172		 *
   6173		 * No need for 1 unit for the inode ref item because it is
   6174		 * inserted in a batch together with the inode item at
   6175		 * btrfs_create_new_inode().
   6176		 */
   6177		*trans_num_items += 3;
   6178	}
   6179	return 0;
   6180}
   6181
   6182void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
   6183{
   6184	posix_acl_release(args->acl);
   6185	posix_acl_release(args->default_acl);
   6186}
   6187
   6188/*
   6189 * Inherit flags from the parent inode.
   6190 *
   6191 * Currently only the compression flags and the cow flags are inherited.
   6192 */
   6193static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
   6194{
   6195	unsigned int flags;
   6196
   6197	flags = BTRFS_I(dir)->flags;
   6198
   6199	if (flags & BTRFS_INODE_NOCOMPRESS) {
   6200		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
   6201		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
   6202	} else if (flags & BTRFS_INODE_COMPRESS) {
   6203		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
   6204		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
   6205	}
   6206
   6207	if (flags & BTRFS_INODE_NODATACOW) {
   6208		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
   6209		if (S_ISREG(inode->i_mode))
   6210			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
   6211	}
   6212
   6213	btrfs_sync_inode_flags_to_i_flags(inode);
   6214}
   6215
   6216int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
   6217			   struct btrfs_new_inode_args *args)
   6218{
   6219	struct inode *dir = args->dir;
   6220	struct inode *inode = args->inode;
   6221	const char *name = args->orphan ? NULL : args->dentry->d_name.name;
   6222	int name_len = args->orphan ? 0 : args->dentry->d_name.len;
   6223	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
   6224	struct btrfs_root *root;
   6225	struct btrfs_inode_item *inode_item;
   6226	struct btrfs_key *location;
   6227	struct btrfs_path *path;
   6228	u64 objectid;
   6229	struct btrfs_inode_ref *ref;
   6230	struct btrfs_key key[2];
   6231	u32 sizes[2];
   6232	struct btrfs_item_batch batch;
   6233	unsigned long ptr;
   6234	int ret;
   6235
   6236	path = btrfs_alloc_path();
   6237	if (!path)
   6238		return -ENOMEM;
   6239
   6240	if (!args->subvol)
   6241		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
   6242	root = BTRFS_I(inode)->root;
   6243
   6244	ret = btrfs_get_free_objectid(root, &objectid);
   6245	if (ret)
   6246		goto out;
   6247	inode->i_ino = objectid;
   6248
   6249	if (args->orphan) {
   6250		/*
   6251		 * O_TMPFILE, set link count to 0, so that after this point, we
   6252		 * fill in an inode item with the correct link count.
   6253		 */
   6254		set_nlink(inode, 0);
   6255	} else {
   6256		trace_btrfs_inode_request(dir);
   6257
   6258		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
   6259		if (ret)
   6260			goto out;
   6261	}
   6262	/* index_cnt is ignored for everything but a dir. */
   6263	BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
   6264	BTRFS_I(inode)->generation = trans->transid;
   6265	inode->i_generation = BTRFS_I(inode)->generation;
   6266
   6267	/*
   6268	 * Subvolumes don't inherit flags from their parent directory.
   6269	 * Originally this was probably by accident, but we probably can't
   6270	 * change it now without compatibility issues.
   6271	 */
   6272	if (!args->subvol)
   6273		btrfs_inherit_iflags(inode, dir);
   6274
   6275	if (S_ISREG(inode->i_mode)) {
   6276		if (btrfs_test_opt(fs_info, NODATASUM))
   6277			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
   6278		if (btrfs_test_opt(fs_info, NODATACOW))
   6279			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
   6280				BTRFS_INODE_NODATASUM;
   6281	}
   6282
   6283	location = &BTRFS_I(inode)->location;
   6284	location->objectid = objectid;
   6285	location->offset = 0;
   6286	location->type = BTRFS_INODE_ITEM_KEY;
   6287
   6288	ret = btrfs_insert_inode_locked(inode);
   6289	if (ret < 0) {
   6290		if (!args->orphan)
   6291			BTRFS_I(dir)->index_cnt--;
   6292		goto out;
   6293	}
   6294
   6295	/*
   6296	 * We could have gotten an inode number from somebody who was fsynced
   6297	 * and then removed in this same transaction, so let's just set full
   6298	 * sync since it will be a full sync anyway and this will blow away the
   6299	 * old info in the log.
   6300	 */
   6301	btrfs_set_inode_full_sync(BTRFS_I(inode));
   6302
   6303	key[0].objectid = objectid;
   6304	key[0].type = BTRFS_INODE_ITEM_KEY;
   6305	key[0].offset = 0;
   6306
   6307	sizes[0] = sizeof(struct btrfs_inode_item);
   6308
   6309	if (!args->orphan) {
   6310		/*
   6311		 * Start new inodes with an inode_ref. This is slightly more
   6312		 * efficient for small numbers of hard links since they will
   6313		 * be packed into one item. Extended refs will kick in if we
   6314		 * add more hard links than can fit in the ref item.
   6315		 */
   6316		key[1].objectid = objectid;
   6317		key[1].type = BTRFS_INODE_REF_KEY;
   6318		if (args->subvol) {
   6319			key[1].offset = objectid;
   6320			sizes[1] = 2 + sizeof(*ref);
   6321		} else {
   6322			key[1].offset = btrfs_ino(BTRFS_I(dir));
   6323			sizes[1] = name_len + sizeof(*ref);
   6324		}
   6325	}
   6326
   6327	batch.keys = &key[0];
   6328	batch.data_sizes = &sizes[0];
   6329	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
   6330	batch.nr = args->orphan ? 1 : 2;
   6331	ret = btrfs_insert_empty_items(trans, root, path, &batch);
   6332	if (ret != 0) {
   6333		btrfs_abort_transaction(trans, ret);
   6334		goto discard;
   6335	}
   6336
   6337	inode->i_mtime = current_time(inode);
   6338	inode->i_atime = inode->i_mtime;
   6339	inode->i_ctime = inode->i_mtime;
   6340	BTRFS_I(inode)->i_otime = inode->i_mtime;
   6341
   6342	/*
   6343	 * We're going to fill the inode item now, so at this point the inode
   6344	 * must be fully initialized.
   6345	 */
   6346
   6347	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
   6348				  struct btrfs_inode_item);
   6349	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
   6350			     sizeof(*inode_item));
   6351	fill_inode_item(trans, path->nodes[0], inode_item, inode);
   6352
   6353	if (!args->orphan) {
   6354		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
   6355				     struct btrfs_inode_ref);
   6356		ptr = (unsigned long)(ref + 1);
   6357		if (args->subvol) {
   6358			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
   6359			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
   6360			write_extent_buffer(path->nodes[0], "..", ptr, 2);
   6361		} else {
   6362			btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
   6363			btrfs_set_inode_ref_index(path->nodes[0], ref,
   6364						  BTRFS_I(inode)->dir_index);
   6365			write_extent_buffer(path->nodes[0], name, ptr, name_len);
   6366		}
   6367	}
   6368
   6369	btrfs_mark_buffer_dirty(path->nodes[0]);
   6370	btrfs_release_path(path);
   6371
   6372	if (args->subvol) {
   6373		struct inode *parent;
   6374
   6375		/*
   6376		 * Subvolumes inherit properties from their parent subvolume,
   6377		 * not the directory they were created in.
   6378		 */
   6379		parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
   6380				    BTRFS_I(dir)->root);
   6381		if (IS_ERR(parent)) {
   6382			ret = PTR_ERR(parent);
   6383		} else {
   6384			ret = btrfs_inode_inherit_props(trans, inode, parent);
   6385			iput(parent);
   6386		}
   6387	} else {
   6388		ret = btrfs_inode_inherit_props(trans, inode, dir);
   6389	}
   6390	if (ret) {
   6391		btrfs_err(fs_info,
   6392			  "error inheriting props for ino %llu (root %llu): %d",
   6393			  btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
   6394			  ret);
   6395	}
   6396
   6397	/*
   6398	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
   6399	 * probably a bug.
   6400	 */
   6401	if (!args->subvol) {
   6402		ret = btrfs_init_inode_security(trans, args);
   6403		if (ret) {
   6404			btrfs_abort_transaction(trans, ret);
   6405			goto discard;
   6406		}
   6407	}
   6408
   6409	inode_tree_add(inode);
   6410
   6411	trace_btrfs_inode_new(inode);
   6412	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
   6413
   6414	btrfs_update_root_times(trans, root);
   6415
   6416	if (args->orphan) {
   6417		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
   6418	} else {
   6419		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
   6420				     name_len, 0, BTRFS_I(inode)->dir_index);
   6421	}
   6422	if (ret) {
   6423		btrfs_abort_transaction(trans, ret);
   6424		goto discard;
   6425	}
   6426
   6427	ret = 0;
   6428	goto out;
   6429
   6430discard:
   6431	/*
   6432	 * discard_new_inode() calls iput(), but the caller owns the reference
   6433	 * to the inode.
   6434	 */
   6435	ihold(inode);
   6436	discard_new_inode(inode);
   6437out:
   6438	btrfs_free_path(path);
   6439	return ret;
   6440}
   6441
   6442/*
   6443 * utility function to add 'inode' into 'parent_inode' with
   6444 * a give name and a given sequence number.
   6445 * if 'add_backref' is true, also insert a backref from the
   6446 * inode to the parent directory.
   6447 */
   6448int btrfs_add_link(struct btrfs_trans_handle *trans,
   6449		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
   6450		   const char *name, int name_len, int add_backref, u64 index)
   6451{
   6452	int ret = 0;
   6453	struct btrfs_key key;
   6454	struct btrfs_root *root = parent_inode->root;
   6455	u64 ino = btrfs_ino(inode);
   6456	u64 parent_ino = btrfs_ino(parent_inode);
   6457
   6458	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
   6459		memcpy(&key, &inode->root->root_key, sizeof(key));
   6460	} else {
   6461		key.objectid = ino;
   6462		key.type = BTRFS_INODE_ITEM_KEY;
   6463		key.offset = 0;
   6464	}
   6465
   6466	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
   6467		ret = btrfs_add_root_ref(trans, key.objectid,
   6468					 root->root_key.objectid, parent_ino,
   6469					 index, name, name_len);
   6470	} else if (add_backref) {
   6471		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
   6472					     parent_ino, index);
   6473	}
   6474
   6475	/* Nothing to clean up yet */
   6476	if (ret)
   6477		return ret;
   6478
   6479	ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
   6480				    btrfs_inode_type(&inode->vfs_inode), index);
   6481	if (ret == -EEXIST || ret == -EOVERFLOW)
   6482		goto fail_dir_item;
   6483	else if (ret) {
   6484		btrfs_abort_transaction(trans, ret);
   6485		return ret;
   6486	}
   6487
   6488	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
   6489			   name_len * 2);
   6490	inode_inc_iversion(&parent_inode->vfs_inode);
   6491	/*
   6492	 * If we are replaying a log tree, we do not want to update the mtime
   6493	 * and ctime of the parent directory with the current time, since the
   6494	 * log replay procedure is responsible for setting them to their correct
   6495	 * values (the ones it had when the fsync was done).
   6496	 */
   6497	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
   6498		struct timespec64 now = current_time(&parent_inode->vfs_inode);
   6499
   6500		parent_inode->vfs_inode.i_mtime = now;
   6501		parent_inode->vfs_inode.i_ctime = now;
   6502	}
   6503	ret = btrfs_update_inode(trans, root, parent_inode);
   6504	if (ret)
   6505		btrfs_abort_transaction(trans, ret);
   6506	return ret;
   6507
   6508fail_dir_item:
   6509	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
   6510		u64 local_index;
   6511		int err;
   6512		err = btrfs_del_root_ref(trans, key.objectid,
   6513					 root->root_key.objectid, parent_ino,
   6514					 &local_index, name, name_len);
   6515		if (err)
   6516			btrfs_abort_transaction(trans, err);
   6517	} else if (add_backref) {
   6518		u64 local_index;
   6519		int err;
   6520
   6521		err = btrfs_del_inode_ref(trans, root, name, name_len,
   6522					  ino, parent_ino, &local_index);
   6523		if (err)
   6524			btrfs_abort_transaction(trans, err);
   6525	}
   6526
   6527	/* Return the original error code */
   6528	return ret;
   6529}
   6530
   6531static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
   6532			       struct inode *inode)
   6533{
   6534	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
   6535	struct btrfs_root *root = BTRFS_I(dir)->root;
   6536	struct btrfs_new_inode_args new_inode_args = {
   6537		.dir = dir,
   6538		.dentry = dentry,
   6539		.inode = inode,
   6540	};
   6541	unsigned int trans_num_items;
   6542	struct btrfs_trans_handle *trans;
   6543	int err;
   6544
   6545	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
   6546	if (err)
   6547		goto out_inode;
   6548
   6549	trans = btrfs_start_transaction(root, trans_num_items);
   6550	if (IS_ERR(trans)) {
   6551		err = PTR_ERR(trans);
   6552		goto out_new_inode_args;
   6553	}
   6554
   6555	err = btrfs_create_new_inode(trans, &new_inode_args);
   6556	if (!err)
   6557		d_instantiate_new(dentry, inode);
   6558
   6559	btrfs_end_transaction(trans);
   6560	btrfs_btree_balance_dirty(fs_info);
   6561out_new_inode_args:
   6562	btrfs_new_inode_args_destroy(&new_inode_args);
   6563out_inode:
   6564	if (err)
   6565		iput(inode);
   6566	return err;
   6567}
   6568
   6569static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
   6570		       struct dentry *dentry, umode_t mode, dev_t rdev)
   6571{
   6572	struct inode *inode;
   6573
   6574	inode = new_inode(dir->i_sb);
   6575	if (!inode)
   6576		return -ENOMEM;
   6577	inode_init_owner(mnt_userns, inode, dir, mode);
   6578	inode->i_op = &btrfs_special_inode_operations;
   6579	init_special_inode(inode, inode->i_mode, rdev);
   6580	return btrfs_create_common(dir, dentry, inode);
   6581}
   6582
   6583static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
   6584			struct dentry *dentry, umode_t mode, bool excl)
   6585{
   6586	struct inode *inode;
   6587
   6588	inode = new_inode(dir->i_sb);
   6589	if (!inode)
   6590		return -ENOMEM;
   6591	inode_init_owner(mnt_userns, inode, dir, mode);
   6592	inode->i_fop = &btrfs_file_operations;
   6593	inode->i_op = &btrfs_file_inode_operations;
   6594	inode->i_mapping->a_ops = &btrfs_aops;
   6595	return btrfs_create_common(dir, dentry, inode);
   6596}
   6597
   6598static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
   6599		      struct dentry *dentry)
   6600{
   6601	struct btrfs_trans_handle *trans = NULL;
   6602	struct btrfs_root *root = BTRFS_I(dir)->root;
   6603	struct inode *inode = d_inode(old_dentry);
   6604	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   6605	u64 index;
   6606	int err;
   6607	int drop_inode = 0;
   6608
   6609	/* do not allow sys_link's with other subvols of the same device */
   6610	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
   6611		return -EXDEV;
   6612
   6613	if (inode->i_nlink >= BTRFS_LINK_MAX)
   6614		return -EMLINK;
   6615
   6616	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
   6617	if (err)
   6618		goto fail;
   6619
   6620	/*
   6621	 * 2 items for inode and inode ref
   6622	 * 2 items for dir items
   6623	 * 1 item for parent inode
   6624	 * 1 item for orphan item deletion if O_TMPFILE
   6625	 */
   6626	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
   6627	if (IS_ERR(trans)) {
   6628		err = PTR_ERR(trans);
   6629		trans = NULL;
   6630		goto fail;
   6631	}
   6632
   6633	/* There are several dir indexes for this inode, clear the cache. */
   6634	BTRFS_I(inode)->dir_index = 0ULL;
   6635	inc_nlink(inode);
   6636	inode_inc_iversion(inode);
   6637	inode->i_ctime = current_time(inode);
   6638	ihold(inode);
   6639	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
   6640
   6641	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
   6642			     dentry->d_name.name, dentry->d_name.len, 1, index);
   6643
   6644	if (err) {
   6645		drop_inode = 1;
   6646	} else {
   6647		struct dentry *parent = dentry->d_parent;
   6648
   6649		err = btrfs_update_inode(trans, root, BTRFS_I(inode));
   6650		if (err)
   6651			goto fail;
   6652		if (inode->i_nlink == 1) {
   6653			/*
   6654			 * If new hard link count is 1, it's a file created
   6655			 * with open(2) O_TMPFILE flag.
   6656			 */
   6657			err = btrfs_orphan_del(trans, BTRFS_I(inode));
   6658			if (err)
   6659				goto fail;
   6660		}
   6661		d_instantiate(dentry, inode);
   6662		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
   6663	}
   6664
   6665fail:
   6666	if (trans)
   6667		btrfs_end_transaction(trans);
   6668	if (drop_inode) {
   6669		inode_dec_link_count(inode);
   6670		iput(inode);
   6671	}
   6672	btrfs_btree_balance_dirty(fs_info);
   6673	return err;
   6674}
   6675
   6676static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
   6677		       struct dentry *dentry, umode_t mode)
   6678{
   6679	struct inode *inode;
   6680
   6681	inode = new_inode(dir->i_sb);
   6682	if (!inode)
   6683		return -ENOMEM;
   6684	inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode);
   6685	inode->i_op = &btrfs_dir_inode_operations;
   6686	inode->i_fop = &btrfs_dir_file_operations;
   6687	return btrfs_create_common(dir, dentry, inode);
   6688}
   6689
   6690static noinline int uncompress_inline(struct btrfs_path *path,
   6691				      struct page *page,
   6692				      size_t pg_offset, u64 extent_offset,
   6693				      struct btrfs_file_extent_item *item)
   6694{
   6695	int ret;
   6696	struct extent_buffer *leaf = path->nodes[0];
   6697	char *tmp;
   6698	size_t max_size;
   6699	unsigned long inline_size;
   6700	unsigned long ptr;
   6701	int compress_type;
   6702
   6703	WARN_ON(pg_offset != 0);
   6704	compress_type = btrfs_file_extent_compression(leaf, item);
   6705	max_size = btrfs_file_extent_ram_bytes(leaf, item);
   6706	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
   6707	tmp = kmalloc(inline_size, GFP_NOFS);
   6708	if (!tmp)
   6709		return -ENOMEM;
   6710	ptr = btrfs_file_extent_inline_start(item);
   6711
   6712	read_extent_buffer(leaf, tmp, ptr, inline_size);
   6713
   6714	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
   6715	ret = btrfs_decompress(compress_type, tmp, page,
   6716			       extent_offset, inline_size, max_size);
   6717
   6718	/*
   6719	 * decompression code contains a memset to fill in any space between the end
   6720	 * of the uncompressed data and the end of max_size in case the decompressed
   6721	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
   6722	 * the end of an inline extent and the beginning of the next block, so we
   6723	 * cover that region here.
   6724	 */
   6725
   6726	if (max_size + pg_offset < PAGE_SIZE)
   6727		memzero_page(page,  pg_offset + max_size,
   6728			     PAGE_SIZE - max_size - pg_offset);
   6729	kfree(tmp);
   6730	return ret;
   6731}
   6732
   6733/**
   6734 * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
   6735 * @inode:	file to search in
   6736 * @page:	page to read extent data into if the extent is inline
   6737 * @pg_offset:	offset into @page to copy to
   6738 * @start:	file offset
   6739 * @len:	length of range starting at @start
   6740 *
   6741 * This returns the first &struct extent_map which overlaps with the given
   6742 * range, reading it from the B-tree and caching it if necessary. Note that
   6743 * there may be more extents which overlap the given range after the returned
   6744 * extent_map.
   6745 *
   6746 * If @page is not NULL and the extent is inline, this also reads the extent
   6747 * data directly into the page and marks the extent up to date in the io_tree.
   6748 *
   6749 * Return: ERR_PTR on error, non-NULL extent_map on success.
   6750 */
   6751struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
   6752				    struct page *page, size_t pg_offset,
   6753				    u64 start, u64 len)
   6754{
   6755	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   6756	int ret = 0;
   6757	u64 extent_start = 0;
   6758	u64 extent_end = 0;
   6759	u64 objectid = btrfs_ino(inode);
   6760	int extent_type = -1;
   6761	struct btrfs_path *path = NULL;
   6762	struct btrfs_root *root = inode->root;
   6763	struct btrfs_file_extent_item *item;
   6764	struct extent_buffer *leaf;
   6765	struct btrfs_key found_key;
   6766	struct extent_map *em = NULL;
   6767	struct extent_map_tree *em_tree = &inode->extent_tree;
   6768	struct extent_io_tree *io_tree = &inode->io_tree;
   6769
   6770	read_lock(&em_tree->lock);
   6771	em = lookup_extent_mapping(em_tree, start, len);
   6772	read_unlock(&em_tree->lock);
   6773
   6774	if (em) {
   6775		if (em->start > start || em->start + em->len <= start)
   6776			free_extent_map(em);
   6777		else if (em->block_start == EXTENT_MAP_INLINE && page)
   6778			free_extent_map(em);
   6779		else
   6780			goto out;
   6781	}
   6782	em = alloc_extent_map();
   6783	if (!em) {
   6784		ret = -ENOMEM;
   6785		goto out;
   6786	}
   6787	em->start = EXTENT_MAP_HOLE;
   6788	em->orig_start = EXTENT_MAP_HOLE;
   6789	em->len = (u64)-1;
   6790	em->block_len = (u64)-1;
   6791
   6792	path = btrfs_alloc_path();
   6793	if (!path) {
   6794		ret = -ENOMEM;
   6795		goto out;
   6796	}
   6797
   6798	/* Chances are we'll be called again, so go ahead and do readahead */
   6799	path->reada = READA_FORWARD;
   6800
   6801	/*
   6802	 * The same explanation in load_free_space_cache applies here as well,
   6803	 * we only read when we're loading the free space cache, and at that
   6804	 * point the commit_root has everything we need.
   6805	 */
   6806	if (btrfs_is_free_space_inode(inode)) {
   6807		path->search_commit_root = 1;
   6808		path->skip_locking = 1;
   6809	}
   6810
   6811	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
   6812	if (ret < 0) {
   6813		goto out;
   6814	} else if (ret > 0) {
   6815		if (path->slots[0] == 0)
   6816			goto not_found;
   6817		path->slots[0]--;
   6818		ret = 0;
   6819	}
   6820
   6821	leaf = path->nodes[0];
   6822	item = btrfs_item_ptr(leaf, path->slots[0],
   6823			      struct btrfs_file_extent_item);
   6824	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   6825	if (found_key.objectid != objectid ||
   6826	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
   6827		/*
   6828		 * If we backup past the first extent we want to move forward
   6829		 * and see if there is an extent in front of us, otherwise we'll
   6830		 * say there is a hole for our whole search range which can
   6831		 * cause problems.
   6832		 */
   6833		extent_end = start;
   6834		goto next;
   6835	}
   6836
   6837	extent_type = btrfs_file_extent_type(leaf, item);
   6838	extent_start = found_key.offset;
   6839	extent_end = btrfs_file_extent_end(path);
   6840	if (extent_type == BTRFS_FILE_EXTENT_REG ||
   6841	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
   6842		/* Only regular file could have regular/prealloc extent */
   6843		if (!S_ISREG(inode->vfs_inode.i_mode)) {
   6844			ret = -EUCLEAN;
   6845			btrfs_crit(fs_info,
   6846		"regular/prealloc extent found for non-regular inode %llu",
   6847				   btrfs_ino(inode));
   6848			goto out;
   6849		}
   6850		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
   6851						       extent_start);
   6852	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
   6853		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
   6854						      path->slots[0],
   6855						      extent_start);
   6856	}
   6857next:
   6858	if (start >= extent_end) {
   6859		path->slots[0]++;
   6860		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
   6861			ret = btrfs_next_leaf(root, path);
   6862			if (ret < 0)
   6863				goto out;
   6864			else if (ret > 0)
   6865				goto not_found;
   6866
   6867			leaf = path->nodes[0];
   6868		}
   6869		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   6870		if (found_key.objectid != objectid ||
   6871		    found_key.type != BTRFS_EXTENT_DATA_KEY)
   6872			goto not_found;
   6873		if (start + len <= found_key.offset)
   6874			goto not_found;
   6875		if (start > found_key.offset)
   6876			goto next;
   6877
   6878		/* New extent overlaps with existing one */
   6879		em->start = start;
   6880		em->orig_start = start;
   6881		em->len = found_key.offset - start;
   6882		em->block_start = EXTENT_MAP_HOLE;
   6883		goto insert;
   6884	}
   6885
   6886	btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
   6887
   6888	if (extent_type == BTRFS_FILE_EXTENT_REG ||
   6889	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
   6890		goto insert;
   6891	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
   6892		unsigned long ptr;
   6893		char *map;
   6894		size_t size;
   6895		size_t extent_offset;
   6896		size_t copy_size;
   6897
   6898		if (!page)
   6899			goto out;
   6900
   6901		size = btrfs_file_extent_ram_bytes(leaf, item);
   6902		extent_offset = page_offset(page) + pg_offset - extent_start;
   6903		copy_size = min_t(u64, PAGE_SIZE - pg_offset,
   6904				  size - extent_offset);
   6905		em->start = extent_start + extent_offset;
   6906		em->len = ALIGN(copy_size, fs_info->sectorsize);
   6907		em->orig_block_len = em->len;
   6908		em->orig_start = em->start;
   6909		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
   6910
   6911		if (!PageUptodate(page)) {
   6912			if (btrfs_file_extent_compression(leaf, item) !=
   6913			    BTRFS_COMPRESS_NONE) {
   6914				ret = uncompress_inline(path, page, pg_offset,
   6915							extent_offset, item);
   6916				if (ret)
   6917					goto out;
   6918			} else {
   6919				map = kmap_local_page(page);
   6920				read_extent_buffer(leaf, map + pg_offset, ptr,
   6921						   copy_size);
   6922				if (pg_offset + copy_size < PAGE_SIZE) {
   6923					memset(map + pg_offset + copy_size, 0,
   6924					       PAGE_SIZE - pg_offset -
   6925					       copy_size);
   6926				}
   6927				kunmap_local(map);
   6928			}
   6929			flush_dcache_page(page);
   6930		}
   6931		set_extent_uptodate(io_tree, em->start,
   6932				    extent_map_end(em) - 1, NULL, GFP_NOFS);
   6933		goto insert;
   6934	}
   6935not_found:
   6936	em->start = start;
   6937	em->orig_start = start;
   6938	em->len = len;
   6939	em->block_start = EXTENT_MAP_HOLE;
   6940insert:
   6941	ret = 0;
   6942	btrfs_release_path(path);
   6943	if (em->start > start || extent_map_end(em) <= start) {
   6944		btrfs_err(fs_info,
   6945			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
   6946			  em->start, em->len, start, len);
   6947		ret = -EIO;
   6948		goto out;
   6949	}
   6950
   6951	write_lock(&em_tree->lock);
   6952	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
   6953	write_unlock(&em_tree->lock);
   6954out:
   6955	btrfs_free_path(path);
   6956
   6957	trace_btrfs_get_extent(root, inode, em);
   6958
   6959	if (ret) {
   6960		free_extent_map(em);
   6961		return ERR_PTR(ret);
   6962	}
   6963	return em;
   6964}
   6965
   6966struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
   6967					   u64 start, u64 len)
   6968{
   6969	struct extent_map *em;
   6970	struct extent_map *hole_em = NULL;
   6971	u64 delalloc_start = start;
   6972	u64 end;
   6973	u64 delalloc_len;
   6974	u64 delalloc_end;
   6975	int err = 0;
   6976
   6977	em = btrfs_get_extent(inode, NULL, 0, start, len);
   6978	if (IS_ERR(em))
   6979		return em;
   6980	/*
   6981	 * If our em maps to:
   6982	 * - a hole or
   6983	 * - a pre-alloc extent,
   6984	 * there might actually be delalloc bytes behind it.
   6985	 */
   6986	if (em->block_start != EXTENT_MAP_HOLE &&
   6987	    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
   6988		return em;
   6989	else
   6990		hole_em = em;
   6991
   6992	/* check to see if we've wrapped (len == -1 or similar) */
   6993	end = start + len;
   6994	if (end < start)
   6995		end = (u64)-1;
   6996	else
   6997		end -= 1;
   6998
   6999	em = NULL;
   7000
   7001	/* ok, we didn't find anything, lets look for delalloc */
   7002	delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
   7003				 end, len, EXTENT_DELALLOC, 1);
   7004	delalloc_end = delalloc_start + delalloc_len;
   7005	if (delalloc_end < delalloc_start)
   7006		delalloc_end = (u64)-1;
   7007
   7008	/*
   7009	 * We didn't find anything useful, return the original results from
   7010	 * get_extent()
   7011	 */
   7012	if (delalloc_start > end || delalloc_end <= start) {
   7013		em = hole_em;
   7014		hole_em = NULL;
   7015		goto out;
   7016	}
   7017
   7018	/*
   7019	 * Adjust the delalloc_start to make sure it doesn't go backwards from
   7020	 * the start they passed in
   7021	 */
   7022	delalloc_start = max(start, delalloc_start);
   7023	delalloc_len = delalloc_end - delalloc_start;
   7024
   7025	if (delalloc_len > 0) {
   7026		u64 hole_start;
   7027		u64 hole_len;
   7028		const u64 hole_end = extent_map_end(hole_em);
   7029
   7030		em = alloc_extent_map();
   7031		if (!em) {
   7032			err = -ENOMEM;
   7033			goto out;
   7034		}
   7035
   7036		ASSERT(hole_em);
   7037		/*
   7038		 * When btrfs_get_extent can't find anything it returns one
   7039		 * huge hole
   7040		 *
   7041		 * Make sure what it found really fits our range, and adjust to
   7042		 * make sure it is based on the start from the caller
   7043		 */
   7044		if (hole_end <= start || hole_em->start > end) {
   7045		       free_extent_map(hole_em);
   7046		       hole_em = NULL;
   7047		} else {
   7048		       hole_start = max(hole_em->start, start);
   7049		       hole_len = hole_end - hole_start;
   7050		}
   7051
   7052		if (hole_em && delalloc_start > hole_start) {
   7053			/*
   7054			 * Our hole starts before our delalloc, so we have to
   7055			 * return just the parts of the hole that go until the
   7056			 * delalloc starts
   7057			 */
   7058			em->len = min(hole_len, delalloc_start - hole_start);
   7059			em->start = hole_start;
   7060			em->orig_start = hole_start;
   7061			/*
   7062			 * Don't adjust block start at all, it is fixed at
   7063			 * EXTENT_MAP_HOLE
   7064			 */
   7065			em->block_start = hole_em->block_start;
   7066			em->block_len = hole_len;
   7067			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
   7068				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
   7069		} else {
   7070			/*
   7071			 * Hole is out of passed range or it starts after
   7072			 * delalloc range
   7073			 */
   7074			em->start = delalloc_start;
   7075			em->len = delalloc_len;
   7076			em->orig_start = delalloc_start;
   7077			em->block_start = EXTENT_MAP_DELALLOC;
   7078			em->block_len = delalloc_len;
   7079		}
   7080	} else {
   7081		return hole_em;
   7082	}
   7083out:
   7084
   7085	free_extent_map(hole_em);
   7086	if (err) {
   7087		free_extent_map(em);
   7088		return ERR_PTR(err);
   7089	}
   7090	return em;
   7091}
   7092
   7093static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
   7094						  const u64 start,
   7095						  const u64 len,
   7096						  const u64 orig_start,
   7097						  const u64 block_start,
   7098						  const u64 block_len,
   7099						  const u64 orig_block_len,
   7100						  const u64 ram_bytes,
   7101						  const int type)
   7102{
   7103	struct extent_map *em = NULL;
   7104	int ret;
   7105
   7106	if (type != BTRFS_ORDERED_NOCOW) {
   7107		em = create_io_em(inode, start, len, orig_start, block_start,
   7108				  block_len, orig_block_len, ram_bytes,
   7109				  BTRFS_COMPRESS_NONE, /* compress_type */
   7110				  type);
   7111		if (IS_ERR(em))
   7112			goto out;
   7113	}
   7114	ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
   7115				       block_len, 0,
   7116				       (1 << type) |
   7117				       (1 << BTRFS_ORDERED_DIRECT),
   7118				       BTRFS_COMPRESS_NONE);
   7119	if (ret) {
   7120		if (em) {
   7121			free_extent_map(em);
   7122			btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
   7123		}
   7124		em = ERR_PTR(ret);
   7125	}
   7126 out:
   7127
   7128	return em;
   7129}
   7130
   7131static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
   7132						  u64 start, u64 len)
   7133{
   7134	struct btrfs_root *root = inode->root;
   7135	struct btrfs_fs_info *fs_info = root->fs_info;
   7136	struct extent_map *em;
   7137	struct btrfs_key ins;
   7138	u64 alloc_hint;
   7139	int ret;
   7140
   7141	alloc_hint = get_extent_allocation_hint(inode, start, len);
   7142	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
   7143				   0, alloc_hint, &ins, 1, 1);
   7144	if (ret)
   7145		return ERR_PTR(ret);
   7146
   7147	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
   7148				     ins.objectid, ins.offset, ins.offset,
   7149				     ins.offset, BTRFS_ORDERED_REGULAR);
   7150	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
   7151	if (IS_ERR(em))
   7152		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
   7153					   1);
   7154
   7155	return em;
   7156}
   7157
   7158static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
   7159{
   7160	struct btrfs_block_group *block_group;
   7161	bool readonly = false;
   7162
   7163	block_group = btrfs_lookup_block_group(fs_info, bytenr);
   7164	if (!block_group || block_group->ro)
   7165		readonly = true;
   7166	if (block_group)
   7167		btrfs_put_block_group(block_group);
   7168	return readonly;
   7169}
   7170
   7171/*
   7172 * Check if we can do nocow write into the range [@offset, @offset + @len)
   7173 *
   7174 * @offset:	File offset
   7175 * @len:	The length to write, will be updated to the nocow writeable
   7176 *		range
   7177 * @orig_start:	(optional) Return the original file offset of the file extent
   7178 * @orig_len:	(optional) Return the original on-disk length of the file extent
   7179 * @ram_bytes:	(optional) Return the ram_bytes of the file extent
   7180 * @strict:	if true, omit optimizations that might force us into unnecessary
   7181 *		cow. e.g., don't trust generation number.
   7182 *
   7183 * Return:
   7184 * >0	and update @len if we can do nocow write
   7185 *  0	if we can't do nocow write
   7186 * <0	if error happened
   7187 *
   7188 * NOTE: This only checks the file extents, caller is responsible to wait for
   7189 *	 any ordered extents.
   7190 */
   7191noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
   7192			      u64 *orig_start, u64 *orig_block_len,
   7193			      u64 *ram_bytes, bool strict)
   7194{
   7195	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   7196	struct can_nocow_file_extent_args nocow_args = { 0 };
   7197	struct btrfs_path *path;
   7198	int ret;
   7199	struct extent_buffer *leaf;
   7200	struct btrfs_root *root = BTRFS_I(inode)->root;
   7201	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
   7202	struct btrfs_file_extent_item *fi;
   7203	struct btrfs_key key;
   7204	int found_type;
   7205
   7206	path = btrfs_alloc_path();
   7207	if (!path)
   7208		return -ENOMEM;
   7209
   7210	ret = btrfs_lookup_file_extent(NULL, root, path,
   7211			btrfs_ino(BTRFS_I(inode)), offset, 0);
   7212	if (ret < 0)
   7213		goto out;
   7214
   7215	if (ret == 1) {
   7216		if (path->slots[0] == 0) {
   7217			/* can't find the item, must cow */
   7218			ret = 0;
   7219			goto out;
   7220		}
   7221		path->slots[0]--;
   7222	}
   7223	ret = 0;
   7224	leaf = path->nodes[0];
   7225	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   7226	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
   7227	    key.type != BTRFS_EXTENT_DATA_KEY) {
   7228		/* not our file or wrong item type, must cow */
   7229		goto out;
   7230	}
   7231
   7232	if (key.offset > offset) {
   7233		/* Wrong offset, must cow */
   7234		goto out;
   7235	}
   7236
   7237	if (btrfs_file_extent_end(path) <= offset)
   7238		goto out;
   7239
   7240	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
   7241	found_type = btrfs_file_extent_type(leaf, fi);
   7242	if (ram_bytes)
   7243		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
   7244
   7245	nocow_args.start = offset;
   7246	nocow_args.end = offset + *len - 1;
   7247	nocow_args.strict = strict;
   7248	nocow_args.free_path = true;
   7249
   7250	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
   7251	/* can_nocow_file_extent() has freed the path. */
   7252	path = NULL;
   7253
   7254	if (ret != 1) {
   7255		/* Treat errors as not being able to NOCOW. */
   7256		ret = 0;
   7257		goto out;
   7258	}
   7259
   7260	ret = 0;
   7261	if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
   7262		goto out;
   7263
   7264	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
   7265	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
   7266		u64 range_end;
   7267
   7268		range_end = round_up(offset + nocow_args.num_bytes,
   7269				     root->fs_info->sectorsize) - 1;
   7270		ret = test_range_bit(io_tree, offset, range_end,
   7271				     EXTENT_DELALLOC, 0, NULL);
   7272		if (ret) {
   7273			ret = -EAGAIN;
   7274			goto out;
   7275		}
   7276	}
   7277
   7278	if (orig_start)
   7279		*orig_start = key.offset - nocow_args.extent_offset;
   7280	if (orig_block_len)
   7281		*orig_block_len = nocow_args.disk_num_bytes;
   7282
   7283	*len = nocow_args.num_bytes;
   7284	ret = 1;
   7285out:
   7286	btrfs_free_path(path);
   7287	return ret;
   7288}
   7289
   7290static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
   7291			      struct extent_state **cached_state,
   7292			      unsigned int iomap_flags)
   7293{
   7294	const bool writing = (iomap_flags & IOMAP_WRITE);
   7295	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
   7296	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
   7297	struct btrfs_ordered_extent *ordered;
   7298	int ret = 0;
   7299
   7300	while (1) {
   7301		if (nowait) {
   7302			if (!try_lock_extent(io_tree, lockstart, lockend))
   7303				return -EAGAIN;
   7304		} else {
   7305			lock_extent_bits(io_tree, lockstart, lockend, cached_state);
   7306		}
   7307		/*
   7308		 * We're concerned with the entire range that we're going to be
   7309		 * doing DIO to, so we need to make sure there's no ordered
   7310		 * extents in this range.
   7311		 */
   7312		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
   7313						     lockend - lockstart + 1);
   7314
   7315		/*
   7316		 * We need to make sure there are no buffered pages in this
   7317		 * range either, we could have raced between the invalidate in
   7318		 * generic_file_direct_write and locking the extent.  The
   7319		 * invalidate needs to happen so that reads after a write do not
   7320		 * get stale data.
   7321		 */
   7322		if (!ordered &&
   7323		    (!writing || !filemap_range_has_page(inode->i_mapping,
   7324							 lockstart, lockend)))
   7325			break;
   7326
   7327		unlock_extent_cached(io_tree, lockstart, lockend, cached_state);
   7328
   7329		if (ordered) {
   7330			if (nowait) {
   7331				btrfs_put_ordered_extent(ordered);
   7332				ret = -EAGAIN;
   7333				break;
   7334			}
   7335			/*
   7336			 * If we are doing a DIO read and the ordered extent we
   7337			 * found is for a buffered write, we can not wait for it
   7338			 * to complete and retry, because if we do so we can
   7339			 * deadlock with concurrent buffered writes on page
   7340			 * locks. This happens only if our DIO read covers more
   7341			 * than one extent map, if at this point has already
   7342			 * created an ordered extent for a previous extent map
   7343			 * and locked its range in the inode's io tree, and a
   7344			 * concurrent write against that previous extent map's
   7345			 * range and this range started (we unlock the ranges
   7346			 * in the io tree only when the bios complete and
   7347			 * buffered writes always lock pages before attempting
   7348			 * to lock range in the io tree).
   7349			 */
   7350			if (writing ||
   7351			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
   7352				btrfs_start_ordered_extent(ordered, 1);
   7353			else
   7354				ret = nowait ? -EAGAIN : -ENOTBLK;
   7355			btrfs_put_ordered_extent(ordered);
   7356		} else {
   7357			/*
   7358			 * We could trigger writeback for this range (and wait
   7359			 * for it to complete) and then invalidate the pages for
   7360			 * this range (through invalidate_inode_pages2_range()),
   7361			 * but that can lead us to a deadlock with a concurrent
   7362			 * call to readahead (a buffered read or a defrag call
   7363			 * triggered a readahead) on a page lock due to an
   7364			 * ordered dio extent we created before but did not have
   7365			 * yet a corresponding bio submitted (whence it can not
   7366			 * complete), which makes readahead wait for that
   7367			 * ordered extent to complete while holding a lock on
   7368			 * that page.
   7369			 */
   7370			ret = nowait ? -EAGAIN : -ENOTBLK;
   7371		}
   7372
   7373		if (ret)
   7374			break;
   7375
   7376		cond_resched();
   7377	}
   7378
   7379	return ret;
   7380}
   7381
   7382/* The callers of this must take lock_extent() */
   7383static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
   7384				       u64 len, u64 orig_start, u64 block_start,
   7385				       u64 block_len, u64 orig_block_len,
   7386				       u64 ram_bytes, int compress_type,
   7387				       int type)
   7388{
   7389	struct extent_map_tree *em_tree;
   7390	struct extent_map *em;
   7391	int ret;
   7392
   7393	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
   7394	       type == BTRFS_ORDERED_COMPRESSED ||
   7395	       type == BTRFS_ORDERED_NOCOW ||
   7396	       type == BTRFS_ORDERED_REGULAR);
   7397
   7398	em_tree = &inode->extent_tree;
   7399	em = alloc_extent_map();
   7400	if (!em)
   7401		return ERR_PTR(-ENOMEM);
   7402
   7403	em->start = start;
   7404	em->orig_start = orig_start;
   7405	em->len = len;
   7406	em->block_len = block_len;
   7407	em->block_start = block_start;
   7408	em->orig_block_len = orig_block_len;
   7409	em->ram_bytes = ram_bytes;
   7410	em->generation = -1;
   7411	set_bit(EXTENT_FLAG_PINNED, &em->flags);
   7412	if (type == BTRFS_ORDERED_PREALLOC) {
   7413		set_bit(EXTENT_FLAG_FILLING, &em->flags);
   7414	} else if (type == BTRFS_ORDERED_COMPRESSED) {
   7415		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
   7416		em->compress_type = compress_type;
   7417	}
   7418
   7419	do {
   7420		btrfs_drop_extent_cache(inode, em->start,
   7421					em->start + em->len - 1, 0);
   7422		write_lock(&em_tree->lock);
   7423		ret = add_extent_mapping(em_tree, em, 1);
   7424		write_unlock(&em_tree->lock);
   7425		/*
   7426		 * The caller has taken lock_extent(), who could race with us
   7427		 * to add em?
   7428		 */
   7429	} while (ret == -EEXIST);
   7430
   7431	if (ret) {
   7432		free_extent_map(em);
   7433		return ERR_PTR(ret);
   7434	}
   7435
   7436	/* em got 2 refs now, callers needs to do free_extent_map once. */
   7437	return em;
   7438}
   7439
   7440
   7441static int btrfs_get_blocks_direct_write(struct extent_map **map,
   7442					 struct inode *inode,
   7443					 struct btrfs_dio_data *dio_data,
   7444					 u64 start, u64 len,
   7445					 unsigned int iomap_flags)
   7446{
   7447	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
   7448	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   7449	struct extent_map *em = *map;
   7450	int type;
   7451	u64 block_start, orig_start, orig_block_len, ram_bytes;
   7452	struct btrfs_block_group *bg;
   7453	bool can_nocow = false;
   7454	bool space_reserved = false;
   7455	u64 prev_len;
   7456	int ret = 0;
   7457
   7458	/*
   7459	 * We don't allocate a new extent in the following cases
   7460	 *
   7461	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
   7462	 * existing extent.
   7463	 * 2) The extent is marked as PREALLOC. We're good to go here and can
   7464	 * just use the extent.
   7465	 *
   7466	 */
   7467	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
   7468	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
   7469	     em->block_start != EXTENT_MAP_HOLE)) {
   7470		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
   7471			type = BTRFS_ORDERED_PREALLOC;
   7472		else
   7473			type = BTRFS_ORDERED_NOCOW;
   7474		len = min(len, em->len - (start - em->start));
   7475		block_start = em->block_start + (start - em->start);
   7476
   7477		if (can_nocow_extent(inode, start, &len, &orig_start,
   7478				     &orig_block_len, &ram_bytes, false) == 1) {
   7479			bg = btrfs_inc_nocow_writers(fs_info, block_start);
   7480			if (bg)
   7481				can_nocow = true;
   7482		}
   7483	}
   7484
   7485	prev_len = len;
   7486	if (can_nocow) {
   7487		struct extent_map *em2;
   7488
   7489		/* We can NOCOW, so only need to reserve metadata space. */
   7490		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
   7491						      nowait);
   7492		if (ret < 0) {
   7493			/* Our caller expects us to free the input extent map. */
   7494			free_extent_map(em);
   7495			*map = NULL;
   7496			btrfs_dec_nocow_writers(bg);
   7497			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
   7498				ret = -EAGAIN;
   7499			goto out;
   7500		}
   7501		space_reserved = true;
   7502
   7503		em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
   7504					      orig_start, block_start,
   7505					      len, orig_block_len,
   7506					      ram_bytes, type);
   7507		btrfs_dec_nocow_writers(bg);
   7508		if (type == BTRFS_ORDERED_PREALLOC) {
   7509			free_extent_map(em);
   7510			*map = em = em2;
   7511		}
   7512
   7513		if (IS_ERR(em2)) {
   7514			ret = PTR_ERR(em2);
   7515			goto out;
   7516		}
   7517
   7518		dio_data->nocow_done = true;
   7519	} else {
   7520		/* Our caller expects us to free the input extent map. */
   7521		free_extent_map(em);
   7522		*map = NULL;
   7523
   7524		if (nowait)
   7525			return -EAGAIN;
   7526
   7527		/*
   7528		 * If we could not allocate data space before locking the file
   7529		 * range and we can't do a NOCOW write, then we have to fail.
   7530		 */
   7531		if (!dio_data->data_space_reserved)
   7532			return -ENOSPC;
   7533
   7534		/*
   7535		 * We have to COW and we have already reserved data space before,
   7536		 * so now we reserve only metadata.
   7537		 */
   7538		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
   7539						      false);
   7540		if (ret < 0)
   7541			goto out;
   7542		space_reserved = true;
   7543
   7544		em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
   7545		if (IS_ERR(em)) {
   7546			ret = PTR_ERR(em);
   7547			goto out;
   7548		}
   7549		*map = em;
   7550		len = min(len, em->len - (start - em->start));
   7551		if (len < prev_len)
   7552			btrfs_delalloc_release_metadata(BTRFS_I(inode),
   7553							prev_len - len, true);
   7554	}
   7555
   7556	/*
   7557	 * We have created our ordered extent, so we can now release our reservation
   7558	 * for an outstanding extent.
   7559	 */
   7560	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
   7561
   7562	/*
   7563	 * Need to update the i_size under the extent lock so buffered
   7564	 * readers will get the updated i_size when we unlock.
   7565	 */
   7566	if (start + len > i_size_read(inode))
   7567		i_size_write(inode, start + len);
   7568out:
   7569	if (ret && space_reserved) {
   7570		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
   7571		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
   7572	}
   7573	return ret;
   7574}
   7575
   7576static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
   7577		loff_t length, unsigned int flags, struct iomap *iomap,
   7578		struct iomap *srcmap)
   7579{
   7580	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
   7581	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   7582	struct extent_map *em;
   7583	struct extent_state *cached_state = NULL;
   7584	struct btrfs_dio_data *dio_data = iter->private;
   7585	u64 lockstart, lockend;
   7586	const bool write = !!(flags & IOMAP_WRITE);
   7587	int ret = 0;
   7588	u64 len = length;
   7589	const u64 data_alloc_len = length;
   7590	bool unlock_extents = false;
   7591
   7592	if (!write)
   7593		len = min_t(u64, len, fs_info->sectorsize);
   7594
   7595	lockstart = start;
   7596	lockend = start + len - 1;
   7597
   7598	/*
   7599	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
   7600	 * enough if we've written compressed pages to this area, so we need to
   7601	 * flush the dirty pages again to make absolutely sure that any
   7602	 * outstanding dirty pages are on disk - the first flush only starts
   7603	 * compression on the data, while keeping the pages locked, so by the
   7604	 * time the second flush returns we know bios for the compressed pages
   7605	 * were submitted and finished, and the pages no longer under writeback.
   7606	 *
   7607	 * If we have a NOWAIT request and we have any pages in the range that
   7608	 * are locked, likely due to compression still in progress, we don't want
   7609	 * to block on page locks. We also don't want to block on pages marked as
   7610	 * dirty or under writeback (same as for the non-compression case).
   7611	 * iomap_dio_rw() did the same check, but after that and before we got
   7612	 * here, mmap'ed writes may have happened or buffered reads started
   7613	 * (readpage() and readahead(), which lock pages), as we haven't locked
   7614	 * the file range yet.
   7615	 */
   7616	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
   7617		     &BTRFS_I(inode)->runtime_flags)) {
   7618		if (flags & IOMAP_NOWAIT) {
   7619			if (filemap_range_needs_writeback(inode->i_mapping,
   7620							  lockstart, lockend))
   7621				return -EAGAIN;
   7622		} else {
   7623			ret = filemap_fdatawrite_range(inode->i_mapping, start,
   7624						       start + length - 1);
   7625			if (ret)
   7626				return ret;
   7627		}
   7628	}
   7629
   7630	memset(dio_data, 0, sizeof(*dio_data));
   7631
   7632	/*
   7633	 * We always try to allocate data space and must do it before locking
   7634	 * the file range, to avoid deadlocks with concurrent writes to the same
   7635	 * range if the range has several extents and the writes don't expand the
   7636	 * current i_size (the inode lock is taken in shared mode). If we fail to
   7637	 * allocate data space here we continue and later, after locking the
   7638	 * file range, we fail with ENOSPC only if we figure out we can not do a
   7639	 * NOCOW write.
   7640	 */
   7641	if (write && !(flags & IOMAP_NOWAIT)) {
   7642		ret = btrfs_check_data_free_space(BTRFS_I(inode),
   7643						  &dio_data->data_reserved,
   7644						  start, data_alloc_len);
   7645		if (!ret)
   7646			dio_data->data_space_reserved = true;
   7647		else if (ret && !(BTRFS_I(inode)->flags &
   7648				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
   7649			goto err;
   7650	}
   7651
   7652	/*
   7653	 * If this errors out it's because we couldn't invalidate pagecache for
   7654	 * this range and we need to fallback to buffered IO, or we are doing a
   7655	 * NOWAIT read/write and we need to block.
   7656	 */
   7657	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
   7658	if (ret < 0)
   7659		goto err;
   7660
   7661	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
   7662	if (IS_ERR(em)) {
   7663		ret = PTR_ERR(em);
   7664		goto unlock_err;
   7665	}
   7666
   7667	/*
   7668	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
   7669	 * io.  INLINE is special, and we could probably kludge it in here, but
   7670	 * it's still buffered so for safety lets just fall back to the generic
   7671	 * buffered path.
   7672	 *
   7673	 * For COMPRESSED we _have_ to read the entire extent in so we can
   7674	 * decompress it, so there will be buffering required no matter what we
   7675	 * do, so go ahead and fallback to buffered.
   7676	 *
   7677	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
   7678	 * to buffered IO.  Don't blame me, this is the price we pay for using
   7679	 * the generic code.
   7680	 */
   7681	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
   7682	    em->block_start == EXTENT_MAP_INLINE) {
   7683		free_extent_map(em);
   7684		ret = -ENOTBLK;
   7685		goto unlock_err;
   7686	}
   7687
   7688	len = min(len, em->len - (start - em->start));
   7689
   7690	/*
   7691	 * If we have a NOWAIT request and the range contains multiple extents
   7692	 * (or a mix of extents and holes), then we return -EAGAIN to make the
   7693	 * caller fallback to a context where it can do a blocking (without
   7694	 * NOWAIT) request. This way we avoid doing partial IO and returning
   7695	 * success to the caller, which is not optimal for writes and for reads
   7696	 * it can result in unexpected behaviour for an application.
   7697	 *
   7698	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
   7699	 * iomap_dio_rw(), we can end up returning less data then what the caller
   7700	 * asked for, resulting in an unexpected, and incorrect, short read.
   7701	 * That is, the caller asked to read N bytes and we return less than that,
   7702	 * which is wrong unless we are crossing EOF. This happens if we get a
   7703	 * page fault error when trying to fault in pages for the buffer that is
   7704	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
   7705	 * have previously submitted bios for other extents in the range, in
   7706	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
   7707	 * those bios have completed by the time we get the page fault error,
   7708	 * which we return back to our caller - we should only return EIOCBQUEUED
   7709	 * after we have submitted bios for all the extents in the range.
   7710	 */
   7711	if ((flags & IOMAP_NOWAIT) && len < length) {
   7712		free_extent_map(em);
   7713		ret = -EAGAIN;
   7714		goto unlock_err;
   7715	}
   7716
   7717	if (write) {
   7718		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
   7719						    start, len, flags);
   7720		if (ret < 0)
   7721			goto unlock_err;
   7722		unlock_extents = true;
   7723		/* Recalc len in case the new em is smaller than requested */
   7724		len = min(len, em->len - (start - em->start));
   7725		if (dio_data->data_space_reserved) {
   7726			u64 release_offset;
   7727			u64 release_len = 0;
   7728
   7729			if (dio_data->nocow_done) {
   7730				release_offset = start;
   7731				release_len = data_alloc_len;
   7732			} else if (len < data_alloc_len) {
   7733				release_offset = start + len;
   7734				release_len = data_alloc_len - len;
   7735			}
   7736
   7737			if (release_len > 0)
   7738				btrfs_free_reserved_data_space(BTRFS_I(inode),
   7739							       dio_data->data_reserved,
   7740							       release_offset,
   7741							       release_len);
   7742		}
   7743	} else {
   7744		/*
   7745		 * We need to unlock only the end area that we aren't using.
   7746		 * The rest is going to be unlocked by the endio routine.
   7747		 */
   7748		lockstart = start + len;
   7749		if (lockstart < lockend)
   7750			unlock_extents = true;
   7751	}
   7752
   7753	if (unlock_extents)
   7754		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
   7755				     lockstart, lockend, &cached_state);
   7756	else
   7757		free_extent_state(cached_state);
   7758
   7759	/*
   7760	 * Translate extent map information to iomap.
   7761	 * We trim the extents (and move the addr) even though iomap code does
   7762	 * that, since we have locked only the parts we are performing I/O in.
   7763	 */
   7764	if ((em->block_start == EXTENT_MAP_HOLE) ||
   7765	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
   7766		iomap->addr = IOMAP_NULL_ADDR;
   7767		iomap->type = IOMAP_HOLE;
   7768	} else {
   7769		iomap->addr = em->block_start + (start - em->start);
   7770		iomap->type = IOMAP_MAPPED;
   7771	}
   7772	iomap->offset = start;
   7773	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
   7774	iomap->length = len;
   7775
   7776	if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
   7777		iomap->flags |= IOMAP_F_ZONE_APPEND;
   7778
   7779	free_extent_map(em);
   7780
   7781	return 0;
   7782
   7783unlock_err:
   7784	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
   7785			     &cached_state);
   7786err:
   7787	if (dio_data->data_space_reserved) {
   7788		btrfs_free_reserved_data_space(BTRFS_I(inode),
   7789					       dio_data->data_reserved,
   7790					       start, data_alloc_len);
   7791		extent_changeset_free(dio_data->data_reserved);
   7792	}
   7793
   7794	return ret;
   7795}
   7796
   7797static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
   7798		ssize_t written, unsigned int flags, struct iomap *iomap)
   7799{
   7800	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
   7801	struct btrfs_dio_data *dio_data = iter->private;
   7802	size_t submitted = dio_data->submitted;
   7803	const bool write = !!(flags & IOMAP_WRITE);
   7804	int ret = 0;
   7805
   7806	if (!write && (iomap->type == IOMAP_HOLE)) {
   7807		/* If reading from a hole, unlock and return */
   7808		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
   7809		return 0;
   7810	}
   7811
   7812	if (submitted < length) {
   7813		pos += submitted;
   7814		length -= submitted;
   7815		if (write)
   7816			__endio_write_update_ordered(BTRFS_I(inode), pos,
   7817					length, false);
   7818		else
   7819			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
   7820				      pos + length - 1);
   7821		ret = -ENOTBLK;
   7822	}
   7823
   7824	if (write)
   7825		extent_changeset_free(dio_data->data_reserved);
   7826	return ret;
   7827}
   7828
   7829static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
   7830{
   7831	/*
   7832	 * This implies a barrier so that stores to dio_bio->bi_status before
   7833	 * this and loads of dio_bio->bi_status after this are fully ordered.
   7834	 */
   7835	if (!refcount_dec_and_test(&dip->refs))
   7836		return;
   7837
   7838	if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
   7839		__endio_write_update_ordered(BTRFS_I(dip->inode),
   7840					     dip->file_offset,
   7841					     dip->bytes,
   7842					     !dip->bio.bi_status);
   7843	} else {
   7844		unlock_extent(&BTRFS_I(dip->inode)->io_tree,
   7845			      dip->file_offset,
   7846			      dip->file_offset + dip->bytes - 1);
   7847	}
   7848
   7849	kfree(dip->csums);
   7850	bio_endio(&dip->bio);
   7851}
   7852
   7853static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
   7854				  int mirror_num,
   7855				  enum btrfs_compression_type compress_type)
   7856{
   7857	struct btrfs_dio_private *dip = bio->bi_private;
   7858	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   7859
   7860	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
   7861
   7862	if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
   7863		return;
   7864
   7865	refcount_inc(&dip->refs);
   7866	if (btrfs_map_bio(fs_info, bio, mirror_num))
   7867		refcount_dec(&dip->refs);
   7868}
   7869
   7870static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
   7871					     struct btrfs_bio *bbio,
   7872					     const bool uptodate)
   7873{
   7874	struct inode *inode = dip->inode;
   7875	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
   7876	const u32 sectorsize = fs_info->sectorsize;
   7877	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
   7878	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
   7879	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
   7880	struct bio_vec bvec;
   7881	struct bvec_iter iter;
   7882	u32 bio_offset = 0;
   7883	blk_status_t err = BLK_STS_OK;
   7884
   7885	__bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
   7886		unsigned int i, nr_sectors, pgoff;
   7887
   7888		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
   7889		pgoff = bvec.bv_offset;
   7890		for (i = 0; i < nr_sectors; i++) {
   7891			u64 start = bbio->file_offset + bio_offset;
   7892
   7893			ASSERT(pgoff < PAGE_SIZE);
   7894			if (uptodate &&
   7895			    (!csum || !check_data_csum(inode, bbio,
   7896						       bio_offset, bvec.bv_page,
   7897						       pgoff, start))) {
   7898				clean_io_failure(fs_info, failure_tree, io_tree,
   7899						 start, bvec.bv_page,
   7900						 btrfs_ino(BTRFS_I(inode)),
   7901						 pgoff);
   7902			} else {
   7903				int ret;
   7904
   7905				ret = btrfs_repair_one_sector(inode, &bbio->bio,
   7906						bio_offset, bvec.bv_page, pgoff,
   7907						start, bbio->mirror_num,
   7908						submit_dio_repair_bio);
   7909				if (ret)
   7910					err = errno_to_blk_status(ret);
   7911			}
   7912			ASSERT(bio_offset + sectorsize > bio_offset);
   7913			bio_offset += sectorsize;
   7914			pgoff += sectorsize;
   7915		}
   7916	}
   7917	return err;
   7918}
   7919
   7920static void __endio_write_update_ordered(struct btrfs_inode *inode,
   7921					 const u64 offset, const u64 bytes,
   7922					 const bool uptodate)
   7923{
   7924	btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
   7925				       finish_ordered_fn, uptodate);
   7926}
   7927
   7928static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
   7929						     struct bio *bio,
   7930						     u64 dio_file_offset)
   7931{
   7932	return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
   7933}
   7934
   7935static void btrfs_end_dio_bio(struct bio *bio)
   7936{
   7937	struct btrfs_dio_private *dip = bio->bi_private;
   7938	struct btrfs_bio *bbio = btrfs_bio(bio);
   7939	blk_status_t err = bio->bi_status;
   7940
   7941	if (err)
   7942		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
   7943			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
   7944			   btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
   7945			   bio->bi_opf, bio->bi_iter.bi_sector,
   7946			   bio->bi_iter.bi_size, err);
   7947
   7948	if (bio_op(bio) == REQ_OP_READ)
   7949		err = btrfs_check_read_dio_bio(dip, bbio, !err);
   7950
   7951	if (err)
   7952		dip->bio.bi_status = err;
   7953
   7954	btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
   7955
   7956	bio_put(bio);
   7957	btrfs_dio_private_put(dip);
   7958}
   7959
   7960static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
   7961		struct inode *inode, u64 file_offset, int async_submit)
   7962{
   7963	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   7964	struct btrfs_dio_private *dip = bio->bi_private;
   7965	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
   7966	blk_status_t ret;
   7967
   7968	/* Check btrfs_submit_bio_hook() for rules about async submit. */
   7969	if (async_submit)
   7970		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
   7971
   7972	if (!write) {
   7973		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
   7974		if (ret)
   7975			goto err;
   7976	}
   7977
   7978	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
   7979		goto map;
   7980
   7981	if (write && async_submit) {
   7982		ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset,
   7983					  btrfs_submit_bio_start_direct_io);
   7984		goto err;
   7985	} else if (write) {
   7986		/*
   7987		 * If we aren't doing async submit, calculate the csum of the
   7988		 * bio now.
   7989		 */
   7990		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
   7991		if (ret)
   7992			goto err;
   7993	} else {
   7994		u64 csum_offset;
   7995
   7996		csum_offset = file_offset - dip->file_offset;
   7997		csum_offset >>= fs_info->sectorsize_bits;
   7998		csum_offset *= fs_info->csum_size;
   7999		btrfs_bio(bio)->csum = dip->csums + csum_offset;
   8000	}
   8001map:
   8002	ret = btrfs_map_bio(fs_info, bio, 0);
   8003err:
   8004	return ret;
   8005}
   8006
   8007static void btrfs_submit_direct(const struct iomap_iter *iter,
   8008		struct bio *dio_bio, loff_t file_offset)
   8009{
   8010	struct btrfs_dio_private *dip =
   8011		container_of(dio_bio, struct btrfs_dio_private, bio);
   8012	struct inode *inode = iter->inode;
   8013	const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
   8014	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   8015	const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
   8016			     BTRFS_BLOCK_GROUP_RAID56_MASK);
   8017	struct bio *bio;
   8018	u64 start_sector;
   8019	int async_submit = 0;
   8020	u64 submit_len;
   8021	u64 clone_offset = 0;
   8022	u64 clone_len;
   8023	u64 logical;
   8024	int ret;
   8025	blk_status_t status;
   8026	struct btrfs_io_geometry geom;
   8027	struct btrfs_dio_data *dio_data = iter->private;
   8028	struct extent_map *em = NULL;
   8029
   8030	dip->inode = inode;
   8031	dip->file_offset = file_offset;
   8032	dip->bytes = dio_bio->bi_iter.bi_size;
   8033	refcount_set(&dip->refs, 1);
   8034	dip->csums = NULL;
   8035
   8036	if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
   8037		unsigned int nr_sectors =
   8038			(dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
   8039
   8040		/*
   8041		 * Load the csums up front to reduce csum tree searches and
   8042		 * contention when submitting bios.
   8043		 */
   8044		status = BLK_STS_RESOURCE;
   8045		dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
   8046		if (!dip)
   8047			goto out_err;
   8048
   8049		status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
   8050		if (status != BLK_STS_OK)
   8051			goto out_err;
   8052	}
   8053
   8054	start_sector = dio_bio->bi_iter.bi_sector;
   8055	submit_len = dio_bio->bi_iter.bi_size;
   8056
   8057	do {
   8058		logical = start_sector << 9;
   8059		em = btrfs_get_chunk_map(fs_info, logical, submit_len);
   8060		if (IS_ERR(em)) {
   8061			status = errno_to_blk_status(PTR_ERR(em));
   8062			em = NULL;
   8063			goto out_err_em;
   8064		}
   8065		ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
   8066					    logical, &geom);
   8067		if (ret) {
   8068			status = errno_to_blk_status(ret);
   8069			goto out_err_em;
   8070		}
   8071
   8072		clone_len = min(submit_len, geom.len);
   8073		ASSERT(clone_len <= UINT_MAX);
   8074
   8075		/*
   8076		 * This will never fail as it's passing GPF_NOFS and
   8077		 * the allocation is backed by btrfs_bioset.
   8078		 */
   8079		bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
   8080		bio->bi_private = dip;
   8081		bio->bi_end_io = btrfs_end_dio_bio;
   8082		btrfs_bio(bio)->file_offset = file_offset;
   8083
   8084		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
   8085			status = extract_ordered_extent(BTRFS_I(inode), bio,
   8086							file_offset);
   8087			if (status) {
   8088				bio_put(bio);
   8089				goto out_err;
   8090			}
   8091		}
   8092
   8093		ASSERT(submit_len >= clone_len);
   8094		submit_len -= clone_len;
   8095
   8096		/*
   8097		 * Increase the count before we submit the bio so we know
   8098		 * the end IO handler won't happen before we increase the
   8099		 * count. Otherwise, the dip might get freed before we're
   8100		 * done setting it up.
   8101		 *
   8102		 * We transfer the initial reference to the last bio, so we
   8103		 * don't need to increment the reference count for the last one.
   8104		 */
   8105		if (submit_len > 0) {
   8106			refcount_inc(&dip->refs);
   8107			/*
   8108			 * If we are submitting more than one bio, submit them
   8109			 * all asynchronously. The exception is RAID 5 or 6, as
   8110			 * asynchronous checksums make it difficult to collect
   8111			 * full stripe writes.
   8112			 */
   8113			if (!raid56)
   8114				async_submit = 1;
   8115		}
   8116
   8117		status = btrfs_submit_dio_bio(bio, inode, file_offset,
   8118						async_submit);
   8119		if (status) {
   8120			bio_put(bio);
   8121			if (submit_len > 0)
   8122				refcount_dec(&dip->refs);
   8123			goto out_err_em;
   8124		}
   8125
   8126		dio_data->submitted += clone_len;
   8127		clone_offset += clone_len;
   8128		start_sector += clone_len >> 9;
   8129		file_offset += clone_len;
   8130
   8131		free_extent_map(em);
   8132	} while (submit_len > 0);
   8133	return;
   8134
   8135out_err_em:
   8136	free_extent_map(em);
   8137out_err:
   8138	dio_bio->bi_status = status;
   8139	btrfs_dio_private_put(dip);
   8140}
   8141
   8142static const struct iomap_ops btrfs_dio_iomap_ops = {
   8143	.iomap_begin            = btrfs_dio_iomap_begin,
   8144	.iomap_end              = btrfs_dio_iomap_end,
   8145};
   8146
   8147static const struct iomap_dio_ops btrfs_dio_ops = {
   8148	.submit_io		= btrfs_submit_direct,
   8149	.bio_set		= &btrfs_dio_bioset,
   8150};
   8151
   8152ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
   8153{
   8154	struct btrfs_dio_data data;
   8155
   8156	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
   8157			    IOMAP_DIO_PARTIAL, &data, done_before);
   8158}
   8159
   8160static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
   8161			u64 start, u64 len)
   8162{
   8163	int	ret;
   8164
   8165	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
   8166	if (ret)
   8167		return ret;
   8168
   8169	return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
   8170}
   8171
   8172static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
   8173{
   8174	struct inode *inode = page->mapping->host;
   8175	int ret;
   8176
   8177	if (current->flags & PF_MEMALLOC) {
   8178		redirty_page_for_writepage(wbc, page);
   8179		unlock_page(page);
   8180		return 0;
   8181	}
   8182
   8183	/*
   8184	 * If we are under memory pressure we will call this directly from the
   8185	 * VM, we need to make sure we have the inode referenced for the ordered
   8186	 * extent.  If not just return like we didn't do anything.
   8187	 */
   8188	if (!igrab(inode)) {
   8189		redirty_page_for_writepage(wbc, page);
   8190		return AOP_WRITEPAGE_ACTIVATE;
   8191	}
   8192	ret = extent_write_full_page(page, wbc);
   8193	btrfs_add_delayed_iput(inode);
   8194	return ret;
   8195}
   8196
   8197static int btrfs_writepages(struct address_space *mapping,
   8198			    struct writeback_control *wbc)
   8199{
   8200	return extent_writepages(mapping, wbc);
   8201}
   8202
   8203static void btrfs_readahead(struct readahead_control *rac)
   8204{
   8205	extent_readahead(rac);
   8206}
   8207
   8208/*
   8209 * For release_folio() and invalidate_folio() we have a race window where
   8210 * folio_end_writeback() is called but the subpage spinlock is not yet released.
   8211 * If we continue to release/invalidate the page, we could cause use-after-free
   8212 * for subpage spinlock.  So this function is to spin and wait for subpage
   8213 * spinlock.
   8214 */
   8215static void wait_subpage_spinlock(struct page *page)
   8216{
   8217	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
   8218	struct btrfs_subpage *subpage;
   8219
   8220	if (!btrfs_is_subpage(fs_info, page))
   8221		return;
   8222
   8223	ASSERT(PagePrivate(page) && page->private);
   8224	subpage = (struct btrfs_subpage *)page->private;
   8225
   8226	/*
   8227	 * This may look insane as we just acquire the spinlock and release it,
   8228	 * without doing anything.  But we just want to make sure no one is
   8229	 * still holding the subpage spinlock.
   8230	 * And since the page is not dirty nor writeback, and we have page
   8231	 * locked, the only possible way to hold a spinlock is from the endio
   8232	 * function to clear page writeback.
   8233	 *
   8234	 * Here we just acquire the spinlock so that all existing callers
   8235	 * should exit and we're safe to release/invalidate the page.
   8236	 */
   8237	spin_lock_irq(&subpage->lock);
   8238	spin_unlock_irq(&subpage->lock);
   8239}
   8240
   8241static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
   8242{
   8243	int ret = try_release_extent_mapping(&folio->page, gfp_flags);
   8244
   8245	if (ret == 1) {
   8246		wait_subpage_spinlock(&folio->page);
   8247		clear_page_extent_mapped(&folio->page);
   8248	}
   8249	return ret;
   8250}
   8251
   8252static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
   8253{
   8254	if (folio_test_writeback(folio) || folio_test_dirty(folio))
   8255		return false;
   8256	return __btrfs_release_folio(folio, gfp_flags);
   8257}
   8258
   8259#ifdef CONFIG_MIGRATION
   8260static int btrfs_migratepage(struct address_space *mapping,
   8261			     struct page *newpage, struct page *page,
   8262			     enum migrate_mode mode)
   8263{
   8264	int ret;
   8265
   8266	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
   8267	if (ret != MIGRATEPAGE_SUCCESS)
   8268		return ret;
   8269
   8270	if (page_has_private(page))
   8271		attach_page_private(newpage, detach_page_private(page));
   8272
   8273	if (PageOrdered(page)) {
   8274		ClearPageOrdered(page);
   8275		SetPageOrdered(newpage);
   8276	}
   8277
   8278	if (mode != MIGRATE_SYNC_NO_COPY)
   8279		migrate_page_copy(newpage, page);
   8280	else
   8281		migrate_page_states(newpage, page);
   8282	return MIGRATEPAGE_SUCCESS;
   8283}
   8284#endif
   8285
   8286static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
   8287				 size_t length)
   8288{
   8289	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
   8290	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   8291	struct extent_io_tree *tree = &inode->io_tree;
   8292	struct extent_state *cached_state = NULL;
   8293	u64 page_start = folio_pos(folio);
   8294	u64 page_end = page_start + folio_size(folio) - 1;
   8295	u64 cur;
   8296	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
   8297
   8298	/*
   8299	 * We have folio locked so no new ordered extent can be created on this
   8300	 * page, nor bio can be submitted for this folio.
   8301	 *
   8302	 * But already submitted bio can still be finished on this folio.
   8303	 * Furthermore, endio function won't skip folio which has Ordered
   8304	 * (Private2) already cleared, so it's possible for endio and
   8305	 * invalidate_folio to do the same ordered extent accounting twice
   8306	 * on one folio.
   8307	 *
   8308	 * So here we wait for any submitted bios to finish, so that we won't
   8309	 * do double ordered extent accounting on the same folio.
   8310	 */
   8311	folio_wait_writeback(folio);
   8312	wait_subpage_spinlock(&folio->page);
   8313
   8314	/*
   8315	 * For subpage case, we have call sites like
   8316	 * btrfs_punch_hole_lock_range() which passes range not aligned to
   8317	 * sectorsize.
   8318	 * If the range doesn't cover the full folio, we don't need to and
   8319	 * shouldn't clear page extent mapped, as folio->private can still
   8320	 * record subpage dirty bits for other part of the range.
   8321	 *
   8322	 * For cases that invalidate the full folio even the range doesn't
   8323	 * cover the full folio, like invalidating the last folio, we're
   8324	 * still safe to wait for ordered extent to finish.
   8325	 */
   8326	if (!(offset == 0 && length == folio_size(folio))) {
   8327		btrfs_release_folio(folio, GFP_NOFS);
   8328		return;
   8329	}
   8330
   8331	if (!inode_evicting)
   8332		lock_extent_bits(tree, page_start, page_end, &cached_state);
   8333
   8334	cur = page_start;
   8335	while (cur < page_end) {
   8336		struct btrfs_ordered_extent *ordered;
   8337		bool delete_states;
   8338		u64 range_end;
   8339		u32 range_len;
   8340
   8341		ordered = btrfs_lookup_first_ordered_range(inode, cur,
   8342							   page_end + 1 - cur);
   8343		if (!ordered) {
   8344			range_end = page_end;
   8345			/*
   8346			 * No ordered extent covering this range, we are safe
   8347			 * to delete all extent states in the range.
   8348			 */
   8349			delete_states = true;
   8350			goto next;
   8351		}
   8352		if (ordered->file_offset > cur) {
   8353			/*
   8354			 * There is a range between [cur, oe->file_offset) not
   8355			 * covered by any ordered extent.
   8356			 * We are safe to delete all extent states, and handle
   8357			 * the ordered extent in the next iteration.
   8358			 */
   8359			range_end = ordered->file_offset - 1;
   8360			delete_states = true;
   8361			goto next;
   8362		}
   8363
   8364		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
   8365				page_end);
   8366		ASSERT(range_end + 1 - cur < U32_MAX);
   8367		range_len = range_end + 1 - cur;
   8368		if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
   8369			/*
   8370			 * If Ordered (Private2) is cleared, it means endio has
   8371			 * already been executed for the range.
   8372			 * We can't delete the extent states as
   8373			 * btrfs_finish_ordered_io() may still use some of them.
   8374			 */
   8375			delete_states = false;
   8376			goto next;
   8377		}
   8378		btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
   8379
   8380		/*
   8381		 * IO on this page will never be started, so we need to account
   8382		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
   8383		 * here, must leave that up for the ordered extent completion.
   8384		 *
   8385		 * This will also unlock the range for incoming
   8386		 * btrfs_finish_ordered_io().
   8387		 */
   8388		if (!inode_evicting)
   8389			clear_extent_bit(tree, cur, range_end,
   8390					 EXTENT_DELALLOC |
   8391					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
   8392					 EXTENT_DEFRAG, 1, 0, &cached_state);
   8393
   8394		spin_lock_irq(&inode->ordered_tree.lock);
   8395		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
   8396		ordered->truncated_len = min(ordered->truncated_len,
   8397					     cur - ordered->file_offset);
   8398		spin_unlock_irq(&inode->ordered_tree.lock);
   8399
   8400		if (btrfs_dec_test_ordered_pending(inode, &ordered,
   8401						   cur, range_end + 1 - cur)) {
   8402			btrfs_finish_ordered_io(ordered);
   8403			/*
   8404			 * The ordered extent has finished, now we're again
   8405			 * safe to delete all extent states of the range.
   8406			 */
   8407			delete_states = true;
   8408		} else {
   8409			/*
   8410			 * btrfs_finish_ordered_io() will get executed by endio
   8411			 * of other pages, thus we can't delete extent states
   8412			 * anymore
   8413			 */
   8414			delete_states = false;
   8415		}
   8416next:
   8417		if (ordered)
   8418			btrfs_put_ordered_extent(ordered);
   8419		/*
   8420		 * Qgroup reserved space handler
   8421		 * Sector(s) here will be either:
   8422		 *
   8423		 * 1) Already written to disk or bio already finished
   8424		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
   8425		 *    Qgroup will be handled by its qgroup_record then.
   8426		 *    btrfs_qgroup_free_data() call will do nothing here.
   8427		 *
   8428		 * 2) Not written to disk yet
   8429		 *    Then btrfs_qgroup_free_data() call will clear the
   8430		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
   8431		 *    reserved data space.
   8432		 *    Since the IO will never happen for this page.
   8433		 */
   8434		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
   8435		if (!inode_evicting) {
   8436			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
   8437				 EXTENT_DELALLOC | EXTENT_UPTODATE |
   8438				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
   8439				 delete_states, &cached_state);
   8440		}
   8441		cur = range_end + 1;
   8442	}
   8443	/*
   8444	 * We have iterated through all ordered extents of the page, the page
   8445	 * should not have Ordered (Private2) anymore, or the above iteration
   8446	 * did something wrong.
   8447	 */
   8448	ASSERT(!folio_test_ordered(folio));
   8449	btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
   8450	if (!inode_evicting)
   8451		__btrfs_release_folio(folio, GFP_NOFS);
   8452	clear_page_extent_mapped(&folio->page);
   8453}
   8454
   8455/*
   8456 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
   8457 * called from a page fault handler when a page is first dirtied. Hence we must
   8458 * be careful to check for EOF conditions here. We set the page up correctly
   8459 * for a written page which means we get ENOSPC checking when writing into
   8460 * holes and correct delalloc and unwritten extent mapping on filesystems that
   8461 * support these features.
   8462 *
   8463 * We are not allowed to take the i_mutex here so we have to play games to
   8464 * protect against truncate races as the page could now be beyond EOF.  Because
   8465 * truncate_setsize() writes the inode size before removing pages, once we have
   8466 * the page lock we can determine safely if the page is beyond EOF. If it is not
   8467 * beyond EOF, then the page is guaranteed safe against truncation until we
   8468 * unlock the page.
   8469 */
   8470vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
   8471{
   8472	struct page *page = vmf->page;
   8473	struct inode *inode = file_inode(vmf->vma->vm_file);
   8474	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   8475	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
   8476	struct btrfs_ordered_extent *ordered;
   8477	struct extent_state *cached_state = NULL;
   8478	struct extent_changeset *data_reserved = NULL;
   8479	unsigned long zero_start;
   8480	loff_t size;
   8481	vm_fault_t ret;
   8482	int ret2;
   8483	int reserved = 0;
   8484	u64 reserved_space;
   8485	u64 page_start;
   8486	u64 page_end;
   8487	u64 end;
   8488
   8489	reserved_space = PAGE_SIZE;
   8490
   8491	sb_start_pagefault(inode->i_sb);
   8492	page_start = page_offset(page);
   8493	page_end = page_start + PAGE_SIZE - 1;
   8494	end = page_end;
   8495
   8496	/*
   8497	 * Reserving delalloc space after obtaining the page lock can lead to
   8498	 * deadlock. For example, if a dirty page is locked by this function
   8499	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
   8500	 * dirty page write out, then the btrfs_writepage() function could
   8501	 * end up waiting indefinitely to get a lock on the page currently
   8502	 * being processed by btrfs_page_mkwrite() function.
   8503	 */
   8504	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
   8505					    page_start, reserved_space);
   8506	if (!ret2) {
   8507		ret2 = file_update_time(vmf->vma->vm_file);
   8508		reserved = 1;
   8509	}
   8510	if (ret2) {
   8511		ret = vmf_error(ret2);
   8512		if (reserved)
   8513			goto out;
   8514		goto out_noreserve;
   8515	}
   8516
   8517	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
   8518again:
   8519	down_read(&BTRFS_I(inode)->i_mmap_lock);
   8520	lock_page(page);
   8521	size = i_size_read(inode);
   8522
   8523	if ((page->mapping != inode->i_mapping) ||
   8524	    (page_start >= size)) {
   8525		/* page got truncated out from underneath us */
   8526		goto out_unlock;
   8527	}
   8528	wait_on_page_writeback(page);
   8529
   8530	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
   8531	ret2 = set_page_extent_mapped(page);
   8532	if (ret2 < 0) {
   8533		ret = vmf_error(ret2);
   8534		unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
   8535		goto out_unlock;
   8536	}
   8537
   8538	/*
   8539	 * we can't set the delalloc bits if there are pending ordered
   8540	 * extents.  Drop our locks and wait for them to finish
   8541	 */
   8542	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
   8543			PAGE_SIZE);
   8544	if (ordered) {
   8545		unlock_extent_cached(io_tree, page_start, page_end,
   8546				     &cached_state);
   8547		unlock_page(page);
   8548		up_read(&BTRFS_I(inode)->i_mmap_lock);
   8549		btrfs_start_ordered_extent(ordered, 1);
   8550		btrfs_put_ordered_extent(ordered);
   8551		goto again;
   8552	}
   8553
   8554	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
   8555		reserved_space = round_up(size - page_start,
   8556					  fs_info->sectorsize);
   8557		if (reserved_space < PAGE_SIZE) {
   8558			end = page_start + reserved_space - 1;
   8559			btrfs_delalloc_release_space(BTRFS_I(inode),
   8560					data_reserved, page_start,
   8561					PAGE_SIZE - reserved_space, true);
   8562		}
   8563	}
   8564
   8565	/*
   8566	 * page_mkwrite gets called when the page is firstly dirtied after it's
   8567	 * faulted in, but write(2) could also dirty a page and set delalloc
   8568	 * bits, thus in this case for space account reason, we still need to
   8569	 * clear any delalloc bits within this page range since we have to
   8570	 * reserve data&meta space before lock_page() (see above comments).
   8571	 */
   8572	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
   8573			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
   8574			  EXTENT_DEFRAG, 0, 0, &cached_state);
   8575
   8576	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
   8577					&cached_state);
   8578	if (ret2) {
   8579		unlock_extent_cached(io_tree, page_start, page_end,
   8580				     &cached_state);
   8581		ret = VM_FAULT_SIGBUS;
   8582		goto out_unlock;
   8583	}
   8584
   8585	/* page is wholly or partially inside EOF */
   8586	if (page_start + PAGE_SIZE > size)
   8587		zero_start = offset_in_page(size);
   8588	else
   8589		zero_start = PAGE_SIZE;
   8590
   8591	if (zero_start != PAGE_SIZE) {
   8592		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
   8593		flush_dcache_page(page);
   8594	}
   8595	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
   8596	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
   8597	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
   8598
   8599	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
   8600
   8601	unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
   8602	up_read(&BTRFS_I(inode)->i_mmap_lock);
   8603
   8604	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
   8605	sb_end_pagefault(inode->i_sb);
   8606	extent_changeset_free(data_reserved);
   8607	return VM_FAULT_LOCKED;
   8608
   8609out_unlock:
   8610	unlock_page(page);
   8611	up_read(&BTRFS_I(inode)->i_mmap_lock);
   8612out:
   8613	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
   8614	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
   8615				     reserved_space, (ret != 0));
   8616out_noreserve:
   8617	sb_end_pagefault(inode->i_sb);
   8618	extent_changeset_free(data_reserved);
   8619	return ret;
   8620}
   8621
   8622static int btrfs_truncate(struct inode *inode, bool skip_writeback)
   8623{
   8624	struct btrfs_truncate_control control = {
   8625		.inode = BTRFS_I(inode),
   8626		.ino = btrfs_ino(BTRFS_I(inode)),
   8627		.min_type = BTRFS_EXTENT_DATA_KEY,
   8628		.clear_extent_range = true,
   8629	};
   8630	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   8631	struct btrfs_root *root = BTRFS_I(inode)->root;
   8632	struct btrfs_block_rsv *rsv;
   8633	int ret;
   8634	struct btrfs_trans_handle *trans;
   8635	u64 mask = fs_info->sectorsize - 1;
   8636	u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
   8637
   8638	if (!skip_writeback) {
   8639		ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
   8640					       (u64)-1);
   8641		if (ret)
   8642			return ret;
   8643	}
   8644
   8645	/*
   8646	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
   8647	 * things going on here:
   8648	 *
   8649	 * 1) We need to reserve space to update our inode.
   8650	 *
   8651	 * 2) We need to have something to cache all the space that is going to
   8652	 * be free'd up by the truncate operation, but also have some slack
   8653	 * space reserved in case it uses space during the truncate (thank you
   8654	 * very much snapshotting).
   8655	 *
   8656	 * And we need these to be separate.  The fact is we can use a lot of
   8657	 * space doing the truncate, and we have no earthly idea how much space
   8658	 * we will use, so we need the truncate reservation to be separate so it
   8659	 * doesn't end up using space reserved for updating the inode.  We also
   8660	 * need to be able to stop the transaction and start a new one, which
   8661	 * means we need to be able to update the inode several times, and we
   8662	 * have no idea of knowing how many times that will be, so we can't just
   8663	 * reserve 1 item for the entirety of the operation, so that has to be
   8664	 * done separately as well.
   8665	 *
   8666	 * So that leaves us with
   8667	 *
   8668	 * 1) rsv - for the truncate reservation, which we will steal from the
   8669	 * transaction reservation.
   8670	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
   8671	 * updating the inode.
   8672	 */
   8673	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
   8674	if (!rsv)
   8675		return -ENOMEM;
   8676	rsv->size = min_size;
   8677	rsv->failfast = 1;
   8678
   8679	/*
   8680	 * 1 for the truncate slack space
   8681	 * 1 for updating the inode.
   8682	 */
   8683	trans = btrfs_start_transaction(root, 2);
   8684	if (IS_ERR(trans)) {
   8685		ret = PTR_ERR(trans);
   8686		goto out;
   8687	}
   8688
   8689	/* Migrate the slack space for the truncate to our reserve */
   8690	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
   8691				      min_size, false);
   8692	BUG_ON(ret);
   8693
   8694	trans->block_rsv = rsv;
   8695
   8696	while (1) {
   8697		struct extent_state *cached_state = NULL;
   8698		const u64 new_size = inode->i_size;
   8699		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
   8700
   8701		control.new_size = new_size;
   8702		lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
   8703				 &cached_state);
   8704		/*
   8705		 * We want to drop from the next block forward in case this new
   8706		 * size is not block aligned since we will be keeping the last
   8707		 * block of the extent just the way it is.
   8708		 */
   8709		btrfs_drop_extent_cache(BTRFS_I(inode),
   8710					ALIGN(new_size, fs_info->sectorsize),
   8711					(u64)-1, 0);
   8712
   8713		ret = btrfs_truncate_inode_items(trans, root, &control);
   8714
   8715		inode_sub_bytes(inode, control.sub_bytes);
   8716		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
   8717
   8718		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
   8719				     (u64)-1, &cached_state);
   8720
   8721		trans->block_rsv = &fs_info->trans_block_rsv;
   8722		if (ret != -ENOSPC && ret != -EAGAIN)
   8723			break;
   8724
   8725		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   8726		if (ret)
   8727			break;
   8728
   8729		btrfs_end_transaction(trans);
   8730		btrfs_btree_balance_dirty(fs_info);
   8731
   8732		trans = btrfs_start_transaction(root, 2);
   8733		if (IS_ERR(trans)) {
   8734			ret = PTR_ERR(trans);
   8735			trans = NULL;
   8736			break;
   8737		}
   8738
   8739		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
   8740		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
   8741					      rsv, min_size, false);
   8742		BUG_ON(ret);	/* shouldn't happen */
   8743		trans->block_rsv = rsv;
   8744	}
   8745
   8746	/*
   8747	 * We can't call btrfs_truncate_block inside a trans handle as we could
   8748	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
   8749	 * know we've truncated everything except the last little bit, and can
   8750	 * do btrfs_truncate_block and then update the disk_i_size.
   8751	 */
   8752	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
   8753		btrfs_end_transaction(trans);
   8754		btrfs_btree_balance_dirty(fs_info);
   8755
   8756		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
   8757		if (ret)
   8758			goto out;
   8759		trans = btrfs_start_transaction(root, 1);
   8760		if (IS_ERR(trans)) {
   8761			ret = PTR_ERR(trans);
   8762			goto out;
   8763		}
   8764		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
   8765	}
   8766
   8767	if (trans) {
   8768		int ret2;
   8769
   8770		trans->block_rsv = &fs_info->trans_block_rsv;
   8771		ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
   8772		if (ret2 && !ret)
   8773			ret = ret2;
   8774
   8775		ret2 = btrfs_end_transaction(trans);
   8776		if (ret2 && !ret)
   8777			ret = ret2;
   8778		btrfs_btree_balance_dirty(fs_info);
   8779	}
   8780out:
   8781	btrfs_free_block_rsv(fs_info, rsv);
   8782	/*
   8783	 * So if we truncate and then write and fsync we normally would just
   8784	 * write the extents that changed, which is a problem if we need to
   8785	 * first truncate that entire inode.  So set this flag so we write out
   8786	 * all of the extents in the inode to the sync log so we're completely
   8787	 * safe.
   8788	 *
   8789	 * If no extents were dropped or trimmed we don't need to force the next
   8790	 * fsync to truncate all the inode's items from the log and re-log them
   8791	 * all. This means the truncate operation did not change the file size,
   8792	 * or changed it to a smaller size but there was only an implicit hole
   8793	 * between the old i_size and the new i_size, and there were no prealloc
   8794	 * extents beyond i_size to drop.
   8795	 */
   8796	if (control.extents_found > 0)
   8797		btrfs_set_inode_full_sync(BTRFS_I(inode));
   8798
   8799	return ret;
   8800}
   8801
   8802struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
   8803				     struct inode *dir)
   8804{
   8805	struct inode *inode;
   8806
   8807	inode = new_inode(dir->i_sb);
   8808	if (inode) {
   8809		/*
   8810		 * Subvolumes don't inherit the sgid bit or the parent's gid if
   8811		 * the parent's sgid bit is set. This is probably a bug.
   8812		 */
   8813		inode_init_owner(mnt_userns, inode, NULL,
   8814				 S_IFDIR | (~current_umask() & S_IRWXUGO));
   8815		inode->i_op = &btrfs_dir_inode_operations;
   8816		inode->i_fop = &btrfs_dir_file_operations;
   8817	}
   8818	return inode;
   8819}
   8820
   8821struct inode *btrfs_alloc_inode(struct super_block *sb)
   8822{
   8823	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
   8824	struct btrfs_inode *ei;
   8825	struct inode *inode;
   8826
   8827	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
   8828	if (!ei)
   8829		return NULL;
   8830
   8831	ei->root = NULL;
   8832	ei->generation = 0;
   8833	ei->last_trans = 0;
   8834	ei->last_sub_trans = 0;
   8835	ei->logged_trans = 0;
   8836	ei->delalloc_bytes = 0;
   8837	ei->new_delalloc_bytes = 0;
   8838	ei->defrag_bytes = 0;
   8839	ei->disk_i_size = 0;
   8840	ei->flags = 0;
   8841	ei->ro_flags = 0;
   8842	ei->csum_bytes = 0;
   8843	ei->index_cnt = (u64)-1;
   8844	ei->dir_index = 0;
   8845	ei->last_unlink_trans = 0;
   8846	ei->last_reflink_trans = 0;
   8847	ei->last_log_commit = 0;
   8848
   8849	spin_lock_init(&ei->lock);
   8850	ei->outstanding_extents = 0;
   8851	if (sb->s_magic != BTRFS_TEST_MAGIC)
   8852		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
   8853					      BTRFS_BLOCK_RSV_DELALLOC);
   8854	ei->runtime_flags = 0;
   8855	ei->prop_compress = BTRFS_COMPRESS_NONE;
   8856	ei->defrag_compress = BTRFS_COMPRESS_NONE;
   8857
   8858	ei->delayed_node = NULL;
   8859
   8860	ei->i_otime.tv_sec = 0;
   8861	ei->i_otime.tv_nsec = 0;
   8862
   8863	inode = &ei->vfs_inode;
   8864	extent_map_tree_init(&ei->extent_tree);
   8865	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
   8866	extent_io_tree_init(fs_info, &ei->io_failure_tree,
   8867			    IO_TREE_INODE_IO_FAILURE, inode);
   8868	extent_io_tree_init(fs_info, &ei->file_extent_tree,
   8869			    IO_TREE_INODE_FILE_EXTENT, inode);
   8870	ei->io_tree.track_uptodate = true;
   8871	ei->io_failure_tree.track_uptodate = true;
   8872	atomic_set(&ei->sync_writers, 0);
   8873	mutex_init(&ei->log_mutex);
   8874	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
   8875	INIT_LIST_HEAD(&ei->delalloc_inodes);
   8876	INIT_LIST_HEAD(&ei->delayed_iput);
   8877	RB_CLEAR_NODE(&ei->rb_node);
   8878	init_rwsem(&ei->i_mmap_lock);
   8879
   8880	return inode;
   8881}
   8882
   8883#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
   8884void btrfs_test_destroy_inode(struct inode *inode)
   8885{
   8886	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
   8887	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
   8888}
   8889#endif
   8890
   8891void btrfs_free_inode(struct inode *inode)
   8892{
   8893	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
   8894}
   8895
   8896void btrfs_destroy_inode(struct inode *vfs_inode)
   8897{
   8898	struct btrfs_ordered_extent *ordered;
   8899	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
   8900	struct btrfs_root *root = inode->root;
   8901
   8902	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
   8903	WARN_ON(vfs_inode->i_data.nrpages);
   8904	WARN_ON(inode->block_rsv.reserved);
   8905	WARN_ON(inode->block_rsv.size);
   8906	WARN_ON(inode->outstanding_extents);
   8907	if (!S_ISDIR(vfs_inode->i_mode)) {
   8908		WARN_ON(inode->delalloc_bytes);
   8909		WARN_ON(inode->new_delalloc_bytes);
   8910	}
   8911	WARN_ON(inode->csum_bytes);
   8912	WARN_ON(inode->defrag_bytes);
   8913
   8914	/*
   8915	 * This can happen where we create an inode, but somebody else also
   8916	 * created the same inode and we need to destroy the one we already
   8917	 * created.
   8918	 */
   8919	if (!root)
   8920		return;
   8921
   8922	while (1) {
   8923		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
   8924		if (!ordered)
   8925			break;
   8926		else {
   8927			btrfs_err(root->fs_info,
   8928				  "found ordered extent %llu %llu on inode cleanup",
   8929				  ordered->file_offset, ordered->num_bytes);
   8930			btrfs_remove_ordered_extent(inode, ordered);
   8931			btrfs_put_ordered_extent(ordered);
   8932			btrfs_put_ordered_extent(ordered);
   8933		}
   8934	}
   8935	btrfs_qgroup_check_reserved_leak(inode);
   8936	inode_tree_del(inode);
   8937	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
   8938	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
   8939	btrfs_put_root(inode->root);
   8940}
   8941
   8942int btrfs_drop_inode(struct inode *inode)
   8943{
   8944	struct btrfs_root *root = BTRFS_I(inode)->root;
   8945
   8946	if (root == NULL)
   8947		return 1;
   8948
   8949	/* the snap/subvol tree is on deleting */
   8950	if (btrfs_root_refs(&root->root_item) == 0)
   8951		return 1;
   8952	else
   8953		return generic_drop_inode(inode);
   8954}
   8955
   8956static void init_once(void *foo)
   8957{
   8958	struct btrfs_inode *ei = foo;
   8959
   8960	inode_init_once(&ei->vfs_inode);
   8961}
   8962
   8963void __cold btrfs_destroy_cachep(void)
   8964{
   8965	/*
   8966	 * Make sure all delayed rcu free inodes are flushed before we
   8967	 * destroy cache.
   8968	 */
   8969	rcu_barrier();
   8970	bioset_exit(&btrfs_dio_bioset);
   8971	kmem_cache_destroy(btrfs_inode_cachep);
   8972	kmem_cache_destroy(btrfs_trans_handle_cachep);
   8973	kmem_cache_destroy(btrfs_path_cachep);
   8974	kmem_cache_destroy(btrfs_free_space_cachep);
   8975	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
   8976}
   8977
   8978int __init btrfs_init_cachep(void)
   8979{
   8980	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
   8981			sizeof(struct btrfs_inode), 0,
   8982			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
   8983			init_once);
   8984	if (!btrfs_inode_cachep)
   8985		goto fail;
   8986
   8987	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
   8988			sizeof(struct btrfs_trans_handle), 0,
   8989			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
   8990	if (!btrfs_trans_handle_cachep)
   8991		goto fail;
   8992
   8993	btrfs_path_cachep = kmem_cache_create("btrfs_path",
   8994			sizeof(struct btrfs_path), 0,
   8995			SLAB_MEM_SPREAD, NULL);
   8996	if (!btrfs_path_cachep)
   8997		goto fail;
   8998
   8999	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
   9000			sizeof(struct btrfs_free_space), 0,
   9001			SLAB_MEM_SPREAD, NULL);
   9002	if (!btrfs_free_space_cachep)
   9003		goto fail;
   9004
   9005	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
   9006							PAGE_SIZE, PAGE_SIZE,
   9007							SLAB_MEM_SPREAD, NULL);
   9008	if (!btrfs_free_space_bitmap_cachep)
   9009		goto fail;
   9010
   9011	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
   9012			offsetof(struct btrfs_dio_private, bio),
   9013			BIOSET_NEED_BVECS))
   9014		goto fail;
   9015
   9016	return 0;
   9017fail:
   9018	btrfs_destroy_cachep();
   9019	return -ENOMEM;
   9020}
   9021
   9022static int btrfs_getattr(struct user_namespace *mnt_userns,
   9023			 const struct path *path, struct kstat *stat,
   9024			 u32 request_mask, unsigned int flags)
   9025{
   9026	u64 delalloc_bytes;
   9027	u64 inode_bytes;
   9028	struct inode *inode = d_inode(path->dentry);
   9029	u32 blocksize = inode->i_sb->s_blocksize;
   9030	u32 bi_flags = BTRFS_I(inode)->flags;
   9031	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
   9032
   9033	stat->result_mask |= STATX_BTIME;
   9034	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
   9035	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
   9036	if (bi_flags & BTRFS_INODE_APPEND)
   9037		stat->attributes |= STATX_ATTR_APPEND;
   9038	if (bi_flags & BTRFS_INODE_COMPRESS)
   9039		stat->attributes |= STATX_ATTR_COMPRESSED;
   9040	if (bi_flags & BTRFS_INODE_IMMUTABLE)
   9041		stat->attributes |= STATX_ATTR_IMMUTABLE;
   9042	if (bi_flags & BTRFS_INODE_NODUMP)
   9043		stat->attributes |= STATX_ATTR_NODUMP;
   9044	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
   9045		stat->attributes |= STATX_ATTR_VERITY;
   9046
   9047	stat->attributes_mask |= (STATX_ATTR_APPEND |
   9048				  STATX_ATTR_COMPRESSED |
   9049				  STATX_ATTR_IMMUTABLE |
   9050				  STATX_ATTR_NODUMP);
   9051
   9052	generic_fillattr(mnt_userns, inode, stat);
   9053	stat->dev = BTRFS_I(inode)->root->anon_dev;
   9054
   9055	spin_lock(&BTRFS_I(inode)->lock);
   9056	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
   9057	inode_bytes = inode_get_bytes(inode);
   9058	spin_unlock(&BTRFS_I(inode)->lock);
   9059	stat->blocks = (ALIGN(inode_bytes, blocksize) +
   9060			ALIGN(delalloc_bytes, blocksize)) >> 9;
   9061	return 0;
   9062}
   9063
   9064static int btrfs_rename_exchange(struct inode *old_dir,
   9065			      struct dentry *old_dentry,
   9066			      struct inode *new_dir,
   9067			      struct dentry *new_dentry)
   9068{
   9069	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
   9070	struct btrfs_trans_handle *trans;
   9071	unsigned int trans_num_items;
   9072	struct btrfs_root *root = BTRFS_I(old_dir)->root;
   9073	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
   9074	struct inode *new_inode = new_dentry->d_inode;
   9075	struct inode *old_inode = old_dentry->d_inode;
   9076	struct timespec64 ctime = current_time(old_inode);
   9077	struct btrfs_rename_ctx old_rename_ctx;
   9078	struct btrfs_rename_ctx new_rename_ctx;
   9079	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
   9080	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
   9081	u64 old_idx = 0;
   9082	u64 new_idx = 0;
   9083	int ret;
   9084	int ret2;
   9085	bool need_abort = false;
   9086
   9087	/*
   9088	 * For non-subvolumes allow exchange only within one subvolume, in the
   9089	 * same inode namespace. Two subvolumes (represented as directory) can
   9090	 * be exchanged as they're a logical link and have a fixed inode number.
   9091	 */
   9092	if (root != dest &&
   9093	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
   9094	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
   9095		return -EXDEV;
   9096
   9097	/* close the race window with snapshot create/destroy ioctl */
   9098	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
   9099	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
   9100		down_read(&fs_info->subvol_sem);
   9101
   9102	/*
   9103	 * For each inode:
   9104	 * 1 to remove old dir item
   9105	 * 1 to remove old dir index
   9106	 * 1 to add new dir item
   9107	 * 1 to add new dir index
   9108	 * 1 to update parent inode
   9109	 *
   9110	 * If the parents are the same, we only need to account for one
   9111	 */
   9112	trans_num_items = (old_dir == new_dir ? 9 : 10);
   9113	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
   9114		/*
   9115		 * 1 to remove old root ref
   9116		 * 1 to remove old root backref
   9117		 * 1 to add new root ref
   9118		 * 1 to add new root backref
   9119		 */
   9120		trans_num_items += 4;
   9121	} else {
   9122		/*
   9123		 * 1 to update inode item
   9124		 * 1 to remove old inode ref
   9125		 * 1 to add new inode ref
   9126		 */
   9127		trans_num_items += 3;
   9128	}
   9129	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
   9130		trans_num_items += 4;
   9131	else
   9132		trans_num_items += 3;
   9133	trans = btrfs_start_transaction(root, trans_num_items);
   9134	if (IS_ERR(trans)) {
   9135		ret = PTR_ERR(trans);
   9136		goto out_notrans;
   9137	}
   9138
   9139	if (dest != root) {
   9140		ret = btrfs_record_root_in_trans(trans, dest);
   9141		if (ret)
   9142			goto out_fail;
   9143	}
   9144
   9145	/*
   9146	 * We need to find a free sequence number both in the source and
   9147	 * in the destination directory for the exchange.
   9148	 */
   9149	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
   9150	if (ret)
   9151		goto out_fail;
   9152	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
   9153	if (ret)
   9154		goto out_fail;
   9155
   9156	BTRFS_I(old_inode)->dir_index = 0ULL;
   9157	BTRFS_I(new_inode)->dir_index = 0ULL;
   9158
   9159	/* Reference for the source. */
   9160	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
   9161		/* force full log commit if subvolume involved. */
   9162		btrfs_set_log_full_commit(trans);
   9163	} else {
   9164		ret = btrfs_insert_inode_ref(trans, dest,
   9165					     new_dentry->d_name.name,
   9166					     new_dentry->d_name.len,
   9167					     old_ino,
   9168					     btrfs_ino(BTRFS_I(new_dir)),
   9169					     old_idx);
   9170		if (ret)
   9171			goto out_fail;
   9172		need_abort = true;
   9173	}
   9174
   9175	/* And now for the dest. */
   9176	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
   9177		/* force full log commit if subvolume involved. */
   9178		btrfs_set_log_full_commit(trans);
   9179	} else {
   9180		ret = btrfs_insert_inode_ref(trans, root,
   9181					     old_dentry->d_name.name,
   9182					     old_dentry->d_name.len,
   9183					     new_ino,
   9184					     btrfs_ino(BTRFS_I(old_dir)),
   9185					     new_idx);
   9186		if (ret) {
   9187			if (need_abort)
   9188				btrfs_abort_transaction(trans, ret);
   9189			goto out_fail;
   9190		}
   9191	}
   9192
   9193	/* Update inode version and ctime/mtime. */
   9194	inode_inc_iversion(old_dir);
   9195	inode_inc_iversion(new_dir);
   9196	inode_inc_iversion(old_inode);
   9197	inode_inc_iversion(new_inode);
   9198	old_dir->i_ctime = old_dir->i_mtime = ctime;
   9199	new_dir->i_ctime = new_dir->i_mtime = ctime;
   9200	old_inode->i_ctime = ctime;
   9201	new_inode->i_ctime = ctime;
   9202
   9203	if (old_dentry->d_parent != new_dentry->d_parent) {
   9204		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
   9205				BTRFS_I(old_inode), 1);
   9206		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
   9207				BTRFS_I(new_inode), 1);
   9208	}
   9209
   9210	/* src is a subvolume */
   9211	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
   9212		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
   9213	} else { /* src is an inode */
   9214		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
   9215					   BTRFS_I(old_dentry->d_inode),
   9216					   old_dentry->d_name.name,
   9217					   old_dentry->d_name.len,
   9218					   &old_rename_ctx);
   9219		if (!ret)
   9220			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
   9221	}
   9222	if (ret) {
   9223		btrfs_abort_transaction(trans, ret);
   9224		goto out_fail;
   9225	}
   9226
   9227	/* dest is a subvolume */
   9228	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
   9229		ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
   9230	} else { /* dest is an inode */
   9231		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
   9232					   BTRFS_I(new_dentry->d_inode),
   9233					   new_dentry->d_name.name,
   9234					   new_dentry->d_name.len,
   9235					   &new_rename_ctx);
   9236		if (!ret)
   9237			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
   9238	}
   9239	if (ret) {
   9240		btrfs_abort_transaction(trans, ret);
   9241		goto out_fail;
   9242	}
   9243
   9244	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
   9245			     new_dentry->d_name.name,
   9246			     new_dentry->d_name.len, 0, old_idx);
   9247	if (ret) {
   9248		btrfs_abort_transaction(trans, ret);
   9249		goto out_fail;
   9250	}
   9251
   9252	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
   9253			     old_dentry->d_name.name,
   9254			     old_dentry->d_name.len, 0, new_idx);
   9255	if (ret) {
   9256		btrfs_abort_transaction(trans, ret);
   9257		goto out_fail;
   9258	}
   9259
   9260	if (old_inode->i_nlink == 1)
   9261		BTRFS_I(old_inode)->dir_index = old_idx;
   9262	if (new_inode->i_nlink == 1)
   9263		BTRFS_I(new_inode)->dir_index = new_idx;
   9264
   9265	/*
   9266	 * Now pin the logs of the roots. We do it to ensure that no other task
   9267	 * can sync the logs while we are in progress with the rename, because
   9268	 * that could result in an inconsistency in case any of the inodes that
   9269	 * are part of this rename operation were logged before.
   9270	 */
   9271	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
   9272		btrfs_pin_log_trans(root);
   9273	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
   9274		btrfs_pin_log_trans(dest);
   9275
   9276	/* Do the log updates for all inodes. */
   9277	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
   9278		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
   9279				   old_rename_ctx.index, new_dentry->d_parent);
   9280	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
   9281		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
   9282				   new_rename_ctx.index, old_dentry->d_parent);
   9283
   9284	/* Now unpin the logs. */
   9285	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
   9286		btrfs_end_log_trans(root);
   9287	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
   9288		btrfs_end_log_trans(dest);
   9289out_fail:
   9290	ret2 = btrfs_end_transaction(trans);
   9291	ret = ret ? ret : ret2;
   9292out_notrans:
   9293	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
   9294	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
   9295		up_read(&fs_info->subvol_sem);
   9296
   9297	return ret;
   9298}
   9299
   9300static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns,
   9301					struct inode *dir)
   9302{
   9303	struct inode *inode;
   9304
   9305	inode = new_inode(dir->i_sb);
   9306	if (inode) {
   9307		inode_init_owner(mnt_userns, inode, dir,
   9308				 S_IFCHR | WHITEOUT_MODE);
   9309		inode->i_op = &btrfs_special_inode_operations;
   9310		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
   9311	}
   9312	return inode;
   9313}
   9314
   9315static int btrfs_rename(struct user_namespace *mnt_userns,
   9316			struct inode *old_dir, struct dentry *old_dentry,
   9317			struct inode *new_dir, struct dentry *new_dentry,
   9318			unsigned int flags)
   9319{
   9320	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
   9321	struct btrfs_new_inode_args whiteout_args = {
   9322		.dir = old_dir,
   9323		.dentry = old_dentry,
   9324	};
   9325	struct btrfs_trans_handle *trans;
   9326	unsigned int trans_num_items;
   9327	struct btrfs_root *root = BTRFS_I(old_dir)->root;
   9328	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
   9329	struct inode *new_inode = d_inode(new_dentry);
   9330	struct inode *old_inode = d_inode(old_dentry);
   9331	struct btrfs_rename_ctx rename_ctx;
   9332	u64 index = 0;
   9333	int ret;
   9334	int ret2;
   9335	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
   9336
   9337	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
   9338		return -EPERM;
   9339
   9340	/* we only allow rename subvolume link between subvolumes */
   9341	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
   9342		return -EXDEV;
   9343
   9344	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
   9345	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
   9346		return -ENOTEMPTY;
   9347
   9348	if (S_ISDIR(old_inode->i_mode) && new_inode &&
   9349	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
   9350		return -ENOTEMPTY;
   9351
   9352
   9353	/* check for collisions, even if the  name isn't there */
   9354	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
   9355			     new_dentry->d_name.name,
   9356			     new_dentry->d_name.len);
   9357
   9358	if (ret) {
   9359		if (ret == -EEXIST) {
   9360			/* we shouldn't get
   9361			 * eexist without a new_inode */
   9362			if (WARN_ON(!new_inode)) {
   9363				return ret;
   9364			}
   9365		} else {
   9366			/* maybe -EOVERFLOW */
   9367			return ret;
   9368		}
   9369	}
   9370	ret = 0;
   9371
   9372	/*
   9373	 * we're using rename to replace one file with another.  Start IO on it
   9374	 * now so  we don't add too much work to the end of the transaction
   9375	 */
   9376	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
   9377		filemap_flush(old_inode->i_mapping);
   9378
   9379	if (flags & RENAME_WHITEOUT) {
   9380		whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir);
   9381		if (!whiteout_args.inode)
   9382			return -ENOMEM;
   9383		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
   9384		if (ret)
   9385			goto out_whiteout_inode;
   9386	} else {
   9387		/* 1 to update the old parent inode. */
   9388		trans_num_items = 1;
   9389	}
   9390
   9391	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
   9392		/* Close the race window with snapshot create/destroy ioctl */
   9393		down_read(&fs_info->subvol_sem);
   9394		/*
   9395		 * 1 to remove old root ref
   9396		 * 1 to remove old root backref
   9397		 * 1 to add new root ref
   9398		 * 1 to add new root backref
   9399		 */
   9400		trans_num_items += 4;
   9401	} else {
   9402		/*
   9403		 * 1 to update inode
   9404		 * 1 to remove old inode ref
   9405		 * 1 to add new inode ref
   9406		 */
   9407		trans_num_items += 3;
   9408	}
   9409	/*
   9410	 * 1 to remove old dir item
   9411	 * 1 to remove old dir index
   9412	 * 1 to add new dir item
   9413	 * 1 to add new dir index
   9414	 */
   9415	trans_num_items += 4;
   9416	/* 1 to update new parent inode if it's not the same as the old parent */
   9417	if (new_dir != old_dir)
   9418		trans_num_items++;
   9419	if (new_inode) {
   9420		/*
   9421		 * 1 to update inode
   9422		 * 1 to remove inode ref
   9423		 * 1 to remove dir item
   9424		 * 1 to remove dir index
   9425		 * 1 to possibly add orphan item
   9426		 */
   9427		trans_num_items += 5;
   9428	}
   9429	trans = btrfs_start_transaction(root, trans_num_items);
   9430	if (IS_ERR(trans)) {
   9431		ret = PTR_ERR(trans);
   9432		goto out_notrans;
   9433	}
   9434
   9435	if (dest != root) {
   9436		ret = btrfs_record_root_in_trans(trans, dest);
   9437		if (ret)
   9438			goto out_fail;
   9439	}
   9440
   9441	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
   9442	if (ret)
   9443		goto out_fail;
   9444
   9445	BTRFS_I(old_inode)->dir_index = 0ULL;
   9446	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
   9447		/* force full log commit if subvolume involved. */
   9448		btrfs_set_log_full_commit(trans);
   9449	} else {
   9450		ret = btrfs_insert_inode_ref(trans, dest,
   9451					     new_dentry->d_name.name,
   9452					     new_dentry->d_name.len,
   9453					     old_ino,
   9454					     btrfs_ino(BTRFS_I(new_dir)), index);
   9455		if (ret)
   9456			goto out_fail;
   9457	}
   9458
   9459	inode_inc_iversion(old_dir);
   9460	inode_inc_iversion(new_dir);
   9461	inode_inc_iversion(old_inode);
   9462	old_dir->i_ctime = old_dir->i_mtime =
   9463	new_dir->i_ctime = new_dir->i_mtime =
   9464	old_inode->i_ctime = current_time(old_dir);
   9465
   9466	if (old_dentry->d_parent != new_dentry->d_parent)
   9467		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
   9468				BTRFS_I(old_inode), 1);
   9469
   9470	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
   9471		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
   9472	} else {
   9473		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
   9474					BTRFS_I(d_inode(old_dentry)),
   9475					old_dentry->d_name.name,
   9476					old_dentry->d_name.len,
   9477					&rename_ctx);
   9478		if (!ret)
   9479			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
   9480	}
   9481	if (ret) {
   9482		btrfs_abort_transaction(trans, ret);
   9483		goto out_fail;
   9484	}
   9485
   9486	if (new_inode) {
   9487		inode_inc_iversion(new_inode);
   9488		new_inode->i_ctime = current_time(new_inode);
   9489		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
   9490			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
   9491			ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
   9492			BUG_ON(new_inode->i_nlink == 0);
   9493		} else {
   9494			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
   9495						 BTRFS_I(d_inode(new_dentry)),
   9496						 new_dentry->d_name.name,
   9497						 new_dentry->d_name.len);
   9498		}
   9499		if (!ret && new_inode->i_nlink == 0)
   9500			ret = btrfs_orphan_add(trans,
   9501					BTRFS_I(d_inode(new_dentry)));
   9502		if (ret) {
   9503			btrfs_abort_transaction(trans, ret);
   9504			goto out_fail;
   9505		}
   9506	}
   9507
   9508	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
   9509			     new_dentry->d_name.name,
   9510			     new_dentry->d_name.len, 0, index);
   9511	if (ret) {
   9512		btrfs_abort_transaction(trans, ret);
   9513		goto out_fail;
   9514	}
   9515
   9516	if (old_inode->i_nlink == 1)
   9517		BTRFS_I(old_inode)->dir_index = index;
   9518
   9519	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
   9520		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
   9521				   rename_ctx.index, new_dentry->d_parent);
   9522
   9523	if (flags & RENAME_WHITEOUT) {
   9524		ret = btrfs_create_new_inode(trans, &whiteout_args);
   9525		if (ret) {
   9526			btrfs_abort_transaction(trans, ret);
   9527			goto out_fail;
   9528		} else {
   9529			unlock_new_inode(whiteout_args.inode);
   9530			iput(whiteout_args.inode);
   9531			whiteout_args.inode = NULL;
   9532		}
   9533	}
   9534out_fail:
   9535	ret2 = btrfs_end_transaction(trans);
   9536	ret = ret ? ret : ret2;
   9537out_notrans:
   9538	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
   9539		up_read(&fs_info->subvol_sem);
   9540	if (flags & RENAME_WHITEOUT)
   9541		btrfs_new_inode_args_destroy(&whiteout_args);
   9542out_whiteout_inode:
   9543	if (flags & RENAME_WHITEOUT)
   9544		iput(whiteout_args.inode);
   9545	return ret;
   9546}
   9547
   9548static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
   9549			 struct dentry *old_dentry, struct inode *new_dir,
   9550			 struct dentry *new_dentry, unsigned int flags)
   9551{
   9552	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
   9553		return -EINVAL;
   9554
   9555	if (flags & RENAME_EXCHANGE)
   9556		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
   9557					  new_dentry);
   9558
   9559	return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
   9560			    new_dentry, flags);
   9561}
   9562
   9563struct btrfs_delalloc_work {
   9564	struct inode *inode;
   9565	struct completion completion;
   9566	struct list_head list;
   9567	struct btrfs_work work;
   9568};
   9569
   9570static void btrfs_run_delalloc_work(struct btrfs_work *work)
   9571{
   9572	struct btrfs_delalloc_work *delalloc_work;
   9573	struct inode *inode;
   9574
   9575	delalloc_work = container_of(work, struct btrfs_delalloc_work,
   9576				     work);
   9577	inode = delalloc_work->inode;
   9578	filemap_flush(inode->i_mapping);
   9579	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
   9580				&BTRFS_I(inode)->runtime_flags))
   9581		filemap_flush(inode->i_mapping);
   9582
   9583	iput(inode);
   9584	complete(&delalloc_work->completion);
   9585}
   9586
   9587static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
   9588{
   9589	struct btrfs_delalloc_work *work;
   9590
   9591	work = kmalloc(sizeof(*work), GFP_NOFS);
   9592	if (!work)
   9593		return NULL;
   9594
   9595	init_completion(&work->completion);
   9596	INIT_LIST_HEAD(&work->list);
   9597	work->inode = inode;
   9598	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
   9599
   9600	return work;
   9601}
   9602
   9603/*
   9604 * some fairly slow code that needs optimization. This walks the list
   9605 * of all the inodes with pending delalloc and forces them to disk.
   9606 */
   9607static int start_delalloc_inodes(struct btrfs_root *root,
   9608				 struct writeback_control *wbc, bool snapshot,
   9609				 bool in_reclaim_context)
   9610{
   9611	struct btrfs_inode *binode;
   9612	struct inode *inode;
   9613	struct btrfs_delalloc_work *work, *next;
   9614	struct list_head works;
   9615	struct list_head splice;
   9616	int ret = 0;
   9617	bool full_flush = wbc->nr_to_write == LONG_MAX;
   9618
   9619	INIT_LIST_HEAD(&works);
   9620	INIT_LIST_HEAD(&splice);
   9621
   9622	mutex_lock(&root->delalloc_mutex);
   9623	spin_lock(&root->delalloc_lock);
   9624	list_splice_init(&root->delalloc_inodes, &splice);
   9625	while (!list_empty(&splice)) {
   9626		binode = list_entry(splice.next, struct btrfs_inode,
   9627				    delalloc_inodes);
   9628
   9629		list_move_tail(&binode->delalloc_inodes,
   9630			       &root->delalloc_inodes);
   9631
   9632		if (in_reclaim_context &&
   9633		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
   9634			continue;
   9635
   9636		inode = igrab(&binode->vfs_inode);
   9637		if (!inode) {
   9638			cond_resched_lock(&root->delalloc_lock);
   9639			continue;
   9640		}
   9641		spin_unlock(&root->delalloc_lock);
   9642
   9643		if (snapshot)
   9644			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
   9645				&binode->runtime_flags);
   9646		if (full_flush) {
   9647			work = btrfs_alloc_delalloc_work(inode);
   9648			if (!work) {
   9649				iput(inode);
   9650				ret = -ENOMEM;
   9651				goto out;
   9652			}
   9653			list_add_tail(&work->list, &works);
   9654			btrfs_queue_work(root->fs_info->flush_workers,
   9655					 &work->work);
   9656		} else {
   9657			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
   9658			btrfs_add_delayed_iput(inode);
   9659			if (ret || wbc->nr_to_write <= 0)
   9660				goto out;
   9661		}
   9662		cond_resched();
   9663		spin_lock(&root->delalloc_lock);
   9664	}
   9665	spin_unlock(&root->delalloc_lock);
   9666
   9667out:
   9668	list_for_each_entry_safe(work, next, &works, list) {
   9669		list_del_init(&work->list);
   9670		wait_for_completion(&work->completion);
   9671		kfree(work);
   9672	}
   9673
   9674	if (!list_empty(&splice)) {
   9675		spin_lock(&root->delalloc_lock);
   9676		list_splice_tail(&splice, &root->delalloc_inodes);
   9677		spin_unlock(&root->delalloc_lock);
   9678	}
   9679	mutex_unlock(&root->delalloc_mutex);
   9680	return ret;
   9681}
   9682
   9683int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
   9684{
   9685	struct writeback_control wbc = {
   9686		.nr_to_write = LONG_MAX,
   9687		.sync_mode = WB_SYNC_NONE,
   9688		.range_start = 0,
   9689		.range_end = LLONG_MAX,
   9690	};
   9691	struct btrfs_fs_info *fs_info = root->fs_info;
   9692
   9693	if (BTRFS_FS_ERROR(fs_info))
   9694		return -EROFS;
   9695
   9696	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
   9697}
   9698
   9699int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
   9700			       bool in_reclaim_context)
   9701{
   9702	struct writeback_control wbc = {
   9703		.nr_to_write = nr,
   9704		.sync_mode = WB_SYNC_NONE,
   9705		.range_start = 0,
   9706		.range_end = LLONG_MAX,
   9707	};
   9708	struct btrfs_root *root;
   9709	struct list_head splice;
   9710	int ret;
   9711
   9712	if (BTRFS_FS_ERROR(fs_info))
   9713		return -EROFS;
   9714
   9715	INIT_LIST_HEAD(&splice);
   9716
   9717	mutex_lock(&fs_info->delalloc_root_mutex);
   9718	spin_lock(&fs_info->delalloc_root_lock);
   9719	list_splice_init(&fs_info->delalloc_roots, &splice);
   9720	while (!list_empty(&splice)) {
   9721		/*
   9722		 * Reset nr_to_write here so we know that we're doing a full
   9723		 * flush.
   9724		 */
   9725		if (nr == LONG_MAX)
   9726			wbc.nr_to_write = LONG_MAX;
   9727
   9728		root = list_first_entry(&splice, struct btrfs_root,
   9729					delalloc_root);
   9730		root = btrfs_grab_root(root);
   9731		BUG_ON(!root);
   9732		list_move_tail(&root->delalloc_root,
   9733			       &fs_info->delalloc_roots);
   9734		spin_unlock(&fs_info->delalloc_root_lock);
   9735
   9736		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
   9737		btrfs_put_root(root);
   9738		if (ret < 0 || wbc.nr_to_write <= 0)
   9739			goto out;
   9740		spin_lock(&fs_info->delalloc_root_lock);
   9741	}
   9742	spin_unlock(&fs_info->delalloc_root_lock);
   9743
   9744	ret = 0;
   9745out:
   9746	if (!list_empty(&splice)) {
   9747		spin_lock(&fs_info->delalloc_root_lock);
   9748		list_splice_tail(&splice, &fs_info->delalloc_roots);
   9749		spin_unlock(&fs_info->delalloc_root_lock);
   9750	}
   9751	mutex_unlock(&fs_info->delalloc_root_mutex);
   9752	return ret;
   9753}
   9754
   9755static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
   9756			 struct dentry *dentry, const char *symname)
   9757{
   9758	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
   9759	struct btrfs_trans_handle *trans;
   9760	struct btrfs_root *root = BTRFS_I(dir)->root;
   9761	struct btrfs_path *path;
   9762	struct btrfs_key key;
   9763	struct inode *inode;
   9764	struct btrfs_new_inode_args new_inode_args = {
   9765		.dir = dir,
   9766		.dentry = dentry,
   9767	};
   9768	unsigned int trans_num_items;
   9769	int err;
   9770	int name_len;
   9771	int datasize;
   9772	unsigned long ptr;
   9773	struct btrfs_file_extent_item *ei;
   9774	struct extent_buffer *leaf;
   9775
   9776	name_len = strlen(symname);
   9777	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
   9778		return -ENAMETOOLONG;
   9779
   9780	inode = new_inode(dir->i_sb);
   9781	if (!inode)
   9782		return -ENOMEM;
   9783	inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO);
   9784	inode->i_op = &btrfs_symlink_inode_operations;
   9785	inode_nohighmem(inode);
   9786	inode->i_mapping->a_ops = &btrfs_aops;
   9787	btrfs_i_size_write(BTRFS_I(inode), name_len);
   9788	inode_set_bytes(inode, name_len);
   9789
   9790	new_inode_args.inode = inode;
   9791	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
   9792	if (err)
   9793		goto out_inode;
   9794	/* 1 additional item for the inline extent */
   9795	trans_num_items++;
   9796
   9797	trans = btrfs_start_transaction(root, trans_num_items);
   9798	if (IS_ERR(trans)) {
   9799		err = PTR_ERR(trans);
   9800		goto out_new_inode_args;
   9801	}
   9802
   9803	err = btrfs_create_new_inode(trans, &new_inode_args);
   9804	if (err)
   9805		goto out;
   9806
   9807	path = btrfs_alloc_path();
   9808	if (!path) {
   9809		err = -ENOMEM;
   9810		btrfs_abort_transaction(trans, err);
   9811		discard_new_inode(inode);
   9812		inode = NULL;
   9813		goto out;
   9814	}
   9815	key.objectid = btrfs_ino(BTRFS_I(inode));
   9816	key.offset = 0;
   9817	key.type = BTRFS_EXTENT_DATA_KEY;
   9818	datasize = btrfs_file_extent_calc_inline_size(name_len);
   9819	err = btrfs_insert_empty_item(trans, root, path, &key,
   9820				      datasize);
   9821	if (err) {
   9822		btrfs_abort_transaction(trans, err);
   9823		btrfs_free_path(path);
   9824		discard_new_inode(inode);
   9825		inode = NULL;
   9826		goto out;
   9827	}
   9828	leaf = path->nodes[0];
   9829	ei = btrfs_item_ptr(leaf, path->slots[0],
   9830			    struct btrfs_file_extent_item);
   9831	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
   9832	btrfs_set_file_extent_type(leaf, ei,
   9833				   BTRFS_FILE_EXTENT_INLINE);
   9834	btrfs_set_file_extent_encryption(leaf, ei, 0);
   9835	btrfs_set_file_extent_compression(leaf, ei, 0);
   9836	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
   9837	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
   9838
   9839	ptr = btrfs_file_extent_inline_start(ei);
   9840	write_extent_buffer(leaf, symname, ptr, name_len);
   9841	btrfs_mark_buffer_dirty(leaf);
   9842	btrfs_free_path(path);
   9843
   9844	d_instantiate_new(dentry, inode);
   9845	err = 0;
   9846out:
   9847	btrfs_end_transaction(trans);
   9848	btrfs_btree_balance_dirty(fs_info);
   9849out_new_inode_args:
   9850	btrfs_new_inode_args_destroy(&new_inode_args);
   9851out_inode:
   9852	if (err)
   9853		iput(inode);
   9854	return err;
   9855}
   9856
   9857static struct btrfs_trans_handle *insert_prealloc_file_extent(
   9858				       struct btrfs_trans_handle *trans_in,
   9859				       struct btrfs_inode *inode,
   9860				       struct btrfs_key *ins,
   9861				       u64 file_offset)
   9862{
   9863	struct btrfs_file_extent_item stack_fi;
   9864	struct btrfs_replace_extent_info extent_info;
   9865	struct btrfs_trans_handle *trans = trans_in;
   9866	struct btrfs_path *path;
   9867	u64 start = ins->objectid;
   9868	u64 len = ins->offset;
   9869	int qgroup_released;
   9870	int ret;
   9871
   9872	memset(&stack_fi, 0, sizeof(stack_fi));
   9873
   9874	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
   9875	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
   9876	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
   9877	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
   9878	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
   9879	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
   9880	/* Encryption and other encoding is reserved and all 0 */
   9881
   9882	qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
   9883	if (qgroup_released < 0)
   9884		return ERR_PTR(qgroup_released);
   9885
   9886	if (trans) {
   9887		ret = insert_reserved_file_extent(trans, inode,
   9888						  file_offset, &stack_fi,
   9889						  true, qgroup_released);
   9890		if (ret)
   9891			goto free_qgroup;
   9892		return trans;
   9893	}
   9894
   9895	extent_info.disk_offset = start;
   9896	extent_info.disk_len = len;
   9897	extent_info.data_offset = 0;
   9898	extent_info.data_len = len;
   9899	extent_info.file_offset = file_offset;
   9900	extent_info.extent_buf = (char *)&stack_fi;
   9901	extent_info.is_new_extent = true;
   9902	extent_info.update_times = true;
   9903	extent_info.qgroup_reserved = qgroup_released;
   9904	extent_info.insertions = 0;
   9905
   9906	path = btrfs_alloc_path();
   9907	if (!path) {
   9908		ret = -ENOMEM;
   9909		goto free_qgroup;
   9910	}
   9911
   9912	ret = btrfs_replace_file_extents(inode, path, file_offset,
   9913				     file_offset + len - 1, &extent_info,
   9914				     &trans);
   9915	btrfs_free_path(path);
   9916	if (ret)
   9917		goto free_qgroup;
   9918	return trans;
   9919
   9920free_qgroup:
   9921	/*
   9922	 * We have released qgroup data range at the beginning of the function,
   9923	 * and normally qgroup_released bytes will be freed when committing
   9924	 * transaction.
   9925	 * But if we error out early, we have to free what we have released
   9926	 * or we leak qgroup data reservation.
   9927	 */
   9928	btrfs_qgroup_free_refroot(inode->root->fs_info,
   9929			inode->root->root_key.objectid, qgroup_released,
   9930			BTRFS_QGROUP_RSV_DATA);
   9931	return ERR_PTR(ret);
   9932}
   9933
   9934static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
   9935				       u64 start, u64 num_bytes, u64 min_size,
   9936				       loff_t actual_len, u64 *alloc_hint,
   9937				       struct btrfs_trans_handle *trans)
   9938{
   9939	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   9940	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
   9941	struct extent_map *em;
   9942	struct btrfs_root *root = BTRFS_I(inode)->root;
   9943	struct btrfs_key ins;
   9944	u64 cur_offset = start;
   9945	u64 clear_offset = start;
   9946	u64 i_size;
   9947	u64 cur_bytes;
   9948	u64 last_alloc = (u64)-1;
   9949	int ret = 0;
   9950	bool own_trans = true;
   9951	u64 end = start + num_bytes - 1;
   9952
   9953	if (trans)
   9954		own_trans = false;
   9955	while (num_bytes > 0) {
   9956		cur_bytes = min_t(u64, num_bytes, SZ_256M);
   9957		cur_bytes = max(cur_bytes, min_size);
   9958		/*
   9959		 * If we are severely fragmented we could end up with really
   9960		 * small allocations, so if the allocator is returning small
   9961		 * chunks lets make its job easier by only searching for those
   9962		 * sized chunks.
   9963		 */
   9964		cur_bytes = min(cur_bytes, last_alloc);
   9965		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
   9966				min_size, 0, *alloc_hint, &ins, 1, 0);
   9967		if (ret)
   9968			break;
   9969
   9970		/*
   9971		 * We've reserved this space, and thus converted it from
   9972		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
   9973		 * from here on out we will only need to clear our reservation
   9974		 * for the remaining unreserved area, so advance our
   9975		 * clear_offset by our extent size.
   9976		 */
   9977		clear_offset += ins.offset;
   9978
   9979		last_alloc = ins.offset;
   9980		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
   9981						    &ins, cur_offset);
   9982		/*
   9983		 * Now that we inserted the prealloc extent we can finally
   9984		 * decrement the number of reservations in the block group.
   9985		 * If we did it before, we could race with relocation and have
   9986		 * relocation miss the reserved extent, making it fail later.
   9987		 */
   9988		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
   9989		if (IS_ERR(trans)) {
   9990			ret = PTR_ERR(trans);
   9991			btrfs_free_reserved_extent(fs_info, ins.objectid,
   9992						   ins.offset, 0);
   9993			break;
   9994		}
   9995
   9996		btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
   9997					cur_offset + ins.offset -1, 0);
   9998
   9999		em = alloc_extent_map();
  10000		if (!em) {
  10001			btrfs_set_inode_full_sync(BTRFS_I(inode));
  10002			goto next;
  10003		}
  10004
  10005		em->start = cur_offset;
  10006		em->orig_start = cur_offset;
  10007		em->len = ins.offset;
  10008		em->block_start = ins.objectid;
  10009		em->block_len = ins.offset;
  10010		em->orig_block_len = ins.offset;
  10011		em->ram_bytes = ins.offset;
  10012		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
  10013		em->generation = trans->transid;
  10014
  10015		while (1) {
  10016			write_lock(&em_tree->lock);
  10017			ret = add_extent_mapping(em_tree, em, 1);
  10018			write_unlock(&em_tree->lock);
  10019			if (ret != -EEXIST)
  10020				break;
  10021			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
  10022						cur_offset + ins.offset - 1,
  10023						0);
  10024		}
  10025		free_extent_map(em);
  10026next:
  10027		num_bytes -= ins.offset;
  10028		cur_offset += ins.offset;
  10029		*alloc_hint = ins.objectid + ins.offset;
  10030
  10031		inode_inc_iversion(inode);
  10032		inode->i_ctime = current_time(inode);
  10033		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
  10034		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
  10035		    (actual_len > inode->i_size) &&
  10036		    (cur_offset > inode->i_size)) {
  10037			if (cur_offset > actual_len)
  10038				i_size = actual_len;
  10039			else
  10040				i_size = cur_offset;
  10041			i_size_write(inode, i_size);
  10042			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
  10043		}
  10044
  10045		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
  10046
  10047		if (ret) {
  10048			btrfs_abort_transaction(trans, ret);
  10049			if (own_trans)
  10050				btrfs_end_transaction(trans);
  10051			break;
  10052		}
  10053
  10054		if (own_trans) {
  10055			btrfs_end_transaction(trans);
  10056			trans = NULL;
  10057		}
  10058	}
  10059	if (clear_offset < end)
  10060		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
  10061			end - clear_offset + 1);
  10062	return ret;
  10063}
  10064
  10065int btrfs_prealloc_file_range(struct inode *inode, int mode,
  10066			      u64 start, u64 num_bytes, u64 min_size,
  10067			      loff_t actual_len, u64 *alloc_hint)
  10068{
  10069	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
  10070					   min_size, actual_len, alloc_hint,
  10071					   NULL);
  10072}
  10073
  10074int btrfs_prealloc_file_range_trans(struct inode *inode,
  10075				    struct btrfs_trans_handle *trans, int mode,
  10076				    u64 start, u64 num_bytes, u64 min_size,
  10077				    loff_t actual_len, u64 *alloc_hint)
  10078{
  10079	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
  10080					   min_size, actual_len, alloc_hint, trans);
  10081}
  10082
  10083static int btrfs_permission(struct user_namespace *mnt_userns,
  10084			    struct inode *inode, int mask)
  10085{
  10086	struct btrfs_root *root = BTRFS_I(inode)->root;
  10087	umode_t mode = inode->i_mode;
  10088
  10089	if (mask & MAY_WRITE &&
  10090	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
  10091		if (btrfs_root_readonly(root))
  10092			return -EROFS;
  10093		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
  10094			return -EACCES;
  10095	}
  10096	return generic_permission(mnt_userns, inode, mask);
  10097}
  10098
  10099static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
  10100			 struct dentry *dentry, umode_t mode)
  10101{
  10102	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
  10103	struct btrfs_trans_handle *trans;
  10104	struct btrfs_root *root = BTRFS_I(dir)->root;
  10105	struct inode *inode;
  10106	struct btrfs_new_inode_args new_inode_args = {
  10107		.dir = dir,
  10108		.dentry = dentry,
  10109		.orphan = true,
  10110	};
  10111	unsigned int trans_num_items;
  10112	int ret;
  10113
  10114	inode = new_inode(dir->i_sb);
  10115	if (!inode)
  10116		return -ENOMEM;
  10117	inode_init_owner(mnt_userns, inode, dir, mode);
  10118	inode->i_fop = &btrfs_file_operations;
  10119	inode->i_op = &btrfs_file_inode_operations;
  10120	inode->i_mapping->a_ops = &btrfs_aops;
  10121
  10122	new_inode_args.inode = inode;
  10123	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
  10124	if (ret)
  10125		goto out_inode;
  10126
  10127	trans = btrfs_start_transaction(root, trans_num_items);
  10128	if (IS_ERR(trans)) {
  10129		ret = PTR_ERR(trans);
  10130		goto out_new_inode_args;
  10131	}
  10132
  10133	ret = btrfs_create_new_inode(trans, &new_inode_args);
  10134
  10135	/*
  10136	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
  10137	 * set it to 1 because d_tmpfile() will issue a warning if the count is
  10138	 * 0, through:
  10139	 *
  10140	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
  10141	 */
  10142	set_nlink(inode, 1);
  10143
  10144	if (!ret) {
  10145		d_tmpfile(dentry, inode);
  10146		unlock_new_inode(inode);
  10147		mark_inode_dirty(inode);
  10148	}
  10149
  10150	btrfs_end_transaction(trans);
  10151	btrfs_btree_balance_dirty(fs_info);
  10152out_new_inode_args:
  10153	btrfs_new_inode_args_destroy(&new_inode_args);
  10154out_inode:
  10155	if (ret)
  10156		iput(inode);
  10157	return ret;
  10158}
  10159
  10160void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
  10161{
  10162	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  10163	unsigned long index = start >> PAGE_SHIFT;
  10164	unsigned long end_index = end >> PAGE_SHIFT;
  10165	struct page *page;
  10166	u32 len;
  10167
  10168	ASSERT(end + 1 - start <= U32_MAX);
  10169	len = end + 1 - start;
  10170	while (index <= end_index) {
  10171		page = find_get_page(inode->vfs_inode.i_mapping, index);
  10172		ASSERT(page); /* Pages should be in the extent_io_tree */
  10173
  10174		btrfs_page_set_writeback(fs_info, page, start, len);
  10175		put_page(page);
  10176		index++;
  10177	}
  10178}
  10179
  10180static int btrfs_encoded_io_compression_from_extent(
  10181				struct btrfs_fs_info *fs_info,
  10182				int compress_type)
  10183{
  10184	switch (compress_type) {
  10185	case BTRFS_COMPRESS_NONE:
  10186		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
  10187	case BTRFS_COMPRESS_ZLIB:
  10188		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
  10189	case BTRFS_COMPRESS_LZO:
  10190		/*
  10191		 * The LZO format depends on the sector size. 64K is the maximum
  10192		 * sector size that we support.
  10193		 */
  10194		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
  10195			return -EINVAL;
  10196		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
  10197		       (fs_info->sectorsize_bits - 12);
  10198	case BTRFS_COMPRESS_ZSTD:
  10199		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
  10200	default:
  10201		return -EUCLEAN;
  10202	}
  10203}
  10204
  10205static ssize_t btrfs_encoded_read_inline(
  10206				struct kiocb *iocb,
  10207				struct iov_iter *iter, u64 start,
  10208				u64 lockend,
  10209				struct extent_state **cached_state,
  10210				u64 extent_start, size_t count,
  10211				struct btrfs_ioctl_encoded_io_args *encoded,
  10212				bool *unlocked)
  10213{
  10214	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
  10215	struct btrfs_root *root = inode->root;
  10216	struct btrfs_fs_info *fs_info = root->fs_info;
  10217	struct extent_io_tree *io_tree = &inode->io_tree;
  10218	struct btrfs_path *path;
  10219	struct extent_buffer *leaf;
  10220	struct btrfs_file_extent_item *item;
  10221	u64 ram_bytes;
  10222	unsigned long ptr;
  10223	void *tmp;
  10224	ssize_t ret;
  10225
  10226	path = btrfs_alloc_path();
  10227	if (!path) {
  10228		ret = -ENOMEM;
  10229		goto out;
  10230	}
  10231	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
  10232				       extent_start, 0);
  10233	if (ret) {
  10234		if (ret > 0) {
  10235			/* The extent item disappeared? */
  10236			ret = -EIO;
  10237		}
  10238		goto out;
  10239	}
  10240	leaf = path->nodes[0];
  10241	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
  10242
  10243	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
  10244	ptr = btrfs_file_extent_inline_start(item);
  10245
  10246	encoded->len = min_t(u64, extent_start + ram_bytes,
  10247			     inode->vfs_inode.i_size) - iocb->ki_pos;
  10248	ret = btrfs_encoded_io_compression_from_extent(fs_info,
  10249				 btrfs_file_extent_compression(leaf, item));
  10250	if (ret < 0)
  10251		goto out;
  10252	encoded->compression = ret;
  10253	if (encoded->compression) {
  10254		size_t inline_size;
  10255
  10256		inline_size = btrfs_file_extent_inline_item_len(leaf,
  10257								path->slots[0]);
  10258		if (inline_size > count) {
  10259			ret = -ENOBUFS;
  10260			goto out;
  10261		}
  10262		count = inline_size;
  10263		encoded->unencoded_len = ram_bytes;
  10264		encoded->unencoded_offset = iocb->ki_pos - extent_start;
  10265	} else {
  10266		count = min_t(u64, count, encoded->len);
  10267		encoded->len = count;
  10268		encoded->unencoded_len = count;
  10269		ptr += iocb->ki_pos - extent_start;
  10270	}
  10271
  10272	tmp = kmalloc(count, GFP_NOFS);
  10273	if (!tmp) {
  10274		ret = -ENOMEM;
  10275		goto out;
  10276	}
  10277	read_extent_buffer(leaf, tmp, ptr, count);
  10278	btrfs_release_path(path);
  10279	unlock_extent_cached(io_tree, start, lockend, cached_state);
  10280	btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
  10281	*unlocked = true;
  10282
  10283	ret = copy_to_iter(tmp, count, iter);
  10284	if (ret != count)
  10285		ret = -EFAULT;
  10286	kfree(tmp);
  10287out:
  10288	btrfs_free_path(path);
  10289	return ret;
  10290}
  10291
  10292struct btrfs_encoded_read_private {
  10293	struct btrfs_inode *inode;
  10294	u64 file_offset;
  10295	wait_queue_head_t wait;
  10296	atomic_t pending;
  10297	blk_status_t status;
  10298	bool skip_csum;
  10299};
  10300
  10301static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
  10302					    struct bio *bio, int mirror_num)
  10303{
  10304	struct btrfs_encoded_read_private *priv = bio->bi_private;
  10305	struct btrfs_bio *bbio = btrfs_bio(bio);
  10306	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  10307	blk_status_t ret;
  10308
  10309	if (!priv->skip_csum) {
  10310		ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
  10311		if (ret)
  10312			return ret;
  10313	}
  10314
  10315	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
  10316	if (ret) {
  10317		btrfs_bio_free_csum(bbio);
  10318		return ret;
  10319	}
  10320
  10321	atomic_inc(&priv->pending);
  10322	ret = btrfs_map_bio(fs_info, bio, mirror_num);
  10323	if (ret) {
  10324		atomic_dec(&priv->pending);
  10325		btrfs_bio_free_csum(bbio);
  10326	}
  10327	return ret;
  10328}
  10329
  10330static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
  10331{
  10332	const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
  10333	struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
  10334	struct btrfs_inode *inode = priv->inode;
  10335	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  10336	u32 sectorsize = fs_info->sectorsize;
  10337	struct bio_vec *bvec;
  10338	struct bvec_iter_all iter_all;
  10339	u64 start = priv->file_offset;
  10340	u32 bio_offset = 0;
  10341
  10342	if (priv->skip_csum || !uptodate)
  10343		return bbio->bio.bi_status;
  10344
  10345	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
  10346		unsigned int i, nr_sectors, pgoff;
  10347
  10348		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
  10349		pgoff = bvec->bv_offset;
  10350		for (i = 0; i < nr_sectors; i++) {
  10351			ASSERT(pgoff < PAGE_SIZE);
  10352			if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
  10353					    bvec->bv_page, pgoff, start))
  10354				return BLK_STS_IOERR;
  10355			start += sectorsize;
  10356			bio_offset += sectorsize;
  10357			pgoff += sectorsize;
  10358		}
  10359	}
  10360	return BLK_STS_OK;
  10361}
  10362
  10363static void btrfs_encoded_read_endio(struct bio *bio)
  10364{
  10365	struct btrfs_encoded_read_private *priv = bio->bi_private;
  10366	struct btrfs_bio *bbio = btrfs_bio(bio);
  10367	blk_status_t status;
  10368
  10369	status = btrfs_encoded_read_verify_csum(bbio);
  10370	if (status) {
  10371		/*
  10372		 * The memory barrier implied by the atomic_dec_return() here
  10373		 * pairs with the memory barrier implied by the
  10374		 * atomic_dec_return() or io_wait_event() in
  10375		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
  10376		 * write is observed before the load of status in
  10377		 * btrfs_encoded_read_regular_fill_pages().
  10378		 */
  10379		WRITE_ONCE(priv->status, status);
  10380	}
  10381	if (!atomic_dec_return(&priv->pending))
  10382		wake_up(&priv->wait);
  10383	btrfs_bio_free_csum(bbio);
  10384	bio_put(bio);
  10385}
  10386
  10387static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
  10388						 u64 file_offset,
  10389						 u64 disk_bytenr,
  10390						 u64 disk_io_size,
  10391						 struct page **pages)
  10392{
  10393	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  10394	struct btrfs_encoded_read_private priv = {
  10395		.inode = inode,
  10396		.file_offset = file_offset,
  10397		.pending = ATOMIC_INIT(1),
  10398		.skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
  10399	};
  10400	unsigned long i = 0;
  10401	u64 cur = 0;
  10402	int ret;
  10403
  10404	init_waitqueue_head(&priv.wait);
  10405	/*
  10406	 * Submit bios for the extent, splitting due to bio or stripe limits as
  10407	 * necessary.
  10408	 */
  10409	while (cur < disk_io_size) {
  10410		struct extent_map *em;
  10411		struct btrfs_io_geometry geom;
  10412		struct bio *bio = NULL;
  10413		u64 remaining;
  10414
  10415		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
  10416					 disk_io_size - cur);
  10417		if (IS_ERR(em)) {
  10418			ret = PTR_ERR(em);
  10419		} else {
  10420			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
  10421						    disk_bytenr + cur, &geom);
  10422			free_extent_map(em);
  10423		}
  10424		if (ret) {
  10425			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
  10426			break;
  10427		}
  10428		remaining = min(geom.len, disk_io_size - cur);
  10429		while (bio || remaining) {
  10430			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
  10431
  10432			if (!bio) {
  10433				bio = btrfs_bio_alloc(BIO_MAX_VECS);
  10434				bio->bi_iter.bi_sector =
  10435					(disk_bytenr + cur) >> SECTOR_SHIFT;
  10436				bio->bi_end_io = btrfs_encoded_read_endio;
  10437				bio->bi_private = &priv;
  10438				bio->bi_opf = REQ_OP_READ;
  10439			}
  10440
  10441			if (!bytes ||
  10442			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
  10443				blk_status_t status;
  10444
  10445				status = submit_encoded_read_bio(inode, bio, 0);
  10446				if (status) {
  10447					WRITE_ONCE(priv.status, status);
  10448					bio_put(bio);
  10449					goto out;
  10450				}
  10451				bio = NULL;
  10452				continue;
  10453			}
  10454
  10455			i++;
  10456			cur += bytes;
  10457			remaining -= bytes;
  10458		}
  10459	}
  10460
  10461out:
  10462	if (atomic_dec_return(&priv.pending))
  10463		io_wait_event(priv.wait, !atomic_read(&priv.pending));
  10464	/* See btrfs_encoded_read_endio() for ordering. */
  10465	return blk_status_to_errno(READ_ONCE(priv.status));
  10466}
  10467
  10468static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
  10469					  struct iov_iter *iter,
  10470					  u64 start, u64 lockend,
  10471					  struct extent_state **cached_state,
  10472					  u64 disk_bytenr, u64 disk_io_size,
  10473					  size_t count, bool compressed,
  10474					  bool *unlocked)
  10475{
  10476	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
  10477	struct extent_io_tree *io_tree = &inode->io_tree;
  10478	struct page **pages;
  10479	unsigned long nr_pages, i;
  10480	u64 cur;
  10481	size_t page_offset;
  10482	ssize_t ret;
  10483
  10484	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
  10485	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
  10486	if (!pages)
  10487		return -ENOMEM;
  10488	ret = btrfs_alloc_page_array(nr_pages, pages);
  10489	if (ret) {
  10490		ret = -ENOMEM;
  10491		goto out;
  10492		}
  10493
  10494	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
  10495						    disk_io_size, pages);
  10496	if (ret)
  10497		goto out;
  10498
  10499	unlock_extent_cached(io_tree, start, lockend, cached_state);
  10500	btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
  10501	*unlocked = true;
  10502
  10503	if (compressed) {
  10504		i = 0;
  10505		page_offset = 0;
  10506	} else {
  10507		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
  10508		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
  10509	}
  10510	cur = 0;
  10511	while (cur < count) {
  10512		size_t bytes = min_t(size_t, count - cur,
  10513				     PAGE_SIZE - page_offset);
  10514
  10515		if (copy_page_to_iter(pages[i], page_offset, bytes,
  10516				      iter) != bytes) {
  10517			ret = -EFAULT;
  10518			goto out;
  10519		}
  10520		i++;
  10521		cur += bytes;
  10522		page_offset = 0;
  10523	}
  10524	ret = count;
  10525out:
  10526	for (i = 0; i < nr_pages; i++) {
  10527		if (pages[i])
  10528			__free_page(pages[i]);
  10529	}
  10530	kfree(pages);
  10531	return ret;
  10532}
  10533
  10534ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
  10535			   struct btrfs_ioctl_encoded_io_args *encoded)
  10536{
  10537	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
  10538	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  10539	struct extent_io_tree *io_tree = &inode->io_tree;
  10540	ssize_t ret;
  10541	size_t count = iov_iter_count(iter);
  10542	u64 start, lockend, disk_bytenr, disk_io_size;
  10543	struct extent_state *cached_state = NULL;
  10544	struct extent_map *em;
  10545	bool unlocked = false;
  10546
  10547	file_accessed(iocb->ki_filp);
  10548
  10549	btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
  10550
  10551	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
  10552		btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
  10553		return 0;
  10554	}
  10555	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
  10556	/*
  10557	 * We don't know how long the extent containing iocb->ki_pos is, but if
  10558	 * it's compressed we know that it won't be longer than this.
  10559	 */
  10560	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
  10561
  10562	for (;;) {
  10563		struct btrfs_ordered_extent *ordered;
  10564
  10565		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
  10566					       lockend - start + 1);
  10567		if (ret)
  10568			goto out_unlock_inode;
  10569		lock_extent_bits(io_tree, start, lockend, &cached_state);
  10570		ordered = btrfs_lookup_ordered_range(inode, start,
  10571						     lockend - start + 1);
  10572		if (!ordered)
  10573			break;
  10574		btrfs_put_ordered_extent(ordered);
  10575		unlock_extent_cached(io_tree, start, lockend, &cached_state);
  10576		cond_resched();
  10577	}
  10578
  10579	em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
  10580	if (IS_ERR(em)) {
  10581		ret = PTR_ERR(em);
  10582		goto out_unlock_extent;
  10583	}
  10584
  10585	if (em->block_start == EXTENT_MAP_INLINE) {
  10586		u64 extent_start = em->start;
  10587
  10588		/*
  10589		 * For inline extents we get everything we need out of the
  10590		 * extent item.
  10591		 */
  10592		free_extent_map(em);
  10593		em = NULL;
  10594		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
  10595						&cached_state, extent_start,
  10596						count, encoded, &unlocked);
  10597		goto out;
  10598	}
  10599
  10600	/*
  10601	 * We only want to return up to EOF even if the extent extends beyond
  10602	 * that.
  10603	 */
  10604	encoded->len = min_t(u64, extent_map_end(em),
  10605			     inode->vfs_inode.i_size) - iocb->ki_pos;
  10606	if (em->block_start == EXTENT_MAP_HOLE ||
  10607	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
  10608		disk_bytenr = EXTENT_MAP_HOLE;
  10609		count = min_t(u64, count, encoded->len);
  10610		encoded->len = count;
  10611		encoded->unencoded_len = count;
  10612	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
  10613		disk_bytenr = em->block_start;
  10614		/*
  10615		 * Bail if the buffer isn't large enough to return the whole
  10616		 * compressed extent.
  10617		 */
  10618		if (em->block_len > count) {
  10619			ret = -ENOBUFS;
  10620			goto out_em;
  10621		}
  10622		disk_io_size = count = em->block_len;
  10623		encoded->unencoded_len = em->ram_bytes;
  10624		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
  10625		ret = btrfs_encoded_io_compression_from_extent(fs_info,
  10626							     em->compress_type);
  10627		if (ret < 0)
  10628			goto out_em;
  10629		encoded->compression = ret;
  10630	} else {
  10631		disk_bytenr = em->block_start + (start - em->start);
  10632		if (encoded->len > count)
  10633			encoded->len = count;
  10634		/*
  10635		 * Don't read beyond what we locked. This also limits the page
  10636		 * allocations that we'll do.
  10637		 */
  10638		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
  10639		count = start + disk_io_size - iocb->ki_pos;
  10640		encoded->len = count;
  10641		encoded->unencoded_len = count;
  10642		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
  10643	}
  10644	free_extent_map(em);
  10645	em = NULL;
  10646
  10647	if (disk_bytenr == EXTENT_MAP_HOLE) {
  10648		unlock_extent_cached(io_tree, start, lockend, &cached_state);
  10649		btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
  10650		unlocked = true;
  10651		ret = iov_iter_zero(count, iter);
  10652		if (ret != count)
  10653			ret = -EFAULT;
  10654	} else {
  10655		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
  10656						 &cached_state, disk_bytenr,
  10657						 disk_io_size, count,
  10658						 encoded->compression,
  10659						 &unlocked);
  10660	}
  10661
  10662out:
  10663	if (ret >= 0)
  10664		iocb->ki_pos += encoded->len;
  10665out_em:
  10666	free_extent_map(em);
  10667out_unlock_extent:
  10668	if (!unlocked)
  10669		unlock_extent_cached(io_tree, start, lockend, &cached_state);
  10670out_unlock_inode:
  10671	if (!unlocked)
  10672		btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
  10673	return ret;
  10674}
  10675
  10676ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
  10677			       const struct btrfs_ioctl_encoded_io_args *encoded)
  10678{
  10679	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
  10680	struct btrfs_root *root = inode->root;
  10681	struct btrfs_fs_info *fs_info = root->fs_info;
  10682	struct extent_io_tree *io_tree = &inode->io_tree;
  10683	struct extent_changeset *data_reserved = NULL;
  10684	struct extent_state *cached_state = NULL;
  10685	int compression;
  10686	size_t orig_count;
  10687	u64 start, end;
  10688	u64 num_bytes, ram_bytes, disk_num_bytes;
  10689	unsigned long nr_pages, i;
  10690	struct page **pages;
  10691	struct btrfs_key ins;
  10692	bool extent_reserved = false;
  10693	struct extent_map *em;
  10694	ssize_t ret;
  10695
  10696	switch (encoded->compression) {
  10697	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
  10698		compression = BTRFS_COMPRESS_ZLIB;
  10699		break;
  10700	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
  10701		compression = BTRFS_COMPRESS_ZSTD;
  10702		break;
  10703	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
  10704	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
  10705	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
  10706	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
  10707	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
  10708		/* The sector size must match for LZO. */
  10709		if (encoded->compression -
  10710		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
  10711		    fs_info->sectorsize_bits)
  10712			return -EINVAL;
  10713		compression = BTRFS_COMPRESS_LZO;
  10714		break;
  10715	default:
  10716		return -EINVAL;
  10717	}
  10718	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
  10719		return -EINVAL;
  10720
  10721	orig_count = iov_iter_count(from);
  10722
  10723	/* The extent size must be sane. */
  10724	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
  10725	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
  10726		return -EINVAL;
  10727
  10728	/*
  10729	 * The compressed data must be smaller than the decompressed data.
  10730	 *
  10731	 * It's of course possible for data to compress to larger or the same
  10732	 * size, but the buffered I/O path falls back to no compression for such
  10733	 * data, and we don't want to break any assumptions by creating these
  10734	 * extents.
  10735	 *
  10736	 * Note that this is less strict than the current check we have that the
  10737	 * compressed data must be at least one sector smaller than the
  10738	 * decompressed data. We only want to enforce the weaker requirement
  10739	 * from old kernels that it is at least one byte smaller.
  10740	 */
  10741	if (orig_count >= encoded->unencoded_len)
  10742		return -EINVAL;
  10743
  10744	/* The extent must start on a sector boundary. */
  10745	start = iocb->ki_pos;
  10746	if (!IS_ALIGNED(start, fs_info->sectorsize))
  10747		return -EINVAL;
  10748
  10749	/*
  10750	 * The extent must end on a sector boundary. However, we allow a write
  10751	 * which ends at or extends i_size to have an unaligned length; we round
  10752	 * up the extent size and set i_size to the unaligned end.
  10753	 */
  10754	if (start + encoded->len < inode->vfs_inode.i_size &&
  10755	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
  10756		return -EINVAL;
  10757
  10758	/* Finally, the offset in the unencoded data must be sector-aligned. */
  10759	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
  10760		return -EINVAL;
  10761
  10762	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
  10763	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
  10764	end = start + num_bytes - 1;
  10765
  10766	/*
  10767	 * If the extent cannot be inline, the compressed data on disk must be
  10768	 * sector-aligned. For convenience, we extend it with zeroes if it
  10769	 * isn't.
  10770	 */
  10771	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
  10772	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
  10773	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
  10774	if (!pages)
  10775		return -ENOMEM;
  10776	for (i = 0; i < nr_pages; i++) {
  10777		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
  10778		char *kaddr;
  10779
  10780		pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
  10781		if (!pages[i]) {
  10782			ret = -ENOMEM;
  10783			goto out_pages;
  10784		}
  10785		kaddr = kmap(pages[i]);
  10786		if (copy_from_iter(kaddr, bytes, from) != bytes) {
  10787			kunmap(pages[i]);
  10788			ret = -EFAULT;
  10789			goto out_pages;
  10790		}
  10791		if (bytes < PAGE_SIZE)
  10792			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
  10793		kunmap(pages[i]);
  10794	}
  10795
  10796	for (;;) {
  10797		struct btrfs_ordered_extent *ordered;
  10798
  10799		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
  10800		if (ret)
  10801			goto out_pages;
  10802		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
  10803						    start >> PAGE_SHIFT,
  10804						    end >> PAGE_SHIFT);
  10805		if (ret)
  10806			goto out_pages;
  10807		lock_extent_bits(io_tree, start, end, &cached_state);
  10808		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
  10809		if (!ordered &&
  10810		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
  10811			break;
  10812		if (ordered)
  10813			btrfs_put_ordered_extent(ordered);
  10814		unlock_extent_cached(io_tree, start, end, &cached_state);
  10815		cond_resched();
  10816	}
  10817
  10818	/*
  10819	 * We don't use the higher-level delalloc space functions because our
  10820	 * num_bytes and disk_num_bytes are different.
  10821	 */
  10822	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
  10823	if (ret)
  10824		goto out_unlock;
  10825	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
  10826	if (ret)
  10827		goto out_free_data_space;
  10828	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
  10829					      false);
  10830	if (ret)
  10831		goto out_qgroup_free_data;
  10832
  10833	/* Try an inline extent first. */
  10834	if (start == 0 && encoded->unencoded_len == encoded->len &&
  10835	    encoded->unencoded_offset == 0) {
  10836		ret = cow_file_range_inline(inode, encoded->len, orig_count,
  10837					    compression, pages, true);
  10838		if (ret <= 0) {
  10839			if (ret == 0)
  10840				ret = orig_count;
  10841			goto out_delalloc_release;
  10842		}
  10843	}
  10844
  10845	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
  10846				   disk_num_bytes, 0, 0, &ins, 1, 1);
  10847	if (ret)
  10848		goto out_delalloc_release;
  10849	extent_reserved = true;
  10850
  10851	em = create_io_em(inode, start, num_bytes,
  10852			  start - encoded->unencoded_offset, ins.objectid,
  10853			  ins.offset, ins.offset, ram_bytes, compression,
  10854			  BTRFS_ORDERED_COMPRESSED);
  10855	if (IS_ERR(em)) {
  10856		ret = PTR_ERR(em);
  10857		goto out_free_reserved;
  10858	}
  10859	free_extent_map(em);
  10860
  10861	ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
  10862				       ins.objectid, ins.offset,
  10863				       encoded->unencoded_offset,
  10864				       (1 << BTRFS_ORDERED_ENCODED) |
  10865				       (1 << BTRFS_ORDERED_COMPRESSED),
  10866				       compression);
  10867	if (ret) {
  10868		btrfs_drop_extent_cache(inode, start, end, 0);
  10869		goto out_free_reserved;
  10870	}
  10871	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
  10872
  10873	if (start + encoded->len > inode->vfs_inode.i_size)
  10874		i_size_write(&inode->vfs_inode, start + encoded->len);
  10875
  10876	unlock_extent_cached(io_tree, start, end, &cached_state);
  10877
  10878	btrfs_delalloc_release_extents(inode, num_bytes);
  10879
  10880	if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
  10881					  ins.offset, pages, nr_pages, 0, NULL,
  10882					  false)) {
  10883		btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
  10884		ret = -EIO;
  10885		goto out_pages;
  10886	}
  10887	ret = orig_count;
  10888	goto out;
  10889
  10890out_free_reserved:
  10891	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
  10892	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
  10893out_delalloc_release:
  10894	btrfs_delalloc_release_extents(inode, num_bytes);
  10895	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
  10896out_qgroup_free_data:
  10897	if (ret < 0)
  10898		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
  10899out_free_data_space:
  10900	/*
  10901	 * If btrfs_reserve_extent() succeeded, then we already decremented
  10902	 * bytes_may_use.
  10903	 */
  10904	if (!extent_reserved)
  10905		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
  10906out_unlock:
  10907	unlock_extent_cached(io_tree, start, end, &cached_state);
  10908out_pages:
  10909	for (i = 0; i < nr_pages; i++) {
  10910		if (pages[i])
  10911			__free_page(pages[i]);
  10912	}
  10913	kvfree(pages);
  10914out:
  10915	if (ret >= 0)
  10916		iocb->ki_pos += encoded->len;
  10917	return ret;
  10918}
  10919
  10920#ifdef CONFIG_SWAP
  10921/*
  10922 * Add an entry indicating a block group or device which is pinned by a
  10923 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
  10924 * negative errno on failure.
  10925 */
  10926static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
  10927				  bool is_block_group)
  10928{
  10929	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
  10930	struct btrfs_swapfile_pin *sp, *entry;
  10931	struct rb_node **p;
  10932	struct rb_node *parent = NULL;
  10933
  10934	sp = kmalloc(sizeof(*sp), GFP_NOFS);
  10935	if (!sp)
  10936		return -ENOMEM;
  10937	sp->ptr = ptr;
  10938	sp->inode = inode;
  10939	sp->is_block_group = is_block_group;
  10940	sp->bg_extent_count = 1;
  10941
  10942	spin_lock(&fs_info->swapfile_pins_lock);
  10943	p = &fs_info->swapfile_pins.rb_node;
  10944	while (*p) {
  10945		parent = *p;
  10946		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
  10947		if (sp->ptr < entry->ptr ||
  10948		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
  10949			p = &(*p)->rb_left;
  10950		} else if (sp->ptr > entry->ptr ||
  10951			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
  10952			p = &(*p)->rb_right;
  10953		} else {
  10954			if (is_block_group)
  10955				entry->bg_extent_count++;
  10956			spin_unlock(&fs_info->swapfile_pins_lock);
  10957			kfree(sp);
  10958			return 1;
  10959		}
  10960	}
  10961	rb_link_node(&sp->node, parent, p);
  10962	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
  10963	spin_unlock(&fs_info->swapfile_pins_lock);
  10964	return 0;
  10965}
  10966
  10967/* Free all of the entries pinned by this swapfile. */
  10968static void btrfs_free_swapfile_pins(struct inode *inode)
  10969{
  10970	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
  10971	struct btrfs_swapfile_pin *sp;
  10972	struct rb_node *node, *next;
  10973
  10974	spin_lock(&fs_info->swapfile_pins_lock);
  10975	node = rb_first(&fs_info->swapfile_pins);
  10976	while (node) {
  10977		next = rb_next(node);
  10978		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
  10979		if (sp->inode == inode) {
  10980			rb_erase(&sp->node, &fs_info->swapfile_pins);
  10981			if (sp->is_block_group) {
  10982				btrfs_dec_block_group_swap_extents(sp->ptr,
  10983							   sp->bg_extent_count);
  10984				btrfs_put_block_group(sp->ptr);
  10985			}
  10986			kfree(sp);
  10987		}
  10988		node = next;
  10989	}
  10990	spin_unlock(&fs_info->swapfile_pins_lock);
  10991}
  10992
  10993struct btrfs_swap_info {
  10994	u64 start;
  10995	u64 block_start;
  10996	u64 block_len;
  10997	u64 lowest_ppage;
  10998	u64 highest_ppage;
  10999	unsigned long nr_pages;
  11000	int nr_extents;
  11001};
  11002
  11003static int btrfs_add_swap_extent(struct swap_info_struct *sis,
  11004				 struct btrfs_swap_info *bsi)
  11005{
  11006	unsigned long nr_pages;
  11007	unsigned long max_pages;
  11008	u64 first_ppage, first_ppage_reported, next_ppage;
  11009	int ret;
  11010
  11011	/*
  11012	 * Our swapfile may have had its size extended after the swap header was
  11013	 * written. In that case activating the swapfile should not go beyond
  11014	 * the max size set in the swap header.
  11015	 */
  11016	if (bsi->nr_pages >= sis->max)
  11017		return 0;
  11018
  11019	max_pages = sis->max - bsi->nr_pages;
  11020	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
  11021	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
  11022				PAGE_SIZE) >> PAGE_SHIFT;
  11023
  11024	if (first_ppage >= next_ppage)
  11025		return 0;
  11026	nr_pages = next_ppage - first_ppage;
  11027	nr_pages = min(nr_pages, max_pages);
  11028
  11029	first_ppage_reported = first_ppage;
  11030	if (bsi->start == 0)
  11031		first_ppage_reported++;
  11032	if (bsi->lowest_ppage > first_ppage_reported)
  11033		bsi->lowest_ppage = first_ppage_reported;
  11034	if (bsi->highest_ppage < (next_ppage - 1))
  11035		bsi->highest_ppage = next_ppage - 1;
  11036
  11037	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
  11038	if (ret < 0)
  11039		return ret;
  11040	bsi->nr_extents += ret;
  11041	bsi->nr_pages += nr_pages;
  11042	return 0;
  11043}
  11044
  11045static void btrfs_swap_deactivate(struct file *file)
  11046{
  11047	struct inode *inode = file_inode(file);
  11048
  11049	btrfs_free_swapfile_pins(inode);
  11050	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
  11051}
  11052
  11053static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
  11054			       sector_t *span)
  11055{
  11056	struct inode *inode = file_inode(file);
  11057	struct btrfs_root *root = BTRFS_I(inode)->root;
  11058	struct btrfs_fs_info *fs_info = root->fs_info;
  11059	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  11060	struct extent_state *cached_state = NULL;
  11061	struct extent_map *em = NULL;
  11062	struct btrfs_device *device = NULL;
  11063	struct btrfs_swap_info bsi = {
  11064		.lowest_ppage = (sector_t)-1ULL,
  11065	};
  11066	int ret = 0;
  11067	u64 isize;
  11068	u64 start;
  11069
  11070	/*
  11071	 * If the swap file was just created, make sure delalloc is done. If the
  11072	 * file changes again after this, the user is doing something stupid and
  11073	 * we don't really care.
  11074	 */
  11075	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
  11076	if (ret)
  11077		return ret;
  11078
  11079	/*
  11080	 * The inode is locked, so these flags won't change after we check them.
  11081	 */
  11082	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
  11083		btrfs_warn(fs_info, "swapfile must not be compressed");
  11084		return -EINVAL;
  11085	}
  11086	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
  11087		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
  11088		return -EINVAL;
  11089	}
  11090	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
  11091		btrfs_warn(fs_info, "swapfile must not be checksummed");
  11092		return -EINVAL;
  11093	}
  11094
  11095	/*
  11096	 * Balance or device remove/replace/resize can move stuff around from
  11097	 * under us. The exclop protection makes sure they aren't running/won't
  11098	 * run concurrently while we are mapping the swap extents, and
  11099	 * fs_info->swapfile_pins prevents them from running while the swap
  11100	 * file is active and moving the extents. Note that this also prevents
  11101	 * a concurrent device add which isn't actually necessary, but it's not
  11102	 * really worth the trouble to allow it.
  11103	 */
  11104	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
  11105		btrfs_warn(fs_info,
  11106	   "cannot activate swapfile while exclusive operation is running");
  11107		return -EBUSY;
  11108	}
  11109
  11110	/*
  11111	 * Prevent snapshot creation while we are activating the swap file.
  11112	 * We do not want to race with snapshot creation. If snapshot creation
  11113	 * already started before we bumped nr_swapfiles from 0 to 1 and
  11114	 * completes before the first write into the swap file after it is
  11115	 * activated, than that write would fallback to COW.
  11116	 */
  11117	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
  11118		btrfs_exclop_finish(fs_info);
  11119		btrfs_warn(fs_info,
  11120	   "cannot activate swapfile because snapshot creation is in progress");
  11121		return -EINVAL;
  11122	}
  11123	/*
  11124	 * Snapshots can create extents which require COW even if NODATACOW is
  11125	 * set. We use this counter to prevent snapshots. We must increment it
  11126	 * before walking the extents because we don't want a concurrent
  11127	 * snapshot to run after we've already checked the extents.
  11128	 *
  11129	 * It is possible that subvolume is marked for deletion but still not
  11130	 * removed yet. To prevent this race, we check the root status before
  11131	 * activating the swapfile.
  11132	 */
  11133	spin_lock(&root->root_item_lock);
  11134	if (btrfs_root_dead(root)) {
  11135		spin_unlock(&root->root_item_lock);
  11136
  11137		btrfs_exclop_finish(fs_info);
  11138		btrfs_warn(fs_info,
  11139		"cannot activate swapfile because subvolume %llu is being deleted",
  11140			root->root_key.objectid);
  11141		return -EPERM;
  11142	}
  11143	atomic_inc(&root->nr_swapfiles);
  11144	spin_unlock(&root->root_item_lock);
  11145
  11146	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
  11147
  11148	lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
  11149	start = 0;
  11150	while (start < isize) {
  11151		u64 logical_block_start, physical_block_start;
  11152		struct btrfs_block_group *bg;
  11153		u64 len = isize - start;
  11154
  11155		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
  11156		if (IS_ERR(em)) {
  11157			ret = PTR_ERR(em);
  11158			goto out;
  11159		}
  11160
  11161		if (em->block_start == EXTENT_MAP_HOLE) {
  11162			btrfs_warn(fs_info, "swapfile must not have holes");
  11163			ret = -EINVAL;
  11164			goto out;
  11165		}
  11166		if (em->block_start == EXTENT_MAP_INLINE) {
  11167			/*
  11168			 * It's unlikely we'll ever actually find ourselves
  11169			 * here, as a file small enough to fit inline won't be
  11170			 * big enough to store more than the swap header, but in
  11171			 * case something changes in the future, let's catch it
  11172			 * here rather than later.
  11173			 */
  11174			btrfs_warn(fs_info, "swapfile must not be inline");
  11175			ret = -EINVAL;
  11176			goto out;
  11177		}
  11178		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
  11179			btrfs_warn(fs_info, "swapfile must not be compressed");
  11180			ret = -EINVAL;
  11181			goto out;
  11182		}
  11183
  11184		logical_block_start = em->block_start + (start - em->start);
  11185		len = min(len, em->len - (start - em->start));
  11186		free_extent_map(em);
  11187		em = NULL;
  11188
  11189		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
  11190		if (ret < 0) {
  11191			goto out;
  11192		} else if (ret) {
  11193			ret = 0;
  11194		} else {
  11195			btrfs_warn(fs_info,
  11196				   "swapfile must not be copy-on-write");
  11197			ret = -EINVAL;
  11198			goto out;
  11199		}
  11200
  11201		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
  11202		if (IS_ERR(em)) {
  11203			ret = PTR_ERR(em);
  11204			goto out;
  11205		}
  11206
  11207		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
  11208			btrfs_warn(fs_info,
  11209				   "swapfile must have single data profile");
  11210			ret = -EINVAL;
  11211			goto out;
  11212		}
  11213
  11214		if (device == NULL) {
  11215			device = em->map_lookup->stripes[0].dev;
  11216			ret = btrfs_add_swapfile_pin(inode, device, false);
  11217			if (ret == 1)
  11218				ret = 0;
  11219			else if (ret)
  11220				goto out;
  11221		} else if (device != em->map_lookup->stripes[0].dev) {
  11222			btrfs_warn(fs_info, "swapfile must be on one device");
  11223			ret = -EINVAL;
  11224			goto out;
  11225		}
  11226
  11227		physical_block_start = (em->map_lookup->stripes[0].physical +
  11228					(logical_block_start - em->start));
  11229		len = min(len, em->len - (logical_block_start - em->start));
  11230		free_extent_map(em);
  11231		em = NULL;
  11232
  11233		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
  11234		if (!bg) {
  11235			btrfs_warn(fs_info,
  11236			   "could not find block group containing swapfile");
  11237			ret = -EINVAL;
  11238			goto out;
  11239		}
  11240
  11241		if (!btrfs_inc_block_group_swap_extents(bg)) {
  11242			btrfs_warn(fs_info,
  11243			   "block group for swapfile at %llu is read-only%s",
  11244			   bg->start,
  11245			   atomic_read(&fs_info->scrubs_running) ?
  11246				       " (scrub running)" : "");
  11247			btrfs_put_block_group(bg);
  11248			ret = -EINVAL;
  11249			goto out;
  11250		}
  11251
  11252		ret = btrfs_add_swapfile_pin(inode, bg, true);
  11253		if (ret) {
  11254			btrfs_put_block_group(bg);
  11255			if (ret == 1)
  11256				ret = 0;
  11257			else
  11258				goto out;
  11259		}
  11260
  11261		if (bsi.block_len &&
  11262		    bsi.block_start + bsi.block_len == physical_block_start) {
  11263			bsi.block_len += len;
  11264		} else {
  11265			if (bsi.block_len) {
  11266				ret = btrfs_add_swap_extent(sis, &bsi);
  11267				if (ret)
  11268					goto out;
  11269			}
  11270			bsi.start = start;
  11271			bsi.block_start = physical_block_start;
  11272			bsi.block_len = len;
  11273		}
  11274
  11275		start += len;
  11276	}
  11277
  11278	if (bsi.block_len)
  11279		ret = btrfs_add_swap_extent(sis, &bsi);
  11280
  11281out:
  11282	if (!IS_ERR_OR_NULL(em))
  11283		free_extent_map(em);
  11284
  11285	unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
  11286
  11287	if (ret)
  11288		btrfs_swap_deactivate(file);
  11289
  11290	btrfs_drew_write_unlock(&root->snapshot_lock);
  11291
  11292	btrfs_exclop_finish(fs_info);
  11293
  11294	if (ret)
  11295		return ret;
  11296
  11297	if (device)
  11298		sis->bdev = device->bdev;
  11299	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
  11300	sis->max = bsi.nr_pages;
  11301	sis->pages = bsi.nr_pages - 1;
  11302	sis->highest_bit = bsi.nr_pages - 1;
  11303	return bsi.nr_extents;
  11304}
  11305#else
  11306static void btrfs_swap_deactivate(struct file *file)
  11307{
  11308}
  11309
  11310static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
  11311			       sector_t *span)
  11312{
  11313	return -EOPNOTSUPP;
  11314}
  11315#endif
  11316
  11317/*
  11318 * Update the number of bytes used in the VFS' inode. When we replace extents in
  11319 * a range (clone, dedupe, fallocate's zero range), we must update the number of
  11320 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
  11321 * always get a correct value.
  11322 */
  11323void btrfs_update_inode_bytes(struct btrfs_inode *inode,
  11324			      const u64 add_bytes,
  11325			      const u64 del_bytes)
  11326{
  11327	if (add_bytes == del_bytes)
  11328		return;
  11329
  11330	spin_lock(&inode->lock);
  11331	if (del_bytes > 0)
  11332		inode_sub_bytes(&inode->vfs_inode, del_bytes);
  11333	if (add_bytes > 0)
  11334		inode_add_bytes(&inode->vfs_inode, add_bytes);
  11335	spin_unlock(&inode->lock);
  11336}
  11337
  11338/**
  11339 * Verify that there are no ordered extents for a given file range.
  11340 *
  11341 * @inode:   The target inode.
  11342 * @start:   Start offset of the file range, should be sector size aligned.
  11343 * @end:     End offset (inclusive) of the file range, its value +1 should be
  11344 *           sector size aligned.
  11345 *
  11346 * This should typically be used for cases where we locked an inode's VFS lock in
  11347 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
  11348 * we have flushed all delalloc in the range, we have waited for all ordered
  11349 * extents in the range to complete and finally we have locked the file range in
  11350 * the inode's io_tree.
  11351 */
  11352void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
  11353{
  11354	struct btrfs_root *root = inode->root;
  11355	struct btrfs_ordered_extent *ordered;
  11356
  11357	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
  11358		return;
  11359
  11360	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
  11361	if (ordered) {
  11362		btrfs_err(root->fs_info,
  11363"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
  11364			  start, end, btrfs_ino(inode), root->root_key.objectid,
  11365			  ordered->file_offset,
  11366			  ordered->file_offset + ordered->num_bytes - 1);
  11367		btrfs_put_ordered_extent(ordered);
  11368	}
  11369
  11370	ASSERT(ordered == NULL);
  11371}
  11372
  11373static const struct inode_operations btrfs_dir_inode_operations = {
  11374	.getattr	= btrfs_getattr,
  11375	.lookup		= btrfs_lookup,
  11376	.create		= btrfs_create,
  11377	.unlink		= btrfs_unlink,
  11378	.link		= btrfs_link,
  11379	.mkdir		= btrfs_mkdir,
  11380	.rmdir		= btrfs_rmdir,
  11381	.rename		= btrfs_rename2,
  11382	.symlink	= btrfs_symlink,
  11383	.setattr	= btrfs_setattr,
  11384	.mknod		= btrfs_mknod,
  11385	.listxattr	= btrfs_listxattr,
  11386	.permission	= btrfs_permission,
  11387	.get_acl	= btrfs_get_acl,
  11388	.set_acl	= btrfs_set_acl,
  11389	.update_time	= btrfs_update_time,
  11390	.tmpfile        = btrfs_tmpfile,
  11391	.fileattr_get	= btrfs_fileattr_get,
  11392	.fileattr_set	= btrfs_fileattr_set,
  11393};
  11394
  11395static const struct file_operations btrfs_dir_file_operations = {
  11396	.llseek		= generic_file_llseek,
  11397	.read		= generic_read_dir,
  11398	.iterate_shared	= btrfs_real_readdir,
  11399	.open		= btrfs_opendir,
  11400	.unlocked_ioctl	= btrfs_ioctl,
  11401#ifdef CONFIG_COMPAT
  11402	.compat_ioctl	= btrfs_compat_ioctl,
  11403#endif
  11404	.release        = btrfs_release_file,
  11405	.fsync		= btrfs_sync_file,
  11406};
  11407
  11408/*
  11409 * btrfs doesn't support the bmap operation because swapfiles
  11410 * use bmap to make a mapping of extents in the file.  They assume
  11411 * these extents won't change over the life of the file and they
  11412 * use the bmap result to do IO directly to the drive.
  11413 *
  11414 * the btrfs bmap call would return logical addresses that aren't
  11415 * suitable for IO and they also will change frequently as COW
  11416 * operations happen.  So, swapfile + btrfs == corruption.
  11417 *
  11418 * For now we're avoiding this by dropping bmap.
  11419 */
  11420static const struct address_space_operations btrfs_aops = {
  11421	.read_folio	= btrfs_read_folio,
  11422	.writepage	= btrfs_writepage,
  11423	.writepages	= btrfs_writepages,
  11424	.readahead	= btrfs_readahead,
  11425	.direct_IO	= noop_direct_IO,
  11426	.invalidate_folio = btrfs_invalidate_folio,
  11427	.release_folio	= btrfs_release_folio,
  11428#ifdef CONFIG_MIGRATION
  11429	.migratepage	= btrfs_migratepage,
  11430#endif
  11431	.dirty_folio	= filemap_dirty_folio,
  11432	.error_remove_page = generic_error_remove_page,
  11433	.swap_activate	= btrfs_swap_activate,
  11434	.swap_deactivate = btrfs_swap_deactivate,
  11435};
  11436
  11437static const struct inode_operations btrfs_file_inode_operations = {
  11438	.getattr	= btrfs_getattr,
  11439	.setattr	= btrfs_setattr,
  11440	.listxattr      = btrfs_listxattr,
  11441	.permission	= btrfs_permission,
  11442	.fiemap		= btrfs_fiemap,
  11443	.get_acl	= btrfs_get_acl,
  11444	.set_acl	= btrfs_set_acl,
  11445	.update_time	= btrfs_update_time,
  11446	.fileattr_get	= btrfs_fileattr_get,
  11447	.fileattr_set	= btrfs_fileattr_set,
  11448};
  11449static const struct inode_operations btrfs_special_inode_operations = {
  11450	.getattr	= btrfs_getattr,
  11451	.setattr	= btrfs_setattr,
  11452	.permission	= btrfs_permission,
  11453	.listxattr	= btrfs_listxattr,
  11454	.get_acl	= btrfs_get_acl,
  11455	.set_acl	= btrfs_set_acl,
  11456	.update_time	= btrfs_update_time,
  11457};
  11458static const struct inode_operations btrfs_symlink_inode_operations = {
  11459	.get_link	= page_get_link,
  11460	.getattr	= btrfs_getattr,
  11461	.setattr	= btrfs_setattr,
  11462	.permission	= btrfs_permission,
  11463	.listxattr	= btrfs_listxattr,
  11464	.update_time	= btrfs_update_time,
  11465};
  11466
  11467const struct dentry_operations btrfs_dentry_operations = {
  11468	.d_delete	= btrfs_dentry_delete,
  11469};