file.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
file.c (107947B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2007 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/fs.h>
      7#include <linux/pagemap.h>
      8#include <linux/time.h>
      9#include <linux/init.h>
     10#include <linux/string.h>
     11#include <linux/backing-dev.h>
     12#include <linux/falloc.h>
     13#include <linux/writeback.h>
     14#include <linux/compat.h>
     15#include <linux/slab.h>
     16#include <linux/btrfs.h>
     17#include <linux/uio.h>
     18#include <linux/iversion.h>
     19#include <linux/fsverity.h>
     20#include "ctree.h"
     21#include "disk-io.h"
     22#include "transaction.h"
     23#include "btrfs_inode.h"
     24#include "print-tree.h"
     25#include "tree-log.h"
     26#include "locking.h"
     27#include "volumes.h"
     28#include "qgroup.h"
     29#include "compression.h"
     30#include "delalloc-space.h"
     31#include "reflink.h"
     32#include "subpage.h"
     33
     34static struct kmem_cache *btrfs_inode_defrag_cachep;
     35/*
     36 * when auto defrag is enabled we
     37 * queue up these defrag structs to remember which
     38 * inodes need defragging passes
     39 */
     40struct inode_defrag {
     41	struct rb_node rb_node;
     42	/* objectid */
     43	u64 ino;
     44	/*
     45	 * transid where the defrag was added, we search for
     46	 * extents newer than this
     47	 */
     48	u64 transid;
     49
     50	/* root objectid */
     51	u64 root;
     52
     53	/*
     54	 * The extent size threshold for autodefrag.
     55	 *
     56	 * This value is different for compressed/non-compressed extents,
     57	 * thus needs to be passed from higher layer.
     58	 * (aka, inode_should_defrag())
     59	 */
     60	u32 extent_thresh;
     61};
     62
     63static int __compare_inode_defrag(struct inode_defrag *defrag1,
     64				  struct inode_defrag *defrag2)
     65{
     66	if (defrag1->root > defrag2->root)
     67		return 1;
     68	else if (defrag1->root < defrag2->root)
     69		return -1;
     70	else if (defrag1->ino > defrag2->ino)
     71		return 1;
     72	else if (defrag1->ino < defrag2->ino)
     73		return -1;
     74	else
     75		return 0;
     76}
     77
     78/* pop a record for an inode into the defrag tree.  The lock
     79 * must be held already
     80 *
     81 * If you're inserting a record for an older transid than an
     82 * existing record, the transid already in the tree is lowered
     83 *
     84 * If an existing record is found the defrag item you
     85 * pass in is freed
     86 */
     87static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
     88				    struct inode_defrag *defrag)
     89{
     90	struct btrfs_fs_info *fs_info = inode->root->fs_info;
     91	struct inode_defrag *entry;
     92	struct rb_node **p;
     93	struct rb_node *parent = NULL;
     94	int ret;
     95
     96	p = &fs_info->defrag_inodes.rb_node;
     97	while (*p) {
     98		parent = *p;
     99		entry = rb_entry(parent, struct inode_defrag, rb_node);
    100
    101		ret = __compare_inode_defrag(defrag, entry);
    102		if (ret < 0)
    103			p = &parent->rb_left;
    104		else if (ret > 0)
    105			p = &parent->rb_right;
    106		else {
    107			/* if we're reinserting an entry for
    108			 * an old defrag run, make sure to
    109			 * lower the transid of our existing record
    110			 */
    111			if (defrag->transid < entry->transid)
    112				entry->transid = defrag->transid;
    113			entry->extent_thresh = min(defrag->extent_thresh,
    114						   entry->extent_thresh);
    115			return -EEXIST;
    116		}
    117	}
    118	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
    119	rb_link_node(&defrag->rb_node, parent, p);
    120	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
    121	return 0;
    122}
    123
    124static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
    125{
    126	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
    127		return 0;
    128
    129	if (btrfs_fs_closing(fs_info))
    130		return 0;
    131
    132	return 1;
    133}
    134
    135/*
    136 * insert a defrag record for this inode if auto defrag is
    137 * enabled
    138 */
    139int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
    140			   struct btrfs_inode *inode, u32 extent_thresh)
    141{
    142	struct btrfs_root *root = inode->root;
    143	struct btrfs_fs_info *fs_info = root->fs_info;
    144	struct inode_defrag *defrag;
    145	u64 transid;
    146	int ret;
    147
    148	if (!__need_auto_defrag(fs_info))
    149		return 0;
    150
    151	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
    152		return 0;
    153
    154	if (trans)
    155		transid = trans->transid;
    156	else
    157		transid = inode->root->last_trans;
    158
    159	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
    160	if (!defrag)
    161		return -ENOMEM;
    162
    163	defrag->ino = btrfs_ino(inode);
    164	defrag->transid = transid;
    165	defrag->root = root->root_key.objectid;
    166	defrag->extent_thresh = extent_thresh;
    167
    168	spin_lock(&fs_info->defrag_inodes_lock);
    169	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
    170		/*
    171		 * If we set IN_DEFRAG flag and evict the inode from memory,
    172		 * and then re-read this inode, this new inode doesn't have
    173		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
    174		 */
    175		ret = __btrfs_add_inode_defrag(inode, defrag);
    176		if (ret)
    177			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
    178	} else {
    179		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
    180	}
    181	spin_unlock(&fs_info->defrag_inodes_lock);
    182	return 0;
    183}
    184
    185/*
    186 * pick the defragable inode that we want, if it doesn't exist, we will get
    187 * the next one.
    188 */
    189static struct inode_defrag *
    190btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
    191{
    192	struct inode_defrag *entry = NULL;
    193	struct inode_defrag tmp;
    194	struct rb_node *p;
    195	struct rb_node *parent = NULL;
    196	int ret;
    197
    198	tmp.ino = ino;
    199	tmp.root = root;
    200
    201	spin_lock(&fs_info->defrag_inodes_lock);
    202	p = fs_info->defrag_inodes.rb_node;
    203	while (p) {
    204		parent = p;
    205		entry = rb_entry(parent, struct inode_defrag, rb_node);
    206
    207		ret = __compare_inode_defrag(&tmp, entry);
    208		if (ret < 0)
    209			p = parent->rb_left;
    210		else if (ret > 0)
    211			p = parent->rb_right;
    212		else
    213			goto out;
    214	}
    215
    216	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
    217		parent = rb_next(parent);
    218		if (parent)
    219			entry = rb_entry(parent, struct inode_defrag, rb_node);
    220		else
    221			entry = NULL;
    222	}
    223out:
    224	if (entry)
    225		rb_erase(parent, &fs_info->defrag_inodes);
    226	spin_unlock(&fs_info->defrag_inodes_lock);
    227	return entry;
    228}
    229
    230void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
    231{
    232	struct inode_defrag *defrag;
    233	struct rb_node *node;
    234
    235	spin_lock(&fs_info->defrag_inodes_lock);
    236	node = rb_first(&fs_info->defrag_inodes);
    237	while (node) {
    238		rb_erase(node, &fs_info->defrag_inodes);
    239		defrag = rb_entry(node, struct inode_defrag, rb_node);
    240		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
    241
    242		cond_resched_lock(&fs_info->defrag_inodes_lock);
    243
    244		node = rb_first(&fs_info->defrag_inodes);
    245	}
    246	spin_unlock(&fs_info->defrag_inodes_lock);
    247}
    248
    249#define BTRFS_DEFRAG_BATCH	1024
    250
    251static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
    252				    struct inode_defrag *defrag)
    253{
    254	struct btrfs_root *inode_root;
    255	struct inode *inode;
    256	struct btrfs_ioctl_defrag_range_args range;
    257	int ret = 0;
    258	u64 cur = 0;
    259
    260again:
    261	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
    262		goto cleanup;
    263	if (!__need_auto_defrag(fs_info))
    264		goto cleanup;
    265
    266	/* get the inode */
    267	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
    268	if (IS_ERR(inode_root)) {
    269		ret = PTR_ERR(inode_root);
    270		goto cleanup;
    271	}
    272
    273	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
    274	btrfs_put_root(inode_root);
    275	if (IS_ERR(inode)) {
    276		ret = PTR_ERR(inode);
    277		goto cleanup;
    278	}
    279
    280	if (cur >= i_size_read(inode)) {
    281		iput(inode);
    282		goto cleanup;
    283	}
    284
    285	/* do a chunk of defrag */
    286	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
    287	memset(&range, 0, sizeof(range));
    288	range.len = (u64)-1;
    289	range.start = cur;
    290	range.extent_thresh = defrag->extent_thresh;
    291
    292	sb_start_write(fs_info->sb);
    293	ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
    294				       BTRFS_DEFRAG_BATCH);
    295	sb_end_write(fs_info->sb);
    296	iput(inode);
    297
    298	if (ret < 0)
    299		goto cleanup;
    300
    301	cur = max(cur + fs_info->sectorsize, range.start);
    302	goto again;
    303
    304cleanup:
    305	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
    306	return ret;
    307}
    308
    309/*
    310 * run through the list of inodes in the FS that need
    311 * defragging
    312 */
    313int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
    314{
    315	struct inode_defrag *defrag;
    316	u64 first_ino = 0;
    317	u64 root_objectid = 0;
    318
    319	atomic_inc(&fs_info->defrag_running);
    320	while (1) {
    321		/* Pause the auto defragger. */
    322		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
    323			     &fs_info->fs_state))
    324			break;
    325
    326		if (!__need_auto_defrag(fs_info))
    327			break;
    328
    329		/* find an inode to defrag */
    330		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
    331						 first_ino);
    332		if (!defrag) {
    333			if (root_objectid || first_ino) {
    334				root_objectid = 0;
    335				first_ino = 0;
    336				continue;
    337			} else {
    338				break;
    339			}
    340		}
    341
    342		first_ino = defrag->ino + 1;
    343		root_objectid = defrag->root;
    344
    345		__btrfs_run_defrag_inode(fs_info, defrag);
    346	}
    347	atomic_dec(&fs_info->defrag_running);
    348
    349	/*
    350	 * during unmount, we use the transaction_wait queue to
    351	 * wait for the defragger to stop
    352	 */
    353	wake_up(&fs_info->transaction_wait);
    354	return 0;
    355}
    356
    357/* simple helper to fault in pages and copy.  This should go away
    358 * and be replaced with calls into generic code.
    359 */
    360static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
    361					 struct page **prepared_pages,
    362					 struct iov_iter *i)
    363{
    364	size_t copied = 0;
    365	size_t total_copied = 0;
    366	int pg = 0;
    367	int offset = offset_in_page(pos);
    368
    369	while (write_bytes > 0) {
    370		size_t count = min_t(size_t,
    371				     PAGE_SIZE - offset, write_bytes);
    372		struct page *page = prepared_pages[pg];
    373		/*
    374		 * Copy data from userspace to the current page
    375		 */
    376		copied = copy_page_from_iter_atomic(page, offset, count, i);
    377
    378		/* Flush processor's dcache for this page */
    379		flush_dcache_page(page);
    380
    381		/*
    382		 * if we get a partial write, we can end up with
    383		 * partially up to date pages.  These add
    384		 * a lot of complexity, so make sure they don't
    385		 * happen by forcing this copy to be retried.
    386		 *
    387		 * The rest of the btrfs_file_write code will fall
    388		 * back to page at a time copies after we return 0.
    389		 */
    390		if (unlikely(copied < count)) {
    391			if (!PageUptodate(page)) {
    392				iov_iter_revert(i, copied);
    393				copied = 0;
    394			}
    395			if (!copied)
    396				break;
    397		}
    398
    399		write_bytes -= copied;
    400		total_copied += copied;
    401		offset += copied;
    402		if (offset == PAGE_SIZE) {
    403			pg++;
    404			offset = 0;
    405		}
    406	}
    407	return total_copied;
    408}
    409
    410/*
    411 * unlocks pages after btrfs_file_write is done with them
    412 */
    413static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
    414			     struct page **pages, size_t num_pages,
    415			     u64 pos, u64 copied)
    416{
    417	size_t i;
    418	u64 block_start = round_down(pos, fs_info->sectorsize);
    419	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
    420
    421	ASSERT(block_len <= U32_MAX);
    422	for (i = 0; i < num_pages; i++) {
    423		/* page checked is some magic around finding pages that
    424		 * have been modified without going through btrfs_set_page_dirty
    425		 * clear it here. There should be no need to mark the pages
    426		 * accessed as prepare_pages should have marked them accessed
    427		 * in prepare_pages via find_or_create_page()
    428		 */
    429		btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
    430					       block_len);
    431		unlock_page(pages[i]);
    432		put_page(pages[i]);
    433	}
    434}
    435
    436/*
    437 * After btrfs_copy_from_user(), update the following things for delalloc:
    438 * - Mark newly dirtied pages as DELALLOC in the io tree.
    439 *   Used to advise which range is to be written back.
    440 * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
    441 * - Update inode size for past EOF write
    442 */
    443int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
    444		      size_t num_pages, loff_t pos, size_t write_bytes,
    445		      struct extent_state **cached, bool noreserve)
    446{
    447	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    448	int err = 0;
    449	int i;
    450	u64 num_bytes;
    451	u64 start_pos;
    452	u64 end_of_last_block;
    453	u64 end_pos = pos + write_bytes;
    454	loff_t isize = i_size_read(&inode->vfs_inode);
    455	unsigned int extra_bits = 0;
    456
    457	if (write_bytes == 0)
    458		return 0;
    459
    460	if (noreserve)
    461		extra_bits |= EXTENT_NORESERVE;
    462
    463	start_pos = round_down(pos, fs_info->sectorsize);
    464	num_bytes = round_up(write_bytes + pos - start_pos,
    465			     fs_info->sectorsize);
    466	ASSERT(num_bytes <= U32_MAX);
    467
    468	end_of_last_block = start_pos + num_bytes - 1;
    469
    470	/*
    471	 * The pages may have already been dirty, clear out old accounting so
    472	 * we can set things up properly
    473	 */
    474	clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
    475			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
    476			 0, 0, cached);
    477
    478	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
    479					extra_bits, cached);
    480	if (err)
    481		return err;
    482
    483	for (i = 0; i < num_pages; i++) {
    484		struct page *p = pages[i];
    485
    486		btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
    487		btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
    488		btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
    489	}
    490
    491	/*
    492	 * we've only changed i_size in ram, and we haven't updated
    493	 * the disk i_size.  There is no need to log the inode
    494	 * at this time.
    495	 */
    496	if (end_pos > isize)
    497		i_size_write(&inode->vfs_inode, end_pos);
    498	return 0;
    499}
    500
    501/*
    502 * this drops all the extents in the cache that intersect the range
    503 * [start, end].  Existing extents are split as required.
    504 */
    505void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
    506			     int skip_pinned)
    507{
    508	struct extent_map *em;
    509	struct extent_map *split = NULL;
    510	struct extent_map *split2 = NULL;
    511	struct extent_map_tree *em_tree = &inode->extent_tree;
    512	u64 len = end - start + 1;
    513	u64 gen;
    514	int ret;
    515	int testend = 1;
    516	unsigned long flags;
    517	int compressed = 0;
    518	bool modified;
    519
    520	WARN_ON(end < start);
    521	if (end == (u64)-1) {
    522		len = (u64)-1;
    523		testend = 0;
    524	}
    525	while (1) {
    526		int no_splits = 0;
    527
    528		modified = false;
    529		if (!split)
    530			split = alloc_extent_map();
    531		if (!split2)
    532			split2 = alloc_extent_map();
    533		if (!split || !split2)
    534			no_splits = 1;
    535
    536		write_lock(&em_tree->lock);
    537		em = lookup_extent_mapping(em_tree, start, len);
    538		if (!em) {
    539			write_unlock(&em_tree->lock);
    540			break;
    541		}
    542		flags = em->flags;
    543		gen = em->generation;
    544		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
    545			if (testend && em->start + em->len >= start + len) {
    546				free_extent_map(em);
    547				write_unlock(&em_tree->lock);
    548				break;
    549			}
    550			start = em->start + em->len;
    551			if (testend)
    552				len = start + len - (em->start + em->len);
    553			free_extent_map(em);
    554			write_unlock(&em_tree->lock);
    555			continue;
    556		}
    557		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
    558		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
    559		clear_bit(EXTENT_FLAG_LOGGING, &flags);
    560		modified = !list_empty(&em->list);
    561		if (no_splits)
    562			goto next;
    563
    564		if (em->start < start) {
    565			split->start = em->start;
    566			split->len = start - em->start;
    567
    568			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
    569				split->orig_start = em->orig_start;
    570				split->block_start = em->block_start;
    571
    572				if (compressed)
    573					split->block_len = em->block_len;
    574				else
    575					split->block_len = split->len;
    576				split->orig_block_len = max(split->block_len,
    577						em->orig_block_len);
    578				split->ram_bytes = em->ram_bytes;
    579			} else {
    580				split->orig_start = split->start;
    581				split->block_len = 0;
    582				split->block_start = em->block_start;
    583				split->orig_block_len = 0;
    584				split->ram_bytes = split->len;
    585			}
    586
    587			split->generation = gen;
    588			split->flags = flags;
    589			split->compress_type = em->compress_type;
    590			replace_extent_mapping(em_tree, em, split, modified);
    591			free_extent_map(split);
    592			split = split2;
    593			split2 = NULL;
    594		}
    595		if (testend && em->start + em->len > start + len) {
    596			u64 diff = start + len - em->start;
    597
    598			split->start = start + len;
    599			split->len = em->start + em->len - (start + len);
    600			split->flags = flags;
    601			split->compress_type = em->compress_type;
    602			split->generation = gen;
    603
    604			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
    605				split->orig_block_len = max(em->block_len,
    606						    em->orig_block_len);
    607
    608				split->ram_bytes = em->ram_bytes;
    609				if (compressed) {
    610					split->block_len = em->block_len;
    611					split->block_start = em->block_start;
    612					split->orig_start = em->orig_start;
    613				} else {
    614					split->block_len = split->len;
    615					split->block_start = em->block_start
    616						+ diff;
    617					split->orig_start = em->orig_start;
    618				}
    619			} else {
    620				split->ram_bytes = split->len;
    621				split->orig_start = split->start;
    622				split->block_len = 0;
    623				split->block_start = em->block_start;
    624				split->orig_block_len = 0;
    625			}
    626
    627			if (extent_map_in_tree(em)) {
    628				replace_extent_mapping(em_tree, em, split,
    629						       modified);
    630			} else {
    631				ret = add_extent_mapping(em_tree, split,
    632							 modified);
    633				ASSERT(ret == 0); /* Logic error */
    634			}
    635			free_extent_map(split);
    636			split = NULL;
    637		}
    638next:
    639		if (extent_map_in_tree(em))
    640			remove_extent_mapping(em_tree, em);
    641		write_unlock(&em_tree->lock);
    642
    643		/* once for us */
    644		free_extent_map(em);
    645		/* once for the tree*/
    646		free_extent_map(em);
    647	}
    648	if (split)
    649		free_extent_map(split);
    650	if (split2)
    651		free_extent_map(split2);
    652}
    653
    654/*
    655 * this is very complex, but the basic idea is to drop all extents
    656 * in the range start - end.  hint_block is filled in with a block number
    657 * that would be a good hint to the block allocator for this file.
    658 *
    659 * If an extent intersects the range but is not entirely inside the range
    660 * it is either truncated or split.  Anything entirely inside the range
    661 * is deleted from the tree.
    662 *
    663 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
    664 * to deal with that. We set the field 'bytes_found' of the arguments structure
    665 * with the number of allocated bytes found in the target range, so that the
    666 * caller can update the inode's number of bytes in an atomic way when
    667 * replacing extents in a range to avoid races with stat(2).
    668 */
    669int btrfs_drop_extents(struct btrfs_trans_handle *trans,
    670		       struct btrfs_root *root, struct btrfs_inode *inode,
    671		       struct btrfs_drop_extents_args *args)
    672{
    673	struct btrfs_fs_info *fs_info = root->fs_info;
    674	struct extent_buffer *leaf;
    675	struct btrfs_file_extent_item *fi;
    676	struct btrfs_ref ref = { 0 };
    677	struct btrfs_key key;
    678	struct btrfs_key new_key;
    679	u64 ino = btrfs_ino(inode);
    680	u64 search_start = args->start;
    681	u64 disk_bytenr = 0;
    682	u64 num_bytes = 0;
    683	u64 extent_offset = 0;
    684	u64 extent_end = 0;
    685	u64 last_end = args->start;
    686	int del_nr = 0;
    687	int del_slot = 0;
    688	int extent_type;
    689	int recow;
    690	int ret;
    691	int modify_tree = -1;
    692	int update_refs;
    693	int found = 0;
    694	struct btrfs_path *path = args->path;
    695
    696	args->bytes_found = 0;
    697	args->extent_inserted = false;
    698
    699	/* Must always have a path if ->replace_extent is true */
    700	ASSERT(!(args->replace_extent && !args->path));
    701
    702	if (!path) {
    703		path = btrfs_alloc_path();
    704		if (!path) {
    705			ret = -ENOMEM;
    706			goto out;
    707		}
    708	}
    709
    710	if (args->drop_cache)
    711		btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
    712
    713	if (args->start >= inode->disk_i_size && !args->replace_extent)
    714		modify_tree = 0;
    715
    716	update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
    717	while (1) {
    718		recow = 0;
    719		ret = btrfs_lookup_file_extent(trans, root, path, ino,
    720					       search_start, modify_tree);
    721		if (ret < 0)
    722			break;
    723		if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
    724			leaf = path->nodes[0];
    725			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
    726			if (key.objectid == ino &&
    727			    key.type == BTRFS_EXTENT_DATA_KEY)
    728				path->slots[0]--;
    729		}
    730		ret = 0;
    731next_slot:
    732		leaf = path->nodes[0];
    733		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
    734			BUG_ON(del_nr > 0);
    735			ret = btrfs_next_leaf(root, path);
    736			if (ret < 0)
    737				break;
    738			if (ret > 0) {
    739				ret = 0;
    740				break;
    741			}
    742			leaf = path->nodes[0];
    743			recow = 1;
    744		}
    745
    746		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    747
    748		if (key.objectid > ino)
    749			break;
    750		if (WARN_ON_ONCE(key.objectid < ino) ||
    751		    key.type < BTRFS_EXTENT_DATA_KEY) {
    752			ASSERT(del_nr == 0);
    753			path->slots[0]++;
    754			goto next_slot;
    755		}
    756		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
    757			break;
    758
    759		fi = btrfs_item_ptr(leaf, path->slots[0],
    760				    struct btrfs_file_extent_item);
    761		extent_type = btrfs_file_extent_type(leaf, fi);
    762
    763		if (extent_type == BTRFS_FILE_EXTENT_REG ||
    764		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
    765			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
    766			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
    767			extent_offset = btrfs_file_extent_offset(leaf, fi);
    768			extent_end = key.offset +
    769				btrfs_file_extent_num_bytes(leaf, fi);
    770		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
    771			extent_end = key.offset +
    772				btrfs_file_extent_ram_bytes(leaf, fi);
    773		} else {
    774			/* can't happen */
    775			BUG();
    776		}
    777
    778		/*
    779		 * Don't skip extent items representing 0 byte lengths. They
    780		 * used to be created (bug) if while punching holes we hit
    781		 * -ENOSPC condition. So if we find one here, just ensure we
    782		 * delete it, otherwise we would insert a new file extent item
    783		 * with the same key (offset) as that 0 bytes length file
    784		 * extent item in the call to setup_items_for_insert() later
    785		 * in this function.
    786		 */
    787		if (extent_end == key.offset && extent_end >= search_start) {
    788			last_end = extent_end;
    789			goto delete_extent_item;
    790		}
    791
    792		if (extent_end <= search_start) {
    793			path->slots[0]++;
    794			goto next_slot;
    795		}
    796
    797		found = 1;
    798		search_start = max(key.offset, args->start);
    799		if (recow || !modify_tree) {
    800			modify_tree = -1;
    801			btrfs_release_path(path);
    802			continue;
    803		}
    804
    805		/*
    806		 *     | - range to drop - |
    807		 *  | -------- extent -------- |
    808		 */
    809		if (args->start > key.offset && args->end < extent_end) {
    810			BUG_ON(del_nr > 0);
    811			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
    812				ret = -EOPNOTSUPP;
    813				break;
    814			}
    815
    816			memcpy(&new_key, &key, sizeof(new_key));
    817			new_key.offset = args->start;
    818			ret = btrfs_duplicate_item(trans, root, path,
    819						   &new_key);
    820			if (ret == -EAGAIN) {
    821				btrfs_release_path(path);
    822				continue;
    823			}
    824			if (ret < 0)
    825				break;
    826
    827			leaf = path->nodes[0];
    828			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
    829					    struct btrfs_file_extent_item);
    830			btrfs_set_file_extent_num_bytes(leaf, fi,
    831							args->start - key.offset);
    832
    833			fi = btrfs_item_ptr(leaf, path->slots[0],
    834					    struct btrfs_file_extent_item);
    835
    836			extent_offset += args->start - key.offset;
    837			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
    838			btrfs_set_file_extent_num_bytes(leaf, fi,
    839							extent_end - args->start);
    840			btrfs_mark_buffer_dirty(leaf);
    841
    842			if (update_refs && disk_bytenr > 0) {
    843				btrfs_init_generic_ref(&ref,
    844						BTRFS_ADD_DELAYED_REF,
    845						disk_bytenr, num_bytes, 0);
    846				btrfs_init_data_ref(&ref,
    847						root->root_key.objectid,
    848						new_key.objectid,
    849						args->start - extent_offset,
    850						0, false);
    851				ret = btrfs_inc_extent_ref(trans, &ref);
    852				BUG_ON(ret); /* -ENOMEM */
    853			}
    854			key.offset = args->start;
    855		}
    856		/*
    857		 * From here on out we will have actually dropped something, so
    858		 * last_end can be updated.
    859		 */
    860		last_end = extent_end;
    861
    862		/*
    863		 *  | ---- range to drop ----- |
    864		 *      | -------- extent -------- |
    865		 */
    866		if (args->start <= key.offset && args->end < extent_end) {
    867			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
    868				ret = -EOPNOTSUPP;
    869				break;
    870			}
    871
    872			memcpy(&new_key, &key, sizeof(new_key));
    873			new_key.offset = args->end;
    874			btrfs_set_item_key_safe(fs_info, path, &new_key);
    875
    876			extent_offset += args->end - key.offset;
    877			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
    878			btrfs_set_file_extent_num_bytes(leaf, fi,
    879							extent_end - args->end);
    880			btrfs_mark_buffer_dirty(leaf);
    881			if (update_refs && disk_bytenr > 0)
    882				args->bytes_found += args->end - key.offset;
    883			break;
    884		}
    885
    886		search_start = extent_end;
    887		/*
    888		 *       | ---- range to drop ----- |
    889		 *  | -------- extent -------- |
    890		 */
    891		if (args->start > key.offset && args->end >= extent_end) {
    892			BUG_ON(del_nr > 0);
    893			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
    894				ret = -EOPNOTSUPP;
    895				break;
    896			}
    897
    898			btrfs_set_file_extent_num_bytes(leaf, fi,
    899							args->start - key.offset);
    900			btrfs_mark_buffer_dirty(leaf);
    901			if (update_refs && disk_bytenr > 0)
    902				args->bytes_found += extent_end - args->start;
    903			if (args->end == extent_end)
    904				break;
    905
    906			path->slots[0]++;
    907			goto next_slot;
    908		}
    909
    910		/*
    911		 *  | ---- range to drop ----- |
    912		 *    | ------ extent ------ |
    913		 */
    914		if (args->start <= key.offset && args->end >= extent_end) {
    915delete_extent_item:
    916			if (del_nr == 0) {
    917				del_slot = path->slots[0];
    918				del_nr = 1;
    919			} else {
    920				BUG_ON(del_slot + del_nr != path->slots[0]);
    921				del_nr++;
    922			}
    923
    924			if (update_refs &&
    925			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
    926				args->bytes_found += extent_end - key.offset;
    927				extent_end = ALIGN(extent_end,
    928						   fs_info->sectorsize);
    929			} else if (update_refs && disk_bytenr > 0) {
    930				btrfs_init_generic_ref(&ref,
    931						BTRFS_DROP_DELAYED_REF,
    932						disk_bytenr, num_bytes, 0);
    933				btrfs_init_data_ref(&ref,
    934						root->root_key.objectid,
    935						key.objectid,
    936						key.offset - extent_offset, 0,
    937						false);
    938				ret = btrfs_free_extent(trans, &ref);
    939				BUG_ON(ret); /* -ENOMEM */
    940				args->bytes_found += extent_end - key.offset;
    941			}
    942
    943			if (args->end == extent_end)
    944				break;
    945
    946			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
    947				path->slots[0]++;
    948				goto next_slot;
    949			}
    950
    951			ret = btrfs_del_items(trans, root, path, del_slot,
    952					      del_nr);
    953			if (ret) {
    954				btrfs_abort_transaction(trans, ret);
    955				break;
    956			}
    957
    958			del_nr = 0;
    959			del_slot = 0;
    960
    961			btrfs_release_path(path);
    962			continue;
    963		}
    964
    965		BUG();
    966	}
    967
    968	if (!ret && del_nr > 0) {
    969		/*
    970		 * Set path->slots[0] to first slot, so that after the delete
    971		 * if items are move off from our leaf to its immediate left or
    972		 * right neighbor leafs, we end up with a correct and adjusted
    973		 * path->slots[0] for our insertion (if args->replace_extent).
    974		 */
    975		path->slots[0] = del_slot;
    976		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
    977		if (ret)
    978			btrfs_abort_transaction(trans, ret);
    979	}
    980
    981	leaf = path->nodes[0];
    982	/*
    983	 * If btrfs_del_items() was called, it might have deleted a leaf, in
    984	 * which case it unlocked our path, so check path->locks[0] matches a
    985	 * write lock.
    986	 */
    987	if (!ret && args->replace_extent &&
    988	    path->locks[0] == BTRFS_WRITE_LOCK &&
    989	    btrfs_leaf_free_space(leaf) >=
    990	    sizeof(struct btrfs_item) + args->extent_item_size) {
    991
    992		key.objectid = ino;
    993		key.type = BTRFS_EXTENT_DATA_KEY;
    994		key.offset = args->start;
    995		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
    996			struct btrfs_key slot_key;
    997
    998			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
    999			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
   1000				path->slots[0]++;
   1001		}
   1002		btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
   1003		args->extent_inserted = true;
   1004	}
   1005
   1006	if (!args->path)
   1007		btrfs_free_path(path);
   1008	else if (!args->extent_inserted)
   1009		btrfs_release_path(path);
   1010out:
   1011	args->drop_end = found ? min(args->end, last_end) : args->end;
   1012
   1013	return ret;
   1014}
   1015
   1016static int extent_mergeable(struct extent_buffer *leaf, int slot,
   1017			    u64 objectid, u64 bytenr, u64 orig_offset,
   1018			    u64 *start, u64 *end)
   1019{
   1020	struct btrfs_file_extent_item *fi;
   1021	struct btrfs_key key;
   1022	u64 extent_end;
   1023
   1024	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
   1025		return 0;
   1026
   1027	btrfs_item_key_to_cpu(leaf, &key, slot);
   1028	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
   1029		return 0;
   1030
   1031	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
   1032	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
   1033	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
   1034	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
   1035	    btrfs_file_extent_compression(leaf, fi) ||
   1036	    btrfs_file_extent_encryption(leaf, fi) ||
   1037	    btrfs_file_extent_other_encoding(leaf, fi))
   1038		return 0;
   1039
   1040	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
   1041	if ((*start && *start != key.offset) || (*end && *end != extent_end))
   1042		return 0;
   1043
   1044	*start = key.offset;
   1045	*end = extent_end;
   1046	return 1;
   1047}
   1048
   1049/*
   1050 * Mark extent in the range start - end as written.
   1051 *
   1052 * This changes extent type from 'pre-allocated' to 'regular'. If only
   1053 * part of extent is marked as written, the extent will be split into
   1054 * two or three.
   1055 */
   1056int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
   1057			      struct btrfs_inode *inode, u64 start, u64 end)
   1058{
   1059	struct btrfs_fs_info *fs_info = trans->fs_info;
   1060	struct btrfs_root *root = inode->root;
   1061	struct extent_buffer *leaf;
   1062	struct btrfs_path *path;
   1063	struct btrfs_file_extent_item *fi;
   1064	struct btrfs_ref ref = { 0 };
   1065	struct btrfs_key key;
   1066	struct btrfs_key new_key;
   1067	u64 bytenr;
   1068	u64 num_bytes;
   1069	u64 extent_end;
   1070	u64 orig_offset;
   1071	u64 other_start;
   1072	u64 other_end;
   1073	u64 split;
   1074	int del_nr = 0;
   1075	int del_slot = 0;
   1076	int recow;
   1077	int ret = 0;
   1078	u64 ino = btrfs_ino(inode);
   1079
   1080	path = btrfs_alloc_path();
   1081	if (!path)
   1082		return -ENOMEM;
   1083again:
   1084	recow = 0;
   1085	split = start;
   1086	key.objectid = ino;
   1087	key.type = BTRFS_EXTENT_DATA_KEY;
   1088	key.offset = split;
   1089
   1090	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
   1091	if (ret < 0)
   1092		goto out;
   1093	if (ret > 0 && path->slots[0] > 0)
   1094		path->slots[0]--;
   1095
   1096	leaf = path->nodes[0];
   1097	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   1098	if (key.objectid != ino ||
   1099	    key.type != BTRFS_EXTENT_DATA_KEY) {
   1100		ret = -EINVAL;
   1101		btrfs_abort_transaction(trans, ret);
   1102		goto out;
   1103	}
   1104	fi = btrfs_item_ptr(leaf, path->slots[0],
   1105			    struct btrfs_file_extent_item);
   1106	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
   1107		ret = -EINVAL;
   1108		btrfs_abort_transaction(trans, ret);
   1109		goto out;
   1110	}
   1111	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
   1112	if (key.offset > start || extent_end < end) {
   1113		ret = -EINVAL;
   1114		btrfs_abort_transaction(trans, ret);
   1115		goto out;
   1116	}
   1117
   1118	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
   1119	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
   1120	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
   1121	memcpy(&new_key, &key, sizeof(new_key));
   1122
   1123	if (start == key.offset && end < extent_end) {
   1124		other_start = 0;
   1125		other_end = start;
   1126		if (extent_mergeable(leaf, path->slots[0] - 1,
   1127				     ino, bytenr, orig_offset,
   1128				     &other_start, &other_end)) {
   1129			new_key.offset = end;
   1130			btrfs_set_item_key_safe(fs_info, path, &new_key);
   1131			fi = btrfs_item_ptr(leaf, path->slots[0],
   1132					    struct btrfs_file_extent_item);
   1133			btrfs_set_file_extent_generation(leaf, fi,
   1134							 trans->transid);
   1135			btrfs_set_file_extent_num_bytes(leaf, fi,
   1136							extent_end - end);
   1137			btrfs_set_file_extent_offset(leaf, fi,
   1138						     end - orig_offset);
   1139			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
   1140					    struct btrfs_file_extent_item);
   1141			btrfs_set_file_extent_generation(leaf, fi,
   1142							 trans->transid);
   1143			btrfs_set_file_extent_num_bytes(leaf, fi,
   1144							end - other_start);
   1145			btrfs_mark_buffer_dirty(leaf);
   1146			goto out;
   1147		}
   1148	}
   1149
   1150	if (start > key.offset && end == extent_end) {
   1151		other_start = end;
   1152		other_end = 0;
   1153		if (extent_mergeable(leaf, path->slots[0] + 1,
   1154				     ino, bytenr, orig_offset,
   1155				     &other_start, &other_end)) {
   1156			fi = btrfs_item_ptr(leaf, path->slots[0],
   1157					    struct btrfs_file_extent_item);
   1158			btrfs_set_file_extent_num_bytes(leaf, fi,
   1159							start - key.offset);
   1160			btrfs_set_file_extent_generation(leaf, fi,
   1161							 trans->transid);
   1162			path->slots[0]++;
   1163			new_key.offset = start;
   1164			btrfs_set_item_key_safe(fs_info, path, &new_key);
   1165
   1166			fi = btrfs_item_ptr(leaf, path->slots[0],
   1167					    struct btrfs_file_extent_item);
   1168			btrfs_set_file_extent_generation(leaf, fi,
   1169							 trans->transid);
   1170			btrfs_set_file_extent_num_bytes(leaf, fi,
   1171							other_end - start);
   1172			btrfs_set_file_extent_offset(leaf, fi,
   1173						     start - orig_offset);
   1174			btrfs_mark_buffer_dirty(leaf);
   1175			goto out;
   1176		}
   1177	}
   1178
   1179	while (start > key.offset || end < extent_end) {
   1180		if (key.offset == start)
   1181			split = end;
   1182
   1183		new_key.offset = split;
   1184		ret = btrfs_duplicate_item(trans, root, path, &new_key);
   1185		if (ret == -EAGAIN) {
   1186			btrfs_release_path(path);
   1187			goto again;
   1188		}
   1189		if (ret < 0) {
   1190			btrfs_abort_transaction(trans, ret);
   1191			goto out;
   1192		}
   1193
   1194		leaf = path->nodes[0];
   1195		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
   1196				    struct btrfs_file_extent_item);
   1197		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
   1198		btrfs_set_file_extent_num_bytes(leaf, fi,
   1199						split - key.offset);
   1200
   1201		fi = btrfs_item_ptr(leaf, path->slots[0],
   1202				    struct btrfs_file_extent_item);
   1203
   1204		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
   1205		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
   1206		btrfs_set_file_extent_num_bytes(leaf, fi,
   1207						extent_end - split);
   1208		btrfs_mark_buffer_dirty(leaf);
   1209
   1210		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
   1211				       num_bytes, 0);
   1212		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
   1213				    orig_offset, 0, false);
   1214		ret = btrfs_inc_extent_ref(trans, &ref);
   1215		if (ret) {
   1216			btrfs_abort_transaction(trans, ret);
   1217			goto out;
   1218		}
   1219
   1220		if (split == start) {
   1221			key.offset = start;
   1222		} else {
   1223			if (start != key.offset) {
   1224				ret = -EINVAL;
   1225				btrfs_abort_transaction(trans, ret);
   1226				goto out;
   1227			}
   1228			path->slots[0]--;
   1229			extent_end = end;
   1230		}
   1231		recow = 1;
   1232	}
   1233
   1234	other_start = end;
   1235	other_end = 0;
   1236	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
   1237			       num_bytes, 0);
   1238	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
   1239			    0, false);
   1240	if (extent_mergeable(leaf, path->slots[0] + 1,
   1241			     ino, bytenr, orig_offset,
   1242			     &other_start, &other_end)) {
   1243		if (recow) {
   1244			btrfs_release_path(path);
   1245			goto again;
   1246		}
   1247		extent_end = other_end;
   1248		del_slot = path->slots[0] + 1;
   1249		del_nr++;
   1250		ret = btrfs_free_extent(trans, &ref);
   1251		if (ret) {
   1252			btrfs_abort_transaction(trans, ret);
   1253			goto out;
   1254		}
   1255	}
   1256	other_start = 0;
   1257	other_end = start;
   1258	if (extent_mergeable(leaf, path->slots[0] - 1,
   1259			     ino, bytenr, orig_offset,
   1260			     &other_start, &other_end)) {
   1261		if (recow) {
   1262			btrfs_release_path(path);
   1263			goto again;
   1264		}
   1265		key.offset = other_start;
   1266		del_slot = path->slots[0];
   1267		del_nr++;
   1268		ret = btrfs_free_extent(trans, &ref);
   1269		if (ret) {
   1270			btrfs_abort_transaction(trans, ret);
   1271			goto out;
   1272		}
   1273	}
   1274	if (del_nr == 0) {
   1275		fi = btrfs_item_ptr(leaf, path->slots[0],
   1276			   struct btrfs_file_extent_item);
   1277		btrfs_set_file_extent_type(leaf, fi,
   1278					   BTRFS_FILE_EXTENT_REG);
   1279		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
   1280		btrfs_mark_buffer_dirty(leaf);
   1281	} else {
   1282		fi = btrfs_item_ptr(leaf, del_slot - 1,
   1283			   struct btrfs_file_extent_item);
   1284		btrfs_set_file_extent_type(leaf, fi,
   1285					   BTRFS_FILE_EXTENT_REG);
   1286		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
   1287		btrfs_set_file_extent_num_bytes(leaf, fi,
   1288						extent_end - key.offset);
   1289		btrfs_mark_buffer_dirty(leaf);
   1290
   1291		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
   1292		if (ret < 0) {
   1293			btrfs_abort_transaction(trans, ret);
   1294			goto out;
   1295		}
   1296	}
   1297out:
   1298	btrfs_free_path(path);
   1299	return ret;
   1300}
   1301
   1302/*
   1303 * on error we return an unlocked page and the error value
   1304 * on success we return a locked page and 0
   1305 */
   1306static int prepare_uptodate_page(struct inode *inode,
   1307				 struct page *page, u64 pos,
   1308				 bool force_uptodate)
   1309{
   1310	struct folio *folio = page_folio(page);
   1311	int ret = 0;
   1312
   1313	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
   1314	    !PageUptodate(page)) {
   1315		ret = btrfs_read_folio(NULL, folio);
   1316		if (ret)
   1317			return ret;
   1318		lock_page(page);
   1319		if (!PageUptodate(page)) {
   1320			unlock_page(page);
   1321			return -EIO;
   1322		}
   1323
   1324		/*
   1325		 * Since btrfs_read_folio() will unlock the folio before it
   1326		 * returns, there is a window where btrfs_release_folio() can be
   1327		 * called to release the page.  Here we check both inode
   1328		 * mapping and PagePrivate() to make sure the page was not
   1329		 * released.
   1330		 *
   1331		 * The private flag check is essential for subpage as we need
   1332		 * to store extra bitmap using page->private.
   1333		 */
   1334		if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
   1335			unlock_page(page);
   1336			return -EAGAIN;
   1337		}
   1338	}
   1339	return 0;
   1340}
   1341
   1342/*
   1343 * this just gets pages into the page cache and locks them down.
   1344 */
   1345static noinline int prepare_pages(struct inode *inode, struct page **pages,
   1346				  size_t num_pages, loff_t pos,
   1347				  size_t write_bytes, bool force_uptodate)
   1348{
   1349	int i;
   1350	unsigned long index = pos >> PAGE_SHIFT;
   1351	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
   1352	int err = 0;
   1353	int faili;
   1354
   1355	for (i = 0; i < num_pages; i++) {
   1356again:
   1357		pages[i] = find_or_create_page(inode->i_mapping, index + i,
   1358					       mask | __GFP_WRITE);
   1359		if (!pages[i]) {
   1360			faili = i - 1;
   1361			err = -ENOMEM;
   1362			goto fail;
   1363		}
   1364
   1365		err = set_page_extent_mapped(pages[i]);
   1366		if (err < 0) {
   1367			faili = i;
   1368			goto fail;
   1369		}
   1370
   1371		if (i == 0)
   1372			err = prepare_uptodate_page(inode, pages[i], pos,
   1373						    force_uptodate);
   1374		if (!err && i == num_pages - 1)
   1375			err = prepare_uptodate_page(inode, pages[i],
   1376						    pos + write_bytes, false);
   1377		if (err) {
   1378			put_page(pages[i]);
   1379			if (err == -EAGAIN) {
   1380				err = 0;
   1381				goto again;
   1382			}
   1383			faili = i - 1;
   1384			goto fail;
   1385		}
   1386		wait_on_page_writeback(pages[i]);
   1387	}
   1388
   1389	return 0;
   1390fail:
   1391	while (faili >= 0) {
   1392		unlock_page(pages[faili]);
   1393		put_page(pages[faili]);
   1394		faili--;
   1395	}
   1396	return err;
   1397
   1398}
   1399
   1400/*
   1401 * This function locks the extent and properly waits for data=ordered extents
   1402 * to finish before allowing the pages to be modified if need.
   1403 *
   1404 * The return value:
   1405 * 1 - the extent is locked
   1406 * 0 - the extent is not locked, and everything is OK
   1407 * -EAGAIN - need re-prepare the pages
   1408 * the other < 0 number - Something wrong happens
   1409 */
   1410static noinline int
   1411lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
   1412				size_t num_pages, loff_t pos,
   1413				size_t write_bytes,
   1414				u64 *lockstart, u64 *lockend,
   1415				struct extent_state **cached_state)
   1416{
   1417	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   1418	u64 start_pos;
   1419	u64 last_pos;
   1420	int i;
   1421	int ret = 0;
   1422
   1423	start_pos = round_down(pos, fs_info->sectorsize);
   1424	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
   1425
   1426	if (start_pos < inode->vfs_inode.i_size) {
   1427		struct btrfs_ordered_extent *ordered;
   1428
   1429		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
   1430				cached_state);
   1431		ordered = btrfs_lookup_ordered_range(inode, start_pos,
   1432						     last_pos - start_pos + 1);
   1433		if (ordered &&
   1434		    ordered->file_offset + ordered->num_bytes > start_pos &&
   1435		    ordered->file_offset <= last_pos) {
   1436			unlock_extent_cached(&inode->io_tree, start_pos,
   1437					last_pos, cached_state);
   1438			for (i = 0; i < num_pages; i++) {
   1439				unlock_page(pages[i]);
   1440				put_page(pages[i]);
   1441			}
   1442			btrfs_start_ordered_extent(ordered, 1);
   1443			btrfs_put_ordered_extent(ordered);
   1444			return -EAGAIN;
   1445		}
   1446		if (ordered)
   1447			btrfs_put_ordered_extent(ordered);
   1448
   1449		*lockstart = start_pos;
   1450		*lockend = last_pos;
   1451		ret = 1;
   1452	}
   1453
   1454	/*
   1455	 * We should be called after prepare_pages() which should have locked
   1456	 * all pages in the range.
   1457	 */
   1458	for (i = 0; i < num_pages; i++)
   1459		WARN_ON(!PageLocked(pages[i]));
   1460
   1461	return ret;
   1462}
   1463
   1464/*
   1465 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
   1466 *
   1467 * @pos:         File offset.
   1468 * @write_bytes: The length to write, will be updated to the nocow writeable
   1469 *               range.
   1470 *
   1471 * This function will flush ordered extents in the range to ensure proper
   1472 * nocow checks.
   1473 *
   1474 * Return:
   1475 * > 0          If we can nocow, and updates @write_bytes.
   1476 *  0           If we can't do a nocow write.
   1477 * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
   1478 *              root is in progress.
   1479 * < 0          If an error happened.
   1480 *
   1481 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
   1482 */
   1483int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
   1484			   size_t *write_bytes)
   1485{
   1486	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   1487	struct btrfs_root *root = inode->root;
   1488	u64 lockstart, lockend;
   1489	u64 num_bytes;
   1490	int ret;
   1491
   1492	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
   1493		return 0;
   1494
   1495	if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
   1496		return -EAGAIN;
   1497
   1498	lockstart = round_down(pos, fs_info->sectorsize);
   1499	lockend = round_up(pos + *write_bytes,
   1500			   fs_info->sectorsize) - 1;
   1501	num_bytes = lockend - lockstart + 1;
   1502
   1503	btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
   1504	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
   1505			NULL, NULL, NULL, false);
   1506	if (ret <= 0) {
   1507		ret = 0;
   1508		btrfs_drew_write_unlock(&root->snapshot_lock);
   1509	} else {
   1510		*write_bytes = min_t(size_t, *write_bytes ,
   1511				     num_bytes - pos + lockstart);
   1512	}
   1513	unlock_extent(&inode->io_tree, lockstart, lockend);
   1514
   1515	return ret;
   1516}
   1517
   1518void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
   1519{
   1520	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
   1521}
   1522
   1523static void update_time_for_write(struct inode *inode)
   1524{
   1525	struct timespec64 now;
   1526
   1527	if (IS_NOCMTIME(inode))
   1528		return;
   1529
   1530	now = current_time(inode);
   1531	if (!timespec64_equal(&inode->i_mtime, &now))
   1532		inode->i_mtime = now;
   1533
   1534	if (!timespec64_equal(&inode->i_ctime, &now))
   1535		inode->i_ctime = now;
   1536
   1537	if (IS_I_VERSION(inode))
   1538		inode_inc_iversion(inode);
   1539}
   1540
   1541static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
   1542			     size_t count)
   1543{
   1544	struct file *file = iocb->ki_filp;
   1545	struct inode *inode = file_inode(file);
   1546	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   1547	loff_t pos = iocb->ki_pos;
   1548	int ret;
   1549	loff_t oldsize;
   1550	loff_t start_pos;
   1551
   1552	/*
   1553	 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
   1554	 * prealloc flags, as without those flags we always have to COW. We will
   1555	 * later check if we can really COW into the target range (using
   1556	 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
   1557	 */
   1558	if ((iocb->ki_flags & IOCB_NOWAIT) &&
   1559	    !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
   1560		return -EAGAIN;
   1561
   1562	current->backing_dev_info = inode_to_bdi(inode);
   1563	ret = file_remove_privs(file);
   1564	if (ret)
   1565		return ret;
   1566
   1567	/*
   1568	 * We reserve space for updating the inode when we reserve space for the
   1569	 * extent we are going to write, so we will enospc out there.  We don't
   1570	 * need to start yet another transaction to update the inode as we will
   1571	 * update the inode when we finish writing whatever data we write.
   1572	 */
   1573	update_time_for_write(inode);
   1574
   1575	start_pos = round_down(pos, fs_info->sectorsize);
   1576	oldsize = i_size_read(inode);
   1577	if (start_pos > oldsize) {
   1578		/* Expand hole size to cover write data, preventing empty gap */
   1579		loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
   1580
   1581		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
   1582		if (ret) {
   1583			current->backing_dev_info = NULL;
   1584			return ret;
   1585		}
   1586	}
   1587
   1588	return 0;
   1589}
   1590
   1591static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
   1592					       struct iov_iter *i)
   1593{
   1594	struct file *file = iocb->ki_filp;
   1595	loff_t pos;
   1596	struct inode *inode = file_inode(file);
   1597	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   1598	struct page **pages = NULL;
   1599	struct extent_changeset *data_reserved = NULL;
   1600	u64 release_bytes = 0;
   1601	u64 lockstart;
   1602	u64 lockend;
   1603	size_t num_written = 0;
   1604	int nrptrs;
   1605	ssize_t ret;
   1606	bool only_release_metadata = false;
   1607	bool force_page_uptodate = false;
   1608	loff_t old_isize = i_size_read(inode);
   1609	unsigned int ilock_flags = 0;
   1610
   1611	if (iocb->ki_flags & IOCB_NOWAIT)
   1612		ilock_flags |= BTRFS_ILOCK_TRY;
   1613
   1614	ret = btrfs_inode_lock(inode, ilock_flags);
   1615	if (ret < 0)
   1616		return ret;
   1617
   1618	ret = generic_write_checks(iocb, i);
   1619	if (ret <= 0)
   1620		goto out;
   1621
   1622	ret = btrfs_write_check(iocb, i, ret);
   1623	if (ret < 0)
   1624		goto out;
   1625
   1626	pos = iocb->ki_pos;
   1627	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
   1628			PAGE_SIZE / (sizeof(struct page *)));
   1629	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
   1630	nrptrs = max(nrptrs, 8);
   1631	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
   1632	if (!pages) {
   1633		ret = -ENOMEM;
   1634		goto out;
   1635	}
   1636
   1637	while (iov_iter_count(i) > 0) {
   1638		struct extent_state *cached_state = NULL;
   1639		size_t offset = offset_in_page(pos);
   1640		size_t sector_offset;
   1641		size_t write_bytes = min(iov_iter_count(i),
   1642					 nrptrs * (size_t)PAGE_SIZE -
   1643					 offset);
   1644		size_t num_pages;
   1645		size_t reserve_bytes;
   1646		size_t dirty_pages;
   1647		size_t copied;
   1648		size_t dirty_sectors;
   1649		size_t num_sectors;
   1650		int extents_locked;
   1651
   1652		/*
   1653		 * Fault pages before locking them in prepare_pages
   1654		 * to avoid recursive lock
   1655		 */
   1656		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
   1657			ret = -EFAULT;
   1658			break;
   1659		}
   1660
   1661		only_release_metadata = false;
   1662		sector_offset = pos & (fs_info->sectorsize - 1);
   1663
   1664		extent_changeset_release(data_reserved);
   1665		ret = btrfs_check_data_free_space(BTRFS_I(inode),
   1666						  &data_reserved, pos,
   1667						  write_bytes);
   1668		if (ret < 0) {
   1669			/*
   1670			 * If we don't have to COW at the offset, reserve
   1671			 * metadata only. write_bytes may get smaller than
   1672			 * requested here.
   1673			 */
   1674			if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
   1675						   &write_bytes) > 0)
   1676				only_release_metadata = true;
   1677			else
   1678				break;
   1679		}
   1680
   1681		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
   1682		WARN_ON(num_pages > nrptrs);
   1683		reserve_bytes = round_up(write_bytes + sector_offset,
   1684					 fs_info->sectorsize);
   1685		WARN_ON(reserve_bytes == 0);
   1686		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
   1687						      reserve_bytes,
   1688						      reserve_bytes, false);
   1689		if (ret) {
   1690			if (!only_release_metadata)
   1691				btrfs_free_reserved_data_space(BTRFS_I(inode),
   1692						data_reserved, pos,
   1693						write_bytes);
   1694			else
   1695				btrfs_check_nocow_unlock(BTRFS_I(inode));
   1696			break;
   1697		}
   1698
   1699		release_bytes = reserve_bytes;
   1700again:
   1701		/*
   1702		 * This is going to setup the pages array with the number of
   1703		 * pages we want, so we don't really need to worry about the
   1704		 * contents of pages from loop to loop
   1705		 */
   1706		ret = prepare_pages(inode, pages, num_pages,
   1707				    pos, write_bytes,
   1708				    force_page_uptodate);
   1709		if (ret) {
   1710			btrfs_delalloc_release_extents(BTRFS_I(inode),
   1711						       reserve_bytes);
   1712			break;
   1713		}
   1714
   1715		extents_locked = lock_and_cleanup_extent_if_need(
   1716				BTRFS_I(inode), pages,
   1717				num_pages, pos, write_bytes, &lockstart,
   1718				&lockend, &cached_state);
   1719		if (extents_locked < 0) {
   1720			if (extents_locked == -EAGAIN)
   1721				goto again;
   1722			btrfs_delalloc_release_extents(BTRFS_I(inode),
   1723						       reserve_bytes);
   1724			ret = extents_locked;
   1725			break;
   1726		}
   1727
   1728		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
   1729
   1730		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
   1731		dirty_sectors = round_up(copied + sector_offset,
   1732					fs_info->sectorsize);
   1733		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
   1734
   1735		/*
   1736		 * if we have trouble faulting in the pages, fall
   1737		 * back to one page at a time
   1738		 */
   1739		if (copied < write_bytes)
   1740			nrptrs = 1;
   1741
   1742		if (copied == 0) {
   1743			force_page_uptodate = true;
   1744			dirty_sectors = 0;
   1745			dirty_pages = 0;
   1746		} else {
   1747			force_page_uptodate = false;
   1748			dirty_pages = DIV_ROUND_UP(copied + offset,
   1749						   PAGE_SIZE);
   1750		}
   1751
   1752		if (num_sectors > dirty_sectors) {
   1753			/* release everything except the sectors we dirtied */
   1754			release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
   1755			if (only_release_metadata) {
   1756				btrfs_delalloc_release_metadata(BTRFS_I(inode),
   1757							release_bytes, true);
   1758			} else {
   1759				u64 __pos;
   1760
   1761				__pos = round_down(pos,
   1762						   fs_info->sectorsize) +
   1763					(dirty_pages << PAGE_SHIFT);
   1764				btrfs_delalloc_release_space(BTRFS_I(inode),
   1765						data_reserved, __pos,
   1766						release_bytes, true);
   1767			}
   1768		}
   1769
   1770		release_bytes = round_up(copied + sector_offset,
   1771					fs_info->sectorsize);
   1772
   1773		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
   1774					dirty_pages, pos, copied,
   1775					&cached_state, only_release_metadata);
   1776
   1777		/*
   1778		 * If we have not locked the extent range, because the range's
   1779		 * start offset is >= i_size, we might still have a non-NULL
   1780		 * cached extent state, acquired while marking the extent range
   1781		 * as delalloc through btrfs_dirty_pages(). Therefore free any
   1782		 * possible cached extent state to avoid a memory leak.
   1783		 */
   1784		if (extents_locked)
   1785			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
   1786					     lockstart, lockend, &cached_state);
   1787		else
   1788			free_extent_state(cached_state);
   1789
   1790		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
   1791		if (ret) {
   1792			btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
   1793			break;
   1794		}
   1795
   1796		release_bytes = 0;
   1797		if (only_release_metadata)
   1798			btrfs_check_nocow_unlock(BTRFS_I(inode));
   1799
   1800		btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
   1801
   1802		cond_resched();
   1803
   1804		balance_dirty_pages_ratelimited(inode->i_mapping);
   1805
   1806		pos += copied;
   1807		num_written += copied;
   1808	}
   1809
   1810	kfree(pages);
   1811
   1812	if (release_bytes) {
   1813		if (only_release_metadata) {
   1814			btrfs_check_nocow_unlock(BTRFS_I(inode));
   1815			btrfs_delalloc_release_metadata(BTRFS_I(inode),
   1816					release_bytes, true);
   1817		} else {
   1818			btrfs_delalloc_release_space(BTRFS_I(inode),
   1819					data_reserved,
   1820					round_down(pos, fs_info->sectorsize),
   1821					release_bytes, true);
   1822		}
   1823	}
   1824
   1825	extent_changeset_free(data_reserved);
   1826	if (num_written > 0) {
   1827		pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
   1828		iocb->ki_pos += num_written;
   1829	}
   1830out:
   1831	btrfs_inode_unlock(inode, ilock_flags);
   1832	return num_written ? num_written : ret;
   1833}
   1834
   1835static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
   1836			       const struct iov_iter *iter, loff_t offset)
   1837{
   1838	const u32 blocksize_mask = fs_info->sectorsize - 1;
   1839
   1840	if (offset & blocksize_mask)
   1841		return -EINVAL;
   1842
   1843	if (iov_iter_alignment(iter) & blocksize_mask)
   1844		return -EINVAL;
   1845
   1846	return 0;
   1847}
   1848
   1849static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
   1850{
   1851	const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
   1852	struct file *file = iocb->ki_filp;
   1853	struct inode *inode = file_inode(file);
   1854	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   1855	loff_t pos;
   1856	ssize_t written = 0;
   1857	ssize_t written_buffered;
   1858	size_t prev_left = 0;
   1859	loff_t endbyte;
   1860	ssize_t err;
   1861	unsigned int ilock_flags = 0;
   1862
   1863	if (iocb->ki_flags & IOCB_NOWAIT)
   1864		ilock_flags |= BTRFS_ILOCK_TRY;
   1865
   1866	/* If the write DIO is within EOF, use a shared lock */
   1867	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
   1868		ilock_flags |= BTRFS_ILOCK_SHARED;
   1869
   1870relock:
   1871	err = btrfs_inode_lock(inode, ilock_flags);
   1872	if (err < 0)
   1873		return err;
   1874
   1875	err = generic_write_checks(iocb, from);
   1876	if (err <= 0) {
   1877		btrfs_inode_unlock(inode, ilock_flags);
   1878		return err;
   1879	}
   1880
   1881	err = btrfs_write_check(iocb, from, err);
   1882	if (err < 0) {
   1883		btrfs_inode_unlock(inode, ilock_flags);
   1884		goto out;
   1885	}
   1886
   1887	pos = iocb->ki_pos;
   1888	/*
   1889	 * Re-check since file size may have changed just before taking the
   1890	 * lock or pos may have changed because of O_APPEND in generic_write_check()
   1891	 */
   1892	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
   1893	    pos + iov_iter_count(from) > i_size_read(inode)) {
   1894		btrfs_inode_unlock(inode, ilock_flags);
   1895		ilock_flags &= ~BTRFS_ILOCK_SHARED;
   1896		goto relock;
   1897	}
   1898
   1899	if (check_direct_IO(fs_info, from, pos)) {
   1900		btrfs_inode_unlock(inode, ilock_flags);
   1901		goto buffered;
   1902	}
   1903
   1904	/*
   1905	 * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
   1906	 * calls generic_write_sync() (through iomap_dio_complete()), because
   1907	 * that results in calling fsync (btrfs_sync_file()) which will try to
   1908	 * lock the inode in exclusive/write mode.
   1909	 */
   1910	if (is_sync_write)
   1911		iocb->ki_flags &= ~IOCB_DSYNC;
   1912
   1913	/*
   1914	 * The iov_iter can be mapped to the same file range we are writing to.
   1915	 * If that's the case, then we will deadlock in the iomap code, because
   1916	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
   1917	 * an ordered extent, and after that it will fault in the pages that the
   1918	 * iov_iter refers to. During the fault in we end up in the readahead
   1919	 * pages code (starting at btrfs_readahead()), which will lock the range,
   1920	 * find that ordered extent and then wait for it to complete (at
   1921	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
   1922	 * obviously the ordered extent can never complete as we didn't submit
   1923	 * yet the respective bio(s). This always happens when the buffer is
   1924	 * memory mapped to the same file range, since the iomap DIO code always
   1925	 * invalidates pages in the target file range (after starting and waiting
   1926	 * for any writeback).
   1927	 *
   1928	 * So here we disable page faults in the iov_iter and then retry if we
   1929	 * got -EFAULT, faulting in the pages before the retry.
   1930	 */
   1931again:
   1932	from->nofault = true;
   1933	err = btrfs_dio_rw(iocb, from, written);
   1934	from->nofault = false;
   1935
   1936	/* No increment (+=) because iomap returns a cumulative value. */
   1937	if (err > 0)
   1938		written = err;
   1939
   1940	if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
   1941		const size_t left = iov_iter_count(from);
   1942		/*
   1943		 * We have more data left to write. Try to fault in as many as
   1944		 * possible of the remainder pages and retry. We do this without
   1945		 * releasing and locking again the inode, to prevent races with
   1946		 * truncate.
   1947		 *
   1948		 * Also, in case the iov refers to pages in the file range of the
   1949		 * file we want to write to (due to a mmap), we could enter an
   1950		 * infinite loop if we retry after faulting the pages in, since
   1951		 * iomap will invalidate any pages in the range early on, before
   1952		 * it tries to fault in the pages of the iov. So we keep track of
   1953		 * how much was left of iov in the previous EFAULT and fallback
   1954		 * to buffered IO in case we haven't made any progress.
   1955		 */
   1956		if (left == prev_left) {
   1957			err = -ENOTBLK;
   1958		} else {
   1959			fault_in_iov_iter_readable(from, left);
   1960			prev_left = left;
   1961			goto again;
   1962		}
   1963	}
   1964
   1965	btrfs_inode_unlock(inode, ilock_flags);
   1966
   1967	/*
   1968	 * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
   1969	 * the fsync (call generic_write_sync()).
   1970	 */
   1971	if (is_sync_write)
   1972		iocb->ki_flags |= IOCB_DSYNC;
   1973
   1974	/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
   1975	if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
   1976		goto out;
   1977
   1978buffered:
   1979	pos = iocb->ki_pos;
   1980	written_buffered = btrfs_buffered_write(iocb, from);
   1981	if (written_buffered < 0) {
   1982		err = written_buffered;
   1983		goto out;
   1984	}
   1985	/*
   1986	 * Ensure all data is persisted. We want the next direct IO read to be
   1987	 * able to read what was just written.
   1988	 */
   1989	endbyte = pos + written_buffered - 1;
   1990	err = btrfs_fdatawrite_range(inode, pos, endbyte);
   1991	if (err)
   1992		goto out;
   1993	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
   1994	if (err)
   1995		goto out;
   1996	written += written_buffered;
   1997	iocb->ki_pos = pos + written_buffered;
   1998	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
   1999				 endbyte >> PAGE_SHIFT);
   2000out:
   2001	return err < 0 ? err : written;
   2002}
   2003
   2004static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
   2005			const struct btrfs_ioctl_encoded_io_args *encoded)
   2006{
   2007	struct file *file = iocb->ki_filp;
   2008	struct inode *inode = file_inode(file);
   2009	loff_t count;
   2010	ssize_t ret;
   2011
   2012	btrfs_inode_lock(inode, 0);
   2013	count = encoded->len;
   2014	ret = generic_write_checks_count(iocb, &count);
   2015	if (ret == 0 && count != encoded->len) {
   2016		/*
   2017		 * The write got truncated by generic_write_checks_count(). We
   2018		 * can't do a partial encoded write.
   2019		 */
   2020		ret = -EFBIG;
   2021	}
   2022	if (ret || encoded->len == 0)
   2023		goto out;
   2024
   2025	ret = btrfs_write_check(iocb, from, encoded->len);
   2026	if (ret < 0)
   2027		goto out;
   2028
   2029	ret = btrfs_do_encoded_write(iocb, from, encoded);
   2030out:
   2031	btrfs_inode_unlock(inode, 0);
   2032	return ret;
   2033}
   2034
   2035ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
   2036			    const struct btrfs_ioctl_encoded_io_args *encoded)
   2037{
   2038	struct file *file = iocb->ki_filp;
   2039	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
   2040	ssize_t num_written, num_sync;
   2041	const bool sync = iocb->ki_flags & IOCB_DSYNC;
   2042
   2043	/*
   2044	 * If the fs flips readonly due to some impossible error, although we
   2045	 * have opened a file as writable, we have to stop this write operation
   2046	 * to ensure consistency.
   2047	 */
   2048	if (BTRFS_FS_ERROR(inode->root->fs_info))
   2049		return -EROFS;
   2050
   2051	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
   2052		return -EOPNOTSUPP;
   2053
   2054	if (sync)
   2055		atomic_inc(&inode->sync_writers);
   2056
   2057	if (encoded) {
   2058		num_written = btrfs_encoded_write(iocb, from, encoded);
   2059		num_sync = encoded->len;
   2060	} else if (iocb->ki_flags & IOCB_DIRECT) {
   2061		num_written = num_sync = btrfs_direct_write(iocb, from);
   2062	} else {
   2063		num_written = num_sync = btrfs_buffered_write(iocb, from);
   2064	}
   2065
   2066	btrfs_set_inode_last_sub_trans(inode);
   2067
   2068	if (num_sync > 0) {
   2069		num_sync = generic_write_sync(iocb, num_sync);
   2070		if (num_sync < 0)
   2071			num_written = num_sync;
   2072	}
   2073
   2074	if (sync)
   2075		atomic_dec(&inode->sync_writers);
   2076
   2077	current->backing_dev_info = NULL;
   2078	return num_written;
   2079}
   2080
   2081static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   2082{
   2083	return btrfs_do_write_iter(iocb, from, NULL);
   2084}
   2085
   2086int btrfs_release_file(struct inode *inode, struct file *filp)
   2087{
   2088	struct btrfs_file_private *private = filp->private_data;
   2089
   2090	if (private && private->filldir_buf)
   2091		kfree(private->filldir_buf);
   2092	kfree(private);
   2093	filp->private_data = NULL;
   2094
   2095	/*
   2096	 * Set by setattr when we are about to truncate a file from a non-zero
   2097	 * size to a zero size.  This tries to flush down new bytes that may
   2098	 * have been written if the application were using truncate to replace
   2099	 * a file in place.
   2100	 */
   2101	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
   2102			       &BTRFS_I(inode)->runtime_flags))
   2103			filemap_flush(inode->i_mapping);
   2104	return 0;
   2105}
   2106
   2107static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
   2108{
   2109	int ret;
   2110	struct blk_plug plug;
   2111
   2112	/*
   2113	 * This is only called in fsync, which would do synchronous writes, so
   2114	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
   2115	 * multiple disks using raid profile, a large IO can be split to
   2116	 * several segments of stripe length (currently 64K).
   2117	 */
   2118	blk_start_plug(&plug);
   2119	atomic_inc(&BTRFS_I(inode)->sync_writers);
   2120	ret = btrfs_fdatawrite_range(inode, start, end);
   2121	atomic_dec(&BTRFS_I(inode)->sync_writers);
   2122	blk_finish_plug(&plug);
   2123
   2124	return ret;
   2125}
   2126
   2127static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
   2128{
   2129	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
   2130	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   2131
   2132	if (btrfs_inode_in_log(inode, fs_info->generation) &&
   2133	    list_empty(&ctx->ordered_extents))
   2134		return true;
   2135
   2136	/*
   2137	 * If we are doing a fast fsync we can not bail out if the inode's
   2138	 * last_trans is <= then the last committed transaction, because we only
   2139	 * update the last_trans of the inode during ordered extent completion,
   2140	 * and for a fast fsync we don't wait for that, we only wait for the
   2141	 * writeback to complete.
   2142	 */
   2143	if (inode->last_trans <= fs_info->last_trans_committed &&
   2144	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
   2145	     list_empty(&ctx->ordered_extents)))
   2146		return true;
   2147
   2148	return false;
   2149}
   2150
   2151/*
   2152 * fsync call for both files and directories.  This logs the inode into
   2153 * the tree log instead of forcing full commits whenever possible.
   2154 *
   2155 * It needs to call filemap_fdatawait so that all ordered extent updates are
   2156 * in the metadata btree are up to date for copying to the log.
   2157 *
   2158 * It drops the inode mutex before doing the tree log commit.  This is an
   2159 * important optimization for directories because holding the mutex prevents
   2160 * new operations on the dir while we write to disk.
   2161 */
   2162int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
   2163{
   2164	struct dentry *dentry = file_dentry(file);
   2165	struct inode *inode = d_inode(dentry);
   2166	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   2167	struct btrfs_root *root = BTRFS_I(inode)->root;
   2168	struct btrfs_trans_handle *trans;
   2169	struct btrfs_log_ctx ctx;
   2170	int ret = 0, err;
   2171	u64 len;
   2172	bool full_sync;
   2173
   2174	trace_btrfs_sync_file(file, datasync);
   2175
   2176	btrfs_init_log_ctx(&ctx, inode);
   2177
   2178	/*
   2179	 * Always set the range to a full range, otherwise we can get into
   2180	 * several problems, from missing file extent items to represent holes
   2181	 * when not using the NO_HOLES feature, to log tree corruption due to
   2182	 * races between hole detection during logging and completion of ordered
   2183	 * extents outside the range, to missing checksums due to ordered extents
   2184	 * for which we flushed only a subset of their pages.
   2185	 */
   2186	start = 0;
   2187	end = LLONG_MAX;
   2188	len = (u64)LLONG_MAX + 1;
   2189
   2190	/*
   2191	 * We write the dirty pages in the range and wait until they complete
   2192	 * out of the ->i_mutex. If so, we can flush the dirty pages by
   2193	 * multi-task, and make the performance up.  See
   2194	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
   2195	 */
   2196	ret = start_ordered_ops(inode, start, end);
   2197	if (ret)
   2198		goto out;
   2199
   2200	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
   2201
   2202	atomic_inc(&root->log_batch);
   2203
   2204	/*
   2205	 * Always check for the full sync flag while holding the inode's lock,
   2206	 * to avoid races with other tasks. The flag must be either set all the
   2207	 * time during logging or always off all the time while logging.
   2208	 */
   2209	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
   2210			     &BTRFS_I(inode)->runtime_flags);
   2211
   2212	/*
   2213	 * Before we acquired the inode's lock and the mmap lock, someone may
   2214	 * have dirtied more pages in the target range. We need to make sure
   2215	 * that writeback for any such pages does not start while we are logging
   2216	 * the inode, because if it does, any of the following might happen when
   2217	 * we are not doing a full inode sync:
   2218	 *
   2219	 * 1) We log an extent after its writeback finishes but before its
   2220	 *    checksums are added to the csum tree, leading to -EIO errors
   2221	 *    when attempting to read the extent after a log replay.
   2222	 *
   2223	 * 2) We can end up logging an extent before its writeback finishes.
   2224	 *    Therefore after the log replay we will have a file extent item
   2225	 *    pointing to an unwritten extent (and no data checksums as well).
   2226	 *
   2227	 * So trigger writeback for any eventual new dirty pages and then we
   2228	 * wait for all ordered extents to complete below.
   2229	 */
   2230	ret = start_ordered_ops(inode, start, end);
   2231	if (ret) {
   2232		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
   2233		goto out;
   2234	}
   2235
   2236	/*
   2237	 * We have to do this here to avoid the priority inversion of waiting on
   2238	 * IO of a lower priority task while holding a transaction open.
   2239	 *
   2240	 * For a full fsync we wait for the ordered extents to complete while
   2241	 * for a fast fsync we wait just for writeback to complete, and then
   2242	 * attach the ordered extents to the transaction so that a transaction
   2243	 * commit waits for their completion, to avoid data loss if we fsync,
   2244	 * the current transaction commits before the ordered extents complete
   2245	 * and a power failure happens right after that.
   2246	 *
   2247	 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
   2248	 * logical address recorded in the ordered extent may change. We need
   2249	 * to wait for the IO to stabilize the logical address.
   2250	 */
   2251	if (full_sync || btrfs_is_zoned(fs_info)) {
   2252		ret = btrfs_wait_ordered_range(inode, start, len);
   2253	} else {
   2254		/*
   2255		 * Get our ordered extents as soon as possible to avoid doing
   2256		 * checksum lookups in the csum tree, and use instead the
   2257		 * checksums attached to the ordered extents.
   2258		 */
   2259		btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
   2260						      &ctx.ordered_extents);
   2261		ret = filemap_fdatawait_range(inode->i_mapping, start, end);
   2262	}
   2263
   2264	if (ret)
   2265		goto out_release_extents;
   2266
   2267	atomic_inc(&root->log_batch);
   2268
   2269	smp_mb();
   2270	if (skip_inode_logging(&ctx)) {
   2271		/*
   2272		 * We've had everything committed since the last time we were
   2273		 * modified so clear this flag in case it was set for whatever
   2274		 * reason, it's no longer relevant.
   2275		 */
   2276		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
   2277			  &BTRFS_I(inode)->runtime_flags);
   2278		/*
   2279		 * An ordered extent might have started before and completed
   2280		 * already with io errors, in which case the inode was not
   2281		 * updated and we end up here. So check the inode's mapping
   2282		 * for any errors that might have happened since we last
   2283		 * checked called fsync.
   2284		 */
   2285		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
   2286		goto out_release_extents;
   2287	}
   2288
   2289	/*
   2290	 * We use start here because we will need to wait on the IO to complete
   2291	 * in btrfs_sync_log, which could require joining a transaction (for
   2292	 * example checking cross references in the nocow path).  If we use join
   2293	 * here we could get into a situation where we're waiting on IO to
   2294	 * happen that is blocked on a transaction trying to commit.  With start
   2295	 * we inc the extwriter counter, so we wait for all extwriters to exit
   2296	 * before we start blocking joiners.  This comment is to keep somebody
   2297	 * from thinking they are super smart and changing this to
   2298	 * btrfs_join_transaction *cough*Josef*cough*.
   2299	 */
   2300	trans = btrfs_start_transaction(root, 0);
   2301	if (IS_ERR(trans)) {
   2302		ret = PTR_ERR(trans);
   2303		goto out_release_extents;
   2304	}
   2305	trans->in_fsync = true;
   2306
   2307	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
   2308	btrfs_release_log_ctx_extents(&ctx);
   2309	if (ret < 0) {
   2310		/* Fallthrough and commit/free transaction. */
   2311		ret = 1;
   2312	}
   2313
   2314	/* we've logged all the items and now have a consistent
   2315	 * version of the file in the log.  It is possible that
   2316	 * someone will come in and modify the file, but that's
   2317	 * fine because the log is consistent on disk, and we
   2318	 * have references to all of the file's extents
   2319	 *
   2320	 * It is possible that someone will come in and log the
   2321	 * file again, but that will end up using the synchronization
   2322	 * inside btrfs_sync_log to keep things safe.
   2323	 */
   2324	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
   2325
   2326	if (ret == BTRFS_NO_LOG_SYNC) {
   2327		ret = btrfs_end_transaction(trans);
   2328		goto out;
   2329	}
   2330
   2331	/* We successfully logged the inode, attempt to sync the log. */
   2332	if (!ret) {
   2333		ret = btrfs_sync_log(trans, root, &ctx);
   2334		if (!ret) {
   2335			ret = btrfs_end_transaction(trans);
   2336			goto out;
   2337		}
   2338	}
   2339
   2340	/*
   2341	 * At this point we need to commit the transaction because we had
   2342	 * btrfs_need_log_full_commit() or some other error.
   2343	 *
   2344	 * If we didn't do a full sync we have to stop the trans handle, wait on
   2345	 * the ordered extents, start it again and commit the transaction.  If
   2346	 * we attempt to wait on the ordered extents here we could deadlock with
   2347	 * something like fallocate() that is holding the extent lock trying to
   2348	 * start a transaction while some other thread is trying to commit the
   2349	 * transaction while we (fsync) are currently holding the transaction
   2350	 * open.
   2351	 */
   2352	if (!full_sync) {
   2353		ret = btrfs_end_transaction(trans);
   2354		if (ret)
   2355			goto out;
   2356		ret = btrfs_wait_ordered_range(inode, start, len);
   2357		if (ret)
   2358			goto out;
   2359
   2360		/*
   2361		 * This is safe to use here because we're only interested in
   2362		 * making sure the transaction that had the ordered extents is
   2363		 * committed.  We aren't waiting on anything past this point,
   2364		 * we're purely getting the transaction and committing it.
   2365		 */
   2366		trans = btrfs_attach_transaction_barrier(root);
   2367		if (IS_ERR(trans)) {
   2368			ret = PTR_ERR(trans);
   2369
   2370			/*
   2371			 * We committed the transaction and there's no currently
   2372			 * running transaction, this means everything we care
   2373			 * about made it to disk and we are done.
   2374			 */
   2375			if (ret == -ENOENT)
   2376				ret = 0;
   2377			goto out;
   2378		}
   2379	}
   2380
   2381	ret = btrfs_commit_transaction(trans);
   2382out:
   2383	ASSERT(list_empty(&ctx.list));
   2384	err = file_check_and_advance_wb_err(file);
   2385	if (!ret)
   2386		ret = err;
   2387	return ret > 0 ? -EIO : ret;
   2388
   2389out_release_extents:
   2390	btrfs_release_log_ctx_extents(&ctx);
   2391	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
   2392	goto out;
   2393}
   2394
   2395static const struct vm_operations_struct btrfs_file_vm_ops = {
   2396	.fault		= filemap_fault,
   2397	.map_pages	= filemap_map_pages,
   2398	.page_mkwrite	= btrfs_page_mkwrite,
   2399};
   2400
   2401static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
   2402{
   2403	struct address_space *mapping = filp->f_mapping;
   2404
   2405	if (!mapping->a_ops->read_folio)
   2406		return -ENOEXEC;
   2407
   2408	file_accessed(filp);
   2409	vma->vm_ops = &btrfs_file_vm_ops;
   2410
   2411	return 0;
   2412}
   2413
   2414static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
   2415			  int slot, u64 start, u64 end)
   2416{
   2417	struct btrfs_file_extent_item *fi;
   2418	struct btrfs_key key;
   2419
   2420	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
   2421		return 0;
   2422
   2423	btrfs_item_key_to_cpu(leaf, &key, slot);
   2424	if (key.objectid != btrfs_ino(inode) ||
   2425	    key.type != BTRFS_EXTENT_DATA_KEY)
   2426		return 0;
   2427
   2428	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
   2429
   2430	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
   2431		return 0;
   2432
   2433	if (btrfs_file_extent_disk_bytenr(leaf, fi))
   2434		return 0;
   2435
   2436	if (key.offset == end)
   2437		return 1;
   2438	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
   2439		return 1;
   2440	return 0;
   2441}
   2442
   2443static int fill_holes(struct btrfs_trans_handle *trans,
   2444		struct btrfs_inode *inode,
   2445		struct btrfs_path *path, u64 offset, u64 end)
   2446{
   2447	struct btrfs_fs_info *fs_info = trans->fs_info;
   2448	struct btrfs_root *root = inode->root;
   2449	struct extent_buffer *leaf;
   2450	struct btrfs_file_extent_item *fi;
   2451	struct extent_map *hole_em;
   2452	struct extent_map_tree *em_tree = &inode->extent_tree;
   2453	struct btrfs_key key;
   2454	int ret;
   2455
   2456	if (btrfs_fs_incompat(fs_info, NO_HOLES))
   2457		goto out;
   2458
   2459	key.objectid = btrfs_ino(inode);
   2460	key.type = BTRFS_EXTENT_DATA_KEY;
   2461	key.offset = offset;
   2462
   2463	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
   2464	if (ret <= 0) {
   2465		/*
   2466		 * We should have dropped this offset, so if we find it then
   2467		 * something has gone horribly wrong.
   2468		 */
   2469		if (ret == 0)
   2470			ret = -EINVAL;
   2471		return ret;
   2472	}
   2473
   2474	leaf = path->nodes[0];
   2475	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
   2476		u64 num_bytes;
   2477
   2478		path->slots[0]--;
   2479		fi = btrfs_item_ptr(leaf, path->slots[0],
   2480				    struct btrfs_file_extent_item);
   2481		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
   2482			end - offset;
   2483		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
   2484		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
   2485		btrfs_set_file_extent_offset(leaf, fi, 0);
   2486		btrfs_mark_buffer_dirty(leaf);
   2487		goto out;
   2488	}
   2489
   2490	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
   2491		u64 num_bytes;
   2492
   2493		key.offset = offset;
   2494		btrfs_set_item_key_safe(fs_info, path, &key);
   2495		fi = btrfs_item_ptr(leaf, path->slots[0],
   2496				    struct btrfs_file_extent_item);
   2497		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
   2498			offset;
   2499		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
   2500		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
   2501		btrfs_set_file_extent_offset(leaf, fi, 0);
   2502		btrfs_mark_buffer_dirty(leaf);
   2503		goto out;
   2504	}
   2505	btrfs_release_path(path);
   2506
   2507	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
   2508			offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
   2509	if (ret)
   2510		return ret;
   2511
   2512out:
   2513	btrfs_release_path(path);
   2514
   2515	hole_em = alloc_extent_map();
   2516	if (!hole_em) {
   2517		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
   2518		btrfs_set_inode_full_sync(inode);
   2519	} else {
   2520		hole_em->start = offset;
   2521		hole_em->len = end - offset;
   2522		hole_em->ram_bytes = hole_em->len;
   2523		hole_em->orig_start = offset;
   2524
   2525		hole_em->block_start = EXTENT_MAP_HOLE;
   2526		hole_em->block_len = 0;
   2527		hole_em->orig_block_len = 0;
   2528		hole_em->compress_type = BTRFS_COMPRESS_NONE;
   2529		hole_em->generation = trans->transid;
   2530
   2531		do {
   2532			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
   2533			write_lock(&em_tree->lock);
   2534			ret = add_extent_mapping(em_tree, hole_em, 1);
   2535			write_unlock(&em_tree->lock);
   2536		} while (ret == -EEXIST);
   2537		free_extent_map(hole_em);
   2538		if (ret)
   2539			btrfs_set_inode_full_sync(inode);
   2540	}
   2541
   2542	return 0;
   2543}
   2544
   2545/*
   2546 * Find a hole extent on given inode and change start/len to the end of hole
   2547 * extent.(hole/vacuum extent whose em->start <= start &&
   2548 *	   em->start + em->len > start)
   2549 * When a hole extent is found, return 1 and modify start/len.
   2550 */
   2551static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
   2552{
   2553	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   2554	struct extent_map *em;
   2555	int ret = 0;
   2556
   2557	em = btrfs_get_extent(inode, NULL, 0,
   2558			      round_down(*start, fs_info->sectorsize),
   2559			      round_up(*len, fs_info->sectorsize));
   2560	if (IS_ERR(em))
   2561		return PTR_ERR(em);
   2562
   2563	/* Hole or vacuum extent(only exists in no-hole mode) */
   2564	if (em->block_start == EXTENT_MAP_HOLE) {
   2565		ret = 1;
   2566		*len = em->start + em->len > *start + *len ?
   2567		       0 : *start + *len - em->start - em->len;
   2568		*start = em->start + em->len;
   2569	}
   2570	free_extent_map(em);
   2571	return ret;
   2572}
   2573
   2574static void btrfs_punch_hole_lock_range(struct inode *inode,
   2575					const u64 lockstart,
   2576					const u64 lockend,
   2577					struct extent_state **cached_state)
   2578{
   2579	/*
   2580	 * For subpage case, if the range is not at page boundary, we could
   2581	 * have pages at the leading/tailing part of the range.
   2582	 * This could lead to dead loop since filemap_range_has_page()
   2583	 * will always return true.
   2584	 * So here we need to do extra page alignment for
   2585	 * filemap_range_has_page().
   2586	 */
   2587	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
   2588	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
   2589
   2590	while (1) {
   2591		truncate_pagecache_range(inode, lockstart, lockend);
   2592
   2593		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
   2594				 cached_state);
   2595		/*
   2596		 * We can't have ordered extents in the range, nor dirty/writeback
   2597		 * pages, because we have locked the inode's VFS lock in exclusive
   2598		 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
   2599		 * we have flushed all delalloc in the range and we have waited
   2600		 * for any ordered extents in the range to complete.
   2601		 * We can race with anyone reading pages from this range, so after
   2602		 * locking the range check if we have pages in the range, and if
   2603		 * we do, unlock the range and retry.
   2604		 */
   2605		if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
   2606					    page_lockend))
   2607			break;
   2608
   2609		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
   2610				     lockend, cached_state);
   2611	}
   2612
   2613	btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
   2614}
   2615
   2616static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
   2617				     struct btrfs_inode *inode,
   2618				     struct btrfs_path *path,
   2619				     struct btrfs_replace_extent_info *extent_info,
   2620				     const u64 replace_len,
   2621				     const u64 bytes_to_drop)
   2622{
   2623	struct btrfs_fs_info *fs_info = trans->fs_info;
   2624	struct btrfs_root *root = inode->root;
   2625	struct btrfs_file_extent_item *extent;
   2626	struct extent_buffer *leaf;
   2627	struct btrfs_key key;
   2628	int slot;
   2629	struct btrfs_ref ref = { 0 };
   2630	int ret;
   2631
   2632	if (replace_len == 0)
   2633		return 0;
   2634
   2635	if (extent_info->disk_offset == 0 &&
   2636	    btrfs_fs_incompat(fs_info, NO_HOLES)) {
   2637		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
   2638		return 0;
   2639	}
   2640
   2641	key.objectid = btrfs_ino(inode);
   2642	key.type = BTRFS_EXTENT_DATA_KEY;
   2643	key.offset = extent_info->file_offset;
   2644	ret = btrfs_insert_empty_item(trans, root, path, &key,
   2645				      sizeof(struct btrfs_file_extent_item));
   2646	if (ret)
   2647		return ret;
   2648	leaf = path->nodes[0];
   2649	slot = path->slots[0];
   2650	write_extent_buffer(leaf, extent_info->extent_buf,
   2651			    btrfs_item_ptr_offset(leaf, slot),
   2652			    sizeof(struct btrfs_file_extent_item));
   2653	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
   2654	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
   2655	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
   2656	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
   2657	if (extent_info->is_new_extent)
   2658		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
   2659	btrfs_mark_buffer_dirty(leaf);
   2660	btrfs_release_path(path);
   2661
   2662	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
   2663						replace_len);
   2664	if (ret)
   2665		return ret;
   2666
   2667	/* If it's a hole, nothing more needs to be done. */
   2668	if (extent_info->disk_offset == 0) {
   2669		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
   2670		return 0;
   2671	}
   2672
   2673	btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
   2674
   2675	if (extent_info->is_new_extent && extent_info->insertions == 0) {
   2676		key.objectid = extent_info->disk_offset;
   2677		key.type = BTRFS_EXTENT_ITEM_KEY;
   2678		key.offset = extent_info->disk_len;
   2679		ret = btrfs_alloc_reserved_file_extent(trans, root,
   2680						       btrfs_ino(inode),
   2681						       extent_info->file_offset,
   2682						       extent_info->qgroup_reserved,
   2683						       &key);
   2684	} else {
   2685		u64 ref_offset;
   2686
   2687		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
   2688				       extent_info->disk_offset,
   2689				       extent_info->disk_len, 0);
   2690		ref_offset = extent_info->file_offset - extent_info->data_offset;
   2691		btrfs_init_data_ref(&ref, root->root_key.objectid,
   2692				    btrfs_ino(inode), ref_offset, 0, false);
   2693		ret = btrfs_inc_extent_ref(trans, &ref);
   2694	}
   2695
   2696	extent_info->insertions++;
   2697
   2698	return ret;
   2699}
   2700
   2701/*
   2702 * The respective range must have been previously locked, as well as the inode.
   2703 * The end offset is inclusive (last byte of the range).
   2704 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
   2705 * the file range with an extent.
   2706 * When not punching a hole, we don't want to end up in a state where we dropped
   2707 * extents without inserting a new one, so we must abort the transaction to avoid
   2708 * a corruption.
   2709 */
   2710int btrfs_replace_file_extents(struct btrfs_inode *inode,
   2711			       struct btrfs_path *path, const u64 start,
   2712			       const u64 end,
   2713			       struct btrfs_replace_extent_info *extent_info,
   2714			       struct btrfs_trans_handle **trans_out)
   2715{
   2716	struct btrfs_drop_extents_args drop_args = { 0 };
   2717	struct btrfs_root *root = inode->root;
   2718	struct btrfs_fs_info *fs_info = root->fs_info;
   2719	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
   2720	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
   2721	struct btrfs_trans_handle *trans = NULL;
   2722	struct btrfs_block_rsv *rsv;
   2723	unsigned int rsv_count;
   2724	u64 cur_offset;
   2725	u64 len = end - start;
   2726	int ret = 0;
   2727
   2728	if (end <= start)
   2729		return -EINVAL;
   2730
   2731	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
   2732	if (!rsv) {
   2733		ret = -ENOMEM;
   2734		goto out;
   2735	}
   2736	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
   2737	rsv->failfast = 1;
   2738
   2739	/*
   2740	 * 1 - update the inode
   2741	 * 1 - removing the extents in the range
   2742	 * 1 - adding the hole extent if no_holes isn't set or if we are
   2743	 *     replacing the range with a new extent
   2744	 */
   2745	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
   2746		rsv_count = 3;
   2747	else
   2748		rsv_count = 2;
   2749
   2750	trans = btrfs_start_transaction(root, rsv_count);
   2751	if (IS_ERR(trans)) {
   2752		ret = PTR_ERR(trans);
   2753		trans = NULL;
   2754		goto out_free;
   2755	}
   2756
   2757	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
   2758				      min_size, false);
   2759	if (WARN_ON(ret))
   2760		goto out_trans;
   2761	trans->block_rsv = rsv;
   2762
   2763	cur_offset = start;
   2764	drop_args.path = path;
   2765	drop_args.end = end + 1;
   2766	drop_args.drop_cache = true;
   2767	while (cur_offset < end) {
   2768		drop_args.start = cur_offset;
   2769		ret = btrfs_drop_extents(trans, root, inode, &drop_args);
   2770		/* If we are punching a hole decrement the inode's byte count */
   2771		if (!extent_info)
   2772			btrfs_update_inode_bytes(inode, 0,
   2773						 drop_args.bytes_found);
   2774		if (ret != -ENOSPC) {
   2775			/*
   2776			 * The only time we don't want to abort is if we are
   2777			 * attempting to clone a partial inline extent, in which
   2778			 * case we'll get EOPNOTSUPP.  However if we aren't
   2779			 * clone we need to abort no matter what, because if we
   2780			 * got EOPNOTSUPP via prealloc then we messed up and
   2781			 * need to abort.
   2782			 */
   2783			if (ret &&
   2784			    (ret != -EOPNOTSUPP ||
   2785			     (extent_info && extent_info->is_new_extent)))
   2786				btrfs_abort_transaction(trans, ret);
   2787			break;
   2788		}
   2789
   2790		trans->block_rsv = &fs_info->trans_block_rsv;
   2791
   2792		if (!extent_info && cur_offset < drop_args.drop_end &&
   2793		    cur_offset < ino_size) {
   2794			ret = fill_holes(trans, inode, path, cur_offset,
   2795					 drop_args.drop_end);
   2796			if (ret) {
   2797				/*
   2798				 * If we failed then we didn't insert our hole
   2799				 * entries for the area we dropped, so now the
   2800				 * fs is corrupted, so we must abort the
   2801				 * transaction.
   2802				 */
   2803				btrfs_abort_transaction(trans, ret);
   2804				break;
   2805			}
   2806		} else if (!extent_info && cur_offset < drop_args.drop_end) {
   2807			/*
   2808			 * We are past the i_size here, but since we didn't
   2809			 * insert holes we need to clear the mapped area so we
   2810			 * know to not set disk_i_size in this area until a new
   2811			 * file extent is inserted here.
   2812			 */
   2813			ret = btrfs_inode_clear_file_extent_range(inode,
   2814					cur_offset,
   2815					drop_args.drop_end - cur_offset);
   2816			if (ret) {
   2817				/*
   2818				 * We couldn't clear our area, so we could
   2819				 * presumably adjust up and corrupt the fs, so
   2820				 * we need to abort.
   2821				 */
   2822				btrfs_abort_transaction(trans, ret);
   2823				break;
   2824			}
   2825		}
   2826
   2827		if (extent_info &&
   2828		    drop_args.drop_end > extent_info->file_offset) {
   2829			u64 replace_len = drop_args.drop_end -
   2830					  extent_info->file_offset;
   2831
   2832			ret = btrfs_insert_replace_extent(trans, inode,	path,
   2833					extent_info, replace_len,
   2834					drop_args.bytes_found);
   2835			if (ret) {
   2836				btrfs_abort_transaction(trans, ret);
   2837				break;
   2838			}
   2839			extent_info->data_len -= replace_len;
   2840			extent_info->data_offset += replace_len;
   2841			extent_info->file_offset += replace_len;
   2842		}
   2843
   2844		/*
   2845		 * We are releasing our handle on the transaction, balance the
   2846		 * dirty pages of the btree inode and flush delayed items, and
   2847		 * then get a new transaction handle, which may now point to a
   2848		 * new transaction in case someone else may have committed the
   2849		 * transaction we used to replace/drop file extent items. So
   2850		 * bump the inode's iversion and update mtime and ctime except
   2851		 * if we are called from a dedupe context. This is because a
   2852		 * power failure/crash may happen after the transaction is
   2853		 * committed and before we finish replacing/dropping all the
   2854		 * file extent items we need.
   2855		 */
   2856		inode_inc_iversion(&inode->vfs_inode);
   2857
   2858		if (!extent_info || extent_info->update_times) {
   2859			inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
   2860			inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
   2861		}
   2862
   2863		ret = btrfs_update_inode(trans, root, inode);
   2864		if (ret)
   2865			break;
   2866
   2867		btrfs_end_transaction(trans);
   2868		btrfs_btree_balance_dirty(fs_info);
   2869
   2870		trans = btrfs_start_transaction(root, rsv_count);
   2871		if (IS_ERR(trans)) {
   2872			ret = PTR_ERR(trans);
   2873			trans = NULL;
   2874			break;
   2875		}
   2876
   2877		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
   2878					      rsv, min_size, false);
   2879		if (WARN_ON(ret))
   2880			break;
   2881		trans->block_rsv = rsv;
   2882
   2883		cur_offset = drop_args.drop_end;
   2884		len = end - cur_offset;
   2885		if (!extent_info && len) {
   2886			ret = find_first_non_hole(inode, &cur_offset, &len);
   2887			if (unlikely(ret < 0))
   2888				break;
   2889			if (ret && !len) {
   2890				ret = 0;
   2891				break;
   2892			}
   2893		}
   2894	}
   2895
   2896	/*
   2897	 * If we were cloning, force the next fsync to be a full one since we
   2898	 * we replaced (or just dropped in the case of cloning holes when
   2899	 * NO_HOLES is enabled) file extent items and did not setup new extent
   2900	 * maps for the replacement extents (or holes).
   2901	 */
   2902	if (extent_info && !extent_info->is_new_extent)
   2903		btrfs_set_inode_full_sync(inode);
   2904
   2905	if (ret)
   2906		goto out_trans;
   2907
   2908	trans->block_rsv = &fs_info->trans_block_rsv;
   2909	/*
   2910	 * If we are using the NO_HOLES feature we might have had already an
   2911	 * hole that overlaps a part of the region [lockstart, lockend] and
   2912	 * ends at (or beyond) lockend. Since we have no file extent items to
   2913	 * represent holes, drop_end can be less than lockend and so we must
   2914	 * make sure we have an extent map representing the existing hole (the
   2915	 * call to __btrfs_drop_extents() might have dropped the existing extent
   2916	 * map representing the existing hole), otherwise the fast fsync path
   2917	 * will not record the existence of the hole region
   2918	 * [existing_hole_start, lockend].
   2919	 */
   2920	if (drop_args.drop_end <= end)
   2921		drop_args.drop_end = end + 1;
   2922	/*
   2923	 * Don't insert file hole extent item if it's for a range beyond eof
   2924	 * (because it's useless) or if it represents a 0 bytes range (when
   2925	 * cur_offset == drop_end).
   2926	 */
   2927	if (!extent_info && cur_offset < ino_size &&
   2928	    cur_offset < drop_args.drop_end) {
   2929		ret = fill_holes(trans, inode, path, cur_offset,
   2930				 drop_args.drop_end);
   2931		if (ret) {
   2932			/* Same comment as above. */
   2933			btrfs_abort_transaction(trans, ret);
   2934			goto out_trans;
   2935		}
   2936	} else if (!extent_info && cur_offset < drop_args.drop_end) {
   2937		/* See the comment in the loop above for the reasoning here. */
   2938		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
   2939					drop_args.drop_end - cur_offset);
   2940		if (ret) {
   2941			btrfs_abort_transaction(trans, ret);
   2942			goto out_trans;
   2943		}
   2944
   2945	}
   2946	if (extent_info) {
   2947		ret = btrfs_insert_replace_extent(trans, inode, path,
   2948				extent_info, extent_info->data_len,
   2949				drop_args.bytes_found);
   2950		if (ret) {
   2951			btrfs_abort_transaction(trans, ret);
   2952			goto out_trans;
   2953		}
   2954	}
   2955
   2956out_trans:
   2957	if (!trans)
   2958		goto out_free;
   2959
   2960	trans->block_rsv = &fs_info->trans_block_rsv;
   2961	if (ret)
   2962		btrfs_end_transaction(trans);
   2963	else
   2964		*trans_out = trans;
   2965out_free:
   2966	btrfs_free_block_rsv(fs_info, rsv);
   2967out:
   2968	return ret;
   2969}
   2970
   2971static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
   2972{
   2973	struct inode *inode = file_inode(file);
   2974	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   2975	struct btrfs_root *root = BTRFS_I(inode)->root;
   2976	struct extent_state *cached_state = NULL;
   2977	struct btrfs_path *path;
   2978	struct btrfs_trans_handle *trans = NULL;
   2979	u64 lockstart;
   2980	u64 lockend;
   2981	u64 tail_start;
   2982	u64 tail_len;
   2983	u64 orig_start = offset;
   2984	int ret = 0;
   2985	bool same_block;
   2986	u64 ino_size;
   2987	bool truncated_block = false;
   2988	bool updated_inode = false;
   2989
   2990	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
   2991
   2992	ret = btrfs_wait_ordered_range(inode, offset, len);
   2993	if (ret)
   2994		goto out_only_mutex;
   2995
   2996	ino_size = round_up(inode->i_size, fs_info->sectorsize);
   2997	ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
   2998	if (ret < 0)
   2999		goto out_only_mutex;
   3000	if (ret && !len) {
   3001		/* Already in a large hole */
   3002		ret = 0;
   3003		goto out_only_mutex;
   3004	}
   3005
   3006	ret = file_modified(file);
   3007	if (ret)
   3008		goto out_only_mutex;
   3009
   3010	lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
   3011	lockend = round_down(offset + len,
   3012			     btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
   3013	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
   3014		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
   3015	/*
   3016	 * We needn't truncate any block which is beyond the end of the file
   3017	 * because we are sure there is no data there.
   3018	 */
   3019	/*
   3020	 * Only do this if we are in the same block and we aren't doing the
   3021	 * entire block.
   3022	 */
   3023	if (same_block && len < fs_info->sectorsize) {
   3024		if (offset < ino_size) {
   3025			truncated_block = true;
   3026			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
   3027						   0);
   3028		} else {
   3029			ret = 0;
   3030		}
   3031		goto out_only_mutex;
   3032	}
   3033
   3034	/* zero back part of the first block */
   3035	if (offset < ino_size) {
   3036		truncated_block = true;
   3037		ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
   3038		if (ret) {
   3039			btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
   3040			return ret;
   3041		}
   3042	}
   3043
   3044	/* Check the aligned pages after the first unaligned page,
   3045	 * if offset != orig_start, which means the first unaligned page
   3046	 * including several following pages are already in holes,
   3047	 * the extra check can be skipped */
   3048	if (offset == orig_start) {
   3049		/* after truncate page, check hole again */
   3050		len = offset + len - lockstart;
   3051		offset = lockstart;
   3052		ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
   3053		if (ret < 0)
   3054			goto out_only_mutex;
   3055		if (ret && !len) {
   3056			ret = 0;
   3057			goto out_only_mutex;
   3058		}
   3059		lockstart = offset;
   3060	}
   3061
   3062	/* Check the tail unaligned part is in a hole */
   3063	tail_start = lockend + 1;
   3064	tail_len = offset + len - tail_start;
   3065	if (tail_len) {
   3066		ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
   3067		if (unlikely(ret < 0))
   3068			goto out_only_mutex;
   3069		if (!ret) {
   3070			/* zero the front end of the last page */
   3071			if (tail_start + tail_len < ino_size) {
   3072				truncated_block = true;
   3073				ret = btrfs_truncate_block(BTRFS_I(inode),
   3074							tail_start + tail_len,
   3075							0, 1);
   3076				if (ret)
   3077					goto out_only_mutex;
   3078			}
   3079		}
   3080	}
   3081
   3082	if (lockend < lockstart) {
   3083		ret = 0;
   3084		goto out_only_mutex;
   3085	}
   3086
   3087	btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
   3088
   3089	path = btrfs_alloc_path();
   3090	if (!path) {
   3091		ret = -ENOMEM;
   3092		goto out;
   3093	}
   3094
   3095	ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
   3096					 lockend, NULL, &trans);
   3097	btrfs_free_path(path);
   3098	if (ret)
   3099		goto out;
   3100
   3101	ASSERT(trans != NULL);
   3102	inode_inc_iversion(inode);
   3103	inode->i_mtime = inode->i_ctime = current_time(inode);
   3104	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   3105	updated_inode = true;
   3106	btrfs_end_transaction(trans);
   3107	btrfs_btree_balance_dirty(fs_info);
   3108out:
   3109	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
   3110			     &cached_state);
   3111out_only_mutex:
   3112	if (!updated_inode && truncated_block && !ret) {
   3113		/*
   3114		 * If we only end up zeroing part of a page, we still need to
   3115		 * update the inode item, so that all the time fields are
   3116		 * updated as well as the necessary btrfs inode in memory fields
   3117		 * for detecting, at fsync time, if the inode isn't yet in the
   3118		 * log tree or it's there but not up to date.
   3119		 */
   3120		struct timespec64 now = current_time(inode);
   3121
   3122		inode_inc_iversion(inode);
   3123		inode->i_mtime = now;
   3124		inode->i_ctime = now;
   3125		trans = btrfs_start_transaction(root, 1);
   3126		if (IS_ERR(trans)) {
   3127			ret = PTR_ERR(trans);
   3128		} else {
   3129			int ret2;
   3130
   3131			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   3132			ret2 = btrfs_end_transaction(trans);
   3133			if (!ret)
   3134				ret = ret2;
   3135		}
   3136	}
   3137	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
   3138	return ret;
   3139}
   3140
   3141/* Helper structure to record which range is already reserved */
   3142struct falloc_range {
   3143	struct list_head list;
   3144	u64 start;
   3145	u64 len;
   3146};
   3147
   3148/*
   3149 * Helper function to add falloc range
   3150 *
   3151 * Caller should have locked the larger range of extent containing
   3152 * [start, len)
   3153 */
   3154static int add_falloc_range(struct list_head *head, u64 start, u64 len)
   3155{
   3156	struct falloc_range *range = NULL;
   3157
   3158	if (!list_empty(head)) {
   3159		/*
   3160		 * As fallocate iterates by bytenr order, we only need to check
   3161		 * the last range.
   3162		 */
   3163		range = list_last_entry(head, struct falloc_range, list);
   3164		if (range->start + range->len == start) {
   3165			range->len += len;
   3166			return 0;
   3167		}
   3168	}
   3169
   3170	range = kmalloc(sizeof(*range), GFP_KERNEL);
   3171	if (!range)
   3172		return -ENOMEM;
   3173	range->start = start;
   3174	range->len = len;
   3175	list_add_tail(&range->list, head);
   3176	return 0;
   3177}
   3178
   3179static int btrfs_fallocate_update_isize(struct inode *inode,
   3180					const u64 end,
   3181					const int mode)
   3182{
   3183	struct btrfs_trans_handle *trans;
   3184	struct btrfs_root *root = BTRFS_I(inode)->root;
   3185	int ret;
   3186	int ret2;
   3187
   3188	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
   3189		return 0;
   3190
   3191	trans = btrfs_start_transaction(root, 1);
   3192	if (IS_ERR(trans))
   3193		return PTR_ERR(trans);
   3194
   3195	inode->i_ctime = current_time(inode);
   3196	i_size_write(inode, end);
   3197	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
   3198	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   3199	ret2 = btrfs_end_transaction(trans);
   3200
   3201	return ret ? ret : ret2;
   3202}
   3203
   3204enum {
   3205	RANGE_BOUNDARY_WRITTEN_EXTENT,
   3206	RANGE_BOUNDARY_PREALLOC_EXTENT,
   3207	RANGE_BOUNDARY_HOLE,
   3208};
   3209
   3210static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
   3211						 u64 offset)
   3212{
   3213	const u64 sectorsize = btrfs_inode_sectorsize(inode);
   3214	struct extent_map *em;
   3215	int ret;
   3216
   3217	offset = round_down(offset, sectorsize);
   3218	em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
   3219	if (IS_ERR(em))
   3220		return PTR_ERR(em);
   3221
   3222	if (em->block_start == EXTENT_MAP_HOLE)
   3223		ret = RANGE_BOUNDARY_HOLE;
   3224	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
   3225		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
   3226	else
   3227		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
   3228
   3229	free_extent_map(em);
   3230	return ret;
   3231}
   3232
   3233static int btrfs_zero_range(struct inode *inode,
   3234			    loff_t offset,
   3235			    loff_t len,
   3236			    const int mode)
   3237{
   3238	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
   3239	struct extent_map *em;
   3240	struct extent_changeset *data_reserved = NULL;
   3241	int ret;
   3242	u64 alloc_hint = 0;
   3243	const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
   3244	u64 alloc_start = round_down(offset, sectorsize);
   3245	u64 alloc_end = round_up(offset + len, sectorsize);
   3246	u64 bytes_to_reserve = 0;
   3247	bool space_reserved = false;
   3248
   3249	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
   3250			      alloc_end - alloc_start);
   3251	if (IS_ERR(em)) {
   3252		ret = PTR_ERR(em);
   3253		goto out;
   3254	}
   3255
   3256	/*
   3257	 * Avoid hole punching and extent allocation for some cases. More cases
   3258	 * could be considered, but these are unlikely common and we keep things
   3259	 * as simple as possible for now. Also, intentionally, if the target
   3260	 * range contains one or more prealloc extents together with regular
   3261	 * extents and holes, we drop all the existing extents and allocate a
   3262	 * new prealloc extent, so that we get a larger contiguous disk extent.
   3263	 */
   3264	if (em->start <= alloc_start &&
   3265	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
   3266		const u64 em_end = em->start + em->len;
   3267
   3268		if (em_end >= offset + len) {
   3269			/*
   3270			 * The whole range is already a prealloc extent,
   3271			 * do nothing except updating the inode's i_size if
   3272			 * needed.
   3273			 */
   3274			free_extent_map(em);
   3275			ret = btrfs_fallocate_update_isize(inode, offset + len,
   3276							   mode);
   3277			goto out;
   3278		}
   3279		/*
   3280		 * Part of the range is already a prealloc extent, so operate
   3281		 * only on the remaining part of the range.
   3282		 */
   3283		alloc_start = em_end;
   3284		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
   3285		len = offset + len - alloc_start;
   3286		offset = alloc_start;
   3287		alloc_hint = em->block_start + em->len;
   3288	}
   3289	free_extent_map(em);
   3290
   3291	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
   3292	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
   3293		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
   3294				      sectorsize);
   3295		if (IS_ERR(em)) {
   3296			ret = PTR_ERR(em);
   3297			goto out;
   3298		}
   3299
   3300		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
   3301			free_extent_map(em);
   3302			ret = btrfs_fallocate_update_isize(inode, offset + len,
   3303							   mode);
   3304			goto out;
   3305		}
   3306		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
   3307			free_extent_map(em);
   3308			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
   3309						   0);
   3310			if (!ret)
   3311				ret = btrfs_fallocate_update_isize(inode,
   3312								   offset + len,
   3313								   mode);
   3314			return ret;
   3315		}
   3316		free_extent_map(em);
   3317		alloc_start = round_down(offset, sectorsize);
   3318		alloc_end = alloc_start + sectorsize;
   3319		goto reserve_space;
   3320	}
   3321
   3322	alloc_start = round_up(offset, sectorsize);
   3323	alloc_end = round_down(offset + len, sectorsize);
   3324
   3325	/*
   3326	 * For unaligned ranges, check the pages at the boundaries, they might
   3327	 * map to an extent, in which case we need to partially zero them, or
   3328	 * they might map to a hole, in which case we need our allocation range
   3329	 * to cover them.
   3330	 */
   3331	if (!IS_ALIGNED(offset, sectorsize)) {
   3332		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
   3333							    offset);
   3334		if (ret < 0)
   3335			goto out;
   3336		if (ret == RANGE_BOUNDARY_HOLE) {
   3337			alloc_start = round_down(offset, sectorsize);
   3338			ret = 0;
   3339		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
   3340			ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
   3341			if (ret)
   3342				goto out;
   3343		} else {
   3344			ret = 0;
   3345		}
   3346	}
   3347
   3348	if (!IS_ALIGNED(offset + len, sectorsize)) {
   3349		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
   3350							    offset + len);
   3351		if (ret < 0)
   3352			goto out;
   3353		if (ret == RANGE_BOUNDARY_HOLE) {
   3354			alloc_end = round_up(offset + len, sectorsize);
   3355			ret = 0;
   3356		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
   3357			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
   3358						   0, 1);
   3359			if (ret)
   3360				goto out;
   3361		} else {
   3362			ret = 0;
   3363		}
   3364	}
   3365
   3366reserve_space:
   3367	if (alloc_start < alloc_end) {
   3368		struct extent_state *cached_state = NULL;
   3369		const u64 lockstart = alloc_start;
   3370		const u64 lockend = alloc_end - 1;
   3371
   3372		bytes_to_reserve = alloc_end - alloc_start;
   3373		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
   3374						      bytes_to_reserve);
   3375		if (ret < 0)
   3376			goto out;
   3377		space_reserved = true;
   3378		btrfs_punch_hole_lock_range(inode, lockstart, lockend,
   3379					    &cached_state);
   3380		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
   3381						alloc_start, bytes_to_reserve);
   3382		if (ret) {
   3383			unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
   3384					     lockend, &cached_state);
   3385			goto out;
   3386		}
   3387		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
   3388						alloc_end - alloc_start,
   3389						i_blocksize(inode),
   3390						offset + len, &alloc_hint);
   3391		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
   3392				     lockend, &cached_state);
   3393		/* btrfs_prealloc_file_range releases reserved space on error */
   3394		if (ret) {
   3395			space_reserved = false;
   3396			goto out;
   3397		}
   3398	}
   3399	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
   3400 out:
   3401	if (ret && space_reserved)
   3402		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
   3403					       alloc_start, bytes_to_reserve);
   3404	extent_changeset_free(data_reserved);
   3405
   3406	return ret;
   3407}
   3408
   3409static long btrfs_fallocate(struct file *file, int mode,
   3410			    loff_t offset, loff_t len)
   3411{
   3412	struct inode *inode = file_inode(file);
   3413	struct extent_state *cached_state = NULL;
   3414	struct extent_changeset *data_reserved = NULL;
   3415	struct falloc_range *range;
   3416	struct falloc_range *tmp;
   3417	struct list_head reserve_list;
   3418	u64 cur_offset;
   3419	u64 last_byte;
   3420	u64 alloc_start;
   3421	u64 alloc_end;
   3422	u64 alloc_hint = 0;
   3423	u64 locked_end;
   3424	u64 actual_end = 0;
   3425	u64 data_space_needed = 0;
   3426	u64 data_space_reserved = 0;
   3427	u64 qgroup_reserved = 0;
   3428	struct extent_map *em;
   3429	int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
   3430	int ret;
   3431
   3432	/* Do not allow fallocate in ZONED mode */
   3433	if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
   3434		return -EOPNOTSUPP;
   3435
   3436	alloc_start = round_down(offset, blocksize);
   3437	alloc_end = round_up(offset + len, blocksize);
   3438	cur_offset = alloc_start;
   3439
   3440	/* Make sure we aren't being give some crap mode */
   3441	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
   3442		     FALLOC_FL_ZERO_RANGE))
   3443		return -EOPNOTSUPP;
   3444
   3445	if (mode & FALLOC_FL_PUNCH_HOLE)
   3446		return btrfs_punch_hole(file, offset, len);
   3447
   3448	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
   3449
   3450	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
   3451		ret = inode_newsize_ok(inode, offset + len);
   3452		if (ret)
   3453			goto out;
   3454	}
   3455
   3456	ret = file_modified(file);
   3457	if (ret)
   3458		goto out;
   3459
   3460	/*
   3461	 * TODO: Move these two operations after we have checked
   3462	 * accurate reserved space, or fallocate can still fail but
   3463	 * with page truncated or size expanded.
   3464	 *
   3465	 * But that's a minor problem and won't do much harm BTW.
   3466	 */
   3467	if (alloc_start > inode->i_size) {
   3468		ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
   3469					alloc_start);
   3470		if (ret)
   3471			goto out;
   3472	} else if (offset + len > inode->i_size) {
   3473		/*
   3474		 * If we are fallocating from the end of the file onward we
   3475		 * need to zero out the end of the block if i_size lands in the
   3476		 * middle of a block.
   3477		 */
   3478		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
   3479		if (ret)
   3480			goto out;
   3481	}
   3482
   3483	/*
   3484	 * We have locked the inode at the VFS level (in exclusive mode) and we
   3485	 * have locked the i_mmap_lock lock (in exclusive mode). Now before
   3486	 * locking the file range, flush all dealloc in the range and wait for
   3487	 * all ordered extents in the range to complete. After this we can lock
   3488	 * the file range and, due to the previous locking we did, we know there
   3489	 * can't be more delalloc or ordered extents in the range.
   3490	 */
   3491	ret = btrfs_wait_ordered_range(inode, alloc_start,
   3492				       alloc_end - alloc_start);
   3493	if (ret)
   3494		goto out;
   3495
   3496	if (mode & FALLOC_FL_ZERO_RANGE) {
   3497		ret = btrfs_zero_range(inode, offset, len, mode);
   3498		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
   3499		return ret;
   3500	}
   3501
   3502	locked_end = alloc_end - 1;
   3503	lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
   3504			 &cached_state);
   3505
   3506	btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
   3507
   3508	/* First, check if we exceed the qgroup limit */
   3509	INIT_LIST_HEAD(&reserve_list);
   3510	while (cur_offset < alloc_end) {
   3511		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
   3512				      alloc_end - cur_offset);
   3513		if (IS_ERR(em)) {
   3514			ret = PTR_ERR(em);
   3515			break;
   3516		}
   3517		last_byte = min(extent_map_end(em), alloc_end);
   3518		actual_end = min_t(u64, extent_map_end(em), offset + len);
   3519		last_byte = ALIGN(last_byte, blocksize);
   3520		if (em->block_start == EXTENT_MAP_HOLE ||
   3521		    (cur_offset >= inode->i_size &&
   3522		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
   3523			const u64 range_len = last_byte - cur_offset;
   3524
   3525			ret = add_falloc_range(&reserve_list, cur_offset, range_len);
   3526			if (ret < 0) {
   3527				free_extent_map(em);
   3528				break;
   3529			}
   3530			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
   3531					&data_reserved, cur_offset, range_len);
   3532			if (ret < 0) {
   3533				free_extent_map(em);
   3534				break;
   3535			}
   3536			qgroup_reserved += range_len;
   3537			data_space_needed += range_len;
   3538		}
   3539		free_extent_map(em);
   3540		cur_offset = last_byte;
   3541	}
   3542
   3543	if (!ret && data_space_needed > 0) {
   3544		/*
   3545		 * We are safe to reserve space here as we can't have delalloc
   3546		 * in the range, see above.
   3547		 */
   3548		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
   3549						      data_space_needed);
   3550		if (!ret)
   3551			data_space_reserved = data_space_needed;
   3552	}
   3553
   3554	/*
   3555	 * If ret is still 0, means we're OK to fallocate.
   3556	 * Or just cleanup the list and exit.
   3557	 */
   3558	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
   3559		if (!ret) {
   3560			ret = btrfs_prealloc_file_range(inode, mode,
   3561					range->start,
   3562					range->len, i_blocksize(inode),
   3563					offset + len, &alloc_hint);
   3564			/*
   3565			 * btrfs_prealloc_file_range() releases space even
   3566			 * if it returns an error.
   3567			 */
   3568			data_space_reserved -= range->len;
   3569			qgroup_reserved -= range->len;
   3570		} else if (data_space_reserved > 0) {
   3571			btrfs_free_reserved_data_space(BTRFS_I(inode),
   3572					       data_reserved, range->start,
   3573					       range->len);
   3574			data_space_reserved -= range->len;
   3575			qgroup_reserved -= range->len;
   3576		} else if (qgroup_reserved > 0) {
   3577			btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
   3578					       range->start, range->len);
   3579			qgroup_reserved -= range->len;
   3580		}
   3581		list_del(&range->list);
   3582		kfree(range);
   3583	}
   3584	if (ret < 0)
   3585		goto out_unlock;
   3586
   3587	/*
   3588	 * We didn't need to allocate any more space, but we still extended the
   3589	 * size of the file so we need to update i_size and the inode item.
   3590	 */
   3591	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
   3592out_unlock:
   3593	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
   3594			     &cached_state);
   3595out:
   3596	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
   3597	extent_changeset_free(data_reserved);
   3598	return ret;
   3599}
   3600
   3601static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
   3602				  int whence)
   3603{
   3604	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   3605	struct extent_map *em = NULL;
   3606	struct extent_state *cached_state = NULL;
   3607	loff_t i_size = inode->vfs_inode.i_size;
   3608	u64 lockstart;
   3609	u64 lockend;
   3610	u64 start;
   3611	u64 len;
   3612	int ret = 0;
   3613
   3614	if (i_size == 0 || offset >= i_size)
   3615		return -ENXIO;
   3616
   3617	/*
   3618	 * offset can be negative, in this case we start finding DATA/HOLE from
   3619	 * the very start of the file.
   3620	 */
   3621	start = max_t(loff_t, 0, offset);
   3622
   3623	lockstart = round_down(start, fs_info->sectorsize);
   3624	lockend = round_up(i_size, fs_info->sectorsize);
   3625	if (lockend <= lockstart)
   3626		lockend = lockstart + fs_info->sectorsize;
   3627	lockend--;
   3628	len = lockend - lockstart + 1;
   3629
   3630	lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
   3631
   3632	while (start < i_size) {
   3633		em = btrfs_get_extent_fiemap(inode, start, len);
   3634		if (IS_ERR(em)) {
   3635			ret = PTR_ERR(em);
   3636			em = NULL;
   3637			break;
   3638		}
   3639
   3640		if (whence == SEEK_HOLE &&
   3641		    (em->block_start == EXTENT_MAP_HOLE ||
   3642		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
   3643			break;
   3644		else if (whence == SEEK_DATA &&
   3645			   (em->block_start != EXTENT_MAP_HOLE &&
   3646			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
   3647			break;
   3648
   3649		start = em->start + em->len;
   3650		free_extent_map(em);
   3651		em = NULL;
   3652		cond_resched();
   3653	}
   3654	free_extent_map(em);
   3655	unlock_extent_cached(&inode->io_tree, lockstart, lockend,
   3656			     &cached_state);
   3657	if (ret) {
   3658		offset = ret;
   3659	} else {
   3660		if (whence == SEEK_DATA && start >= i_size)
   3661			offset = -ENXIO;
   3662		else
   3663			offset = min_t(loff_t, start, i_size);
   3664	}
   3665
   3666	return offset;
   3667}
   3668
   3669static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
   3670{
   3671	struct inode *inode = file->f_mapping->host;
   3672
   3673	switch (whence) {
   3674	default:
   3675		return generic_file_llseek(file, offset, whence);
   3676	case SEEK_DATA:
   3677	case SEEK_HOLE:
   3678		btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
   3679		offset = find_desired_extent(BTRFS_I(inode), offset, whence);
   3680		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
   3681		break;
   3682	}
   3683
   3684	if (offset < 0)
   3685		return offset;
   3686
   3687	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
   3688}
   3689
   3690static int btrfs_file_open(struct inode *inode, struct file *filp)
   3691{
   3692	int ret;
   3693
   3694	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
   3695
   3696	ret = fsverity_file_open(inode, filp);
   3697	if (ret)
   3698		return ret;
   3699	return generic_file_open(inode, filp);
   3700}
   3701
   3702static int check_direct_read(struct btrfs_fs_info *fs_info,
   3703			     const struct iov_iter *iter, loff_t offset)
   3704{
   3705	int ret;
   3706	int i, seg;
   3707
   3708	ret = check_direct_IO(fs_info, iter, offset);
   3709	if (ret < 0)
   3710		return ret;
   3711
   3712	if (!iter_is_iovec(iter))
   3713		return 0;
   3714
   3715	for (seg = 0; seg < iter->nr_segs; seg++)
   3716		for (i = seg + 1; i < iter->nr_segs; i++)
   3717			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
   3718				return -EINVAL;
   3719	return 0;
   3720}
   3721
   3722static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
   3723{
   3724	struct inode *inode = file_inode(iocb->ki_filp);
   3725	size_t prev_left = 0;
   3726	ssize_t read = 0;
   3727	ssize_t ret;
   3728
   3729	if (fsverity_active(inode))
   3730		return 0;
   3731
   3732	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
   3733		return 0;
   3734
   3735	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
   3736again:
   3737	/*
   3738	 * This is similar to what we do for direct IO writes, see the comment
   3739	 * at btrfs_direct_write(), but we also disable page faults in addition
   3740	 * to disabling them only at the iov_iter level. This is because when
   3741	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
   3742	 * which can still trigger page fault ins despite having set ->nofault
   3743	 * to true of our 'to' iov_iter.
   3744	 *
   3745	 * The difference to direct IO writes is that we deadlock when trying
   3746	 * to lock the extent range in the inode's tree during he page reads
   3747	 * triggered by the fault in (while for writes it is due to waiting for
   3748	 * our own ordered extent). This is because for direct IO reads,
   3749	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
   3750	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
   3751	 */
   3752	pagefault_disable();
   3753	to->nofault = true;
   3754	ret = btrfs_dio_rw(iocb, to, read);
   3755	to->nofault = false;
   3756	pagefault_enable();
   3757
   3758	/* No increment (+=) because iomap returns a cumulative value. */
   3759	if (ret > 0)
   3760		read = ret;
   3761
   3762	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
   3763		const size_t left = iov_iter_count(to);
   3764
   3765		if (left == prev_left) {
   3766			/*
   3767			 * We didn't make any progress since the last attempt,
   3768			 * fallback to a buffered read for the remainder of the
   3769			 * range. This is just to avoid any possibility of looping
   3770			 * for too long.
   3771			 */
   3772			ret = read;
   3773		} else {
   3774			/*
   3775			 * We made some progress since the last retry or this is
   3776			 * the first time we are retrying. Fault in as many pages
   3777			 * as possible and retry.
   3778			 */
   3779			fault_in_iov_iter_writeable(to, left);
   3780			prev_left = left;
   3781			goto again;
   3782		}
   3783	}
   3784	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
   3785	return ret < 0 ? ret : read;
   3786}
   3787
   3788static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
   3789{
   3790	ssize_t ret = 0;
   3791
   3792	if (iocb->ki_flags & IOCB_DIRECT) {
   3793		ret = btrfs_direct_read(iocb, to);
   3794		if (ret < 0 || !iov_iter_count(to) ||
   3795		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
   3796			return ret;
   3797	}
   3798
   3799	return filemap_read(iocb, to, ret);
   3800}
   3801
   3802const struct file_operations btrfs_file_operations = {
   3803	.llseek		= btrfs_file_llseek,
   3804	.read_iter      = btrfs_file_read_iter,
   3805	.splice_read	= generic_file_splice_read,
   3806	.write_iter	= btrfs_file_write_iter,
   3807	.splice_write	= iter_file_splice_write,
   3808	.mmap		= btrfs_file_mmap,
   3809	.open		= btrfs_file_open,
   3810	.release	= btrfs_release_file,
   3811	.fsync		= btrfs_sync_file,
   3812	.fallocate	= btrfs_fallocate,
   3813	.unlocked_ioctl	= btrfs_ioctl,
   3814#ifdef CONFIG_COMPAT
   3815	.compat_ioctl	= btrfs_compat_ioctl,
   3816#endif
   3817	.remap_file_range = btrfs_remap_file_range,
   3818};
   3819
   3820void __cold btrfs_auto_defrag_exit(void)
   3821{
   3822	kmem_cache_destroy(btrfs_inode_defrag_cachep);
   3823}
   3824
   3825int __init btrfs_auto_defrag_init(void)
   3826{
   3827	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
   3828					sizeof(struct inode_defrag), 0,
   3829					SLAB_MEM_SPREAD,
   3830					NULL);
   3831	if (!btrfs_inode_defrag_cachep)
   3832		return -ENOMEM;
   3833
   3834	return 0;
   3835}
   3836
   3837int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
   3838{
   3839	int ret;
   3840
   3841	/*
   3842	 * So with compression we will find and lock a dirty page and clear the
   3843	 * first one as dirty, setup an async extent, and immediately return
   3844	 * with the entire range locked but with nobody actually marked with
   3845	 * writeback.  So we can't just filemap_write_and_wait_range() and
   3846	 * expect it to work since it will just kick off a thread to do the
   3847	 * actual work.  So we need to call filemap_fdatawrite_range _again_
   3848	 * since it will wait on the page lock, which won't be unlocked until
   3849	 * after the pages have been marked as writeback and so we're good to go
   3850	 * from there.  We have to do this otherwise we'll miss the ordered
   3851	 * extents and that results in badness.  Please Josef, do not think you
   3852	 * know better and pull this out at some point in the future, it is
   3853	 * right and you are wrong.
   3854	 */
   3855	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
   3856	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
   3857			     &BTRFS_I(inode)->runtime_flags))
   3858		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
   3859
   3860	return ret;
   3861}