cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

extent-tree.c (171424B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2007 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/sched.h>
      7#include <linux/sched/signal.h>
      8#include <linux/pagemap.h>
      9#include <linux/writeback.h>
     10#include <linux/blkdev.h>
     11#include <linux/sort.h>
     12#include <linux/rcupdate.h>
     13#include <linux/kthread.h>
     14#include <linux/slab.h>
     15#include <linux/ratelimit.h>
     16#include <linux/percpu_counter.h>
     17#include <linux/lockdep.h>
     18#include <linux/crc32c.h>
     19#include "misc.h"
     20#include "tree-log.h"
     21#include "disk-io.h"
     22#include "print-tree.h"
     23#include "volumes.h"
     24#include "raid56.h"
     25#include "locking.h"
     26#include "free-space-cache.h"
     27#include "free-space-tree.h"
     28#include "sysfs.h"
     29#include "qgroup.h"
     30#include "ref-verify.h"
     31#include "space-info.h"
     32#include "block-rsv.h"
     33#include "delalloc-space.h"
     34#include "block-group.h"
     35#include "discard.h"
     36#include "rcu-string.h"
     37#include "zoned.h"
     38#include "dev-replace.h"
     39
     40#undef SCRAMBLE_DELAYED_REFS
     41
     42
     43static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
     44			       struct btrfs_delayed_ref_node *node, u64 parent,
     45			       u64 root_objectid, u64 owner_objectid,
     46			       u64 owner_offset, int refs_to_drop,
     47			       struct btrfs_delayed_extent_op *extra_op);
     48static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
     49				    struct extent_buffer *leaf,
     50				    struct btrfs_extent_item *ei);
     51static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
     52				      u64 parent, u64 root_objectid,
     53				      u64 flags, u64 owner, u64 offset,
     54				      struct btrfs_key *ins, int ref_mod);
     55static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
     56				     struct btrfs_delayed_ref_node *node,
     57				     struct btrfs_delayed_extent_op *extent_op);
     58static int find_next_key(struct btrfs_path *path, int level,
     59			 struct btrfs_key *key);
     60
     61static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
     62{
     63	return (cache->flags & bits) == bits;
     64}
     65
     66int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
     67			      u64 start, u64 num_bytes)
     68{
     69	u64 end = start + num_bytes - 1;
     70	set_extent_bits(&fs_info->excluded_extents, start, end,
     71			EXTENT_UPTODATE);
     72	return 0;
     73}
     74
     75void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
     76{
     77	struct btrfs_fs_info *fs_info = cache->fs_info;
     78	u64 start, end;
     79
     80	start = cache->start;
     81	end = start + cache->length - 1;
     82
     83	clear_extent_bits(&fs_info->excluded_extents, start, end,
     84			  EXTENT_UPTODATE);
     85}
     86
     87/* simple helper to search for an existing data extent at a given offset */
     88int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
     89{
     90	struct btrfs_root *root = btrfs_extent_root(fs_info, start);
     91	int ret;
     92	struct btrfs_key key;
     93	struct btrfs_path *path;
     94
     95	path = btrfs_alloc_path();
     96	if (!path)
     97		return -ENOMEM;
     98
     99	key.objectid = start;
    100	key.offset = len;
    101	key.type = BTRFS_EXTENT_ITEM_KEY;
    102	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    103	btrfs_free_path(path);
    104	return ret;
    105}
    106
    107/*
    108 * helper function to lookup reference count and flags of a tree block.
    109 *
    110 * the head node for delayed ref is used to store the sum of all the
    111 * reference count modifications queued up in the rbtree. the head
    112 * node may also store the extent flags to set. This way you can check
    113 * to see what the reference count and extent flags would be if all of
    114 * the delayed refs are not processed.
    115 */
    116int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
    117			     struct btrfs_fs_info *fs_info, u64 bytenr,
    118			     u64 offset, int metadata, u64 *refs, u64 *flags)
    119{
    120	struct btrfs_root *extent_root;
    121	struct btrfs_delayed_ref_head *head;
    122	struct btrfs_delayed_ref_root *delayed_refs;
    123	struct btrfs_path *path;
    124	struct btrfs_extent_item *ei;
    125	struct extent_buffer *leaf;
    126	struct btrfs_key key;
    127	u32 item_size;
    128	u64 num_refs;
    129	u64 extent_flags;
    130	int ret;
    131
    132	/*
    133	 * If we don't have skinny metadata, don't bother doing anything
    134	 * different
    135	 */
    136	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
    137		offset = fs_info->nodesize;
    138		metadata = 0;
    139	}
    140
    141	path = btrfs_alloc_path();
    142	if (!path)
    143		return -ENOMEM;
    144
    145	if (!trans) {
    146		path->skip_locking = 1;
    147		path->search_commit_root = 1;
    148	}
    149
    150search_again:
    151	key.objectid = bytenr;
    152	key.offset = offset;
    153	if (metadata)
    154		key.type = BTRFS_METADATA_ITEM_KEY;
    155	else
    156		key.type = BTRFS_EXTENT_ITEM_KEY;
    157
    158	extent_root = btrfs_extent_root(fs_info, bytenr);
    159	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
    160	if (ret < 0)
    161		goto out_free;
    162
    163	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
    164		if (path->slots[0]) {
    165			path->slots[0]--;
    166			btrfs_item_key_to_cpu(path->nodes[0], &key,
    167					      path->slots[0]);
    168			if (key.objectid == bytenr &&
    169			    key.type == BTRFS_EXTENT_ITEM_KEY &&
    170			    key.offset == fs_info->nodesize)
    171				ret = 0;
    172		}
    173	}
    174
    175	if (ret == 0) {
    176		leaf = path->nodes[0];
    177		item_size = btrfs_item_size(leaf, path->slots[0]);
    178		if (item_size >= sizeof(*ei)) {
    179			ei = btrfs_item_ptr(leaf, path->slots[0],
    180					    struct btrfs_extent_item);
    181			num_refs = btrfs_extent_refs(leaf, ei);
    182			extent_flags = btrfs_extent_flags(leaf, ei);
    183		} else {
    184			ret = -EINVAL;
    185			btrfs_print_v0_err(fs_info);
    186			if (trans)
    187				btrfs_abort_transaction(trans, ret);
    188			else
    189				btrfs_handle_fs_error(fs_info, ret, NULL);
    190
    191			goto out_free;
    192		}
    193
    194		BUG_ON(num_refs == 0);
    195	} else {
    196		num_refs = 0;
    197		extent_flags = 0;
    198		ret = 0;
    199	}
    200
    201	if (!trans)
    202		goto out;
    203
    204	delayed_refs = &trans->transaction->delayed_refs;
    205	spin_lock(&delayed_refs->lock);
    206	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
    207	if (head) {
    208		if (!mutex_trylock(&head->mutex)) {
    209			refcount_inc(&head->refs);
    210			spin_unlock(&delayed_refs->lock);
    211
    212			btrfs_release_path(path);
    213
    214			/*
    215			 * Mutex was contended, block until it's released and try
    216			 * again
    217			 */
    218			mutex_lock(&head->mutex);
    219			mutex_unlock(&head->mutex);
    220			btrfs_put_delayed_ref_head(head);
    221			goto search_again;
    222		}
    223		spin_lock(&head->lock);
    224		if (head->extent_op && head->extent_op->update_flags)
    225			extent_flags |= head->extent_op->flags_to_set;
    226		else
    227			BUG_ON(num_refs == 0);
    228
    229		num_refs += head->ref_mod;
    230		spin_unlock(&head->lock);
    231		mutex_unlock(&head->mutex);
    232	}
    233	spin_unlock(&delayed_refs->lock);
    234out:
    235	WARN_ON(num_refs == 0);
    236	if (refs)
    237		*refs = num_refs;
    238	if (flags)
    239		*flags = extent_flags;
    240out_free:
    241	btrfs_free_path(path);
    242	return ret;
    243}
    244
    245/*
    246 * Back reference rules.  Back refs have three main goals:
    247 *
    248 * 1) differentiate between all holders of references to an extent so that
    249 *    when a reference is dropped we can make sure it was a valid reference
    250 *    before freeing the extent.
    251 *
    252 * 2) Provide enough information to quickly find the holders of an extent
    253 *    if we notice a given block is corrupted or bad.
    254 *
    255 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
    256 *    maintenance.  This is actually the same as #2, but with a slightly
    257 *    different use case.
    258 *
    259 * There are two kinds of back refs. The implicit back refs is optimized
    260 * for pointers in non-shared tree blocks. For a given pointer in a block,
    261 * back refs of this kind provide information about the block's owner tree
    262 * and the pointer's key. These information allow us to find the block by
    263 * b-tree searching. The full back refs is for pointers in tree blocks not
    264 * referenced by their owner trees. The location of tree block is recorded
    265 * in the back refs. Actually the full back refs is generic, and can be
    266 * used in all cases the implicit back refs is used. The major shortcoming
    267 * of the full back refs is its overhead. Every time a tree block gets
    268 * COWed, we have to update back refs entry for all pointers in it.
    269 *
    270 * For a newly allocated tree block, we use implicit back refs for
    271 * pointers in it. This means most tree related operations only involve
    272 * implicit back refs. For a tree block created in old transaction, the
    273 * only way to drop a reference to it is COW it. So we can detect the
    274 * event that tree block loses its owner tree's reference and do the
    275 * back refs conversion.
    276 *
    277 * When a tree block is COWed through a tree, there are four cases:
    278 *
    279 * The reference count of the block is one and the tree is the block's
    280 * owner tree. Nothing to do in this case.
    281 *
    282 * The reference count of the block is one and the tree is not the
    283 * block's owner tree. In this case, full back refs is used for pointers
    284 * in the block. Remove these full back refs, add implicit back refs for
    285 * every pointers in the new block.
    286 *
    287 * The reference count of the block is greater than one and the tree is
    288 * the block's owner tree. In this case, implicit back refs is used for
    289 * pointers in the block. Add full back refs for every pointers in the
    290 * block, increase lower level extents' reference counts. The original
    291 * implicit back refs are entailed to the new block.
    292 *
    293 * The reference count of the block is greater than one and the tree is
    294 * not the block's owner tree. Add implicit back refs for every pointer in
    295 * the new block, increase lower level extents' reference count.
    296 *
    297 * Back Reference Key composing:
    298 *
    299 * The key objectid corresponds to the first byte in the extent,
    300 * The key type is used to differentiate between types of back refs.
    301 * There are different meanings of the key offset for different types
    302 * of back refs.
    303 *
    304 * File extents can be referenced by:
    305 *
    306 * - multiple snapshots, subvolumes, or different generations in one subvol
    307 * - different files inside a single subvolume
    308 * - different offsets inside a file (bookend extents in file.c)
    309 *
    310 * The extent ref structure for the implicit back refs has fields for:
    311 *
    312 * - Objectid of the subvolume root
    313 * - objectid of the file holding the reference
    314 * - original offset in the file
    315 * - how many bookend extents
    316 *
    317 * The key offset for the implicit back refs is hash of the first
    318 * three fields.
    319 *
    320 * The extent ref structure for the full back refs has field for:
    321 *
    322 * - number of pointers in the tree leaf
    323 *
    324 * The key offset for the implicit back refs is the first byte of
    325 * the tree leaf
    326 *
    327 * When a file extent is allocated, The implicit back refs is used.
    328 * the fields are filled in:
    329 *
    330 *     (root_key.objectid, inode objectid, offset in file, 1)
    331 *
    332 * When a file extent is removed file truncation, we find the
    333 * corresponding implicit back refs and check the following fields:
    334 *
    335 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
    336 *
    337 * Btree extents can be referenced by:
    338 *
    339 * - Different subvolumes
    340 *
    341 * Both the implicit back refs and the full back refs for tree blocks
    342 * only consist of key. The key offset for the implicit back refs is
    343 * objectid of block's owner tree. The key offset for the full back refs
    344 * is the first byte of parent block.
    345 *
    346 * When implicit back refs is used, information about the lowest key and
    347 * level of the tree block are required. These information are stored in
    348 * tree block info structure.
    349 */
    350
    351/*
    352 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
    353 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
    354 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
    355 */
    356int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
    357				     struct btrfs_extent_inline_ref *iref,
    358				     enum btrfs_inline_ref_type is_data)
    359{
    360	int type = btrfs_extent_inline_ref_type(eb, iref);
    361	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
    362
    363	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
    364	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
    365	    type == BTRFS_SHARED_DATA_REF_KEY ||
    366	    type == BTRFS_EXTENT_DATA_REF_KEY) {
    367		if (is_data == BTRFS_REF_TYPE_BLOCK) {
    368			if (type == BTRFS_TREE_BLOCK_REF_KEY)
    369				return type;
    370			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
    371				ASSERT(eb->fs_info);
    372				/*
    373				 * Every shared one has parent tree block,
    374				 * which must be aligned to sector size.
    375				 */
    376				if (offset &&
    377				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
    378					return type;
    379			}
    380		} else if (is_data == BTRFS_REF_TYPE_DATA) {
    381			if (type == BTRFS_EXTENT_DATA_REF_KEY)
    382				return type;
    383			if (type == BTRFS_SHARED_DATA_REF_KEY) {
    384				ASSERT(eb->fs_info);
    385				/*
    386				 * Every shared one has parent tree block,
    387				 * which must be aligned to sector size.
    388				 */
    389				if (offset &&
    390				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
    391					return type;
    392			}
    393		} else {
    394			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
    395			return type;
    396		}
    397	}
    398
    399	btrfs_print_leaf((struct extent_buffer *)eb);
    400	btrfs_err(eb->fs_info,
    401		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
    402		  eb->start, (unsigned long)iref, type);
    403	WARN_ON(1);
    404
    405	return BTRFS_REF_TYPE_INVALID;
    406}
    407
    408u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
    409{
    410	u32 high_crc = ~(u32)0;
    411	u32 low_crc = ~(u32)0;
    412	__le64 lenum;
    413
    414	lenum = cpu_to_le64(root_objectid);
    415	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
    416	lenum = cpu_to_le64(owner);
    417	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
    418	lenum = cpu_to_le64(offset);
    419	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
    420
    421	return ((u64)high_crc << 31) ^ (u64)low_crc;
    422}
    423
    424static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
    425				     struct btrfs_extent_data_ref *ref)
    426{
    427	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
    428				    btrfs_extent_data_ref_objectid(leaf, ref),
    429				    btrfs_extent_data_ref_offset(leaf, ref));
    430}
    431
    432static int match_extent_data_ref(struct extent_buffer *leaf,
    433				 struct btrfs_extent_data_ref *ref,
    434				 u64 root_objectid, u64 owner, u64 offset)
    435{
    436	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
    437	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
    438	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
    439		return 0;
    440	return 1;
    441}
    442
    443static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
    444					   struct btrfs_path *path,
    445					   u64 bytenr, u64 parent,
    446					   u64 root_objectid,
    447					   u64 owner, u64 offset)
    448{
    449	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
    450	struct btrfs_key key;
    451	struct btrfs_extent_data_ref *ref;
    452	struct extent_buffer *leaf;
    453	u32 nritems;
    454	int ret;
    455	int recow;
    456	int err = -ENOENT;
    457
    458	key.objectid = bytenr;
    459	if (parent) {
    460		key.type = BTRFS_SHARED_DATA_REF_KEY;
    461		key.offset = parent;
    462	} else {
    463		key.type = BTRFS_EXTENT_DATA_REF_KEY;
    464		key.offset = hash_extent_data_ref(root_objectid,
    465						  owner, offset);
    466	}
    467again:
    468	recow = 0;
    469	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    470	if (ret < 0) {
    471		err = ret;
    472		goto fail;
    473	}
    474
    475	if (parent) {
    476		if (!ret)
    477			return 0;
    478		goto fail;
    479	}
    480
    481	leaf = path->nodes[0];
    482	nritems = btrfs_header_nritems(leaf);
    483	while (1) {
    484		if (path->slots[0] >= nritems) {
    485			ret = btrfs_next_leaf(root, path);
    486			if (ret < 0)
    487				err = ret;
    488			if (ret)
    489				goto fail;
    490
    491			leaf = path->nodes[0];
    492			nritems = btrfs_header_nritems(leaf);
    493			recow = 1;
    494		}
    495
    496		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    497		if (key.objectid != bytenr ||
    498		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
    499			goto fail;
    500
    501		ref = btrfs_item_ptr(leaf, path->slots[0],
    502				     struct btrfs_extent_data_ref);
    503
    504		if (match_extent_data_ref(leaf, ref, root_objectid,
    505					  owner, offset)) {
    506			if (recow) {
    507				btrfs_release_path(path);
    508				goto again;
    509			}
    510			err = 0;
    511			break;
    512		}
    513		path->slots[0]++;
    514	}
    515fail:
    516	return err;
    517}
    518
    519static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
    520					   struct btrfs_path *path,
    521					   u64 bytenr, u64 parent,
    522					   u64 root_objectid, u64 owner,
    523					   u64 offset, int refs_to_add)
    524{
    525	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
    526	struct btrfs_key key;
    527	struct extent_buffer *leaf;
    528	u32 size;
    529	u32 num_refs;
    530	int ret;
    531
    532	key.objectid = bytenr;
    533	if (parent) {
    534		key.type = BTRFS_SHARED_DATA_REF_KEY;
    535		key.offset = parent;
    536		size = sizeof(struct btrfs_shared_data_ref);
    537	} else {
    538		key.type = BTRFS_EXTENT_DATA_REF_KEY;
    539		key.offset = hash_extent_data_ref(root_objectid,
    540						  owner, offset);
    541		size = sizeof(struct btrfs_extent_data_ref);
    542	}
    543
    544	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
    545	if (ret && ret != -EEXIST)
    546		goto fail;
    547
    548	leaf = path->nodes[0];
    549	if (parent) {
    550		struct btrfs_shared_data_ref *ref;
    551		ref = btrfs_item_ptr(leaf, path->slots[0],
    552				     struct btrfs_shared_data_ref);
    553		if (ret == 0) {
    554			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
    555		} else {
    556			num_refs = btrfs_shared_data_ref_count(leaf, ref);
    557			num_refs += refs_to_add;
    558			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
    559		}
    560	} else {
    561		struct btrfs_extent_data_ref *ref;
    562		while (ret == -EEXIST) {
    563			ref = btrfs_item_ptr(leaf, path->slots[0],
    564					     struct btrfs_extent_data_ref);
    565			if (match_extent_data_ref(leaf, ref, root_objectid,
    566						  owner, offset))
    567				break;
    568			btrfs_release_path(path);
    569			key.offset++;
    570			ret = btrfs_insert_empty_item(trans, root, path, &key,
    571						      size);
    572			if (ret && ret != -EEXIST)
    573				goto fail;
    574
    575			leaf = path->nodes[0];
    576		}
    577		ref = btrfs_item_ptr(leaf, path->slots[0],
    578				     struct btrfs_extent_data_ref);
    579		if (ret == 0) {
    580			btrfs_set_extent_data_ref_root(leaf, ref,
    581						       root_objectid);
    582			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
    583			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
    584			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
    585		} else {
    586			num_refs = btrfs_extent_data_ref_count(leaf, ref);
    587			num_refs += refs_to_add;
    588			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
    589		}
    590	}
    591	btrfs_mark_buffer_dirty(leaf);
    592	ret = 0;
    593fail:
    594	btrfs_release_path(path);
    595	return ret;
    596}
    597
    598static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
    599					   struct btrfs_root *root,
    600					   struct btrfs_path *path,
    601					   int refs_to_drop)
    602{
    603	struct btrfs_key key;
    604	struct btrfs_extent_data_ref *ref1 = NULL;
    605	struct btrfs_shared_data_ref *ref2 = NULL;
    606	struct extent_buffer *leaf;
    607	u32 num_refs = 0;
    608	int ret = 0;
    609
    610	leaf = path->nodes[0];
    611	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    612
    613	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
    614		ref1 = btrfs_item_ptr(leaf, path->slots[0],
    615				      struct btrfs_extent_data_ref);
    616		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
    617	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
    618		ref2 = btrfs_item_ptr(leaf, path->slots[0],
    619				      struct btrfs_shared_data_ref);
    620		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
    621	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
    622		btrfs_print_v0_err(trans->fs_info);
    623		btrfs_abort_transaction(trans, -EINVAL);
    624		return -EINVAL;
    625	} else {
    626		BUG();
    627	}
    628
    629	BUG_ON(num_refs < refs_to_drop);
    630	num_refs -= refs_to_drop;
    631
    632	if (num_refs == 0) {
    633		ret = btrfs_del_item(trans, root, path);
    634	} else {
    635		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
    636			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
    637		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
    638			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
    639		btrfs_mark_buffer_dirty(leaf);
    640	}
    641	return ret;
    642}
    643
    644static noinline u32 extent_data_ref_count(struct btrfs_path *path,
    645					  struct btrfs_extent_inline_ref *iref)
    646{
    647	struct btrfs_key key;
    648	struct extent_buffer *leaf;
    649	struct btrfs_extent_data_ref *ref1;
    650	struct btrfs_shared_data_ref *ref2;
    651	u32 num_refs = 0;
    652	int type;
    653
    654	leaf = path->nodes[0];
    655	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    656
    657	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
    658	if (iref) {
    659		/*
    660		 * If type is invalid, we should have bailed out earlier than
    661		 * this call.
    662		 */
    663		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
    664		ASSERT(type != BTRFS_REF_TYPE_INVALID);
    665		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
    666			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
    667			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
    668		} else {
    669			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
    670			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
    671		}
    672	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
    673		ref1 = btrfs_item_ptr(leaf, path->slots[0],
    674				      struct btrfs_extent_data_ref);
    675		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
    676	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
    677		ref2 = btrfs_item_ptr(leaf, path->slots[0],
    678				      struct btrfs_shared_data_ref);
    679		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
    680	} else {
    681		WARN_ON(1);
    682	}
    683	return num_refs;
    684}
    685
    686static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
    687					  struct btrfs_path *path,
    688					  u64 bytenr, u64 parent,
    689					  u64 root_objectid)
    690{
    691	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
    692	struct btrfs_key key;
    693	int ret;
    694
    695	key.objectid = bytenr;
    696	if (parent) {
    697		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
    698		key.offset = parent;
    699	} else {
    700		key.type = BTRFS_TREE_BLOCK_REF_KEY;
    701		key.offset = root_objectid;
    702	}
    703
    704	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    705	if (ret > 0)
    706		ret = -ENOENT;
    707	return ret;
    708}
    709
    710static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
    711					  struct btrfs_path *path,
    712					  u64 bytenr, u64 parent,
    713					  u64 root_objectid)
    714{
    715	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
    716	struct btrfs_key key;
    717	int ret;
    718
    719	key.objectid = bytenr;
    720	if (parent) {
    721		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
    722		key.offset = parent;
    723	} else {
    724		key.type = BTRFS_TREE_BLOCK_REF_KEY;
    725		key.offset = root_objectid;
    726	}
    727
    728	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
    729	btrfs_release_path(path);
    730	return ret;
    731}
    732
    733static inline int extent_ref_type(u64 parent, u64 owner)
    734{
    735	int type;
    736	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
    737		if (parent > 0)
    738			type = BTRFS_SHARED_BLOCK_REF_KEY;
    739		else
    740			type = BTRFS_TREE_BLOCK_REF_KEY;
    741	} else {
    742		if (parent > 0)
    743			type = BTRFS_SHARED_DATA_REF_KEY;
    744		else
    745			type = BTRFS_EXTENT_DATA_REF_KEY;
    746	}
    747	return type;
    748}
    749
    750static int find_next_key(struct btrfs_path *path, int level,
    751			 struct btrfs_key *key)
    752
    753{
    754	for (; level < BTRFS_MAX_LEVEL; level++) {
    755		if (!path->nodes[level])
    756			break;
    757		if (path->slots[level] + 1 >=
    758		    btrfs_header_nritems(path->nodes[level]))
    759			continue;
    760		if (level == 0)
    761			btrfs_item_key_to_cpu(path->nodes[level], key,
    762					      path->slots[level] + 1);
    763		else
    764			btrfs_node_key_to_cpu(path->nodes[level], key,
    765					      path->slots[level] + 1);
    766		return 0;
    767	}
    768	return 1;
    769}
    770
    771/*
    772 * look for inline back ref. if back ref is found, *ref_ret is set
    773 * to the address of inline back ref, and 0 is returned.
    774 *
    775 * if back ref isn't found, *ref_ret is set to the address where it
    776 * should be inserted, and -ENOENT is returned.
    777 *
    778 * if insert is true and there are too many inline back refs, the path
    779 * points to the extent item, and -EAGAIN is returned.
    780 *
    781 * NOTE: inline back refs are ordered in the same way that back ref
    782 *	 items in the tree are ordered.
    783 */
    784static noinline_for_stack
    785int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
    786				 struct btrfs_path *path,
    787				 struct btrfs_extent_inline_ref **ref_ret,
    788				 u64 bytenr, u64 num_bytes,
    789				 u64 parent, u64 root_objectid,
    790				 u64 owner, u64 offset, int insert)
    791{
    792	struct btrfs_fs_info *fs_info = trans->fs_info;
    793	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
    794	struct btrfs_key key;
    795	struct extent_buffer *leaf;
    796	struct btrfs_extent_item *ei;
    797	struct btrfs_extent_inline_ref *iref;
    798	u64 flags;
    799	u64 item_size;
    800	unsigned long ptr;
    801	unsigned long end;
    802	int extra_size;
    803	int type;
    804	int want;
    805	int ret;
    806	int err = 0;
    807	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
    808	int needed;
    809
    810	key.objectid = bytenr;
    811	key.type = BTRFS_EXTENT_ITEM_KEY;
    812	key.offset = num_bytes;
    813
    814	want = extent_ref_type(parent, owner);
    815	if (insert) {
    816		extra_size = btrfs_extent_inline_ref_size(want);
    817		path->search_for_extension = 1;
    818		path->keep_locks = 1;
    819	} else
    820		extra_size = -1;
    821
    822	/*
    823	 * Owner is our level, so we can just add one to get the level for the
    824	 * block we are interested in.
    825	 */
    826	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
    827		key.type = BTRFS_METADATA_ITEM_KEY;
    828		key.offset = owner;
    829	}
    830
    831again:
    832	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
    833	if (ret < 0) {
    834		err = ret;
    835		goto out;
    836	}
    837
    838	/*
    839	 * We may be a newly converted file system which still has the old fat
    840	 * extent entries for metadata, so try and see if we have one of those.
    841	 */
    842	if (ret > 0 && skinny_metadata) {
    843		skinny_metadata = false;
    844		if (path->slots[0]) {
    845			path->slots[0]--;
    846			btrfs_item_key_to_cpu(path->nodes[0], &key,
    847					      path->slots[0]);
    848			if (key.objectid == bytenr &&
    849			    key.type == BTRFS_EXTENT_ITEM_KEY &&
    850			    key.offset == num_bytes)
    851				ret = 0;
    852		}
    853		if (ret) {
    854			key.objectid = bytenr;
    855			key.type = BTRFS_EXTENT_ITEM_KEY;
    856			key.offset = num_bytes;
    857			btrfs_release_path(path);
    858			goto again;
    859		}
    860	}
    861
    862	if (ret && !insert) {
    863		err = -ENOENT;
    864		goto out;
    865	} else if (WARN_ON(ret)) {
    866		err = -EIO;
    867		goto out;
    868	}
    869
    870	leaf = path->nodes[0];
    871	item_size = btrfs_item_size(leaf, path->slots[0]);
    872	if (unlikely(item_size < sizeof(*ei))) {
    873		err = -EINVAL;
    874		btrfs_print_v0_err(fs_info);
    875		btrfs_abort_transaction(trans, err);
    876		goto out;
    877	}
    878
    879	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
    880	flags = btrfs_extent_flags(leaf, ei);
    881
    882	ptr = (unsigned long)(ei + 1);
    883	end = (unsigned long)ei + item_size;
    884
    885	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
    886		ptr += sizeof(struct btrfs_tree_block_info);
    887		BUG_ON(ptr > end);
    888	}
    889
    890	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
    891		needed = BTRFS_REF_TYPE_DATA;
    892	else
    893		needed = BTRFS_REF_TYPE_BLOCK;
    894
    895	err = -ENOENT;
    896	while (1) {
    897		if (ptr >= end) {
    898			if (ptr > end) {
    899				err = -EUCLEAN;
    900				btrfs_print_leaf(path->nodes[0]);
    901				btrfs_crit(fs_info,
    902"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
    903					path->slots[0], root_objectid, owner, offset, parent);
    904			}
    905			break;
    906		}
    907		iref = (struct btrfs_extent_inline_ref *)ptr;
    908		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
    909		if (type == BTRFS_REF_TYPE_INVALID) {
    910			err = -EUCLEAN;
    911			goto out;
    912		}
    913
    914		if (want < type)
    915			break;
    916		if (want > type) {
    917			ptr += btrfs_extent_inline_ref_size(type);
    918			continue;
    919		}
    920
    921		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
    922			struct btrfs_extent_data_ref *dref;
    923			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
    924			if (match_extent_data_ref(leaf, dref, root_objectid,
    925						  owner, offset)) {
    926				err = 0;
    927				break;
    928			}
    929			if (hash_extent_data_ref_item(leaf, dref) <
    930			    hash_extent_data_ref(root_objectid, owner, offset))
    931				break;
    932		} else {
    933			u64 ref_offset;
    934			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
    935			if (parent > 0) {
    936				if (parent == ref_offset) {
    937					err = 0;
    938					break;
    939				}
    940				if (ref_offset < parent)
    941					break;
    942			} else {
    943				if (root_objectid == ref_offset) {
    944					err = 0;
    945					break;
    946				}
    947				if (ref_offset < root_objectid)
    948					break;
    949			}
    950		}
    951		ptr += btrfs_extent_inline_ref_size(type);
    952	}
    953	if (err == -ENOENT && insert) {
    954		if (item_size + extra_size >=
    955		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
    956			err = -EAGAIN;
    957			goto out;
    958		}
    959		/*
    960		 * To add new inline back ref, we have to make sure
    961		 * there is no corresponding back ref item.
    962		 * For simplicity, we just do not add new inline back
    963		 * ref if there is any kind of item for this block
    964		 */
    965		if (find_next_key(path, 0, &key) == 0 &&
    966		    key.objectid == bytenr &&
    967		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
    968			err = -EAGAIN;
    969			goto out;
    970		}
    971	}
    972	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
    973out:
    974	if (insert) {
    975		path->keep_locks = 0;
    976		path->search_for_extension = 0;
    977		btrfs_unlock_up_safe(path, 1);
    978	}
    979	return err;
    980}
    981
    982/*
    983 * helper to add new inline back ref
    984 */
    985static noinline_for_stack
    986void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
    987				 struct btrfs_path *path,
    988				 struct btrfs_extent_inline_ref *iref,
    989				 u64 parent, u64 root_objectid,
    990				 u64 owner, u64 offset, int refs_to_add,
    991				 struct btrfs_delayed_extent_op *extent_op)
    992{
    993	struct extent_buffer *leaf;
    994	struct btrfs_extent_item *ei;
    995	unsigned long ptr;
    996	unsigned long end;
    997	unsigned long item_offset;
    998	u64 refs;
    999	int size;
   1000	int type;
   1001
   1002	leaf = path->nodes[0];
   1003	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
   1004	item_offset = (unsigned long)iref - (unsigned long)ei;
   1005
   1006	type = extent_ref_type(parent, owner);
   1007	size = btrfs_extent_inline_ref_size(type);
   1008
   1009	btrfs_extend_item(path, size);
   1010
   1011	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
   1012	refs = btrfs_extent_refs(leaf, ei);
   1013	refs += refs_to_add;
   1014	btrfs_set_extent_refs(leaf, ei, refs);
   1015	if (extent_op)
   1016		__run_delayed_extent_op(extent_op, leaf, ei);
   1017
   1018	ptr = (unsigned long)ei + item_offset;
   1019	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
   1020	if (ptr < end - size)
   1021		memmove_extent_buffer(leaf, ptr + size, ptr,
   1022				      end - size - ptr);
   1023
   1024	iref = (struct btrfs_extent_inline_ref *)ptr;
   1025	btrfs_set_extent_inline_ref_type(leaf, iref, type);
   1026	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
   1027		struct btrfs_extent_data_ref *dref;
   1028		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
   1029		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
   1030		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
   1031		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
   1032		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
   1033	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
   1034		struct btrfs_shared_data_ref *sref;
   1035		sref = (struct btrfs_shared_data_ref *)(iref + 1);
   1036		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
   1037		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
   1038	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
   1039		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
   1040	} else {
   1041		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
   1042	}
   1043	btrfs_mark_buffer_dirty(leaf);
   1044}
   1045
   1046static int lookup_extent_backref(struct btrfs_trans_handle *trans,
   1047				 struct btrfs_path *path,
   1048				 struct btrfs_extent_inline_ref **ref_ret,
   1049				 u64 bytenr, u64 num_bytes, u64 parent,
   1050				 u64 root_objectid, u64 owner, u64 offset)
   1051{
   1052	int ret;
   1053
   1054	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
   1055					   num_bytes, parent, root_objectid,
   1056					   owner, offset, 0);
   1057	if (ret != -ENOENT)
   1058		return ret;
   1059
   1060	btrfs_release_path(path);
   1061	*ref_ret = NULL;
   1062
   1063	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
   1064		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
   1065					    root_objectid);
   1066	} else {
   1067		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
   1068					     root_objectid, owner, offset);
   1069	}
   1070	return ret;
   1071}
   1072
   1073/*
   1074 * helper to update/remove inline back ref
   1075 */
   1076static noinline_for_stack
   1077void update_inline_extent_backref(struct btrfs_path *path,
   1078				  struct btrfs_extent_inline_ref *iref,
   1079				  int refs_to_mod,
   1080				  struct btrfs_delayed_extent_op *extent_op)
   1081{
   1082	struct extent_buffer *leaf = path->nodes[0];
   1083	struct btrfs_extent_item *ei;
   1084	struct btrfs_extent_data_ref *dref = NULL;
   1085	struct btrfs_shared_data_ref *sref = NULL;
   1086	unsigned long ptr;
   1087	unsigned long end;
   1088	u32 item_size;
   1089	int size;
   1090	int type;
   1091	u64 refs;
   1092
   1093	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
   1094	refs = btrfs_extent_refs(leaf, ei);
   1095	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
   1096	refs += refs_to_mod;
   1097	btrfs_set_extent_refs(leaf, ei, refs);
   1098	if (extent_op)
   1099		__run_delayed_extent_op(extent_op, leaf, ei);
   1100
   1101	/*
   1102	 * If type is invalid, we should have bailed out after
   1103	 * lookup_inline_extent_backref().
   1104	 */
   1105	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
   1106	ASSERT(type != BTRFS_REF_TYPE_INVALID);
   1107
   1108	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
   1109		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
   1110		refs = btrfs_extent_data_ref_count(leaf, dref);
   1111	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
   1112		sref = (struct btrfs_shared_data_ref *)(iref + 1);
   1113		refs = btrfs_shared_data_ref_count(leaf, sref);
   1114	} else {
   1115		refs = 1;
   1116		BUG_ON(refs_to_mod != -1);
   1117	}
   1118
   1119	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
   1120	refs += refs_to_mod;
   1121
   1122	if (refs > 0) {
   1123		if (type == BTRFS_EXTENT_DATA_REF_KEY)
   1124			btrfs_set_extent_data_ref_count(leaf, dref, refs);
   1125		else
   1126			btrfs_set_shared_data_ref_count(leaf, sref, refs);
   1127	} else {
   1128		size =  btrfs_extent_inline_ref_size(type);
   1129		item_size = btrfs_item_size(leaf, path->slots[0]);
   1130		ptr = (unsigned long)iref;
   1131		end = (unsigned long)ei + item_size;
   1132		if (ptr + size < end)
   1133			memmove_extent_buffer(leaf, ptr, ptr + size,
   1134					      end - ptr - size);
   1135		item_size -= size;
   1136		btrfs_truncate_item(path, item_size, 1);
   1137	}
   1138	btrfs_mark_buffer_dirty(leaf);
   1139}
   1140
   1141static noinline_for_stack
   1142int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
   1143				 struct btrfs_path *path,
   1144				 u64 bytenr, u64 num_bytes, u64 parent,
   1145				 u64 root_objectid, u64 owner,
   1146				 u64 offset, int refs_to_add,
   1147				 struct btrfs_delayed_extent_op *extent_op)
   1148{
   1149	struct btrfs_extent_inline_ref *iref;
   1150	int ret;
   1151
   1152	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
   1153					   num_bytes, parent, root_objectid,
   1154					   owner, offset, 1);
   1155	if (ret == 0) {
   1156		/*
   1157		 * We're adding refs to a tree block we already own, this
   1158		 * should not happen at all.
   1159		 */
   1160		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
   1161			btrfs_crit(trans->fs_info,
   1162"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
   1163				   bytenr, num_bytes, root_objectid);
   1164			if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
   1165				WARN_ON(1);
   1166				btrfs_crit(trans->fs_info,
   1167			"path->slots[0]=%d path->nodes[0]:", path->slots[0]);
   1168				btrfs_print_leaf(path->nodes[0]);
   1169			}
   1170			return -EUCLEAN;
   1171		}
   1172		update_inline_extent_backref(path, iref, refs_to_add, extent_op);
   1173	} else if (ret == -ENOENT) {
   1174		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
   1175					    root_objectid, owner, offset,
   1176					    refs_to_add, extent_op);
   1177		ret = 0;
   1178	}
   1179	return ret;
   1180}
   1181
   1182static int remove_extent_backref(struct btrfs_trans_handle *trans,
   1183				 struct btrfs_root *root,
   1184				 struct btrfs_path *path,
   1185				 struct btrfs_extent_inline_ref *iref,
   1186				 int refs_to_drop, int is_data)
   1187{
   1188	int ret = 0;
   1189
   1190	BUG_ON(!is_data && refs_to_drop != 1);
   1191	if (iref)
   1192		update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
   1193	else if (is_data)
   1194		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
   1195	else
   1196		ret = btrfs_del_item(trans, root, path);
   1197	return ret;
   1198}
   1199
   1200static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
   1201			       u64 *discarded_bytes)
   1202{
   1203	int j, ret = 0;
   1204	u64 bytes_left, end;
   1205	u64 aligned_start = ALIGN(start, 1 << 9);
   1206
   1207	if (WARN_ON(start != aligned_start)) {
   1208		len -= aligned_start - start;
   1209		len = round_down(len, 1 << 9);
   1210		start = aligned_start;
   1211	}
   1212
   1213	*discarded_bytes = 0;
   1214
   1215	if (!len)
   1216		return 0;
   1217
   1218	end = start + len;
   1219	bytes_left = len;
   1220
   1221	/* Skip any superblocks on this device. */
   1222	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
   1223		u64 sb_start = btrfs_sb_offset(j);
   1224		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
   1225		u64 size = sb_start - start;
   1226
   1227		if (!in_range(sb_start, start, bytes_left) &&
   1228		    !in_range(sb_end, start, bytes_left) &&
   1229		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
   1230			continue;
   1231
   1232		/*
   1233		 * Superblock spans beginning of range.  Adjust start and
   1234		 * try again.
   1235		 */
   1236		if (sb_start <= start) {
   1237			start += sb_end - start;
   1238			if (start > end) {
   1239				bytes_left = 0;
   1240				break;
   1241			}
   1242			bytes_left = end - start;
   1243			continue;
   1244		}
   1245
   1246		if (size) {
   1247			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
   1248						   GFP_NOFS);
   1249			if (!ret)
   1250				*discarded_bytes += size;
   1251			else if (ret != -EOPNOTSUPP)
   1252				return ret;
   1253		}
   1254
   1255		start = sb_end;
   1256		if (start > end) {
   1257			bytes_left = 0;
   1258			break;
   1259		}
   1260		bytes_left = end - start;
   1261	}
   1262
   1263	if (bytes_left) {
   1264		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
   1265					   GFP_NOFS);
   1266		if (!ret)
   1267			*discarded_bytes += bytes_left;
   1268	}
   1269	return ret;
   1270}
   1271
   1272static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
   1273{
   1274	struct btrfs_device *dev = stripe->dev;
   1275	struct btrfs_fs_info *fs_info = dev->fs_info;
   1276	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   1277	u64 phys = stripe->physical;
   1278	u64 len = stripe->length;
   1279	u64 discarded = 0;
   1280	int ret = 0;
   1281
   1282	/* Zone reset on a zoned filesystem */
   1283	if (btrfs_can_zone_reset(dev, phys, len)) {
   1284		u64 src_disc;
   1285
   1286		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
   1287		if (ret)
   1288			goto out;
   1289
   1290		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
   1291		    dev != dev_replace->srcdev)
   1292			goto out;
   1293
   1294		src_disc = discarded;
   1295
   1296		/* Send to replace target as well */
   1297		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
   1298					      &discarded);
   1299		discarded += src_disc;
   1300	} else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
   1301		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
   1302	} else {
   1303		ret = 0;
   1304		*bytes = 0;
   1305	}
   1306
   1307out:
   1308	*bytes = discarded;
   1309	return ret;
   1310}
   1311
   1312int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
   1313			 u64 num_bytes, u64 *actual_bytes)
   1314{
   1315	int ret = 0;
   1316	u64 discarded_bytes = 0;
   1317	u64 end = bytenr + num_bytes;
   1318	u64 cur = bytenr;
   1319	struct btrfs_io_context *bioc = NULL;
   1320
   1321	/*
   1322	 * Avoid races with device replace and make sure our bioc has devices
   1323	 * associated to its stripes that don't go away while we are discarding.
   1324	 */
   1325	btrfs_bio_counter_inc_blocked(fs_info);
   1326	while (cur < end) {
   1327		struct btrfs_io_stripe *stripe;
   1328		int i;
   1329
   1330		num_bytes = end - cur;
   1331		/* Tell the block device(s) that the sectors can be discarded */
   1332		ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
   1333				      &num_bytes, &bioc, 0);
   1334		/*
   1335		 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
   1336		 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
   1337		 * thus we can't continue anyway.
   1338		 */
   1339		if (ret < 0)
   1340			goto out;
   1341
   1342		stripe = bioc->stripes;
   1343		for (i = 0; i < bioc->num_stripes; i++, stripe++) {
   1344			u64 bytes;
   1345			struct btrfs_device *device = stripe->dev;
   1346
   1347			if (!device->bdev) {
   1348				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
   1349				continue;
   1350			}
   1351
   1352			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
   1353				continue;
   1354
   1355			ret = do_discard_extent(stripe, &bytes);
   1356			if (!ret) {
   1357				discarded_bytes += bytes;
   1358			} else if (ret != -EOPNOTSUPP) {
   1359				/*
   1360				 * Logic errors or -ENOMEM, or -EIO, but
   1361				 * unlikely to happen.
   1362				 *
   1363				 * And since there are two loops, explicitly
   1364				 * go to out to avoid confusion.
   1365				 */
   1366				btrfs_put_bioc(bioc);
   1367				goto out;
   1368			}
   1369
   1370			/*
   1371			 * Just in case we get back EOPNOTSUPP for some reason,
   1372			 * just ignore the return value so we don't screw up
   1373			 * people calling discard_extent.
   1374			 */
   1375			ret = 0;
   1376		}
   1377		btrfs_put_bioc(bioc);
   1378		cur += num_bytes;
   1379	}
   1380out:
   1381	btrfs_bio_counter_dec(fs_info);
   1382
   1383	if (actual_bytes)
   1384		*actual_bytes = discarded_bytes;
   1385
   1386
   1387	if (ret == -EOPNOTSUPP)
   1388		ret = 0;
   1389	return ret;
   1390}
   1391
   1392/* Can return -ENOMEM */
   1393int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
   1394			 struct btrfs_ref *generic_ref)
   1395{
   1396	struct btrfs_fs_info *fs_info = trans->fs_info;
   1397	int ret;
   1398
   1399	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
   1400	       generic_ref->action);
   1401	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
   1402	       generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
   1403
   1404	if (generic_ref->type == BTRFS_REF_METADATA)
   1405		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
   1406	else
   1407		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
   1408
   1409	btrfs_ref_tree_mod(fs_info, generic_ref);
   1410
   1411	return ret;
   1412}
   1413
   1414/*
   1415 * __btrfs_inc_extent_ref - insert backreference for a given extent
   1416 *
   1417 * The counterpart is in __btrfs_free_extent(), with examples and more details
   1418 * how it works.
   1419 *
   1420 * @trans:	    Handle of transaction
   1421 *
   1422 * @node:	    The delayed ref node used to get the bytenr/length for
   1423 *		    extent whose references are incremented.
   1424 *
   1425 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
   1426 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
   1427 *		    bytenr of the parent block. Since new extents are always
   1428 *		    created with indirect references, this will only be the case
   1429 *		    when relocating a shared extent. In that case, root_objectid
   1430 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
   1431 *		    be 0
   1432 *
   1433 * @root_objectid:  The id of the root where this modification has originated,
   1434 *		    this can be either one of the well-known metadata trees or
   1435 *		    the subvolume id which references this extent.
   1436 *
   1437 * @owner:	    For data extents it is the inode number of the owning file.
   1438 *		    For metadata extents this parameter holds the level in the
   1439 *		    tree of the extent.
   1440 *
   1441 * @offset:	    For metadata extents the offset is ignored and is currently
   1442 *		    always passed as 0. For data extents it is the fileoffset
   1443 *		    this extent belongs to.
   1444 *
   1445 * @refs_to_add     Number of references to add
   1446 *
   1447 * @extent_op       Pointer to a structure, holding information necessary when
   1448 *                  updating a tree block's flags
   1449 *
   1450 */
   1451static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
   1452				  struct btrfs_delayed_ref_node *node,
   1453				  u64 parent, u64 root_objectid,
   1454				  u64 owner, u64 offset, int refs_to_add,
   1455				  struct btrfs_delayed_extent_op *extent_op)
   1456{
   1457	struct btrfs_path *path;
   1458	struct extent_buffer *leaf;
   1459	struct btrfs_extent_item *item;
   1460	struct btrfs_key key;
   1461	u64 bytenr = node->bytenr;
   1462	u64 num_bytes = node->num_bytes;
   1463	u64 refs;
   1464	int ret;
   1465
   1466	path = btrfs_alloc_path();
   1467	if (!path)
   1468		return -ENOMEM;
   1469
   1470	/* this will setup the path even if it fails to insert the back ref */
   1471	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
   1472					   parent, root_objectid, owner,
   1473					   offset, refs_to_add, extent_op);
   1474	if ((ret < 0 && ret != -EAGAIN) || !ret)
   1475		goto out;
   1476
   1477	/*
   1478	 * Ok we had -EAGAIN which means we didn't have space to insert and
   1479	 * inline extent ref, so just update the reference count and add a
   1480	 * normal backref.
   1481	 */
   1482	leaf = path->nodes[0];
   1483	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   1484	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
   1485	refs = btrfs_extent_refs(leaf, item);
   1486	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
   1487	if (extent_op)
   1488		__run_delayed_extent_op(extent_op, leaf, item);
   1489
   1490	btrfs_mark_buffer_dirty(leaf);
   1491	btrfs_release_path(path);
   1492
   1493	/* now insert the actual backref */
   1494	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
   1495		BUG_ON(refs_to_add != 1);
   1496		ret = insert_tree_block_ref(trans, path, bytenr, parent,
   1497					    root_objectid);
   1498	} else {
   1499		ret = insert_extent_data_ref(trans, path, bytenr, parent,
   1500					     root_objectid, owner, offset,
   1501					     refs_to_add);
   1502	}
   1503	if (ret)
   1504		btrfs_abort_transaction(trans, ret);
   1505out:
   1506	btrfs_free_path(path);
   1507	return ret;
   1508}
   1509
   1510static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
   1511				struct btrfs_delayed_ref_node *node,
   1512				struct btrfs_delayed_extent_op *extent_op,
   1513				int insert_reserved)
   1514{
   1515	int ret = 0;
   1516	struct btrfs_delayed_data_ref *ref;
   1517	struct btrfs_key ins;
   1518	u64 parent = 0;
   1519	u64 ref_root = 0;
   1520	u64 flags = 0;
   1521
   1522	ins.objectid = node->bytenr;
   1523	ins.offset = node->num_bytes;
   1524	ins.type = BTRFS_EXTENT_ITEM_KEY;
   1525
   1526	ref = btrfs_delayed_node_to_data_ref(node);
   1527	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
   1528
   1529	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
   1530		parent = ref->parent;
   1531	ref_root = ref->root;
   1532
   1533	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
   1534		if (extent_op)
   1535			flags |= extent_op->flags_to_set;
   1536		ret = alloc_reserved_file_extent(trans, parent, ref_root,
   1537						 flags, ref->objectid,
   1538						 ref->offset, &ins,
   1539						 node->ref_mod);
   1540	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
   1541		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
   1542					     ref->objectid, ref->offset,
   1543					     node->ref_mod, extent_op);
   1544	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
   1545		ret = __btrfs_free_extent(trans, node, parent,
   1546					  ref_root, ref->objectid,
   1547					  ref->offset, node->ref_mod,
   1548					  extent_op);
   1549	} else {
   1550		BUG();
   1551	}
   1552	return ret;
   1553}
   1554
   1555static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
   1556				    struct extent_buffer *leaf,
   1557				    struct btrfs_extent_item *ei)
   1558{
   1559	u64 flags = btrfs_extent_flags(leaf, ei);
   1560	if (extent_op->update_flags) {
   1561		flags |= extent_op->flags_to_set;
   1562		btrfs_set_extent_flags(leaf, ei, flags);
   1563	}
   1564
   1565	if (extent_op->update_key) {
   1566		struct btrfs_tree_block_info *bi;
   1567		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
   1568		bi = (struct btrfs_tree_block_info *)(ei + 1);
   1569		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
   1570	}
   1571}
   1572
   1573static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
   1574				 struct btrfs_delayed_ref_head *head,
   1575				 struct btrfs_delayed_extent_op *extent_op)
   1576{
   1577	struct btrfs_fs_info *fs_info = trans->fs_info;
   1578	struct btrfs_root *root;
   1579	struct btrfs_key key;
   1580	struct btrfs_path *path;
   1581	struct btrfs_extent_item *ei;
   1582	struct extent_buffer *leaf;
   1583	u32 item_size;
   1584	int ret;
   1585	int err = 0;
   1586	int metadata = 1;
   1587
   1588	if (TRANS_ABORTED(trans))
   1589		return 0;
   1590
   1591	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
   1592		metadata = 0;
   1593
   1594	path = btrfs_alloc_path();
   1595	if (!path)
   1596		return -ENOMEM;
   1597
   1598	key.objectid = head->bytenr;
   1599
   1600	if (metadata) {
   1601		key.type = BTRFS_METADATA_ITEM_KEY;
   1602		key.offset = extent_op->level;
   1603	} else {
   1604		key.type = BTRFS_EXTENT_ITEM_KEY;
   1605		key.offset = head->num_bytes;
   1606	}
   1607
   1608	root = btrfs_extent_root(fs_info, key.objectid);
   1609again:
   1610	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
   1611	if (ret < 0) {
   1612		err = ret;
   1613		goto out;
   1614	}
   1615	if (ret > 0) {
   1616		if (metadata) {
   1617			if (path->slots[0] > 0) {
   1618				path->slots[0]--;
   1619				btrfs_item_key_to_cpu(path->nodes[0], &key,
   1620						      path->slots[0]);
   1621				if (key.objectid == head->bytenr &&
   1622				    key.type == BTRFS_EXTENT_ITEM_KEY &&
   1623				    key.offset == head->num_bytes)
   1624					ret = 0;
   1625			}
   1626			if (ret > 0) {
   1627				btrfs_release_path(path);
   1628				metadata = 0;
   1629
   1630				key.objectid = head->bytenr;
   1631				key.offset = head->num_bytes;
   1632				key.type = BTRFS_EXTENT_ITEM_KEY;
   1633				goto again;
   1634			}
   1635		} else {
   1636			err = -EIO;
   1637			goto out;
   1638		}
   1639	}
   1640
   1641	leaf = path->nodes[0];
   1642	item_size = btrfs_item_size(leaf, path->slots[0]);
   1643
   1644	if (unlikely(item_size < sizeof(*ei))) {
   1645		err = -EINVAL;
   1646		btrfs_print_v0_err(fs_info);
   1647		btrfs_abort_transaction(trans, err);
   1648		goto out;
   1649	}
   1650
   1651	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
   1652	__run_delayed_extent_op(extent_op, leaf, ei);
   1653
   1654	btrfs_mark_buffer_dirty(leaf);
   1655out:
   1656	btrfs_free_path(path);
   1657	return err;
   1658}
   1659
   1660static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
   1661				struct btrfs_delayed_ref_node *node,
   1662				struct btrfs_delayed_extent_op *extent_op,
   1663				int insert_reserved)
   1664{
   1665	int ret = 0;
   1666	struct btrfs_delayed_tree_ref *ref;
   1667	u64 parent = 0;
   1668	u64 ref_root = 0;
   1669
   1670	ref = btrfs_delayed_node_to_tree_ref(node);
   1671	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
   1672
   1673	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
   1674		parent = ref->parent;
   1675	ref_root = ref->root;
   1676
   1677	if (node->ref_mod != 1) {
   1678		btrfs_err(trans->fs_info,
   1679	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
   1680			  node->bytenr, node->ref_mod, node->action, ref_root,
   1681			  parent);
   1682		return -EIO;
   1683	}
   1684	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
   1685		BUG_ON(!extent_op || !extent_op->update_flags);
   1686		ret = alloc_reserved_tree_block(trans, node, extent_op);
   1687	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
   1688		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
   1689					     ref->level, 0, 1, extent_op);
   1690	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
   1691		ret = __btrfs_free_extent(trans, node, parent, ref_root,
   1692					  ref->level, 0, 1, extent_op);
   1693	} else {
   1694		BUG();
   1695	}
   1696	return ret;
   1697}
   1698
   1699/* helper function to actually process a single delayed ref entry */
   1700static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
   1701			       struct btrfs_delayed_ref_node *node,
   1702			       struct btrfs_delayed_extent_op *extent_op,
   1703			       int insert_reserved)
   1704{
   1705	int ret = 0;
   1706
   1707	if (TRANS_ABORTED(trans)) {
   1708		if (insert_reserved)
   1709			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
   1710		return 0;
   1711	}
   1712
   1713	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
   1714	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
   1715		ret = run_delayed_tree_ref(trans, node, extent_op,
   1716					   insert_reserved);
   1717	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
   1718		 node->type == BTRFS_SHARED_DATA_REF_KEY)
   1719		ret = run_delayed_data_ref(trans, node, extent_op,
   1720					   insert_reserved);
   1721	else
   1722		BUG();
   1723	if (ret && insert_reserved)
   1724		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
   1725	return ret;
   1726}
   1727
   1728static inline struct btrfs_delayed_ref_node *
   1729select_delayed_ref(struct btrfs_delayed_ref_head *head)
   1730{
   1731	struct btrfs_delayed_ref_node *ref;
   1732
   1733	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
   1734		return NULL;
   1735
   1736	/*
   1737	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
   1738	 * This is to prevent a ref count from going down to zero, which deletes
   1739	 * the extent item from the extent tree, when there still are references
   1740	 * to add, which would fail because they would not find the extent item.
   1741	 */
   1742	if (!list_empty(&head->ref_add_list))
   1743		return list_first_entry(&head->ref_add_list,
   1744				struct btrfs_delayed_ref_node, add_list);
   1745
   1746	ref = rb_entry(rb_first_cached(&head->ref_tree),
   1747		       struct btrfs_delayed_ref_node, ref_node);
   1748	ASSERT(list_empty(&ref->add_list));
   1749	return ref;
   1750}
   1751
   1752static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
   1753				      struct btrfs_delayed_ref_head *head)
   1754{
   1755	spin_lock(&delayed_refs->lock);
   1756	head->processing = 0;
   1757	delayed_refs->num_heads_ready++;
   1758	spin_unlock(&delayed_refs->lock);
   1759	btrfs_delayed_ref_unlock(head);
   1760}
   1761
   1762static struct btrfs_delayed_extent_op *cleanup_extent_op(
   1763				struct btrfs_delayed_ref_head *head)
   1764{
   1765	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
   1766
   1767	if (!extent_op)
   1768		return NULL;
   1769
   1770	if (head->must_insert_reserved) {
   1771		head->extent_op = NULL;
   1772		btrfs_free_delayed_extent_op(extent_op);
   1773		return NULL;
   1774	}
   1775	return extent_op;
   1776}
   1777
   1778static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
   1779				     struct btrfs_delayed_ref_head *head)
   1780{
   1781	struct btrfs_delayed_extent_op *extent_op;
   1782	int ret;
   1783
   1784	extent_op = cleanup_extent_op(head);
   1785	if (!extent_op)
   1786		return 0;
   1787	head->extent_op = NULL;
   1788	spin_unlock(&head->lock);
   1789	ret = run_delayed_extent_op(trans, head, extent_op);
   1790	btrfs_free_delayed_extent_op(extent_op);
   1791	return ret ? ret : 1;
   1792}
   1793
   1794void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
   1795				  struct btrfs_delayed_ref_root *delayed_refs,
   1796				  struct btrfs_delayed_ref_head *head)
   1797{
   1798	int nr_items = 1;	/* Dropping this ref head update. */
   1799
   1800	/*
   1801	 * We had csum deletions accounted for in our delayed refs rsv, we need
   1802	 * to drop the csum leaves for this update from our delayed_refs_rsv.
   1803	 */
   1804	if (head->total_ref_mod < 0 && head->is_data) {
   1805		spin_lock(&delayed_refs->lock);
   1806		delayed_refs->pending_csums -= head->num_bytes;
   1807		spin_unlock(&delayed_refs->lock);
   1808		nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
   1809	}
   1810
   1811	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
   1812}
   1813
   1814static int cleanup_ref_head(struct btrfs_trans_handle *trans,
   1815			    struct btrfs_delayed_ref_head *head)
   1816{
   1817
   1818	struct btrfs_fs_info *fs_info = trans->fs_info;
   1819	struct btrfs_delayed_ref_root *delayed_refs;
   1820	int ret;
   1821
   1822	delayed_refs = &trans->transaction->delayed_refs;
   1823
   1824	ret = run_and_cleanup_extent_op(trans, head);
   1825	if (ret < 0) {
   1826		unselect_delayed_ref_head(delayed_refs, head);
   1827		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
   1828		return ret;
   1829	} else if (ret) {
   1830		return ret;
   1831	}
   1832
   1833	/*
   1834	 * Need to drop our head ref lock and re-acquire the delayed ref lock
   1835	 * and then re-check to make sure nobody got added.
   1836	 */
   1837	spin_unlock(&head->lock);
   1838	spin_lock(&delayed_refs->lock);
   1839	spin_lock(&head->lock);
   1840	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
   1841		spin_unlock(&head->lock);
   1842		spin_unlock(&delayed_refs->lock);
   1843		return 1;
   1844	}
   1845	btrfs_delete_ref_head(delayed_refs, head);
   1846	spin_unlock(&head->lock);
   1847	spin_unlock(&delayed_refs->lock);
   1848
   1849	if (head->must_insert_reserved) {
   1850		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
   1851		if (head->is_data) {
   1852			struct btrfs_root *csum_root;
   1853
   1854			csum_root = btrfs_csum_root(fs_info, head->bytenr);
   1855			ret = btrfs_del_csums(trans, csum_root, head->bytenr,
   1856					      head->num_bytes);
   1857		}
   1858	}
   1859
   1860	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
   1861
   1862	trace_run_delayed_ref_head(fs_info, head, 0);
   1863	btrfs_delayed_ref_unlock(head);
   1864	btrfs_put_delayed_ref_head(head);
   1865	return ret;
   1866}
   1867
   1868static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
   1869					struct btrfs_trans_handle *trans)
   1870{
   1871	struct btrfs_delayed_ref_root *delayed_refs =
   1872		&trans->transaction->delayed_refs;
   1873	struct btrfs_delayed_ref_head *head = NULL;
   1874	int ret;
   1875
   1876	spin_lock(&delayed_refs->lock);
   1877	head = btrfs_select_ref_head(delayed_refs);
   1878	if (!head) {
   1879		spin_unlock(&delayed_refs->lock);
   1880		return head;
   1881	}
   1882
   1883	/*
   1884	 * Grab the lock that says we are going to process all the refs for
   1885	 * this head
   1886	 */
   1887	ret = btrfs_delayed_ref_lock(delayed_refs, head);
   1888	spin_unlock(&delayed_refs->lock);
   1889
   1890	/*
   1891	 * We may have dropped the spin lock to get the head mutex lock, and
   1892	 * that might have given someone else time to free the head.  If that's
   1893	 * true, it has been removed from our list and we can move on.
   1894	 */
   1895	if (ret == -EAGAIN)
   1896		head = ERR_PTR(-EAGAIN);
   1897
   1898	return head;
   1899}
   1900
   1901static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
   1902				    struct btrfs_delayed_ref_head *locked_ref,
   1903				    unsigned long *run_refs)
   1904{
   1905	struct btrfs_fs_info *fs_info = trans->fs_info;
   1906	struct btrfs_delayed_ref_root *delayed_refs;
   1907	struct btrfs_delayed_extent_op *extent_op;
   1908	struct btrfs_delayed_ref_node *ref;
   1909	int must_insert_reserved = 0;
   1910	int ret;
   1911
   1912	delayed_refs = &trans->transaction->delayed_refs;
   1913
   1914	lockdep_assert_held(&locked_ref->mutex);
   1915	lockdep_assert_held(&locked_ref->lock);
   1916
   1917	while ((ref = select_delayed_ref(locked_ref))) {
   1918		if (ref->seq &&
   1919		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
   1920			spin_unlock(&locked_ref->lock);
   1921			unselect_delayed_ref_head(delayed_refs, locked_ref);
   1922			return -EAGAIN;
   1923		}
   1924
   1925		(*run_refs)++;
   1926		ref->in_tree = 0;
   1927		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
   1928		RB_CLEAR_NODE(&ref->ref_node);
   1929		if (!list_empty(&ref->add_list))
   1930			list_del(&ref->add_list);
   1931		/*
   1932		 * When we play the delayed ref, also correct the ref_mod on
   1933		 * head
   1934		 */
   1935		switch (ref->action) {
   1936		case BTRFS_ADD_DELAYED_REF:
   1937		case BTRFS_ADD_DELAYED_EXTENT:
   1938			locked_ref->ref_mod -= ref->ref_mod;
   1939			break;
   1940		case BTRFS_DROP_DELAYED_REF:
   1941			locked_ref->ref_mod += ref->ref_mod;
   1942			break;
   1943		default:
   1944			WARN_ON(1);
   1945		}
   1946		atomic_dec(&delayed_refs->num_entries);
   1947
   1948		/*
   1949		 * Record the must_insert_reserved flag before we drop the
   1950		 * spin lock.
   1951		 */
   1952		must_insert_reserved = locked_ref->must_insert_reserved;
   1953		locked_ref->must_insert_reserved = 0;
   1954
   1955		extent_op = locked_ref->extent_op;
   1956		locked_ref->extent_op = NULL;
   1957		spin_unlock(&locked_ref->lock);
   1958
   1959		ret = run_one_delayed_ref(trans, ref, extent_op,
   1960					  must_insert_reserved);
   1961
   1962		btrfs_free_delayed_extent_op(extent_op);
   1963		if (ret) {
   1964			unselect_delayed_ref_head(delayed_refs, locked_ref);
   1965			btrfs_put_delayed_ref(ref);
   1966			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
   1967				    ret);
   1968			return ret;
   1969		}
   1970
   1971		btrfs_put_delayed_ref(ref);
   1972		cond_resched();
   1973
   1974		spin_lock(&locked_ref->lock);
   1975		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
   1976	}
   1977
   1978	return 0;
   1979}
   1980
   1981/*
   1982 * Returns 0 on success or if called with an already aborted transaction.
   1983 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
   1984 */
   1985static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
   1986					     unsigned long nr)
   1987{
   1988	struct btrfs_fs_info *fs_info = trans->fs_info;
   1989	struct btrfs_delayed_ref_root *delayed_refs;
   1990	struct btrfs_delayed_ref_head *locked_ref = NULL;
   1991	ktime_t start = ktime_get();
   1992	int ret;
   1993	unsigned long count = 0;
   1994	unsigned long actual_count = 0;
   1995
   1996	delayed_refs = &trans->transaction->delayed_refs;
   1997	do {
   1998		if (!locked_ref) {
   1999			locked_ref = btrfs_obtain_ref_head(trans);
   2000			if (IS_ERR_OR_NULL(locked_ref)) {
   2001				if (PTR_ERR(locked_ref) == -EAGAIN) {
   2002					continue;
   2003				} else {
   2004					break;
   2005				}
   2006			}
   2007			count++;
   2008		}
   2009		/*
   2010		 * We need to try and merge add/drops of the same ref since we
   2011		 * can run into issues with relocate dropping the implicit ref
   2012		 * and then it being added back again before the drop can
   2013		 * finish.  If we merged anything we need to re-loop so we can
   2014		 * get a good ref.
   2015		 * Or we can get node references of the same type that weren't
   2016		 * merged when created due to bumps in the tree mod seq, and
   2017		 * we need to merge them to prevent adding an inline extent
   2018		 * backref before dropping it (triggering a BUG_ON at
   2019		 * insert_inline_extent_backref()).
   2020		 */
   2021		spin_lock(&locked_ref->lock);
   2022		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
   2023
   2024		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
   2025						      &actual_count);
   2026		if (ret < 0 && ret != -EAGAIN) {
   2027			/*
   2028			 * Error, btrfs_run_delayed_refs_for_head already
   2029			 * unlocked everything so just bail out
   2030			 */
   2031			return ret;
   2032		} else if (!ret) {
   2033			/*
   2034			 * Success, perform the usual cleanup of a processed
   2035			 * head
   2036			 */
   2037			ret = cleanup_ref_head(trans, locked_ref);
   2038			if (ret > 0 ) {
   2039				/* We dropped our lock, we need to loop. */
   2040				ret = 0;
   2041				continue;
   2042			} else if (ret) {
   2043				return ret;
   2044			}
   2045		}
   2046
   2047		/*
   2048		 * Either success case or btrfs_run_delayed_refs_for_head
   2049		 * returned -EAGAIN, meaning we need to select another head
   2050		 */
   2051
   2052		locked_ref = NULL;
   2053		cond_resched();
   2054	} while ((nr != -1 && count < nr) || locked_ref);
   2055
   2056	/*
   2057	 * We don't want to include ref heads since we can have empty ref heads
   2058	 * and those will drastically skew our runtime down since we just do
   2059	 * accounting, no actual extent tree updates.
   2060	 */
   2061	if (actual_count > 0) {
   2062		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
   2063		u64 avg;
   2064
   2065		/*
   2066		 * We weigh the current average higher than our current runtime
   2067		 * to avoid large swings in the average.
   2068		 */
   2069		spin_lock(&delayed_refs->lock);
   2070		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
   2071		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
   2072		spin_unlock(&delayed_refs->lock);
   2073	}
   2074	return 0;
   2075}
   2076
   2077#ifdef SCRAMBLE_DELAYED_REFS
   2078/*
   2079 * Normally delayed refs get processed in ascending bytenr order. This
   2080 * correlates in most cases to the order added. To expose dependencies on this
   2081 * order, we start to process the tree in the middle instead of the beginning
   2082 */
   2083static u64 find_middle(struct rb_root *root)
   2084{
   2085	struct rb_node *n = root->rb_node;
   2086	struct btrfs_delayed_ref_node *entry;
   2087	int alt = 1;
   2088	u64 middle;
   2089	u64 first = 0, last = 0;
   2090
   2091	n = rb_first(root);
   2092	if (n) {
   2093		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
   2094		first = entry->bytenr;
   2095	}
   2096	n = rb_last(root);
   2097	if (n) {
   2098		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
   2099		last = entry->bytenr;
   2100	}
   2101	n = root->rb_node;
   2102
   2103	while (n) {
   2104		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
   2105		WARN_ON(!entry->in_tree);
   2106
   2107		middle = entry->bytenr;
   2108
   2109		if (alt)
   2110			n = n->rb_left;
   2111		else
   2112			n = n->rb_right;
   2113
   2114		alt = 1 - alt;
   2115	}
   2116	return middle;
   2117}
   2118#endif
   2119
   2120/*
   2121 * this starts processing the delayed reference count updates and
   2122 * extent insertions we have queued up so far.  count can be
   2123 * 0, which means to process everything in the tree at the start
   2124 * of the run (but not newly added entries), or it can be some target
   2125 * number you'd like to process.
   2126 *
   2127 * Returns 0 on success or if called with an aborted transaction
   2128 * Returns <0 on error and aborts the transaction
   2129 */
   2130int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
   2131			   unsigned long count)
   2132{
   2133	struct btrfs_fs_info *fs_info = trans->fs_info;
   2134	struct rb_node *node;
   2135	struct btrfs_delayed_ref_root *delayed_refs;
   2136	struct btrfs_delayed_ref_head *head;
   2137	int ret;
   2138	int run_all = count == (unsigned long)-1;
   2139
   2140	/* We'll clean this up in btrfs_cleanup_transaction */
   2141	if (TRANS_ABORTED(trans))
   2142		return 0;
   2143
   2144	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
   2145		return 0;
   2146
   2147	delayed_refs = &trans->transaction->delayed_refs;
   2148	if (count == 0)
   2149		count = delayed_refs->num_heads_ready;
   2150
   2151again:
   2152#ifdef SCRAMBLE_DELAYED_REFS
   2153	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
   2154#endif
   2155	ret = __btrfs_run_delayed_refs(trans, count);
   2156	if (ret < 0) {
   2157		btrfs_abort_transaction(trans, ret);
   2158		return ret;
   2159	}
   2160
   2161	if (run_all) {
   2162		btrfs_create_pending_block_groups(trans);
   2163
   2164		spin_lock(&delayed_refs->lock);
   2165		node = rb_first_cached(&delayed_refs->href_root);
   2166		if (!node) {
   2167			spin_unlock(&delayed_refs->lock);
   2168			goto out;
   2169		}
   2170		head = rb_entry(node, struct btrfs_delayed_ref_head,
   2171				href_node);
   2172		refcount_inc(&head->refs);
   2173		spin_unlock(&delayed_refs->lock);
   2174
   2175		/* Mutex was contended, block until it's released and retry. */
   2176		mutex_lock(&head->mutex);
   2177		mutex_unlock(&head->mutex);
   2178
   2179		btrfs_put_delayed_ref_head(head);
   2180		cond_resched();
   2181		goto again;
   2182	}
   2183out:
   2184	return 0;
   2185}
   2186
   2187int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
   2188				struct extent_buffer *eb, u64 flags,
   2189				int level)
   2190{
   2191	struct btrfs_delayed_extent_op *extent_op;
   2192	int ret;
   2193
   2194	extent_op = btrfs_alloc_delayed_extent_op();
   2195	if (!extent_op)
   2196		return -ENOMEM;
   2197
   2198	extent_op->flags_to_set = flags;
   2199	extent_op->update_flags = true;
   2200	extent_op->update_key = false;
   2201	extent_op->level = level;
   2202
   2203	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
   2204	if (ret)
   2205		btrfs_free_delayed_extent_op(extent_op);
   2206	return ret;
   2207}
   2208
   2209static noinline int check_delayed_ref(struct btrfs_root *root,
   2210				      struct btrfs_path *path,
   2211				      u64 objectid, u64 offset, u64 bytenr)
   2212{
   2213	struct btrfs_delayed_ref_head *head;
   2214	struct btrfs_delayed_ref_node *ref;
   2215	struct btrfs_delayed_data_ref *data_ref;
   2216	struct btrfs_delayed_ref_root *delayed_refs;
   2217	struct btrfs_transaction *cur_trans;
   2218	struct rb_node *node;
   2219	int ret = 0;
   2220
   2221	spin_lock(&root->fs_info->trans_lock);
   2222	cur_trans = root->fs_info->running_transaction;
   2223	if (cur_trans)
   2224		refcount_inc(&cur_trans->use_count);
   2225	spin_unlock(&root->fs_info->trans_lock);
   2226	if (!cur_trans)
   2227		return 0;
   2228
   2229	delayed_refs = &cur_trans->delayed_refs;
   2230	spin_lock(&delayed_refs->lock);
   2231	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
   2232	if (!head) {
   2233		spin_unlock(&delayed_refs->lock);
   2234		btrfs_put_transaction(cur_trans);
   2235		return 0;
   2236	}
   2237
   2238	if (!mutex_trylock(&head->mutex)) {
   2239		refcount_inc(&head->refs);
   2240		spin_unlock(&delayed_refs->lock);
   2241
   2242		btrfs_release_path(path);
   2243
   2244		/*
   2245		 * Mutex was contended, block until it's released and let
   2246		 * caller try again
   2247		 */
   2248		mutex_lock(&head->mutex);
   2249		mutex_unlock(&head->mutex);
   2250		btrfs_put_delayed_ref_head(head);
   2251		btrfs_put_transaction(cur_trans);
   2252		return -EAGAIN;
   2253	}
   2254	spin_unlock(&delayed_refs->lock);
   2255
   2256	spin_lock(&head->lock);
   2257	/*
   2258	 * XXX: We should replace this with a proper search function in the
   2259	 * future.
   2260	 */
   2261	for (node = rb_first_cached(&head->ref_tree); node;
   2262	     node = rb_next(node)) {
   2263		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
   2264		/* If it's a shared ref we know a cross reference exists */
   2265		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
   2266			ret = 1;
   2267			break;
   2268		}
   2269
   2270		data_ref = btrfs_delayed_node_to_data_ref(ref);
   2271
   2272		/*
   2273		 * If our ref doesn't match the one we're currently looking at
   2274		 * then we have a cross reference.
   2275		 */
   2276		if (data_ref->root != root->root_key.objectid ||
   2277		    data_ref->objectid != objectid ||
   2278		    data_ref->offset != offset) {
   2279			ret = 1;
   2280			break;
   2281		}
   2282	}
   2283	spin_unlock(&head->lock);
   2284	mutex_unlock(&head->mutex);
   2285	btrfs_put_transaction(cur_trans);
   2286	return ret;
   2287}
   2288
   2289static noinline int check_committed_ref(struct btrfs_root *root,
   2290					struct btrfs_path *path,
   2291					u64 objectid, u64 offset, u64 bytenr,
   2292					bool strict)
   2293{
   2294	struct btrfs_fs_info *fs_info = root->fs_info;
   2295	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
   2296	struct extent_buffer *leaf;
   2297	struct btrfs_extent_data_ref *ref;
   2298	struct btrfs_extent_inline_ref *iref;
   2299	struct btrfs_extent_item *ei;
   2300	struct btrfs_key key;
   2301	u32 item_size;
   2302	int type;
   2303	int ret;
   2304
   2305	key.objectid = bytenr;
   2306	key.offset = (u64)-1;
   2307	key.type = BTRFS_EXTENT_ITEM_KEY;
   2308
   2309	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
   2310	if (ret < 0)
   2311		goto out;
   2312	BUG_ON(ret == 0); /* Corruption */
   2313
   2314	ret = -ENOENT;
   2315	if (path->slots[0] == 0)
   2316		goto out;
   2317
   2318	path->slots[0]--;
   2319	leaf = path->nodes[0];
   2320	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   2321
   2322	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
   2323		goto out;
   2324
   2325	ret = 1;
   2326	item_size = btrfs_item_size(leaf, path->slots[0]);
   2327	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
   2328
   2329	/* If extent item has more than 1 inline ref then it's shared */
   2330	if (item_size != sizeof(*ei) +
   2331	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
   2332		goto out;
   2333
   2334	/*
   2335	 * If extent created before last snapshot => it's shared unless the
   2336	 * snapshot has been deleted. Use the heuristic if strict is false.
   2337	 */
   2338	if (!strict &&
   2339	    (btrfs_extent_generation(leaf, ei) <=
   2340	     btrfs_root_last_snapshot(&root->root_item)))
   2341		goto out;
   2342
   2343	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
   2344
   2345	/* If this extent has SHARED_DATA_REF then it's shared */
   2346	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
   2347	if (type != BTRFS_EXTENT_DATA_REF_KEY)
   2348		goto out;
   2349
   2350	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
   2351	if (btrfs_extent_refs(leaf, ei) !=
   2352	    btrfs_extent_data_ref_count(leaf, ref) ||
   2353	    btrfs_extent_data_ref_root(leaf, ref) !=
   2354	    root->root_key.objectid ||
   2355	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
   2356	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
   2357		goto out;
   2358
   2359	ret = 0;
   2360out:
   2361	return ret;
   2362}
   2363
   2364int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
   2365			  u64 bytenr, bool strict, struct btrfs_path *path)
   2366{
   2367	int ret;
   2368
   2369	do {
   2370		ret = check_committed_ref(root, path, objectid,
   2371					  offset, bytenr, strict);
   2372		if (ret && ret != -ENOENT)
   2373			goto out;
   2374
   2375		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
   2376	} while (ret == -EAGAIN);
   2377
   2378out:
   2379	btrfs_release_path(path);
   2380	if (btrfs_is_data_reloc_root(root))
   2381		WARN_ON(ret > 0);
   2382	return ret;
   2383}
   2384
   2385static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
   2386			   struct btrfs_root *root,
   2387			   struct extent_buffer *buf,
   2388			   int full_backref, int inc)
   2389{
   2390	struct btrfs_fs_info *fs_info = root->fs_info;
   2391	u64 bytenr;
   2392	u64 num_bytes;
   2393	u64 parent;
   2394	u64 ref_root;
   2395	u32 nritems;
   2396	struct btrfs_key key;
   2397	struct btrfs_file_extent_item *fi;
   2398	struct btrfs_ref generic_ref = { 0 };
   2399	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
   2400	int i;
   2401	int action;
   2402	int level;
   2403	int ret = 0;
   2404
   2405	if (btrfs_is_testing(fs_info))
   2406		return 0;
   2407
   2408	ref_root = btrfs_header_owner(buf);
   2409	nritems = btrfs_header_nritems(buf);
   2410	level = btrfs_header_level(buf);
   2411
   2412	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
   2413		return 0;
   2414
   2415	if (full_backref)
   2416		parent = buf->start;
   2417	else
   2418		parent = 0;
   2419	if (inc)
   2420		action = BTRFS_ADD_DELAYED_REF;
   2421	else
   2422		action = BTRFS_DROP_DELAYED_REF;
   2423
   2424	for (i = 0; i < nritems; i++) {
   2425		if (level == 0) {
   2426			btrfs_item_key_to_cpu(buf, &key, i);
   2427			if (key.type != BTRFS_EXTENT_DATA_KEY)
   2428				continue;
   2429			fi = btrfs_item_ptr(buf, i,
   2430					    struct btrfs_file_extent_item);
   2431			if (btrfs_file_extent_type(buf, fi) ==
   2432			    BTRFS_FILE_EXTENT_INLINE)
   2433				continue;
   2434			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
   2435			if (bytenr == 0)
   2436				continue;
   2437
   2438			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
   2439			key.offset -= btrfs_file_extent_offset(buf, fi);
   2440			btrfs_init_generic_ref(&generic_ref, action, bytenr,
   2441					       num_bytes, parent);
   2442			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
   2443					    key.offset, root->root_key.objectid,
   2444					    for_reloc);
   2445			if (inc)
   2446				ret = btrfs_inc_extent_ref(trans, &generic_ref);
   2447			else
   2448				ret = btrfs_free_extent(trans, &generic_ref);
   2449			if (ret)
   2450				goto fail;
   2451		} else {
   2452			bytenr = btrfs_node_blockptr(buf, i);
   2453			num_bytes = fs_info->nodesize;
   2454			btrfs_init_generic_ref(&generic_ref, action, bytenr,
   2455					       num_bytes, parent);
   2456			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
   2457					    root->root_key.objectid, for_reloc);
   2458			if (inc)
   2459				ret = btrfs_inc_extent_ref(trans, &generic_ref);
   2460			else
   2461				ret = btrfs_free_extent(trans, &generic_ref);
   2462			if (ret)
   2463				goto fail;
   2464		}
   2465	}
   2466	return 0;
   2467fail:
   2468	return ret;
   2469}
   2470
   2471int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
   2472		  struct extent_buffer *buf, int full_backref)
   2473{
   2474	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
   2475}
   2476
   2477int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
   2478		  struct extent_buffer *buf, int full_backref)
   2479{
   2480	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
   2481}
   2482
   2483static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
   2484{
   2485	struct btrfs_fs_info *fs_info = root->fs_info;
   2486	u64 flags;
   2487	u64 ret;
   2488
   2489	if (data)
   2490		flags = BTRFS_BLOCK_GROUP_DATA;
   2491	else if (root == fs_info->chunk_root)
   2492		flags = BTRFS_BLOCK_GROUP_SYSTEM;
   2493	else
   2494		flags = BTRFS_BLOCK_GROUP_METADATA;
   2495
   2496	ret = btrfs_get_alloc_profile(fs_info, flags);
   2497	return ret;
   2498}
   2499
   2500static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
   2501{
   2502	struct rb_node *leftmost;
   2503	u64 bytenr = 0;
   2504
   2505	read_lock(&fs_info->block_group_cache_lock);
   2506	/* Get the block group with the lowest logical start address. */
   2507	leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
   2508	if (leftmost) {
   2509		struct btrfs_block_group *bg;
   2510
   2511		bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
   2512		bytenr = bg->start;
   2513	}
   2514	read_unlock(&fs_info->block_group_cache_lock);
   2515
   2516	return bytenr;
   2517}
   2518
   2519static int pin_down_extent(struct btrfs_trans_handle *trans,
   2520			   struct btrfs_block_group *cache,
   2521			   u64 bytenr, u64 num_bytes, int reserved)
   2522{
   2523	struct btrfs_fs_info *fs_info = cache->fs_info;
   2524
   2525	spin_lock(&cache->space_info->lock);
   2526	spin_lock(&cache->lock);
   2527	cache->pinned += num_bytes;
   2528	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
   2529					     num_bytes);
   2530	if (reserved) {
   2531		cache->reserved -= num_bytes;
   2532		cache->space_info->bytes_reserved -= num_bytes;
   2533	}
   2534	spin_unlock(&cache->lock);
   2535	spin_unlock(&cache->space_info->lock);
   2536
   2537	set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
   2538			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
   2539	return 0;
   2540}
   2541
   2542int btrfs_pin_extent(struct btrfs_trans_handle *trans,
   2543		     u64 bytenr, u64 num_bytes, int reserved)
   2544{
   2545	struct btrfs_block_group *cache;
   2546
   2547	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
   2548	BUG_ON(!cache); /* Logic error */
   2549
   2550	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
   2551
   2552	btrfs_put_block_group(cache);
   2553	return 0;
   2554}
   2555
   2556/*
   2557 * this function must be called within transaction
   2558 */
   2559int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
   2560				    u64 bytenr, u64 num_bytes)
   2561{
   2562	struct btrfs_block_group *cache;
   2563	int ret;
   2564
   2565	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
   2566	if (!cache)
   2567		return -EINVAL;
   2568
   2569	/*
   2570	 * pull in the free space cache (if any) so that our pin
   2571	 * removes the free space from the cache.  We have load_only set
   2572	 * to one because the slow code to read in the free extents does check
   2573	 * the pinned extents.
   2574	 */
   2575	btrfs_cache_block_group(cache, 1);
   2576	/*
   2577	 * Make sure we wait until the cache is completely built in case it is
   2578	 * missing or is invalid and therefore needs to be rebuilt.
   2579	 */
   2580	ret = btrfs_wait_block_group_cache_done(cache);
   2581	if (ret)
   2582		goto out;
   2583
   2584	pin_down_extent(trans, cache, bytenr, num_bytes, 0);
   2585
   2586	/* remove us from the free space cache (if we're there at all) */
   2587	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
   2588out:
   2589	btrfs_put_block_group(cache);
   2590	return ret;
   2591}
   2592
   2593static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
   2594				   u64 start, u64 num_bytes)
   2595{
   2596	int ret;
   2597	struct btrfs_block_group *block_group;
   2598
   2599	block_group = btrfs_lookup_block_group(fs_info, start);
   2600	if (!block_group)
   2601		return -EINVAL;
   2602
   2603	btrfs_cache_block_group(block_group, 1);
   2604	/*
   2605	 * Make sure we wait until the cache is completely built in case it is
   2606	 * missing or is invalid and therefore needs to be rebuilt.
   2607	 */
   2608	ret = btrfs_wait_block_group_cache_done(block_group);
   2609	if (ret)
   2610		goto out;
   2611
   2612	ret = btrfs_remove_free_space(block_group, start, num_bytes);
   2613out:
   2614	btrfs_put_block_group(block_group);
   2615	return ret;
   2616}
   2617
   2618int btrfs_exclude_logged_extents(struct extent_buffer *eb)
   2619{
   2620	struct btrfs_fs_info *fs_info = eb->fs_info;
   2621	struct btrfs_file_extent_item *item;
   2622	struct btrfs_key key;
   2623	int found_type;
   2624	int i;
   2625	int ret = 0;
   2626
   2627	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
   2628		return 0;
   2629
   2630	for (i = 0; i < btrfs_header_nritems(eb); i++) {
   2631		btrfs_item_key_to_cpu(eb, &key, i);
   2632		if (key.type != BTRFS_EXTENT_DATA_KEY)
   2633			continue;
   2634		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
   2635		found_type = btrfs_file_extent_type(eb, item);
   2636		if (found_type == BTRFS_FILE_EXTENT_INLINE)
   2637			continue;
   2638		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
   2639			continue;
   2640		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
   2641		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
   2642		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
   2643		if (ret)
   2644			break;
   2645	}
   2646
   2647	return ret;
   2648}
   2649
   2650static void
   2651btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
   2652{
   2653	atomic_inc(&bg->reservations);
   2654}
   2655
   2656/*
   2657 * Returns the free cluster for the given space info and sets empty_cluster to
   2658 * what it should be based on the mount options.
   2659 */
   2660static struct btrfs_free_cluster *
   2661fetch_cluster_info(struct btrfs_fs_info *fs_info,
   2662		   struct btrfs_space_info *space_info, u64 *empty_cluster)
   2663{
   2664	struct btrfs_free_cluster *ret = NULL;
   2665
   2666	*empty_cluster = 0;
   2667	if (btrfs_mixed_space_info(space_info))
   2668		return ret;
   2669
   2670	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
   2671		ret = &fs_info->meta_alloc_cluster;
   2672		if (btrfs_test_opt(fs_info, SSD))
   2673			*empty_cluster = SZ_2M;
   2674		else
   2675			*empty_cluster = SZ_64K;
   2676	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
   2677		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
   2678		*empty_cluster = SZ_2M;
   2679		ret = &fs_info->data_alloc_cluster;
   2680	}
   2681
   2682	return ret;
   2683}
   2684
   2685static int unpin_extent_range(struct btrfs_fs_info *fs_info,
   2686			      u64 start, u64 end,
   2687			      const bool return_free_space)
   2688{
   2689	struct btrfs_block_group *cache = NULL;
   2690	struct btrfs_space_info *space_info;
   2691	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
   2692	struct btrfs_free_cluster *cluster = NULL;
   2693	u64 len;
   2694	u64 total_unpinned = 0;
   2695	u64 empty_cluster = 0;
   2696	bool readonly;
   2697
   2698	while (start <= end) {
   2699		readonly = false;
   2700		if (!cache ||
   2701		    start >= cache->start + cache->length) {
   2702			if (cache)
   2703				btrfs_put_block_group(cache);
   2704			total_unpinned = 0;
   2705			cache = btrfs_lookup_block_group(fs_info, start);
   2706			BUG_ON(!cache); /* Logic error */
   2707
   2708			cluster = fetch_cluster_info(fs_info,
   2709						     cache->space_info,
   2710						     &empty_cluster);
   2711			empty_cluster <<= 1;
   2712		}
   2713
   2714		len = cache->start + cache->length - start;
   2715		len = min(len, end + 1 - start);
   2716
   2717		down_read(&fs_info->commit_root_sem);
   2718		if (start < cache->last_byte_to_unpin && return_free_space) {
   2719			u64 add_len = min(len, cache->last_byte_to_unpin - start);
   2720
   2721			btrfs_add_free_space(cache, start, add_len);
   2722		}
   2723		up_read(&fs_info->commit_root_sem);
   2724
   2725		start += len;
   2726		total_unpinned += len;
   2727		space_info = cache->space_info;
   2728
   2729		/*
   2730		 * If this space cluster has been marked as fragmented and we've
   2731		 * unpinned enough in this block group to potentially allow a
   2732		 * cluster to be created inside of it go ahead and clear the
   2733		 * fragmented check.
   2734		 */
   2735		if (cluster && cluster->fragmented &&
   2736		    total_unpinned > empty_cluster) {
   2737			spin_lock(&cluster->lock);
   2738			cluster->fragmented = 0;
   2739			spin_unlock(&cluster->lock);
   2740		}
   2741
   2742		spin_lock(&space_info->lock);
   2743		spin_lock(&cache->lock);
   2744		cache->pinned -= len;
   2745		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
   2746		space_info->max_extent_size = 0;
   2747		if (cache->ro) {
   2748			space_info->bytes_readonly += len;
   2749			readonly = true;
   2750		} else if (btrfs_is_zoned(fs_info)) {
   2751			/* Need reset before reusing in a zoned block group */
   2752			space_info->bytes_zone_unusable += len;
   2753			readonly = true;
   2754		}
   2755		spin_unlock(&cache->lock);
   2756		if (!readonly && return_free_space &&
   2757		    global_rsv->space_info == space_info) {
   2758			spin_lock(&global_rsv->lock);
   2759			if (!global_rsv->full) {
   2760				u64 to_add = min(len, global_rsv->size -
   2761						      global_rsv->reserved);
   2762
   2763				global_rsv->reserved += to_add;
   2764				btrfs_space_info_update_bytes_may_use(fs_info,
   2765						space_info, to_add);
   2766				if (global_rsv->reserved >= global_rsv->size)
   2767					global_rsv->full = 1;
   2768				len -= to_add;
   2769			}
   2770			spin_unlock(&global_rsv->lock);
   2771		}
   2772		/* Add to any tickets we may have */
   2773		if (!readonly && return_free_space && len)
   2774			btrfs_try_granting_tickets(fs_info, space_info);
   2775		spin_unlock(&space_info->lock);
   2776	}
   2777
   2778	if (cache)
   2779		btrfs_put_block_group(cache);
   2780	return 0;
   2781}
   2782
   2783int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
   2784{
   2785	struct btrfs_fs_info *fs_info = trans->fs_info;
   2786	struct btrfs_block_group *block_group, *tmp;
   2787	struct list_head *deleted_bgs;
   2788	struct extent_io_tree *unpin;
   2789	u64 start;
   2790	u64 end;
   2791	int ret;
   2792
   2793	unpin = &trans->transaction->pinned_extents;
   2794
   2795	while (!TRANS_ABORTED(trans)) {
   2796		struct extent_state *cached_state = NULL;
   2797
   2798		mutex_lock(&fs_info->unused_bg_unpin_mutex);
   2799		ret = find_first_extent_bit(unpin, 0, &start, &end,
   2800					    EXTENT_DIRTY, &cached_state);
   2801		if (ret) {
   2802			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
   2803			break;
   2804		}
   2805
   2806		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
   2807			ret = btrfs_discard_extent(fs_info, start,
   2808						   end + 1 - start, NULL);
   2809
   2810		clear_extent_dirty(unpin, start, end, &cached_state);
   2811		unpin_extent_range(fs_info, start, end, true);
   2812		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
   2813		free_extent_state(cached_state);
   2814		cond_resched();
   2815	}
   2816
   2817	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
   2818		btrfs_discard_calc_delay(&fs_info->discard_ctl);
   2819		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
   2820	}
   2821
   2822	/*
   2823	 * Transaction is finished.  We don't need the lock anymore.  We
   2824	 * do need to clean up the block groups in case of a transaction
   2825	 * abort.
   2826	 */
   2827	deleted_bgs = &trans->transaction->deleted_bgs;
   2828	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
   2829		u64 trimmed = 0;
   2830
   2831		ret = -EROFS;
   2832		if (!TRANS_ABORTED(trans))
   2833			ret = btrfs_discard_extent(fs_info,
   2834						   block_group->start,
   2835						   block_group->length,
   2836						   &trimmed);
   2837
   2838		list_del_init(&block_group->bg_list);
   2839		btrfs_unfreeze_block_group(block_group);
   2840		btrfs_put_block_group(block_group);
   2841
   2842		if (ret) {
   2843			const char *errstr = btrfs_decode_error(ret);
   2844			btrfs_warn(fs_info,
   2845			   "discard failed while removing blockgroup: errno=%d %s",
   2846				   ret, errstr);
   2847		}
   2848	}
   2849
   2850	return 0;
   2851}
   2852
   2853static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
   2854				     u64 bytenr, u64 num_bytes, bool is_data)
   2855{
   2856	int ret;
   2857
   2858	if (is_data) {
   2859		struct btrfs_root *csum_root;
   2860
   2861		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
   2862		ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
   2863		if (ret) {
   2864			btrfs_abort_transaction(trans, ret);
   2865			return ret;
   2866		}
   2867	}
   2868
   2869	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
   2870	if (ret) {
   2871		btrfs_abort_transaction(trans, ret);
   2872		return ret;
   2873	}
   2874
   2875	ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
   2876	if (ret)
   2877		btrfs_abort_transaction(trans, ret);
   2878
   2879	return ret;
   2880}
   2881
   2882/*
   2883 * Drop one or more refs of @node.
   2884 *
   2885 * 1. Locate the extent refs.
   2886 *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
   2887 *    Locate it, then reduce the refs number or remove the ref line completely.
   2888 *
   2889 * 2. Update the refs count in EXTENT/METADATA_ITEM
   2890 *
   2891 * Inline backref case:
   2892 *
   2893 * in extent tree we have:
   2894 *
   2895 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
   2896 *		refs 2 gen 6 flags DATA
   2897 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
   2898 *		extent data backref root FS_TREE objectid 257 offset 0 count 1
   2899 *
   2900 * This function gets called with:
   2901 *
   2902 *    node->bytenr = 13631488
   2903 *    node->num_bytes = 1048576
   2904 *    root_objectid = FS_TREE
   2905 *    owner_objectid = 257
   2906 *    owner_offset = 0
   2907 *    refs_to_drop = 1
   2908 *
   2909 * Then we should get some like:
   2910 *
   2911 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
   2912 *		refs 1 gen 6 flags DATA
   2913 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
   2914 *
   2915 * Keyed backref case:
   2916 *
   2917 * in extent tree we have:
   2918 *
   2919 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
   2920 *		refs 754 gen 6 flags DATA
   2921 *	[...]
   2922 *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
   2923 *		extent data backref root FS_TREE objectid 866 offset 0 count 1
   2924 *
   2925 * This function get called with:
   2926 *
   2927 *    node->bytenr = 13631488
   2928 *    node->num_bytes = 1048576
   2929 *    root_objectid = FS_TREE
   2930 *    owner_objectid = 866
   2931 *    owner_offset = 0
   2932 *    refs_to_drop = 1
   2933 *
   2934 * Then we should get some like:
   2935 *
   2936 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
   2937 *		refs 753 gen 6 flags DATA
   2938 *
   2939 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
   2940 */
   2941static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
   2942			       struct btrfs_delayed_ref_node *node, u64 parent,
   2943			       u64 root_objectid, u64 owner_objectid,
   2944			       u64 owner_offset, int refs_to_drop,
   2945			       struct btrfs_delayed_extent_op *extent_op)
   2946{
   2947	struct btrfs_fs_info *info = trans->fs_info;
   2948	struct btrfs_key key;
   2949	struct btrfs_path *path;
   2950	struct btrfs_root *extent_root;
   2951	struct extent_buffer *leaf;
   2952	struct btrfs_extent_item *ei;
   2953	struct btrfs_extent_inline_ref *iref;
   2954	int ret;
   2955	int is_data;
   2956	int extent_slot = 0;
   2957	int found_extent = 0;
   2958	int num_to_del = 1;
   2959	u32 item_size;
   2960	u64 refs;
   2961	u64 bytenr = node->bytenr;
   2962	u64 num_bytes = node->num_bytes;
   2963	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
   2964
   2965	extent_root = btrfs_extent_root(info, bytenr);
   2966	ASSERT(extent_root);
   2967
   2968	path = btrfs_alloc_path();
   2969	if (!path)
   2970		return -ENOMEM;
   2971
   2972	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
   2973
   2974	if (!is_data && refs_to_drop != 1) {
   2975		btrfs_crit(info,
   2976"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
   2977			   node->bytenr, refs_to_drop);
   2978		ret = -EINVAL;
   2979		btrfs_abort_transaction(trans, ret);
   2980		goto out;
   2981	}
   2982
   2983	if (is_data)
   2984		skinny_metadata = false;
   2985
   2986	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
   2987				    parent, root_objectid, owner_objectid,
   2988				    owner_offset);
   2989	if (ret == 0) {
   2990		/*
   2991		 * Either the inline backref or the SHARED_DATA_REF/
   2992		 * SHARED_BLOCK_REF is found
   2993		 *
   2994		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
   2995		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
   2996		 */
   2997		extent_slot = path->slots[0];
   2998		while (extent_slot >= 0) {
   2999			btrfs_item_key_to_cpu(path->nodes[0], &key,
   3000					      extent_slot);
   3001			if (key.objectid != bytenr)
   3002				break;
   3003			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
   3004			    key.offset == num_bytes) {
   3005				found_extent = 1;
   3006				break;
   3007			}
   3008			if (key.type == BTRFS_METADATA_ITEM_KEY &&
   3009			    key.offset == owner_objectid) {
   3010				found_extent = 1;
   3011				break;
   3012			}
   3013
   3014			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
   3015			if (path->slots[0] - extent_slot > 5)
   3016				break;
   3017			extent_slot--;
   3018		}
   3019
   3020		if (!found_extent) {
   3021			if (iref) {
   3022				btrfs_crit(info,
   3023"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
   3024				btrfs_abort_transaction(trans, -EUCLEAN);
   3025				goto err_dump;
   3026			}
   3027			/* Must be SHARED_* item, remove the backref first */
   3028			ret = remove_extent_backref(trans, extent_root, path,
   3029						    NULL, refs_to_drop, is_data);
   3030			if (ret) {
   3031				btrfs_abort_transaction(trans, ret);
   3032				goto out;
   3033			}
   3034			btrfs_release_path(path);
   3035
   3036			/* Slow path to locate EXTENT/METADATA_ITEM */
   3037			key.objectid = bytenr;
   3038			key.type = BTRFS_EXTENT_ITEM_KEY;
   3039			key.offset = num_bytes;
   3040
   3041			if (!is_data && skinny_metadata) {
   3042				key.type = BTRFS_METADATA_ITEM_KEY;
   3043				key.offset = owner_objectid;
   3044			}
   3045
   3046			ret = btrfs_search_slot(trans, extent_root,
   3047						&key, path, -1, 1);
   3048			if (ret > 0 && skinny_metadata && path->slots[0]) {
   3049				/*
   3050				 * Couldn't find our skinny metadata item,
   3051				 * see if we have ye olde extent item.
   3052				 */
   3053				path->slots[0]--;
   3054				btrfs_item_key_to_cpu(path->nodes[0], &key,
   3055						      path->slots[0]);
   3056				if (key.objectid == bytenr &&
   3057				    key.type == BTRFS_EXTENT_ITEM_KEY &&
   3058				    key.offset == num_bytes)
   3059					ret = 0;
   3060			}
   3061
   3062			if (ret > 0 && skinny_metadata) {
   3063				skinny_metadata = false;
   3064				key.objectid = bytenr;
   3065				key.type = BTRFS_EXTENT_ITEM_KEY;
   3066				key.offset = num_bytes;
   3067				btrfs_release_path(path);
   3068				ret = btrfs_search_slot(trans, extent_root,
   3069							&key, path, -1, 1);
   3070			}
   3071
   3072			if (ret) {
   3073				btrfs_err(info,
   3074					  "umm, got %d back from search, was looking for %llu",
   3075					  ret, bytenr);
   3076				if (ret > 0)
   3077					btrfs_print_leaf(path->nodes[0]);
   3078			}
   3079			if (ret < 0) {
   3080				btrfs_abort_transaction(trans, ret);
   3081				goto out;
   3082			}
   3083			extent_slot = path->slots[0];
   3084		}
   3085	} else if (WARN_ON(ret == -ENOENT)) {
   3086		btrfs_print_leaf(path->nodes[0]);
   3087		btrfs_err(info,
   3088			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
   3089			bytenr, parent, root_objectid, owner_objectid,
   3090			owner_offset);
   3091		btrfs_abort_transaction(trans, ret);
   3092		goto out;
   3093	} else {
   3094		btrfs_abort_transaction(trans, ret);
   3095		goto out;
   3096	}
   3097
   3098	leaf = path->nodes[0];
   3099	item_size = btrfs_item_size(leaf, extent_slot);
   3100	if (unlikely(item_size < sizeof(*ei))) {
   3101		ret = -EINVAL;
   3102		btrfs_print_v0_err(info);
   3103		btrfs_abort_transaction(trans, ret);
   3104		goto out;
   3105	}
   3106	ei = btrfs_item_ptr(leaf, extent_slot,
   3107			    struct btrfs_extent_item);
   3108	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
   3109	    key.type == BTRFS_EXTENT_ITEM_KEY) {
   3110		struct btrfs_tree_block_info *bi;
   3111		if (item_size < sizeof(*ei) + sizeof(*bi)) {
   3112			btrfs_crit(info,
   3113"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
   3114				   key.objectid, key.type, key.offset,
   3115				   owner_objectid, item_size,
   3116				   sizeof(*ei) + sizeof(*bi));
   3117			btrfs_abort_transaction(trans, -EUCLEAN);
   3118			goto err_dump;
   3119		}
   3120		bi = (struct btrfs_tree_block_info *)(ei + 1);
   3121		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
   3122	}
   3123
   3124	refs = btrfs_extent_refs(leaf, ei);
   3125	if (refs < refs_to_drop) {
   3126		btrfs_crit(info,
   3127		"trying to drop %d refs but we only have %llu for bytenr %llu",
   3128			  refs_to_drop, refs, bytenr);
   3129		btrfs_abort_transaction(trans, -EUCLEAN);
   3130		goto err_dump;
   3131	}
   3132	refs -= refs_to_drop;
   3133
   3134	if (refs > 0) {
   3135		if (extent_op)
   3136			__run_delayed_extent_op(extent_op, leaf, ei);
   3137		/*
   3138		 * In the case of inline back ref, reference count will
   3139		 * be updated by remove_extent_backref
   3140		 */
   3141		if (iref) {
   3142			if (!found_extent) {
   3143				btrfs_crit(info,
   3144"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
   3145				btrfs_abort_transaction(trans, -EUCLEAN);
   3146				goto err_dump;
   3147			}
   3148		} else {
   3149			btrfs_set_extent_refs(leaf, ei, refs);
   3150			btrfs_mark_buffer_dirty(leaf);
   3151		}
   3152		if (found_extent) {
   3153			ret = remove_extent_backref(trans, extent_root, path,
   3154						    iref, refs_to_drop, is_data);
   3155			if (ret) {
   3156				btrfs_abort_transaction(trans, ret);
   3157				goto out;
   3158			}
   3159		}
   3160	} else {
   3161		/* In this branch refs == 1 */
   3162		if (found_extent) {
   3163			if (is_data && refs_to_drop !=
   3164			    extent_data_ref_count(path, iref)) {
   3165				btrfs_crit(info,
   3166		"invalid refs_to_drop, current refs %u refs_to_drop %u",
   3167					   extent_data_ref_count(path, iref),
   3168					   refs_to_drop);
   3169				btrfs_abort_transaction(trans, -EUCLEAN);
   3170				goto err_dump;
   3171			}
   3172			if (iref) {
   3173				if (path->slots[0] != extent_slot) {
   3174					btrfs_crit(info,
   3175"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
   3176						   key.objectid, key.type,
   3177						   key.offset);
   3178					btrfs_abort_transaction(trans, -EUCLEAN);
   3179					goto err_dump;
   3180				}
   3181			} else {
   3182				/*
   3183				 * No inline ref, we must be at SHARED_* item,
   3184				 * And it's single ref, it must be:
   3185				 * |	extent_slot	  ||extent_slot + 1|
   3186				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
   3187				 */
   3188				if (path->slots[0] != extent_slot + 1) {
   3189					btrfs_crit(info,
   3190	"invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
   3191					btrfs_abort_transaction(trans, -EUCLEAN);
   3192					goto err_dump;
   3193				}
   3194				path->slots[0] = extent_slot;
   3195				num_to_del = 2;
   3196			}
   3197		}
   3198
   3199		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
   3200				      num_to_del);
   3201		if (ret) {
   3202			btrfs_abort_transaction(trans, ret);
   3203			goto out;
   3204		}
   3205		btrfs_release_path(path);
   3206
   3207		ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
   3208	}
   3209	btrfs_release_path(path);
   3210
   3211out:
   3212	btrfs_free_path(path);
   3213	return ret;
   3214err_dump:
   3215	/*
   3216	 * Leaf dump can take up a lot of log buffer, so we only do full leaf
   3217	 * dump for debug build.
   3218	 */
   3219	if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
   3220		btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
   3221			   path->slots[0], extent_slot);
   3222		btrfs_print_leaf(path->nodes[0]);
   3223	}
   3224
   3225	btrfs_free_path(path);
   3226	return -EUCLEAN;
   3227}
   3228
   3229/*
   3230 * when we free an block, it is possible (and likely) that we free the last
   3231 * delayed ref for that extent as well.  This searches the delayed ref tree for
   3232 * a given extent, and if there are no other delayed refs to be processed, it
   3233 * removes it from the tree.
   3234 */
   3235static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
   3236				      u64 bytenr)
   3237{
   3238	struct btrfs_delayed_ref_head *head;
   3239	struct btrfs_delayed_ref_root *delayed_refs;
   3240	int ret = 0;
   3241
   3242	delayed_refs = &trans->transaction->delayed_refs;
   3243	spin_lock(&delayed_refs->lock);
   3244	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
   3245	if (!head)
   3246		goto out_delayed_unlock;
   3247
   3248	spin_lock(&head->lock);
   3249	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
   3250		goto out;
   3251
   3252	if (cleanup_extent_op(head) != NULL)
   3253		goto out;
   3254
   3255	/*
   3256	 * waiting for the lock here would deadlock.  If someone else has it
   3257	 * locked they are already in the process of dropping it anyway
   3258	 */
   3259	if (!mutex_trylock(&head->mutex))
   3260		goto out;
   3261
   3262	btrfs_delete_ref_head(delayed_refs, head);
   3263	head->processing = 0;
   3264
   3265	spin_unlock(&head->lock);
   3266	spin_unlock(&delayed_refs->lock);
   3267
   3268	BUG_ON(head->extent_op);
   3269	if (head->must_insert_reserved)
   3270		ret = 1;
   3271
   3272	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
   3273	mutex_unlock(&head->mutex);
   3274	btrfs_put_delayed_ref_head(head);
   3275	return ret;
   3276out:
   3277	spin_unlock(&head->lock);
   3278
   3279out_delayed_unlock:
   3280	spin_unlock(&delayed_refs->lock);
   3281	return 0;
   3282}
   3283
   3284void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
   3285			   u64 root_id,
   3286			   struct extent_buffer *buf,
   3287			   u64 parent, int last_ref)
   3288{
   3289	struct btrfs_fs_info *fs_info = trans->fs_info;
   3290	struct btrfs_ref generic_ref = { 0 };
   3291	int ret;
   3292
   3293	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
   3294			       buf->start, buf->len, parent);
   3295	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
   3296			    root_id, 0, false);
   3297
   3298	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
   3299		btrfs_ref_tree_mod(fs_info, &generic_ref);
   3300		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
   3301		BUG_ON(ret); /* -ENOMEM */
   3302	}
   3303
   3304	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
   3305		struct btrfs_block_group *cache;
   3306		bool must_pin = false;
   3307
   3308		if (root_id != BTRFS_TREE_LOG_OBJECTID) {
   3309			ret = check_ref_cleanup(trans, buf->start);
   3310			if (!ret) {
   3311				btrfs_redirty_list_add(trans->transaction, buf);
   3312				goto out;
   3313			}
   3314		}
   3315
   3316		cache = btrfs_lookup_block_group(fs_info, buf->start);
   3317
   3318		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
   3319			pin_down_extent(trans, cache, buf->start, buf->len, 1);
   3320			btrfs_put_block_group(cache);
   3321			goto out;
   3322		}
   3323
   3324		/*
   3325		 * If this is a leaf and there are tree mod log users, we may
   3326		 * have recorded mod log operations that point to this leaf.
   3327		 * So we must make sure no one reuses this leaf's extent before
   3328		 * mod log operations are applied to a node, otherwise after
   3329		 * rewinding a node using the mod log operations we get an
   3330		 * inconsistent btree, as the leaf's extent may now be used as
   3331		 * a node or leaf for another different btree.
   3332		 * We are safe from races here because at this point no other
   3333		 * node or root points to this extent buffer, so if after this
   3334		 * check a new tree mod log user joins, it will not be able to
   3335		 * find a node pointing to this leaf and record operations that
   3336		 * point to this leaf.
   3337		 */
   3338		if (btrfs_header_level(buf) == 0 &&
   3339		    test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
   3340			must_pin = true;
   3341
   3342		if (must_pin || btrfs_is_zoned(fs_info)) {
   3343			btrfs_redirty_list_add(trans->transaction, buf);
   3344			pin_down_extent(trans, cache, buf->start, buf->len, 1);
   3345			btrfs_put_block_group(cache);
   3346			goto out;
   3347		}
   3348
   3349		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
   3350
   3351		btrfs_add_free_space(cache, buf->start, buf->len);
   3352		btrfs_free_reserved_bytes(cache, buf->len, 0);
   3353		btrfs_put_block_group(cache);
   3354		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
   3355	}
   3356out:
   3357	if (last_ref) {
   3358		/*
   3359		 * Deleting the buffer, clear the corrupt flag since it doesn't
   3360		 * matter anymore.
   3361		 */
   3362		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
   3363	}
   3364}
   3365
   3366/* Can return -ENOMEM */
   3367int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
   3368{
   3369	struct btrfs_fs_info *fs_info = trans->fs_info;
   3370	int ret;
   3371
   3372	if (btrfs_is_testing(fs_info))
   3373		return 0;
   3374
   3375	/*
   3376	 * tree log blocks never actually go into the extent allocation
   3377	 * tree, just update pinning info and exit early.
   3378	 */
   3379	if ((ref->type == BTRFS_REF_METADATA &&
   3380	     ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
   3381	    (ref->type == BTRFS_REF_DATA &&
   3382	     ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
   3383		/* unlocks the pinned mutex */
   3384		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
   3385		ret = 0;
   3386	} else if (ref->type == BTRFS_REF_METADATA) {
   3387		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
   3388	} else {
   3389		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
   3390	}
   3391
   3392	if (!((ref->type == BTRFS_REF_METADATA &&
   3393	       ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
   3394	      (ref->type == BTRFS_REF_DATA &&
   3395	       ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
   3396		btrfs_ref_tree_mod(fs_info, ref);
   3397
   3398	return ret;
   3399}
   3400
   3401enum btrfs_loop_type {
   3402	LOOP_CACHING_NOWAIT,
   3403	LOOP_CACHING_WAIT,
   3404	LOOP_ALLOC_CHUNK,
   3405	LOOP_NO_EMPTY_SIZE,
   3406};
   3407
   3408static inline void
   3409btrfs_lock_block_group(struct btrfs_block_group *cache,
   3410		       int delalloc)
   3411{
   3412	if (delalloc)
   3413		down_read(&cache->data_rwsem);
   3414}
   3415
   3416static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
   3417		       int delalloc)
   3418{
   3419	btrfs_get_block_group(cache);
   3420	if (delalloc)
   3421		down_read(&cache->data_rwsem);
   3422}
   3423
   3424static struct btrfs_block_group *btrfs_lock_cluster(
   3425		   struct btrfs_block_group *block_group,
   3426		   struct btrfs_free_cluster *cluster,
   3427		   int delalloc)
   3428	__acquires(&cluster->refill_lock)
   3429{
   3430	struct btrfs_block_group *used_bg = NULL;
   3431
   3432	spin_lock(&cluster->refill_lock);
   3433	while (1) {
   3434		used_bg = cluster->block_group;
   3435		if (!used_bg)
   3436			return NULL;
   3437
   3438		if (used_bg == block_group)
   3439			return used_bg;
   3440
   3441		btrfs_get_block_group(used_bg);
   3442
   3443		if (!delalloc)
   3444			return used_bg;
   3445
   3446		if (down_read_trylock(&used_bg->data_rwsem))
   3447			return used_bg;
   3448
   3449		spin_unlock(&cluster->refill_lock);
   3450
   3451		/* We should only have one-level nested. */
   3452		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
   3453
   3454		spin_lock(&cluster->refill_lock);
   3455		if (used_bg == cluster->block_group)
   3456			return used_bg;
   3457
   3458		up_read(&used_bg->data_rwsem);
   3459		btrfs_put_block_group(used_bg);
   3460	}
   3461}
   3462
   3463static inline void
   3464btrfs_release_block_group(struct btrfs_block_group *cache,
   3465			 int delalloc)
   3466{
   3467	if (delalloc)
   3468		up_read(&cache->data_rwsem);
   3469	btrfs_put_block_group(cache);
   3470}
   3471
   3472enum btrfs_extent_allocation_policy {
   3473	BTRFS_EXTENT_ALLOC_CLUSTERED,
   3474	BTRFS_EXTENT_ALLOC_ZONED,
   3475};
   3476
   3477/*
   3478 * Structure used internally for find_free_extent() function.  Wraps needed
   3479 * parameters.
   3480 */
   3481struct find_free_extent_ctl {
   3482	/* Basic allocation info */
   3483	u64 ram_bytes;
   3484	u64 num_bytes;
   3485	u64 min_alloc_size;
   3486	u64 empty_size;
   3487	u64 flags;
   3488	int delalloc;
   3489
   3490	/* Where to start the search inside the bg */
   3491	u64 search_start;
   3492
   3493	/* For clustered allocation */
   3494	u64 empty_cluster;
   3495	struct btrfs_free_cluster *last_ptr;
   3496	bool use_cluster;
   3497
   3498	bool have_caching_bg;
   3499	bool orig_have_caching_bg;
   3500
   3501	/* Allocation is called for tree-log */
   3502	bool for_treelog;
   3503
   3504	/* Allocation is called for data relocation */
   3505	bool for_data_reloc;
   3506
   3507	/* RAID index, converted from flags */
   3508	int index;
   3509
   3510	/*
   3511	 * Current loop number, check find_free_extent_update_loop() for details
   3512	 */
   3513	int loop;
   3514
   3515	/*
   3516	 * Whether we're refilling a cluster, if true we need to re-search
   3517	 * current block group but don't try to refill the cluster again.
   3518	 */
   3519	bool retry_clustered;
   3520
   3521	/*
   3522	 * Whether we're updating free space cache, if true we need to re-search
   3523	 * current block group but don't try updating free space cache again.
   3524	 */
   3525	bool retry_unclustered;
   3526
   3527	/* If current block group is cached */
   3528	int cached;
   3529
   3530	/* Max contiguous hole found */
   3531	u64 max_extent_size;
   3532
   3533	/* Total free space from free space cache, not always contiguous */
   3534	u64 total_free_space;
   3535
   3536	/* Found result */
   3537	u64 found_offset;
   3538
   3539	/* Hint where to start looking for an empty space */
   3540	u64 hint_byte;
   3541
   3542	/* Allocation policy */
   3543	enum btrfs_extent_allocation_policy policy;
   3544};
   3545
   3546
   3547/*
   3548 * Helper function for find_free_extent().
   3549 *
   3550 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
   3551 * Return -EAGAIN to inform caller that we need to re-search this block group
   3552 * Return >0 to inform caller that we find nothing
   3553 * Return 0 means we have found a location and set ffe_ctl->found_offset.
   3554 */
   3555static int find_free_extent_clustered(struct btrfs_block_group *bg,
   3556				      struct find_free_extent_ctl *ffe_ctl,
   3557				      struct btrfs_block_group **cluster_bg_ret)
   3558{
   3559	struct btrfs_block_group *cluster_bg;
   3560	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
   3561	u64 aligned_cluster;
   3562	u64 offset;
   3563	int ret;
   3564
   3565	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
   3566	if (!cluster_bg)
   3567		goto refill_cluster;
   3568	if (cluster_bg != bg && (cluster_bg->ro ||
   3569	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
   3570		goto release_cluster;
   3571
   3572	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
   3573			ffe_ctl->num_bytes, cluster_bg->start,
   3574			&ffe_ctl->max_extent_size);
   3575	if (offset) {
   3576		/* We have a block, we're done */
   3577		spin_unlock(&last_ptr->refill_lock);
   3578		trace_btrfs_reserve_extent_cluster(cluster_bg,
   3579				ffe_ctl->search_start, ffe_ctl->num_bytes);
   3580		*cluster_bg_ret = cluster_bg;
   3581		ffe_ctl->found_offset = offset;
   3582		return 0;
   3583	}
   3584	WARN_ON(last_ptr->block_group != cluster_bg);
   3585
   3586release_cluster:
   3587	/*
   3588	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
   3589	 * lets just skip it and let the allocator find whatever block it can
   3590	 * find. If we reach this point, we will have tried the cluster
   3591	 * allocator plenty of times and not have found anything, so we are
   3592	 * likely way too fragmented for the clustering stuff to find anything.
   3593	 *
   3594	 * However, if the cluster is taken from the current block group,
   3595	 * release the cluster first, so that we stand a better chance of
   3596	 * succeeding in the unclustered allocation.
   3597	 */
   3598	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
   3599		spin_unlock(&last_ptr->refill_lock);
   3600		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
   3601		return -ENOENT;
   3602	}
   3603
   3604	/* This cluster didn't work out, free it and start over */
   3605	btrfs_return_cluster_to_free_space(NULL, last_ptr);
   3606
   3607	if (cluster_bg != bg)
   3608		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
   3609
   3610refill_cluster:
   3611	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
   3612		spin_unlock(&last_ptr->refill_lock);
   3613		return -ENOENT;
   3614	}
   3615
   3616	aligned_cluster = max_t(u64,
   3617			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
   3618			bg->full_stripe_len);
   3619	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
   3620			ffe_ctl->num_bytes, aligned_cluster);
   3621	if (ret == 0) {
   3622		/* Now pull our allocation out of this cluster */
   3623		offset = btrfs_alloc_from_cluster(bg, last_ptr,
   3624				ffe_ctl->num_bytes, ffe_ctl->search_start,
   3625				&ffe_ctl->max_extent_size);
   3626		if (offset) {
   3627			/* We found one, proceed */
   3628			spin_unlock(&last_ptr->refill_lock);
   3629			trace_btrfs_reserve_extent_cluster(bg,
   3630					ffe_ctl->search_start,
   3631					ffe_ctl->num_bytes);
   3632			ffe_ctl->found_offset = offset;
   3633			return 0;
   3634		}
   3635	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
   3636		   !ffe_ctl->retry_clustered) {
   3637		spin_unlock(&last_ptr->refill_lock);
   3638
   3639		ffe_ctl->retry_clustered = true;
   3640		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
   3641				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
   3642		return -EAGAIN;
   3643	}
   3644	/*
   3645	 * At this point we either didn't find a cluster or we weren't able to
   3646	 * allocate a block from our cluster.  Free the cluster we've been
   3647	 * trying to use, and go to the next block group.
   3648	 */
   3649	btrfs_return_cluster_to_free_space(NULL, last_ptr);
   3650	spin_unlock(&last_ptr->refill_lock);
   3651	return 1;
   3652}
   3653
   3654/*
   3655 * Return >0 to inform caller that we find nothing
   3656 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
   3657 * Return -EAGAIN to inform caller that we need to re-search this block group
   3658 */
   3659static int find_free_extent_unclustered(struct btrfs_block_group *bg,
   3660					struct find_free_extent_ctl *ffe_ctl)
   3661{
   3662	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
   3663	u64 offset;
   3664
   3665	/*
   3666	 * We are doing an unclustered allocation, set the fragmented flag so
   3667	 * we don't bother trying to setup a cluster again until we get more
   3668	 * space.
   3669	 */
   3670	if (unlikely(last_ptr)) {
   3671		spin_lock(&last_ptr->lock);
   3672		last_ptr->fragmented = 1;
   3673		spin_unlock(&last_ptr->lock);
   3674	}
   3675	if (ffe_ctl->cached) {
   3676		struct btrfs_free_space_ctl *free_space_ctl;
   3677
   3678		free_space_ctl = bg->free_space_ctl;
   3679		spin_lock(&free_space_ctl->tree_lock);
   3680		if (free_space_ctl->free_space <
   3681		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
   3682		    ffe_ctl->empty_size) {
   3683			ffe_ctl->total_free_space = max_t(u64,
   3684					ffe_ctl->total_free_space,
   3685					free_space_ctl->free_space);
   3686			spin_unlock(&free_space_ctl->tree_lock);
   3687			return 1;
   3688		}
   3689		spin_unlock(&free_space_ctl->tree_lock);
   3690	}
   3691
   3692	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
   3693			ffe_ctl->num_bytes, ffe_ctl->empty_size,
   3694			&ffe_ctl->max_extent_size);
   3695
   3696	/*
   3697	 * If we didn't find a chunk, and we haven't failed on this block group
   3698	 * before, and this block group is in the middle of caching and we are
   3699	 * ok with waiting, then go ahead and wait for progress to be made, and
   3700	 * set @retry_unclustered to true.
   3701	 *
   3702	 * If @retry_unclustered is true then we've already waited on this
   3703	 * block group once and should move on to the next block group.
   3704	 */
   3705	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
   3706	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
   3707		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
   3708						      ffe_ctl->empty_size);
   3709		ffe_ctl->retry_unclustered = true;
   3710		return -EAGAIN;
   3711	} else if (!offset) {
   3712		return 1;
   3713	}
   3714	ffe_ctl->found_offset = offset;
   3715	return 0;
   3716}
   3717
   3718static int do_allocation_clustered(struct btrfs_block_group *block_group,
   3719				   struct find_free_extent_ctl *ffe_ctl,
   3720				   struct btrfs_block_group **bg_ret)
   3721{
   3722	int ret;
   3723
   3724	/* We want to try and use the cluster allocator, so lets look there */
   3725	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
   3726		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
   3727		if (ret >= 0 || ret == -EAGAIN)
   3728			return ret;
   3729		/* ret == -ENOENT case falls through */
   3730	}
   3731
   3732	return find_free_extent_unclustered(block_group, ffe_ctl);
   3733}
   3734
   3735/*
   3736 * Tree-log block group locking
   3737 * ============================
   3738 *
   3739 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
   3740 * indicates the starting address of a block group, which is reserved only
   3741 * for tree-log metadata.
   3742 *
   3743 * Lock nesting
   3744 * ============
   3745 *
   3746 * space_info::lock
   3747 *   block_group::lock
   3748 *     fs_info::treelog_bg_lock
   3749 */
   3750
   3751/*
   3752 * Simple allocator for sequential-only block group. It only allows sequential
   3753 * allocation. No need to play with trees. This function also reserves the
   3754 * bytes as in btrfs_add_reserved_bytes.
   3755 */
   3756static int do_allocation_zoned(struct btrfs_block_group *block_group,
   3757			       struct find_free_extent_ctl *ffe_ctl,
   3758			       struct btrfs_block_group **bg_ret)
   3759{
   3760	struct btrfs_fs_info *fs_info = block_group->fs_info;
   3761	struct btrfs_space_info *space_info = block_group->space_info;
   3762	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3763	u64 start = block_group->start;
   3764	u64 num_bytes = ffe_ctl->num_bytes;
   3765	u64 avail;
   3766	u64 bytenr = block_group->start;
   3767	u64 log_bytenr;
   3768	u64 data_reloc_bytenr;
   3769	int ret = 0;
   3770	bool skip = false;
   3771
   3772	ASSERT(btrfs_is_zoned(block_group->fs_info));
   3773
   3774	/*
   3775	 * Do not allow non-tree-log blocks in the dedicated tree-log block
   3776	 * group, and vice versa.
   3777	 */
   3778	spin_lock(&fs_info->treelog_bg_lock);
   3779	log_bytenr = fs_info->treelog_bg;
   3780	if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
   3781			   (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
   3782		skip = true;
   3783	spin_unlock(&fs_info->treelog_bg_lock);
   3784	if (skip)
   3785		return 1;
   3786
   3787	/*
   3788	 * Do not allow non-relocation blocks in the dedicated relocation block
   3789	 * group, and vice versa.
   3790	 */
   3791	spin_lock(&fs_info->relocation_bg_lock);
   3792	data_reloc_bytenr = fs_info->data_reloc_bg;
   3793	if (data_reloc_bytenr &&
   3794	    ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
   3795	     (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
   3796		skip = true;
   3797	spin_unlock(&fs_info->relocation_bg_lock);
   3798	if (skip)
   3799		return 1;
   3800
   3801	/* Check RO and no space case before trying to activate it */
   3802	spin_lock(&block_group->lock);
   3803	if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
   3804		ret = 1;
   3805		/*
   3806		 * May need to clear fs_info->{treelog,data_reloc}_bg.
   3807		 * Return the error after taking the locks.
   3808		 */
   3809	}
   3810	spin_unlock(&block_group->lock);
   3811
   3812	if (!ret && !btrfs_zone_activate(block_group)) {
   3813		ret = 1;
   3814		/*
   3815		 * May need to clear fs_info->{treelog,data_reloc}_bg.
   3816		 * Return the error after taking the locks.
   3817		 */
   3818	}
   3819
   3820	spin_lock(&space_info->lock);
   3821	spin_lock(&block_group->lock);
   3822	spin_lock(&fs_info->treelog_bg_lock);
   3823	spin_lock(&fs_info->relocation_bg_lock);
   3824
   3825	if (ret)
   3826		goto out;
   3827
   3828	ASSERT(!ffe_ctl->for_treelog ||
   3829	       block_group->start == fs_info->treelog_bg ||
   3830	       fs_info->treelog_bg == 0);
   3831	ASSERT(!ffe_ctl->for_data_reloc ||
   3832	       block_group->start == fs_info->data_reloc_bg ||
   3833	       fs_info->data_reloc_bg == 0);
   3834
   3835	if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
   3836		ret = 1;
   3837		goto out;
   3838	}
   3839
   3840	/*
   3841	 * Do not allow currently using block group to be tree-log dedicated
   3842	 * block group.
   3843	 */
   3844	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
   3845	    (block_group->used || block_group->reserved)) {
   3846		ret = 1;
   3847		goto out;
   3848	}
   3849
   3850	/*
   3851	 * Do not allow currently used block group to be the data relocation
   3852	 * dedicated block group.
   3853	 */
   3854	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
   3855	    (block_group->used || block_group->reserved)) {
   3856		ret = 1;
   3857		goto out;
   3858	}
   3859
   3860	WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
   3861	avail = block_group->zone_capacity - block_group->alloc_offset;
   3862	if (avail < num_bytes) {
   3863		if (ffe_ctl->max_extent_size < avail) {
   3864			/*
   3865			 * With sequential allocator, free space is always
   3866			 * contiguous
   3867			 */
   3868			ffe_ctl->max_extent_size = avail;
   3869			ffe_ctl->total_free_space = avail;
   3870		}
   3871		ret = 1;
   3872		goto out;
   3873	}
   3874
   3875	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
   3876		fs_info->treelog_bg = block_group->start;
   3877
   3878	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
   3879		fs_info->data_reloc_bg = block_group->start;
   3880
   3881	ffe_ctl->found_offset = start + block_group->alloc_offset;
   3882	block_group->alloc_offset += num_bytes;
   3883	spin_lock(&ctl->tree_lock);
   3884	ctl->free_space -= num_bytes;
   3885	spin_unlock(&ctl->tree_lock);
   3886
   3887	/*
   3888	 * We do not check if found_offset is aligned to stripesize. The
   3889	 * address is anyway rewritten when using zone append writing.
   3890	 */
   3891
   3892	ffe_ctl->search_start = ffe_ctl->found_offset;
   3893
   3894out:
   3895	if (ret && ffe_ctl->for_treelog)
   3896		fs_info->treelog_bg = 0;
   3897	if (ret && ffe_ctl->for_data_reloc &&
   3898	    fs_info->data_reloc_bg == block_group->start) {
   3899		/*
   3900		 * Do not allow further allocations from this block group.
   3901		 * Compared to increasing the ->ro, setting the
   3902		 * ->zoned_data_reloc_ongoing flag still allows nocow
   3903		 *  writers to come in. See btrfs_inc_nocow_writers().
   3904		 *
   3905		 * We need to disable an allocation to avoid an allocation of
   3906		 * regular (non-relocation data) extent. With mix of relocation
   3907		 * extents and regular extents, we can dispatch WRITE commands
   3908		 * (for relocation extents) and ZONE APPEND commands (for
   3909		 * regular extents) at the same time to the same zone, which
   3910		 * easily break the write pointer.
   3911		 */
   3912		block_group->zoned_data_reloc_ongoing = 1;
   3913		fs_info->data_reloc_bg = 0;
   3914	}
   3915	spin_unlock(&fs_info->relocation_bg_lock);
   3916	spin_unlock(&fs_info->treelog_bg_lock);
   3917	spin_unlock(&block_group->lock);
   3918	spin_unlock(&space_info->lock);
   3919	return ret;
   3920}
   3921
   3922static int do_allocation(struct btrfs_block_group *block_group,
   3923			 struct find_free_extent_ctl *ffe_ctl,
   3924			 struct btrfs_block_group **bg_ret)
   3925{
   3926	switch (ffe_ctl->policy) {
   3927	case BTRFS_EXTENT_ALLOC_CLUSTERED:
   3928		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
   3929	case BTRFS_EXTENT_ALLOC_ZONED:
   3930		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
   3931	default:
   3932		BUG();
   3933	}
   3934}
   3935
   3936static void release_block_group(struct btrfs_block_group *block_group,
   3937				struct find_free_extent_ctl *ffe_ctl,
   3938				int delalloc)
   3939{
   3940	switch (ffe_ctl->policy) {
   3941	case BTRFS_EXTENT_ALLOC_CLUSTERED:
   3942		ffe_ctl->retry_clustered = false;
   3943		ffe_ctl->retry_unclustered = false;
   3944		break;
   3945	case BTRFS_EXTENT_ALLOC_ZONED:
   3946		/* Nothing to do */
   3947		break;
   3948	default:
   3949		BUG();
   3950	}
   3951
   3952	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
   3953	       ffe_ctl->index);
   3954	btrfs_release_block_group(block_group, delalloc);
   3955}
   3956
   3957static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
   3958				   struct btrfs_key *ins)
   3959{
   3960	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
   3961
   3962	if (!ffe_ctl->use_cluster && last_ptr) {
   3963		spin_lock(&last_ptr->lock);
   3964		last_ptr->window_start = ins->objectid;
   3965		spin_unlock(&last_ptr->lock);
   3966	}
   3967}
   3968
   3969static void found_extent(struct find_free_extent_ctl *ffe_ctl,
   3970			 struct btrfs_key *ins)
   3971{
   3972	switch (ffe_ctl->policy) {
   3973	case BTRFS_EXTENT_ALLOC_CLUSTERED:
   3974		found_extent_clustered(ffe_ctl, ins);
   3975		break;
   3976	case BTRFS_EXTENT_ALLOC_ZONED:
   3977		/* Nothing to do */
   3978		break;
   3979	default:
   3980		BUG();
   3981	}
   3982}
   3983
   3984static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
   3985			       struct find_free_extent_ctl *ffe_ctl)
   3986{
   3987	switch (ffe_ctl->policy) {
   3988	case BTRFS_EXTENT_ALLOC_CLUSTERED:
   3989		return true;
   3990	case BTRFS_EXTENT_ALLOC_ZONED:
   3991		/*
   3992		 * If we have enough free space left in an already
   3993		 * active block group and we can't activate any other
   3994		 * zone now, do not allow allocating a new chunk and
   3995		 * let find_free_extent() retry with a smaller size.
   3996		 */
   3997		if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
   3998		    !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
   3999			return false;
   4000		return true;
   4001	default:
   4002		BUG();
   4003	}
   4004}
   4005
   4006static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
   4007{
   4008	switch (ffe_ctl->policy) {
   4009	case BTRFS_EXTENT_ALLOC_CLUSTERED:
   4010		/*
   4011		 * If we can't allocate a new chunk we've already looped through
   4012		 * at least once, move on to the NO_EMPTY_SIZE case.
   4013		 */
   4014		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
   4015		return 0;
   4016	case BTRFS_EXTENT_ALLOC_ZONED:
   4017		/* Give up here */
   4018		return -ENOSPC;
   4019	default:
   4020		BUG();
   4021	}
   4022}
   4023
   4024/*
   4025 * Return >0 means caller needs to re-search for free extent
   4026 * Return 0 means we have the needed free extent.
   4027 * Return <0 means we failed to locate any free extent.
   4028 */
   4029static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
   4030					struct btrfs_key *ins,
   4031					struct find_free_extent_ctl *ffe_ctl,
   4032					bool full_search)
   4033{
   4034	struct btrfs_root *root = fs_info->chunk_root;
   4035	int ret;
   4036
   4037	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
   4038	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
   4039		ffe_ctl->orig_have_caching_bg = true;
   4040
   4041	if (ins->objectid) {
   4042		found_extent(ffe_ctl, ins);
   4043		return 0;
   4044	}
   4045
   4046	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
   4047		return 1;
   4048
   4049	ffe_ctl->index++;
   4050	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
   4051		return 1;
   4052
   4053	/*
   4054	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
   4055	 *			caching kthreads as we move along
   4056	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
   4057	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
   4058	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
   4059	 *		       again
   4060	 */
   4061	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
   4062		ffe_ctl->index = 0;
   4063		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
   4064			/*
   4065			 * We want to skip the LOOP_CACHING_WAIT step if we
   4066			 * don't have any uncached bgs and we've already done a
   4067			 * full search through.
   4068			 */
   4069			if (ffe_ctl->orig_have_caching_bg || !full_search)
   4070				ffe_ctl->loop = LOOP_CACHING_WAIT;
   4071			else
   4072				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
   4073		} else {
   4074			ffe_ctl->loop++;
   4075		}
   4076
   4077		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
   4078			struct btrfs_trans_handle *trans;
   4079			int exist = 0;
   4080
   4081			/*Check if allocation policy allows to create a new chunk */
   4082			if (!can_allocate_chunk(fs_info, ffe_ctl))
   4083				return -ENOSPC;
   4084
   4085			trans = current->journal_info;
   4086			if (trans)
   4087				exist = 1;
   4088			else
   4089				trans = btrfs_join_transaction(root);
   4090
   4091			if (IS_ERR(trans)) {
   4092				ret = PTR_ERR(trans);
   4093				return ret;
   4094			}
   4095
   4096			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
   4097						CHUNK_ALLOC_FORCE_FOR_EXTENT);
   4098
   4099			/* Do not bail out on ENOSPC since we can do more. */
   4100			if (ret == -ENOSPC)
   4101				ret = chunk_allocation_failed(ffe_ctl);
   4102			else if (ret < 0)
   4103				btrfs_abort_transaction(trans, ret);
   4104			else
   4105				ret = 0;
   4106			if (!exist)
   4107				btrfs_end_transaction(trans);
   4108			if (ret)
   4109				return ret;
   4110		}
   4111
   4112		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
   4113			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
   4114				return -ENOSPC;
   4115
   4116			/*
   4117			 * Don't loop again if we already have no empty_size and
   4118			 * no empty_cluster.
   4119			 */
   4120			if (ffe_ctl->empty_size == 0 &&
   4121			    ffe_ctl->empty_cluster == 0)
   4122				return -ENOSPC;
   4123			ffe_ctl->empty_size = 0;
   4124			ffe_ctl->empty_cluster = 0;
   4125		}
   4126		return 1;
   4127	}
   4128	return -ENOSPC;
   4129}
   4130
   4131static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
   4132					struct find_free_extent_ctl *ffe_ctl,
   4133					struct btrfs_space_info *space_info,
   4134					struct btrfs_key *ins)
   4135{
   4136	/*
   4137	 * If our free space is heavily fragmented we may not be able to make
   4138	 * big contiguous allocations, so instead of doing the expensive search
   4139	 * for free space, simply return ENOSPC with our max_extent_size so we
   4140	 * can go ahead and search for a more manageable chunk.
   4141	 *
   4142	 * If our max_extent_size is large enough for our allocation simply
   4143	 * disable clustering since we will likely not be able to find enough
   4144	 * space to create a cluster and induce latency trying.
   4145	 */
   4146	if (space_info->max_extent_size) {
   4147		spin_lock(&space_info->lock);
   4148		if (space_info->max_extent_size &&
   4149		    ffe_ctl->num_bytes > space_info->max_extent_size) {
   4150			ins->offset = space_info->max_extent_size;
   4151			spin_unlock(&space_info->lock);
   4152			return -ENOSPC;
   4153		} else if (space_info->max_extent_size) {
   4154			ffe_ctl->use_cluster = false;
   4155		}
   4156		spin_unlock(&space_info->lock);
   4157	}
   4158
   4159	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
   4160					       &ffe_ctl->empty_cluster);
   4161	if (ffe_ctl->last_ptr) {
   4162		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
   4163
   4164		spin_lock(&last_ptr->lock);
   4165		if (last_ptr->block_group)
   4166			ffe_ctl->hint_byte = last_ptr->window_start;
   4167		if (last_ptr->fragmented) {
   4168			/*
   4169			 * We still set window_start so we can keep track of the
   4170			 * last place we found an allocation to try and save
   4171			 * some time.
   4172			 */
   4173			ffe_ctl->hint_byte = last_ptr->window_start;
   4174			ffe_ctl->use_cluster = false;
   4175		}
   4176		spin_unlock(&last_ptr->lock);
   4177	}
   4178
   4179	return 0;
   4180}
   4181
   4182static int prepare_allocation(struct btrfs_fs_info *fs_info,
   4183			      struct find_free_extent_ctl *ffe_ctl,
   4184			      struct btrfs_space_info *space_info,
   4185			      struct btrfs_key *ins)
   4186{
   4187	switch (ffe_ctl->policy) {
   4188	case BTRFS_EXTENT_ALLOC_CLUSTERED:
   4189		return prepare_allocation_clustered(fs_info, ffe_ctl,
   4190						    space_info, ins);
   4191	case BTRFS_EXTENT_ALLOC_ZONED:
   4192		if (ffe_ctl->for_treelog) {
   4193			spin_lock(&fs_info->treelog_bg_lock);
   4194			if (fs_info->treelog_bg)
   4195				ffe_ctl->hint_byte = fs_info->treelog_bg;
   4196			spin_unlock(&fs_info->treelog_bg_lock);
   4197		}
   4198		if (ffe_ctl->for_data_reloc) {
   4199			spin_lock(&fs_info->relocation_bg_lock);
   4200			if (fs_info->data_reloc_bg)
   4201				ffe_ctl->hint_byte = fs_info->data_reloc_bg;
   4202			spin_unlock(&fs_info->relocation_bg_lock);
   4203		}
   4204		return 0;
   4205	default:
   4206		BUG();
   4207	}
   4208}
   4209
   4210/*
   4211 * walks the btree of allocated extents and find a hole of a given size.
   4212 * The key ins is changed to record the hole:
   4213 * ins->objectid == start position
   4214 * ins->flags = BTRFS_EXTENT_ITEM_KEY
   4215 * ins->offset == the size of the hole.
   4216 * Any available blocks before search_start are skipped.
   4217 *
   4218 * If there is no suitable free space, we will record the max size of
   4219 * the free space extent currently.
   4220 *
   4221 * The overall logic and call chain:
   4222 *
   4223 * find_free_extent()
   4224 * |- Iterate through all block groups
   4225 * |  |- Get a valid block group
   4226 * |  |- Try to do clustered allocation in that block group
   4227 * |  |- Try to do unclustered allocation in that block group
   4228 * |  |- Check if the result is valid
   4229 * |  |  |- If valid, then exit
   4230 * |  |- Jump to next block group
   4231 * |
   4232 * |- Push harder to find free extents
   4233 *    |- If not found, re-iterate all block groups
   4234 */
   4235static noinline int find_free_extent(struct btrfs_root *root,
   4236				     struct btrfs_key *ins,
   4237				     struct find_free_extent_ctl *ffe_ctl)
   4238{
   4239	struct btrfs_fs_info *fs_info = root->fs_info;
   4240	int ret = 0;
   4241	int cache_block_group_error = 0;
   4242	struct btrfs_block_group *block_group = NULL;
   4243	struct btrfs_space_info *space_info;
   4244	bool full_search = false;
   4245
   4246	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
   4247
   4248	ffe_ctl->search_start = 0;
   4249	/* For clustered allocation */
   4250	ffe_ctl->empty_cluster = 0;
   4251	ffe_ctl->last_ptr = NULL;
   4252	ffe_ctl->use_cluster = true;
   4253	ffe_ctl->have_caching_bg = false;
   4254	ffe_ctl->orig_have_caching_bg = false;
   4255	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
   4256	ffe_ctl->loop = 0;
   4257	/* For clustered allocation */
   4258	ffe_ctl->retry_clustered = false;
   4259	ffe_ctl->retry_unclustered = false;
   4260	ffe_ctl->cached = 0;
   4261	ffe_ctl->max_extent_size = 0;
   4262	ffe_ctl->total_free_space = 0;
   4263	ffe_ctl->found_offset = 0;
   4264	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
   4265
   4266	if (btrfs_is_zoned(fs_info))
   4267		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
   4268
   4269	ins->type = BTRFS_EXTENT_ITEM_KEY;
   4270	ins->objectid = 0;
   4271	ins->offset = 0;
   4272
   4273	trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
   4274			       ffe_ctl->flags);
   4275
   4276	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
   4277	if (!space_info) {
   4278		btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
   4279		return -ENOSPC;
   4280	}
   4281
   4282	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
   4283	if (ret < 0)
   4284		return ret;
   4285
   4286	ffe_ctl->search_start = max(ffe_ctl->search_start,
   4287				    first_logical_byte(fs_info));
   4288	ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
   4289	if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
   4290		block_group = btrfs_lookup_block_group(fs_info,
   4291						       ffe_ctl->search_start);
   4292		/*
   4293		 * we don't want to use the block group if it doesn't match our
   4294		 * allocation bits, or if its not cached.
   4295		 *
   4296		 * However if we are re-searching with an ideal block group
   4297		 * picked out then we don't care that the block group is cached.
   4298		 */
   4299		if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
   4300		    block_group->cached != BTRFS_CACHE_NO) {
   4301			down_read(&space_info->groups_sem);
   4302			if (list_empty(&block_group->list) ||
   4303			    block_group->ro) {
   4304				/*
   4305				 * someone is removing this block group,
   4306				 * we can't jump into the have_block_group
   4307				 * target because our list pointers are not
   4308				 * valid
   4309				 */
   4310				btrfs_put_block_group(block_group);
   4311				up_read(&space_info->groups_sem);
   4312			} else {
   4313				ffe_ctl->index = btrfs_bg_flags_to_raid_index(
   4314							block_group->flags);
   4315				btrfs_lock_block_group(block_group,
   4316						       ffe_ctl->delalloc);
   4317				goto have_block_group;
   4318			}
   4319		} else if (block_group) {
   4320			btrfs_put_block_group(block_group);
   4321		}
   4322	}
   4323search:
   4324	ffe_ctl->have_caching_bg = false;
   4325	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
   4326	    ffe_ctl->index == 0)
   4327		full_search = true;
   4328	down_read(&space_info->groups_sem);
   4329	list_for_each_entry(block_group,
   4330			    &space_info->block_groups[ffe_ctl->index], list) {
   4331		struct btrfs_block_group *bg_ret;
   4332
   4333		/* If the block group is read-only, we can skip it entirely. */
   4334		if (unlikely(block_group->ro)) {
   4335			if (ffe_ctl->for_treelog)
   4336				btrfs_clear_treelog_bg(block_group);
   4337			if (ffe_ctl->for_data_reloc)
   4338				btrfs_clear_data_reloc_bg(block_group);
   4339			continue;
   4340		}
   4341
   4342		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
   4343		ffe_ctl->search_start = block_group->start;
   4344
   4345		/*
   4346		 * this can happen if we end up cycling through all the
   4347		 * raid types, but we want to make sure we only allocate
   4348		 * for the proper type.
   4349		 */
   4350		if (!block_group_bits(block_group, ffe_ctl->flags)) {
   4351			u64 extra = BTRFS_BLOCK_GROUP_DUP |
   4352				BTRFS_BLOCK_GROUP_RAID1_MASK |
   4353				BTRFS_BLOCK_GROUP_RAID56_MASK |
   4354				BTRFS_BLOCK_GROUP_RAID10;
   4355
   4356			/*
   4357			 * if they asked for extra copies and this block group
   4358			 * doesn't provide them, bail.  This does allow us to
   4359			 * fill raid0 from raid1.
   4360			 */
   4361			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
   4362				goto loop;
   4363
   4364			/*
   4365			 * This block group has different flags than we want.
   4366			 * It's possible that we have MIXED_GROUP flag but no
   4367			 * block group is mixed.  Just skip such block group.
   4368			 */
   4369			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
   4370			continue;
   4371		}
   4372
   4373have_block_group:
   4374		ffe_ctl->cached = btrfs_block_group_done(block_group);
   4375		if (unlikely(!ffe_ctl->cached)) {
   4376			ffe_ctl->have_caching_bg = true;
   4377			ret = btrfs_cache_block_group(block_group, 0);
   4378
   4379			/*
   4380			 * If we get ENOMEM here or something else we want to
   4381			 * try other block groups, because it may not be fatal.
   4382			 * However if we can't find anything else we need to
   4383			 * save our return here so that we return the actual
   4384			 * error that caused problems, not ENOSPC.
   4385			 */
   4386			if (ret < 0) {
   4387				if (!cache_block_group_error)
   4388					cache_block_group_error = ret;
   4389				ret = 0;
   4390				goto loop;
   4391			}
   4392			ret = 0;
   4393		}
   4394
   4395		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
   4396			goto loop;
   4397
   4398		bg_ret = NULL;
   4399		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
   4400		if (ret == 0) {
   4401			if (bg_ret && bg_ret != block_group) {
   4402				btrfs_release_block_group(block_group,
   4403							  ffe_ctl->delalloc);
   4404				block_group = bg_ret;
   4405			}
   4406		} else if (ret == -EAGAIN) {
   4407			goto have_block_group;
   4408		} else if (ret > 0) {
   4409			goto loop;
   4410		}
   4411
   4412		/* Checks */
   4413		ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
   4414						 fs_info->stripesize);
   4415
   4416		/* move on to the next group */
   4417		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
   4418		    block_group->start + block_group->length) {
   4419			btrfs_add_free_space_unused(block_group,
   4420					    ffe_ctl->found_offset,
   4421					    ffe_ctl->num_bytes);
   4422			goto loop;
   4423		}
   4424
   4425		if (ffe_ctl->found_offset < ffe_ctl->search_start)
   4426			btrfs_add_free_space_unused(block_group,
   4427					ffe_ctl->found_offset,
   4428					ffe_ctl->search_start - ffe_ctl->found_offset);
   4429
   4430		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
   4431					       ffe_ctl->num_bytes,
   4432					       ffe_ctl->delalloc);
   4433		if (ret == -EAGAIN) {
   4434			btrfs_add_free_space_unused(block_group,
   4435					ffe_ctl->found_offset,
   4436					ffe_ctl->num_bytes);
   4437			goto loop;
   4438		}
   4439		btrfs_inc_block_group_reservations(block_group);
   4440
   4441		/* we are all good, lets return */
   4442		ins->objectid = ffe_ctl->search_start;
   4443		ins->offset = ffe_ctl->num_bytes;
   4444
   4445		trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
   4446					   ffe_ctl->num_bytes);
   4447		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
   4448		break;
   4449loop:
   4450		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
   4451		cond_resched();
   4452	}
   4453	up_read(&space_info->groups_sem);
   4454
   4455	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
   4456	if (ret > 0)
   4457		goto search;
   4458
   4459	if (ret == -ENOSPC && !cache_block_group_error) {
   4460		/*
   4461		 * Use ffe_ctl->total_free_space as fallback if we can't find
   4462		 * any contiguous hole.
   4463		 */
   4464		if (!ffe_ctl->max_extent_size)
   4465			ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
   4466		spin_lock(&space_info->lock);
   4467		space_info->max_extent_size = ffe_ctl->max_extent_size;
   4468		spin_unlock(&space_info->lock);
   4469		ins->offset = ffe_ctl->max_extent_size;
   4470	} else if (ret == -ENOSPC) {
   4471		ret = cache_block_group_error;
   4472	}
   4473	return ret;
   4474}
   4475
   4476/*
   4477 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
   4478 *			  hole that is at least as big as @num_bytes.
   4479 *
   4480 * @root           -	The root that will contain this extent
   4481 *
   4482 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
   4483 *			is used for accounting purposes. This value differs
   4484 *			from @num_bytes only in the case of compressed extents.
   4485 *
   4486 * @num_bytes      -	Number of bytes to allocate on-disk.
   4487 *
   4488 * @min_alloc_size -	Indicates the minimum amount of space that the
   4489 *			allocator should try to satisfy. In some cases
   4490 *			@num_bytes may be larger than what is required and if
   4491 *			the filesystem is fragmented then allocation fails.
   4492 *			However, the presence of @min_alloc_size gives a
   4493 *			chance to try and satisfy the smaller allocation.
   4494 *
   4495 * @empty_size     -	A hint that you plan on doing more COW. This is the
   4496 *			size in bytes the allocator should try to find free
   4497 *			next to the block it returns.  This is just a hint and
   4498 *			may be ignored by the allocator.
   4499 *
   4500 * @hint_byte      -	Hint to the allocator to start searching above the byte
   4501 *			address passed. It might be ignored.
   4502 *
   4503 * @ins            -	This key is modified to record the found hole. It will
   4504 *			have the following values:
   4505 *			ins->objectid == start position
   4506 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
   4507 *			ins->offset == the size of the hole.
   4508 *
   4509 * @is_data        -	Boolean flag indicating whether an extent is
   4510 *			allocated for data (true) or metadata (false)
   4511 *
   4512 * @delalloc       -	Boolean flag indicating whether this allocation is for
   4513 *			delalloc or not. If 'true' data_rwsem of block groups
   4514 *			is going to be acquired.
   4515 *
   4516 *
   4517 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
   4518 * case -ENOSPC is returned then @ins->offset will contain the size of the
   4519 * largest available hole the allocator managed to find.
   4520 */
   4521int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
   4522			 u64 num_bytes, u64 min_alloc_size,
   4523			 u64 empty_size, u64 hint_byte,
   4524			 struct btrfs_key *ins, int is_data, int delalloc)
   4525{
   4526	struct btrfs_fs_info *fs_info = root->fs_info;
   4527	struct find_free_extent_ctl ffe_ctl = {};
   4528	bool final_tried = num_bytes == min_alloc_size;
   4529	u64 flags;
   4530	int ret;
   4531	bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
   4532	bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
   4533
   4534	flags = get_alloc_profile_by_root(root, is_data);
   4535again:
   4536	WARN_ON(num_bytes < fs_info->sectorsize);
   4537
   4538	ffe_ctl.ram_bytes = ram_bytes;
   4539	ffe_ctl.num_bytes = num_bytes;
   4540	ffe_ctl.min_alloc_size = min_alloc_size;
   4541	ffe_ctl.empty_size = empty_size;
   4542	ffe_ctl.flags = flags;
   4543	ffe_ctl.delalloc = delalloc;
   4544	ffe_ctl.hint_byte = hint_byte;
   4545	ffe_ctl.for_treelog = for_treelog;
   4546	ffe_ctl.for_data_reloc = for_data_reloc;
   4547
   4548	ret = find_free_extent(root, ins, &ffe_ctl);
   4549	if (!ret && !is_data) {
   4550		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
   4551	} else if (ret == -ENOSPC) {
   4552		if (!final_tried && ins->offset) {
   4553			num_bytes = min(num_bytes >> 1, ins->offset);
   4554			num_bytes = round_down(num_bytes,
   4555					       fs_info->sectorsize);
   4556			num_bytes = max(num_bytes, min_alloc_size);
   4557			ram_bytes = num_bytes;
   4558			if (num_bytes == min_alloc_size)
   4559				final_tried = true;
   4560			goto again;
   4561		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
   4562			struct btrfs_space_info *sinfo;
   4563
   4564			sinfo = btrfs_find_space_info(fs_info, flags);
   4565			btrfs_err(fs_info,
   4566	"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
   4567				  flags, num_bytes, for_treelog, for_data_reloc);
   4568			if (sinfo)
   4569				btrfs_dump_space_info(fs_info, sinfo,
   4570						      num_bytes, 1);
   4571		}
   4572	}
   4573
   4574	return ret;
   4575}
   4576
   4577int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
   4578			       u64 start, u64 len, int delalloc)
   4579{
   4580	struct btrfs_block_group *cache;
   4581
   4582	cache = btrfs_lookup_block_group(fs_info, start);
   4583	if (!cache) {
   4584		btrfs_err(fs_info, "Unable to find block group for %llu",
   4585			  start);
   4586		return -ENOSPC;
   4587	}
   4588
   4589	btrfs_add_free_space(cache, start, len);
   4590	btrfs_free_reserved_bytes(cache, len, delalloc);
   4591	trace_btrfs_reserved_extent_free(fs_info, start, len);
   4592
   4593	btrfs_put_block_group(cache);
   4594	return 0;
   4595}
   4596
   4597int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
   4598			      u64 len)
   4599{
   4600	struct btrfs_block_group *cache;
   4601	int ret = 0;
   4602
   4603	cache = btrfs_lookup_block_group(trans->fs_info, start);
   4604	if (!cache) {
   4605		btrfs_err(trans->fs_info, "unable to find block group for %llu",
   4606			  start);
   4607		return -ENOSPC;
   4608	}
   4609
   4610	ret = pin_down_extent(trans, cache, start, len, 1);
   4611	btrfs_put_block_group(cache);
   4612	return ret;
   4613}
   4614
   4615static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
   4616				 u64 num_bytes)
   4617{
   4618	struct btrfs_fs_info *fs_info = trans->fs_info;
   4619	int ret;
   4620
   4621	ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
   4622	if (ret)
   4623		return ret;
   4624
   4625	ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
   4626	if (ret) {
   4627		ASSERT(!ret);
   4628		btrfs_err(fs_info, "update block group failed for %llu %llu",
   4629			  bytenr, num_bytes);
   4630		return ret;
   4631	}
   4632
   4633	trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
   4634	return 0;
   4635}
   4636
   4637static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
   4638				      u64 parent, u64 root_objectid,
   4639				      u64 flags, u64 owner, u64 offset,
   4640				      struct btrfs_key *ins, int ref_mod)
   4641{
   4642	struct btrfs_fs_info *fs_info = trans->fs_info;
   4643	struct btrfs_root *extent_root;
   4644	int ret;
   4645	struct btrfs_extent_item *extent_item;
   4646	struct btrfs_extent_inline_ref *iref;
   4647	struct btrfs_path *path;
   4648	struct extent_buffer *leaf;
   4649	int type;
   4650	u32 size;
   4651
   4652	if (parent > 0)
   4653		type = BTRFS_SHARED_DATA_REF_KEY;
   4654	else
   4655		type = BTRFS_EXTENT_DATA_REF_KEY;
   4656
   4657	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
   4658
   4659	path = btrfs_alloc_path();
   4660	if (!path)
   4661		return -ENOMEM;
   4662
   4663	extent_root = btrfs_extent_root(fs_info, ins->objectid);
   4664	ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
   4665	if (ret) {
   4666		btrfs_free_path(path);
   4667		return ret;
   4668	}
   4669
   4670	leaf = path->nodes[0];
   4671	extent_item = btrfs_item_ptr(leaf, path->slots[0],
   4672				     struct btrfs_extent_item);
   4673	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
   4674	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
   4675	btrfs_set_extent_flags(leaf, extent_item,
   4676			       flags | BTRFS_EXTENT_FLAG_DATA);
   4677
   4678	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
   4679	btrfs_set_extent_inline_ref_type(leaf, iref, type);
   4680	if (parent > 0) {
   4681		struct btrfs_shared_data_ref *ref;
   4682		ref = (struct btrfs_shared_data_ref *)(iref + 1);
   4683		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
   4684		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
   4685	} else {
   4686		struct btrfs_extent_data_ref *ref;
   4687		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
   4688		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
   4689		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
   4690		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
   4691		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
   4692	}
   4693
   4694	btrfs_mark_buffer_dirty(path->nodes[0]);
   4695	btrfs_free_path(path);
   4696
   4697	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
   4698}
   4699
   4700static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
   4701				     struct btrfs_delayed_ref_node *node,
   4702				     struct btrfs_delayed_extent_op *extent_op)
   4703{
   4704	struct btrfs_fs_info *fs_info = trans->fs_info;
   4705	struct btrfs_root *extent_root;
   4706	int ret;
   4707	struct btrfs_extent_item *extent_item;
   4708	struct btrfs_key extent_key;
   4709	struct btrfs_tree_block_info *block_info;
   4710	struct btrfs_extent_inline_ref *iref;
   4711	struct btrfs_path *path;
   4712	struct extent_buffer *leaf;
   4713	struct btrfs_delayed_tree_ref *ref;
   4714	u32 size = sizeof(*extent_item) + sizeof(*iref);
   4715	u64 flags = extent_op->flags_to_set;
   4716	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
   4717
   4718	ref = btrfs_delayed_node_to_tree_ref(node);
   4719
   4720	extent_key.objectid = node->bytenr;
   4721	if (skinny_metadata) {
   4722		extent_key.offset = ref->level;
   4723		extent_key.type = BTRFS_METADATA_ITEM_KEY;
   4724	} else {
   4725		extent_key.offset = node->num_bytes;
   4726		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
   4727		size += sizeof(*block_info);
   4728	}
   4729
   4730	path = btrfs_alloc_path();
   4731	if (!path)
   4732		return -ENOMEM;
   4733
   4734	extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
   4735	ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
   4736				      size);
   4737	if (ret) {
   4738		btrfs_free_path(path);
   4739		return ret;
   4740	}
   4741
   4742	leaf = path->nodes[0];
   4743	extent_item = btrfs_item_ptr(leaf, path->slots[0],
   4744				     struct btrfs_extent_item);
   4745	btrfs_set_extent_refs(leaf, extent_item, 1);
   4746	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
   4747	btrfs_set_extent_flags(leaf, extent_item,
   4748			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
   4749
   4750	if (skinny_metadata) {
   4751		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
   4752	} else {
   4753		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
   4754		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
   4755		btrfs_set_tree_block_level(leaf, block_info, ref->level);
   4756		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
   4757	}
   4758
   4759	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
   4760		btrfs_set_extent_inline_ref_type(leaf, iref,
   4761						 BTRFS_SHARED_BLOCK_REF_KEY);
   4762		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
   4763	} else {
   4764		btrfs_set_extent_inline_ref_type(leaf, iref,
   4765						 BTRFS_TREE_BLOCK_REF_KEY);
   4766		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
   4767	}
   4768
   4769	btrfs_mark_buffer_dirty(leaf);
   4770	btrfs_free_path(path);
   4771
   4772	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
   4773}
   4774
   4775int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
   4776				     struct btrfs_root *root, u64 owner,
   4777				     u64 offset, u64 ram_bytes,
   4778				     struct btrfs_key *ins)
   4779{
   4780	struct btrfs_ref generic_ref = { 0 };
   4781
   4782	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
   4783
   4784	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
   4785			       ins->objectid, ins->offset, 0);
   4786	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
   4787			    offset, 0, false);
   4788	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
   4789
   4790	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
   4791}
   4792
   4793/*
   4794 * this is used by the tree logging recovery code.  It records that
   4795 * an extent has been allocated and makes sure to clear the free
   4796 * space cache bits as well
   4797 */
   4798int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
   4799				   u64 root_objectid, u64 owner, u64 offset,
   4800				   struct btrfs_key *ins)
   4801{
   4802	struct btrfs_fs_info *fs_info = trans->fs_info;
   4803	int ret;
   4804	struct btrfs_block_group *block_group;
   4805	struct btrfs_space_info *space_info;
   4806
   4807	/*
   4808	 * Mixed block groups will exclude before processing the log so we only
   4809	 * need to do the exclude dance if this fs isn't mixed.
   4810	 */
   4811	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
   4812		ret = __exclude_logged_extent(fs_info, ins->objectid,
   4813					      ins->offset);
   4814		if (ret)
   4815			return ret;
   4816	}
   4817
   4818	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
   4819	if (!block_group)
   4820		return -EINVAL;
   4821
   4822	space_info = block_group->space_info;
   4823	spin_lock(&space_info->lock);
   4824	spin_lock(&block_group->lock);
   4825	space_info->bytes_reserved += ins->offset;
   4826	block_group->reserved += ins->offset;
   4827	spin_unlock(&block_group->lock);
   4828	spin_unlock(&space_info->lock);
   4829
   4830	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
   4831					 offset, ins, 1);
   4832	if (ret)
   4833		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
   4834	btrfs_put_block_group(block_group);
   4835	return ret;
   4836}
   4837
   4838static struct extent_buffer *
   4839btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
   4840		      u64 bytenr, int level, u64 owner,
   4841		      enum btrfs_lock_nesting nest)
   4842{
   4843	struct btrfs_fs_info *fs_info = root->fs_info;
   4844	struct extent_buffer *buf;
   4845
   4846	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
   4847	if (IS_ERR(buf))
   4848		return buf;
   4849
   4850	/*
   4851	 * Extra safety check in case the extent tree is corrupted and extent
   4852	 * allocator chooses to use a tree block which is already used and
   4853	 * locked.
   4854	 */
   4855	if (buf->lock_owner == current->pid) {
   4856		btrfs_err_rl(fs_info,
   4857"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
   4858			buf->start, btrfs_header_owner(buf), current->pid);
   4859		free_extent_buffer(buf);
   4860		return ERR_PTR(-EUCLEAN);
   4861	}
   4862
   4863	/*
   4864	 * This needs to stay, because we could allocate a freed block from an
   4865	 * old tree into a new tree, so we need to make sure this new block is
   4866	 * set to the appropriate level and owner.
   4867	 */
   4868	btrfs_set_buffer_lockdep_class(owner, buf, level);
   4869	__btrfs_tree_lock(buf, nest);
   4870	btrfs_clean_tree_block(buf);
   4871	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
   4872	clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
   4873
   4874	set_extent_buffer_uptodate(buf);
   4875
   4876	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
   4877	btrfs_set_header_level(buf, level);
   4878	btrfs_set_header_bytenr(buf, buf->start);
   4879	btrfs_set_header_generation(buf, trans->transid);
   4880	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
   4881	btrfs_set_header_owner(buf, owner);
   4882	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
   4883	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
   4884	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
   4885		buf->log_index = root->log_transid % 2;
   4886		/*
   4887		 * we allow two log transactions at a time, use different
   4888		 * EXTENT bit to differentiate dirty pages.
   4889		 */
   4890		if (buf->log_index == 0)
   4891			set_extent_dirty(&root->dirty_log_pages, buf->start,
   4892					buf->start + buf->len - 1, GFP_NOFS);
   4893		else
   4894			set_extent_new(&root->dirty_log_pages, buf->start,
   4895					buf->start + buf->len - 1);
   4896	} else {
   4897		buf->log_index = -1;
   4898		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
   4899			 buf->start + buf->len - 1, GFP_NOFS);
   4900	}
   4901	/* this returns a buffer locked for blocking */
   4902	return buf;
   4903}
   4904
   4905/*
   4906 * finds a free extent and does all the dirty work required for allocation
   4907 * returns the tree buffer or an ERR_PTR on error.
   4908 */
   4909struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
   4910					     struct btrfs_root *root,
   4911					     u64 parent, u64 root_objectid,
   4912					     const struct btrfs_disk_key *key,
   4913					     int level, u64 hint,
   4914					     u64 empty_size,
   4915					     enum btrfs_lock_nesting nest)
   4916{
   4917	struct btrfs_fs_info *fs_info = root->fs_info;
   4918	struct btrfs_key ins;
   4919	struct btrfs_block_rsv *block_rsv;
   4920	struct extent_buffer *buf;
   4921	struct btrfs_delayed_extent_op *extent_op;
   4922	struct btrfs_ref generic_ref = { 0 };
   4923	u64 flags = 0;
   4924	int ret;
   4925	u32 blocksize = fs_info->nodesize;
   4926	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
   4927
   4928#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
   4929	if (btrfs_is_testing(fs_info)) {
   4930		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
   4931					    level, root_objectid, nest);
   4932		if (!IS_ERR(buf))
   4933			root->alloc_bytenr += blocksize;
   4934		return buf;
   4935	}
   4936#endif
   4937
   4938	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
   4939	if (IS_ERR(block_rsv))
   4940		return ERR_CAST(block_rsv);
   4941
   4942	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
   4943				   empty_size, hint, &ins, 0, 0);
   4944	if (ret)
   4945		goto out_unuse;
   4946
   4947	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
   4948				    root_objectid, nest);
   4949	if (IS_ERR(buf)) {
   4950		ret = PTR_ERR(buf);
   4951		goto out_free_reserved;
   4952	}
   4953
   4954	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
   4955		if (parent == 0)
   4956			parent = ins.objectid;
   4957		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
   4958	} else
   4959		BUG_ON(parent > 0);
   4960
   4961	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
   4962		extent_op = btrfs_alloc_delayed_extent_op();
   4963		if (!extent_op) {
   4964			ret = -ENOMEM;
   4965			goto out_free_buf;
   4966		}
   4967		if (key)
   4968			memcpy(&extent_op->key, key, sizeof(extent_op->key));
   4969		else
   4970			memset(&extent_op->key, 0, sizeof(extent_op->key));
   4971		extent_op->flags_to_set = flags;
   4972		extent_op->update_key = skinny_metadata ? false : true;
   4973		extent_op->update_flags = true;
   4974		extent_op->level = level;
   4975
   4976		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
   4977				       ins.objectid, ins.offset, parent);
   4978		btrfs_init_tree_ref(&generic_ref, level, root_objectid,
   4979				    root->root_key.objectid, false);
   4980		btrfs_ref_tree_mod(fs_info, &generic_ref);
   4981		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
   4982		if (ret)
   4983			goto out_free_delayed;
   4984	}
   4985	return buf;
   4986
   4987out_free_delayed:
   4988	btrfs_free_delayed_extent_op(extent_op);
   4989out_free_buf:
   4990	btrfs_tree_unlock(buf);
   4991	free_extent_buffer(buf);
   4992out_free_reserved:
   4993	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
   4994out_unuse:
   4995	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
   4996	return ERR_PTR(ret);
   4997}
   4998
   4999struct walk_control {
   5000	u64 refs[BTRFS_MAX_LEVEL];
   5001	u64 flags[BTRFS_MAX_LEVEL];
   5002	struct btrfs_key update_progress;
   5003	struct btrfs_key drop_progress;
   5004	int drop_level;
   5005	int stage;
   5006	int level;
   5007	int shared_level;
   5008	int update_ref;
   5009	int keep_locks;
   5010	int reada_slot;
   5011	int reada_count;
   5012	int restarted;
   5013};
   5014
   5015#define DROP_REFERENCE	1
   5016#define UPDATE_BACKREF	2
   5017
   5018static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
   5019				     struct btrfs_root *root,
   5020				     struct walk_control *wc,
   5021				     struct btrfs_path *path)
   5022{
   5023	struct btrfs_fs_info *fs_info = root->fs_info;
   5024	u64 bytenr;
   5025	u64 generation;
   5026	u64 refs;
   5027	u64 flags;
   5028	u32 nritems;
   5029	struct btrfs_key key;
   5030	struct extent_buffer *eb;
   5031	int ret;
   5032	int slot;
   5033	int nread = 0;
   5034
   5035	if (path->slots[wc->level] < wc->reada_slot) {
   5036		wc->reada_count = wc->reada_count * 2 / 3;
   5037		wc->reada_count = max(wc->reada_count, 2);
   5038	} else {
   5039		wc->reada_count = wc->reada_count * 3 / 2;
   5040		wc->reada_count = min_t(int, wc->reada_count,
   5041					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
   5042	}
   5043
   5044	eb = path->nodes[wc->level];
   5045	nritems = btrfs_header_nritems(eb);
   5046
   5047	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
   5048		if (nread >= wc->reada_count)
   5049			break;
   5050
   5051		cond_resched();
   5052		bytenr = btrfs_node_blockptr(eb, slot);
   5053		generation = btrfs_node_ptr_generation(eb, slot);
   5054
   5055		if (slot == path->slots[wc->level])
   5056			goto reada;
   5057
   5058		if (wc->stage == UPDATE_BACKREF &&
   5059		    generation <= root->root_key.offset)
   5060			continue;
   5061
   5062		/* We don't lock the tree block, it's OK to be racy here */
   5063		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
   5064					       wc->level - 1, 1, &refs,
   5065					       &flags);
   5066		/* We don't care about errors in readahead. */
   5067		if (ret < 0)
   5068			continue;
   5069		BUG_ON(refs == 0);
   5070
   5071		if (wc->stage == DROP_REFERENCE) {
   5072			if (refs == 1)
   5073				goto reada;
   5074
   5075			if (wc->level == 1 &&
   5076			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
   5077				continue;
   5078			if (!wc->update_ref ||
   5079			    generation <= root->root_key.offset)
   5080				continue;
   5081			btrfs_node_key_to_cpu(eb, &key, slot);
   5082			ret = btrfs_comp_cpu_keys(&key,
   5083						  &wc->update_progress);
   5084			if (ret < 0)
   5085				continue;
   5086		} else {
   5087			if (wc->level == 1 &&
   5088			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
   5089				continue;
   5090		}
   5091reada:
   5092		btrfs_readahead_node_child(eb, slot);
   5093		nread++;
   5094	}
   5095	wc->reada_slot = slot;
   5096}
   5097
   5098/*
   5099 * helper to process tree block while walking down the tree.
   5100 *
   5101 * when wc->stage == UPDATE_BACKREF, this function updates
   5102 * back refs for pointers in the block.
   5103 *
   5104 * NOTE: return value 1 means we should stop walking down.
   5105 */
   5106static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
   5107				   struct btrfs_root *root,
   5108				   struct btrfs_path *path,
   5109				   struct walk_control *wc, int lookup_info)
   5110{
   5111	struct btrfs_fs_info *fs_info = root->fs_info;
   5112	int level = wc->level;
   5113	struct extent_buffer *eb = path->nodes[level];
   5114	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
   5115	int ret;
   5116
   5117	if (wc->stage == UPDATE_BACKREF &&
   5118	    btrfs_header_owner(eb) != root->root_key.objectid)
   5119		return 1;
   5120
   5121	/*
   5122	 * when reference count of tree block is 1, it won't increase
   5123	 * again. once full backref flag is set, we never clear it.
   5124	 */
   5125	if (lookup_info &&
   5126	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
   5127	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
   5128		BUG_ON(!path->locks[level]);
   5129		ret = btrfs_lookup_extent_info(trans, fs_info,
   5130					       eb->start, level, 1,
   5131					       &wc->refs[level],
   5132					       &wc->flags[level]);
   5133		BUG_ON(ret == -ENOMEM);
   5134		if (ret)
   5135			return ret;
   5136		BUG_ON(wc->refs[level] == 0);
   5137	}
   5138
   5139	if (wc->stage == DROP_REFERENCE) {
   5140		if (wc->refs[level] > 1)
   5141			return 1;
   5142
   5143		if (path->locks[level] && !wc->keep_locks) {
   5144			btrfs_tree_unlock_rw(eb, path->locks[level]);
   5145			path->locks[level] = 0;
   5146		}
   5147		return 0;
   5148	}
   5149
   5150	/* wc->stage == UPDATE_BACKREF */
   5151	if (!(wc->flags[level] & flag)) {
   5152		BUG_ON(!path->locks[level]);
   5153		ret = btrfs_inc_ref(trans, root, eb, 1);
   5154		BUG_ON(ret); /* -ENOMEM */
   5155		ret = btrfs_dec_ref(trans, root, eb, 0);
   5156		BUG_ON(ret); /* -ENOMEM */
   5157		ret = btrfs_set_disk_extent_flags(trans, eb, flag,
   5158						  btrfs_header_level(eb));
   5159		BUG_ON(ret); /* -ENOMEM */
   5160		wc->flags[level] |= flag;
   5161	}
   5162
   5163	/*
   5164	 * the block is shared by multiple trees, so it's not good to
   5165	 * keep the tree lock
   5166	 */
   5167	if (path->locks[level] && level > 0) {
   5168		btrfs_tree_unlock_rw(eb, path->locks[level]);
   5169		path->locks[level] = 0;
   5170	}
   5171	return 0;
   5172}
   5173
   5174/*
   5175 * This is used to verify a ref exists for this root to deal with a bug where we
   5176 * would have a drop_progress key that hadn't been updated properly.
   5177 */
   5178static int check_ref_exists(struct btrfs_trans_handle *trans,
   5179			    struct btrfs_root *root, u64 bytenr, u64 parent,
   5180			    int level)
   5181{
   5182	struct btrfs_path *path;
   5183	struct btrfs_extent_inline_ref *iref;
   5184	int ret;
   5185
   5186	path = btrfs_alloc_path();
   5187	if (!path)
   5188		return -ENOMEM;
   5189
   5190	ret = lookup_extent_backref(trans, path, &iref, bytenr,
   5191				    root->fs_info->nodesize, parent,
   5192				    root->root_key.objectid, level, 0);
   5193	btrfs_free_path(path);
   5194	if (ret == -ENOENT)
   5195		return 0;
   5196	if (ret < 0)
   5197		return ret;
   5198	return 1;
   5199}
   5200
   5201/*
   5202 * helper to process tree block pointer.
   5203 *
   5204 * when wc->stage == DROP_REFERENCE, this function checks
   5205 * reference count of the block pointed to. if the block
   5206 * is shared and we need update back refs for the subtree
   5207 * rooted at the block, this function changes wc->stage to
   5208 * UPDATE_BACKREF. if the block is shared and there is no
   5209 * need to update back, this function drops the reference
   5210 * to the block.
   5211 *
   5212 * NOTE: return value 1 means we should stop walking down.
   5213 */
   5214static noinline int do_walk_down(struct btrfs_trans_handle *trans,
   5215				 struct btrfs_root *root,
   5216				 struct btrfs_path *path,
   5217				 struct walk_control *wc, int *lookup_info)
   5218{
   5219	struct btrfs_fs_info *fs_info = root->fs_info;
   5220	u64 bytenr;
   5221	u64 generation;
   5222	u64 parent;
   5223	struct btrfs_key key;
   5224	struct btrfs_key first_key;
   5225	struct btrfs_ref ref = { 0 };
   5226	struct extent_buffer *next;
   5227	int level = wc->level;
   5228	int reada = 0;
   5229	int ret = 0;
   5230	bool need_account = false;
   5231
   5232	generation = btrfs_node_ptr_generation(path->nodes[level],
   5233					       path->slots[level]);
   5234	/*
   5235	 * if the lower level block was created before the snapshot
   5236	 * was created, we know there is no need to update back refs
   5237	 * for the subtree
   5238	 */
   5239	if (wc->stage == UPDATE_BACKREF &&
   5240	    generation <= root->root_key.offset) {
   5241		*lookup_info = 1;
   5242		return 1;
   5243	}
   5244
   5245	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
   5246	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
   5247			      path->slots[level]);
   5248
   5249	next = find_extent_buffer(fs_info, bytenr);
   5250	if (!next) {
   5251		next = btrfs_find_create_tree_block(fs_info, bytenr,
   5252				root->root_key.objectid, level - 1);
   5253		if (IS_ERR(next))
   5254			return PTR_ERR(next);
   5255		reada = 1;
   5256	}
   5257	btrfs_tree_lock(next);
   5258
   5259	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
   5260				       &wc->refs[level - 1],
   5261				       &wc->flags[level - 1]);
   5262	if (ret < 0)
   5263		goto out_unlock;
   5264
   5265	if (unlikely(wc->refs[level - 1] == 0)) {
   5266		btrfs_err(fs_info, "Missing references.");
   5267		ret = -EIO;
   5268		goto out_unlock;
   5269	}
   5270	*lookup_info = 0;
   5271
   5272	if (wc->stage == DROP_REFERENCE) {
   5273		if (wc->refs[level - 1] > 1) {
   5274			need_account = true;
   5275			if (level == 1 &&
   5276			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
   5277				goto skip;
   5278
   5279			if (!wc->update_ref ||
   5280			    generation <= root->root_key.offset)
   5281				goto skip;
   5282
   5283			btrfs_node_key_to_cpu(path->nodes[level], &key,
   5284					      path->slots[level]);
   5285			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
   5286			if (ret < 0)
   5287				goto skip;
   5288
   5289			wc->stage = UPDATE_BACKREF;
   5290			wc->shared_level = level - 1;
   5291		}
   5292	} else {
   5293		if (level == 1 &&
   5294		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
   5295			goto skip;
   5296	}
   5297
   5298	if (!btrfs_buffer_uptodate(next, generation, 0)) {
   5299		btrfs_tree_unlock(next);
   5300		free_extent_buffer(next);
   5301		next = NULL;
   5302		*lookup_info = 1;
   5303	}
   5304
   5305	if (!next) {
   5306		if (reada && level == 1)
   5307			reada_walk_down(trans, root, wc, path);
   5308		next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
   5309				       generation, level - 1, &first_key);
   5310		if (IS_ERR(next)) {
   5311			return PTR_ERR(next);
   5312		} else if (!extent_buffer_uptodate(next)) {
   5313			free_extent_buffer(next);
   5314			return -EIO;
   5315		}
   5316		btrfs_tree_lock(next);
   5317	}
   5318
   5319	level--;
   5320	ASSERT(level == btrfs_header_level(next));
   5321	if (level != btrfs_header_level(next)) {
   5322		btrfs_err(root->fs_info, "mismatched level");
   5323		ret = -EIO;
   5324		goto out_unlock;
   5325	}
   5326	path->nodes[level] = next;
   5327	path->slots[level] = 0;
   5328	path->locks[level] = BTRFS_WRITE_LOCK;
   5329	wc->level = level;
   5330	if (wc->level == 1)
   5331		wc->reada_slot = 0;
   5332	return 0;
   5333skip:
   5334	wc->refs[level - 1] = 0;
   5335	wc->flags[level - 1] = 0;
   5336	if (wc->stage == DROP_REFERENCE) {
   5337		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
   5338			parent = path->nodes[level]->start;
   5339		} else {
   5340			ASSERT(root->root_key.objectid ==
   5341			       btrfs_header_owner(path->nodes[level]));
   5342			if (root->root_key.objectid !=
   5343			    btrfs_header_owner(path->nodes[level])) {
   5344				btrfs_err(root->fs_info,
   5345						"mismatched block owner");
   5346				ret = -EIO;
   5347				goto out_unlock;
   5348			}
   5349			parent = 0;
   5350		}
   5351
   5352		/*
   5353		 * If we had a drop_progress we need to verify the refs are set
   5354		 * as expected.  If we find our ref then we know that from here
   5355		 * on out everything should be correct, and we can clear the
   5356		 * ->restarted flag.
   5357		 */
   5358		if (wc->restarted) {
   5359			ret = check_ref_exists(trans, root, bytenr, parent,
   5360					       level - 1);
   5361			if (ret < 0)
   5362				goto out_unlock;
   5363			if (ret == 0)
   5364				goto no_delete;
   5365			ret = 0;
   5366			wc->restarted = 0;
   5367		}
   5368
   5369		/*
   5370		 * Reloc tree doesn't contribute to qgroup numbers, and we have
   5371		 * already accounted them at merge time (replace_path),
   5372		 * thus we could skip expensive subtree trace here.
   5373		 */
   5374		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
   5375		    need_account) {
   5376			ret = btrfs_qgroup_trace_subtree(trans, next,
   5377							 generation, level - 1);
   5378			if (ret) {
   5379				btrfs_err_rl(fs_info,
   5380					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
   5381					     ret);
   5382			}
   5383		}
   5384
   5385		/*
   5386		 * We need to update the next key in our walk control so we can
   5387		 * update the drop_progress key accordingly.  We don't care if
   5388		 * find_next_key doesn't find a key because that means we're at
   5389		 * the end and are going to clean up now.
   5390		 */
   5391		wc->drop_level = level;
   5392		find_next_key(path, level, &wc->drop_progress);
   5393
   5394		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
   5395				       fs_info->nodesize, parent);
   5396		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
   5397				    0, false);
   5398		ret = btrfs_free_extent(trans, &ref);
   5399		if (ret)
   5400			goto out_unlock;
   5401	}
   5402no_delete:
   5403	*lookup_info = 1;
   5404	ret = 1;
   5405
   5406out_unlock:
   5407	btrfs_tree_unlock(next);
   5408	free_extent_buffer(next);
   5409
   5410	return ret;
   5411}
   5412
   5413/*
   5414 * helper to process tree block while walking up the tree.
   5415 *
   5416 * when wc->stage == DROP_REFERENCE, this function drops
   5417 * reference count on the block.
   5418 *
   5419 * when wc->stage == UPDATE_BACKREF, this function changes
   5420 * wc->stage back to DROP_REFERENCE if we changed wc->stage
   5421 * to UPDATE_BACKREF previously while processing the block.
   5422 *
   5423 * NOTE: return value 1 means we should stop walking up.
   5424 */
   5425static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
   5426				 struct btrfs_root *root,
   5427				 struct btrfs_path *path,
   5428				 struct walk_control *wc)
   5429{
   5430	struct btrfs_fs_info *fs_info = root->fs_info;
   5431	int ret;
   5432	int level = wc->level;
   5433	struct extent_buffer *eb = path->nodes[level];
   5434	u64 parent = 0;
   5435
   5436	if (wc->stage == UPDATE_BACKREF) {
   5437		BUG_ON(wc->shared_level < level);
   5438		if (level < wc->shared_level)
   5439			goto out;
   5440
   5441		ret = find_next_key(path, level + 1, &wc->update_progress);
   5442		if (ret > 0)
   5443			wc->update_ref = 0;
   5444
   5445		wc->stage = DROP_REFERENCE;
   5446		wc->shared_level = -1;
   5447		path->slots[level] = 0;
   5448
   5449		/*
   5450		 * check reference count again if the block isn't locked.
   5451		 * we should start walking down the tree again if reference
   5452		 * count is one.
   5453		 */
   5454		if (!path->locks[level]) {
   5455			BUG_ON(level == 0);
   5456			btrfs_tree_lock(eb);
   5457			path->locks[level] = BTRFS_WRITE_LOCK;
   5458
   5459			ret = btrfs_lookup_extent_info(trans, fs_info,
   5460						       eb->start, level, 1,
   5461						       &wc->refs[level],
   5462						       &wc->flags[level]);
   5463			if (ret < 0) {
   5464				btrfs_tree_unlock_rw(eb, path->locks[level]);
   5465				path->locks[level] = 0;
   5466				return ret;
   5467			}
   5468			BUG_ON(wc->refs[level] == 0);
   5469			if (wc->refs[level] == 1) {
   5470				btrfs_tree_unlock_rw(eb, path->locks[level]);
   5471				path->locks[level] = 0;
   5472				return 1;
   5473			}
   5474		}
   5475	}
   5476
   5477	/* wc->stage == DROP_REFERENCE */
   5478	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
   5479
   5480	if (wc->refs[level] == 1) {
   5481		if (level == 0) {
   5482			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
   5483				ret = btrfs_dec_ref(trans, root, eb, 1);
   5484			else
   5485				ret = btrfs_dec_ref(trans, root, eb, 0);
   5486			BUG_ON(ret); /* -ENOMEM */
   5487			if (is_fstree(root->root_key.objectid)) {
   5488				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
   5489				if (ret) {
   5490					btrfs_err_rl(fs_info,
   5491	"error %d accounting leaf items, quota is out of sync, rescan required",
   5492					     ret);
   5493				}
   5494			}
   5495		}
   5496		/* make block locked assertion in btrfs_clean_tree_block happy */
   5497		if (!path->locks[level] &&
   5498		    btrfs_header_generation(eb) == trans->transid) {
   5499			btrfs_tree_lock(eb);
   5500			path->locks[level] = BTRFS_WRITE_LOCK;
   5501		}
   5502		btrfs_clean_tree_block(eb);
   5503	}
   5504
   5505	if (eb == root->node) {
   5506		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
   5507			parent = eb->start;
   5508		else if (root->root_key.objectid != btrfs_header_owner(eb))
   5509			goto owner_mismatch;
   5510	} else {
   5511		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
   5512			parent = path->nodes[level + 1]->start;
   5513		else if (root->root_key.objectid !=
   5514			 btrfs_header_owner(path->nodes[level + 1]))
   5515			goto owner_mismatch;
   5516	}
   5517
   5518	btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
   5519			      wc->refs[level] == 1);
   5520out:
   5521	wc->refs[level] = 0;
   5522	wc->flags[level] = 0;
   5523	return 0;
   5524
   5525owner_mismatch:
   5526	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
   5527		     btrfs_header_owner(eb), root->root_key.objectid);
   5528	return -EUCLEAN;
   5529}
   5530
   5531static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
   5532				   struct btrfs_root *root,
   5533				   struct btrfs_path *path,
   5534				   struct walk_control *wc)
   5535{
   5536	int level = wc->level;
   5537	int lookup_info = 1;
   5538	int ret;
   5539
   5540	while (level >= 0) {
   5541		ret = walk_down_proc(trans, root, path, wc, lookup_info);
   5542		if (ret > 0)
   5543			break;
   5544
   5545		if (level == 0)
   5546			break;
   5547
   5548		if (path->slots[level] >=
   5549		    btrfs_header_nritems(path->nodes[level]))
   5550			break;
   5551
   5552		ret = do_walk_down(trans, root, path, wc, &lookup_info);
   5553		if (ret > 0) {
   5554			path->slots[level]++;
   5555			continue;
   5556		} else if (ret < 0)
   5557			return ret;
   5558		level = wc->level;
   5559	}
   5560	return 0;
   5561}
   5562
   5563static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
   5564				 struct btrfs_root *root,
   5565				 struct btrfs_path *path,
   5566				 struct walk_control *wc, int max_level)
   5567{
   5568	int level = wc->level;
   5569	int ret;
   5570
   5571	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
   5572	while (level < max_level && path->nodes[level]) {
   5573		wc->level = level;
   5574		if (path->slots[level] + 1 <
   5575		    btrfs_header_nritems(path->nodes[level])) {
   5576			path->slots[level]++;
   5577			return 0;
   5578		} else {
   5579			ret = walk_up_proc(trans, root, path, wc);
   5580			if (ret > 0)
   5581				return 0;
   5582			if (ret < 0)
   5583				return ret;
   5584
   5585			if (path->locks[level]) {
   5586				btrfs_tree_unlock_rw(path->nodes[level],
   5587						     path->locks[level]);
   5588				path->locks[level] = 0;
   5589			}
   5590			free_extent_buffer(path->nodes[level]);
   5591			path->nodes[level] = NULL;
   5592			level++;
   5593		}
   5594	}
   5595	return 1;
   5596}
   5597
   5598/*
   5599 * drop a subvolume tree.
   5600 *
   5601 * this function traverses the tree freeing any blocks that only
   5602 * referenced by the tree.
   5603 *
   5604 * when a shared tree block is found. this function decreases its
   5605 * reference count by one. if update_ref is true, this function
   5606 * also make sure backrefs for the shared block and all lower level
   5607 * blocks are properly updated.
   5608 *
   5609 * If called with for_reloc == 0, may exit early with -EAGAIN
   5610 */
   5611int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
   5612{
   5613	struct btrfs_fs_info *fs_info = root->fs_info;
   5614	struct btrfs_path *path;
   5615	struct btrfs_trans_handle *trans;
   5616	struct btrfs_root *tree_root = fs_info->tree_root;
   5617	struct btrfs_root_item *root_item = &root->root_item;
   5618	struct walk_control *wc;
   5619	struct btrfs_key key;
   5620	int err = 0;
   5621	int ret;
   5622	int level;
   5623	bool root_dropped = false;
   5624	bool unfinished_drop = false;
   5625
   5626	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
   5627
   5628	path = btrfs_alloc_path();
   5629	if (!path) {
   5630		err = -ENOMEM;
   5631		goto out;
   5632	}
   5633
   5634	wc = kzalloc(sizeof(*wc), GFP_NOFS);
   5635	if (!wc) {
   5636		btrfs_free_path(path);
   5637		err = -ENOMEM;
   5638		goto out;
   5639	}
   5640
   5641	/*
   5642	 * Use join to avoid potential EINTR from transaction start. See
   5643	 * wait_reserve_ticket and the whole reservation callchain.
   5644	 */
   5645	if (for_reloc)
   5646		trans = btrfs_join_transaction(tree_root);
   5647	else
   5648		trans = btrfs_start_transaction(tree_root, 0);
   5649	if (IS_ERR(trans)) {
   5650		err = PTR_ERR(trans);
   5651		goto out_free;
   5652	}
   5653
   5654	err = btrfs_run_delayed_items(trans);
   5655	if (err)
   5656		goto out_end_trans;
   5657
   5658	/*
   5659	 * This will help us catch people modifying the fs tree while we're
   5660	 * dropping it.  It is unsafe to mess with the fs tree while it's being
   5661	 * dropped as we unlock the root node and parent nodes as we walk down
   5662	 * the tree, assuming nothing will change.  If something does change
   5663	 * then we'll have stale information and drop references to blocks we've
   5664	 * already dropped.
   5665	 */
   5666	set_bit(BTRFS_ROOT_DELETING, &root->state);
   5667	unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
   5668
   5669	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
   5670		level = btrfs_header_level(root->node);
   5671		path->nodes[level] = btrfs_lock_root_node(root);
   5672		path->slots[level] = 0;
   5673		path->locks[level] = BTRFS_WRITE_LOCK;
   5674		memset(&wc->update_progress, 0,
   5675		       sizeof(wc->update_progress));
   5676	} else {
   5677		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
   5678		memcpy(&wc->update_progress, &key,
   5679		       sizeof(wc->update_progress));
   5680
   5681		level = btrfs_root_drop_level(root_item);
   5682		BUG_ON(level == 0);
   5683		path->lowest_level = level;
   5684		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   5685		path->lowest_level = 0;
   5686		if (ret < 0) {
   5687			err = ret;
   5688			goto out_end_trans;
   5689		}
   5690		WARN_ON(ret > 0);
   5691
   5692		/*
   5693		 * unlock our path, this is safe because only this
   5694		 * function is allowed to delete this snapshot
   5695		 */
   5696		btrfs_unlock_up_safe(path, 0);
   5697
   5698		level = btrfs_header_level(root->node);
   5699		while (1) {
   5700			btrfs_tree_lock(path->nodes[level]);
   5701			path->locks[level] = BTRFS_WRITE_LOCK;
   5702
   5703			ret = btrfs_lookup_extent_info(trans, fs_info,
   5704						path->nodes[level]->start,
   5705						level, 1, &wc->refs[level],
   5706						&wc->flags[level]);
   5707			if (ret < 0) {
   5708				err = ret;
   5709				goto out_end_trans;
   5710			}
   5711			BUG_ON(wc->refs[level] == 0);
   5712
   5713			if (level == btrfs_root_drop_level(root_item))
   5714				break;
   5715
   5716			btrfs_tree_unlock(path->nodes[level]);
   5717			path->locks[level] = 0;
   5718			WARN_ON(wc->refs[level] != 1);
   5719			level--;
   5720		}
   5721	}
   5722
   5723	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
   5724	wc->level = level;
   5725	wc->shared_level = -1;
   5726	wc->stage = DROP_REFERENCE;
   5727	wc->update_ref = update_ref;
   5728	wc->keep_locks = 0;
   5729	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
   5730
   5731	while (1) {
   5732
   5733		ret = walk_down_tree(trans, root, path, wc);
   5734		if (ret < 0) {
   5735			err = ret;
   5736			break;
   5737		}
   5738
   5739		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
   5740		if (ret < 0) {
   5741			err = ret;
   5742			break;
   5743		}
   5744
   5745		if (ret > 0) {
   5746			BUG_ON(wc->stage != DROP_REFERENCE);
   5747			break;
   5748		}
   5749
   5750		if (wc->stage == DROP_REFERENCE) {
   5751			wc->drop_level = wc->level;
   5752			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
   5753					      &wc->drop_progress,
   5754					      path->slots[wc->drop_level]);
   5755		}
   5756		btrfs_cpu_key_to_disk(&root_item->drop_progress,
   5757				      &wc->drop_progress);
   5758		btrfs_set_root_drop_level(root_item, wc->drop_level);
   5759
   5760		BUG_ON(wc->level == 0);
   5761		if (btrfs_should_end_transaction(trans) ||
   5762		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
   5763			ret = btrfs_update_root(trans, tree_root,
   5764						&root->root_key,
   5765						root_item);
   5766			if (ret) {
   5767				btrfs_abort_transaction(trans, ret);
   5768				err = ret;
   5769				goto out_end_trans;
   5770			}
   5771
   5772			btrfs_end_transaction_throttle(trans);
   5773			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
   5774				btrfs_debug(fs_info,
   5775					    "drop snapshot early exit");
   5776				err = -EAGAIN;
   5777				goto out_free;
   5778			}
   5779
   5780		       /*
   5781			* Use join to avoid potential EINTR from transaction
   5782			* start. See wait_reserve_ticket and the whole
   5783			* reservation callchain.
   5784			*/
   5785			if (for_reloc)
   5786				trans = btrfs_join_transaction(tree_root);
   5787			else
   5788				trans = btrfs_start_transaction(tree_root, 0);
   5789			if (IS_ERR(trans)) {
   5790				err = PTR_ERR(trans);
   5791				goto out_free;
   5792			}
   5793		}
   5794	}
   5795	btrfs_release_path(path);
   5796	if (err)
   5797		goto out_end_trans;
   5798
   5799	ret = btrfs_del_root(trans, &root->root_key);
   5800	if (ret) {
   5801		btrfs_abort_transaction(trans, ret);
   5802		err = ret;
   5803		goto out_end_trans;
   5804	}
   5805
   5806	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
   5807		ret = btrfs_find_root(tree_root, &root->root_key, path,
   5808				      NULL, NULL);
   5809		if (ret < 0) {
   5810			btrfs_abort_transaction(trans, ret);
   5811			err = ret;
   5812			goto out_end_trans;
   5813		} else if (ret > 0) {
   5814			/* if we fail to delete the orphan item this time
   5815			 * around, it'll get picked up the next time.
   5816			 *
   5817			 * The most common failure here is just -ENOENT.
   5818			 */
   5819			btrfs_del_orphan_item(trans, tree_root,
   5820					      root->root_key.objectid);
   5821		}
   5822	}
   5823
   5824	/*
   5825	 * This subvolume is going to be completely dropped, and won't be
   5826	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
   5827	 * commit transaction time.  So free it here manually.
   5828	 */
   5829	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
   5830	btrfs_qgroup_free_meta_all_pertrans(root);
   5831
   5832	if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
   5833		btrfs_add_dropped_root(trans, root);
   5834	else
   5835		btrfs_put_root(root);
   5836	root_dropped = true;
   5837out_end_trans:
   5838	btrfs_end_transaction_throttle(trans);
   5839out_free:
   5840	kfree(wc);
   5841	btrfs_free_path(path);
   5842out:
   5843	/*
   5844	 * We were an unfinished drop root, check to see if there are any
   5845	 * pending, and if not clear and wake up any waiters.
   5846	 */
   5847	if (!err && unfinished_drop)
   5848		btrfs_maybe_wake_unfinished_drop(fs_info);
   5849
   5850	/*
   5851	 * So if we need to stop dropping the snapshot for whatever reason we
   5852	 * need to make sure to add it back to the dead root list so that we
   5853	 * keep trying to do the work later.  This also cleans up roots if we
   5854	 * don't have it in the radix (like when we recover after a power fail
   5855	 * or unmount) so we don't leak memory.
   5856	 */
   5857	if (!for_reloc && !root_dropped)
   5858		btrfs_add_dead_root(root);
   5859	return err;
   5860}
   5861
   5862/*
   5863 * drop subtree rooted at tree block 'node'.
   5864 *
   5865 * NOTE: this function will unlock and release tree block 'node'
   5866 * only used by relocation code
   5867 */
   5868int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
   5869			struct btrfs_root *root,
   5870			struct extent_buffer *node,
   5871			struct extent_buffer *parent)
   5872{
   5873	struct btrfs_fs_info *fs_info = root->fs_info;
   5874	struct btrfs_path *path;
   5875	struct walk_control *wc;
   5876	int level;
   5877	int parent_level;
   5878	int ret = 0;
   5879	int wret;
   5880
   5881	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
   5882
   5883	path = btrfs_alloc_path();
   5884	if (!path)
   5885		return -ENOMEM;
   5886
   5887	wc = kzalloc(sizeof(*wc), GFP_NOFS);
   5888	if (!wc) {
   5889		btrfs_free_path(path);
   5890		return -ENOMEM;
   5891	}
   5892
   5893	btrfs_assert_tree_write_locked(parent);
   5894	parent_level = btrfs_header_level(parent);
   5895	atomic_inc(&parent->refs);
   5896	path->nodes[parent_level] = parent;
   5897	path->slots[parent_level] = btrfs_header_nritems(parent);
   5898
   5899	btrfs_assert_tree_write_locked(node);
   5900	level = btrfs_header_level(node);
   5901	path->nodes[level] = node;
   5902	path->slots[level] = 0;
   5903	path->locks[level] = BTRFS_WRITE_LOCK;
   5904
   5905	wc->refs[parent_level] = 1;
   5906	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
   5907	wc->level = level;
   5908	wc->shared_level = -1;
   5909	wc->stage = DROP_REFERENCE;
   5910	wc->update_ref = 0;
   5911	wc->keep_locks = 1;
   5912	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
   5913
   5914	while (1) {
   5915		wret = walk_down_tree(trans, root, path, wc);
   5916		if (wret < 0) {
   5917			ret = wret;
   5918			break;
   5919		}
   5920
   5921		wret = walk_up_tree(trans, root, path, wc, parent_level);
   5922		if (wret < 0)
   5923			ret = wret;
   5924		if (wret != 0)
   5925			break;
   5926	}
   5927
   5928	kfree(wc);
   5929	btrfs_free_path(path);
   5930	return ret;
   5931}
   5932
   5933/*
   5934 * helper to account the unused space of all the readonly block group in the
   5935 * space_info. takes mirrors into account.
   5936 */
   5937u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
   5938{
   5939	struct btrfs_block_group *block_group;
   5940	u64 free_bytes = 0;
   5941	int factor;
   5942
   5943	/* It's df, we don't care if it's racy */
   5944	if (list_empty(&sinfo->ro_bgs))
   5945		return 0;
   5946
   5947	spin_lock(&sinfo->lock);
   5948	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
   5949		spin_lock(&block_group->lock);
   5950
   5951		if (!block_group->ro) {
   5952			spin_unlock(&block_group->lock);
   5953			continue;
   5954		}
   5955
   5956		factor = btrfs_bg_type_to_factor(block_group->flags);
   5957		free_bytes += (block_group->length -
   5958			       block_group->used) * factor;
   5959
   5960		spin_unlock(&block_group->lock);
   5961	}
   5962	spin_unlock(&sinfo->lock);
   5963
   5964	return free_bytes;
   5965}
   5966
   5967int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
   5968				   u64 start, u64 end)
   5969{
   5970	return unpin_extent_range(fs_info, start, end, false);
   5971}
   5972
   5973/*
   5974 * It used to be that old block groups would be left around forever.
   5975 * Iterating over them would be enough to trim unused space.  Since we
   5976 * now automatically remove them, we also need to iterate over unallocated
   5977 * space.
   5978 *
   5979 * We don't want a transaction for this since the discard may take a
   5980 * substantial amount of time.  We don't require that a transaction be
   5981 * running, but we do need to take a running transaction into account
   5982 * to ensure that we're not discarding chunks that were released or
   5983 * allocated in the current transaction.
   5984 *
   5985 * Holding the chunks lock will prevent other threads from allocating
   5986 * or releasing chunks, but it won't prevent a running transaction
   5987 * from committing and releasing the memory that the pending chunks
   5988 * list head uses.  For that, we need to take a reference to the
   5989 * transaction and hold the commit root sem.  We only need to hold
   5990 * it while performing the free space search since we have already
   5991 * held back allocations.
   5992 */
   5993static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
   5994{
   5995	u64 start = SZ_1M, len = 0, end = 0;
   5996	int ret;
   5997
   5998	*trimmed = 0;
   5999
   6000	/* Discard not supported = nothing to do. */
   6001	if (!bdev_max_discard_sectors(device->bdev))
   6002		return 0;
   6003
   6004	/* Not writable = nothing to do. */
   6005	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
   6006		return 0;
   6007
   6008	/* No free space = nothing to do. */
   6009	if (device->total_bytes <= device->bytes_used)
   6010		return 0;
   6011
   6012	ret = 0;
   6013
   6014	while (1) {
   6015		struct btrfs_fs_info *fs_info = device->fs_info;
   6016		u64 bytes;
   6017
   6018		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
   6019		if (ret)
   6020			break;
   6021
   6022		find_first_clear_extent_bit(&device->alloc_state, start,
   6023					    &start, &end,
   6024					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
   6025
   6026		/* Check if there are any CHUNK_* bits left */
   6027		if (start > device->total_bytes) {
   6028			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
   6029			btrfs_warn_in_rcu(fs_info,
   6030"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
   6031					  start, end - start + 1,
   6032					  rcu_str_deref(device->name),
   6033					  device->total_bytes);
   6034			mutex_unlock(&fs_info->chunk_mutex);
   6035			ret = 0;
   6036			break;
   6037		}
   6038
   6039		/* Ensure we skip the reserved area in the first 1M */
   6040		start = max_t(u64, start, SZ_1M);
   6041
   6042		/*
   6043		 * If find_first_clear_extent_bit find a range that spans the
   6044		 * end of the device it will set end to -1, in this case it's up
   6045		 * to the caller to trim the value to the size of the device.
   6046		 */
   6047		end = min(end, device->total_bytes - 1);
   6048
   6049		len = end - start + 1;
   6050
   6051		/* We didn't find any extents */
   6052		if (!len) {
   6053			mutex_unlock(&fs_info->chunk_mutex);
   6054			ret = 0;
   6055			break;
   6056		}
   6057
   6058		ret = btrfs_issue_discard(device->bdev, start, len,
   6059					  &bytes);
   6060		if (!ret)
   6061			set_extent_bits(&device->alloc_state, start,
   6062					start + bytes - 1,
   6063					CHUNK_TRIMMED);
   6064		mutex_unlock(&fs_info->chunk_mutex);
   6065
   6066		if (ret)
   6067			break;
   6068
   6069		start += len;
   6070		*trimmed += bytes;
   6071
   6072		if (fatal_signal_pending(current)) {
   6073			ret = -ERESTARTSYS;
   6074			break;
   6075		}
   6076
   6077		cond_resched();
   6078	}
   6079
   6080	return ret;
   6081}
   6082
   6083/*
   6084 * Trim the whole filesystem by:
   6085 * 1) trimming the free space in each block group
   6086 * 2) trimming the unallocated space on each device
   6087 *
   6088 * This will also continue trimming even if a block group or device encounters
   6089 * an error.  The return value will be the last error, or 0 if nothing bad
   6090 * happens.
   6091 */
   6092int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
   6093{
   6094	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   6095	struct btrfs_block_group *cache = NULL;
   6096	struct btrfs_device *device;
   6097	u64 group_trimmed;
   6098	u64 range_end = U64_MAX;
   6099	u64 start;
   6100	u64 end;
   6101	u64 trimmed = 0;
   6102	u64 bg_failed = 0;
   6103	u64 dev_failed = 0;
   6104	int bg_ret = 0;
   6105	int dev_ret = 0;
   6106	int ret = 0;
   6107
   6108	if (range->start == U64_MAX)
   6109		return -EINVAL;
   6110
   6111	/*
   6112	 * Check range overflow if range->len is set.
   6113	 * The default range->len is U64_MAX.
   6114	 */
   6115	if (range->len != U64_MAX &&
   6116	    check_add_overflow(range->start, range->len, &range_end))
   6117		return -EINVAL;
   6118
   6119	cache = btrfs_lookup_first_block_group(fs_info, range->start);
   6120	for (; cache; cache = btrfs_next_block_group(cache)) {
   6121		if (cache->start >= range_end) {
   6122			btrfs_put_block_group(cache);
   6123			break;
   6124		}
   6125
   6126		start = max(range->start, cache->start);
   6127		end = min(range_end, cache->start + cache->length);
   6128
   6129		if (end - start >= range->minlen) {
   6130			if (!btrfs_block_group_done(cache)) {
   6131				ret = btrfs_cache_block_group(cache, 0);
   6132				if (ret) {
   6133					bg_failed++;
   6134					bg_ret = ret;
   6135					continue;
   6136				}
   6137				ret = btrfs_wait_block_group_cache_done(cache);
   6138				if (ret) {
   6139					bg_failed++;
   6140					bg_ret = ret;
   6141					continue;
   6142				}
   6143			}
   6144			ret = btrfs_trim_block_group(cache,
   6145						     &group_trimmed,
   6146						     start,
   6147						     end,
   6148						     range->minlen);
   6149
   6150			trimmed += group_trimmed;
   6151			if (ret) {
   6152				bg_failed++;
   6153				bg_ret = ret;
   6154				continue;
   6155			}
   6156		}
   6157	}
   6158
   6159	if (bg_failed)
   6160		btrfs_warn(fs_info,
   6161			"failed to trim %llu block group(s), last error %d",
   6162			bg_failed, bg_ret);
   6163
   6164	mutex_lock(&fs_devices->device_list_mutex);
   6165	list_for_each_entry(device, &fs_devices->devices, dev_list) {
   6166		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
   6167			continue;
   6168
   6169		ret = btrfs_trim_free_extents(device, &group_trimmed);
   6170		if (ret) {
   6171			dev_failed++;
   6172			dev_ret = ret;
   6173			break;
   6174		}
   6175
   6176		trimmed += group_trimmed;
   6177	}
   6178	mutex_unlock(&fs_devices->device_list_mutex);
   6179
   6180	if (dev_failed)
   6181		btrfs_warn(fs_info,
   6182			"failed to trim %llu device(s), last error %d",
   6183			dev_failed, dev_ret);
   6184	range->len = trimmed;
   6185	if (bg_ret)
   6186		return bg_ret;
   6187	return dev_ret;
   6188}