cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

transaction.c (74575B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2007 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/fs.h>
      7#include <linux/slab.h>
      8#include <linux/sched.h>
      9#include <linux/writeback.h>
     10#include <linux/pagemap.h>
     11#include <linux/blkdev.h>
     12#include <linux/uuid.h>
     13#include "misc.h"
     14#include "ctree.h"
     15#include "disk-io.h"
     16#include "transaction.h"
     17#include "locking.h"
     18#include "tree-log.h"
     19#include "volumes.h"
     20#include "dev-replace.h"
     21#include "qgroup.h"
     22#include "block-group.h"
     23#include "space-info.h"
     24#include "zoned.h"
     25
     26#define BTRFS_ROOT_TRANS_TAG				XA_MARK_0
     27
     28/*
     29 * Transaction states and transitions
     30 *
     31 * No running transaction (fs tree blocks are not modified)
     32 * |
     33 * | To next stage:
     34 * |  Call start_transaction() variants. Except btrfs_join_transaction_nostart().
     35 * V
     36 * Transaction N [[TRANS_STATE_RUNNING]]
     37 * |
     38 * | New trans handles can be attached to transaction N by calling all
     39 * | start_transaction() variants.
     40 * |
     41 * | To next stage:
     42 * |  Call btrfs_commit_transaction() on any trans handle attached to
     43 * |  transaction N
     44 * V
     45 * Transaction N [[TRANS_STATE_COMMIT_START]]
     46 * |
     47 * | Will wait for previous running transaction to completely finish if there
     48 * | is one
     49 * |
     50 * | Then one of the following happes:
     51 * | - Wait for all other trans handle holders to release.
     52 * |   The btrfs_commit_transaction() caller will do the commit work.
     53 * | - Wait for current transaction to be committed by others.
     54 * |   Other btrfs_commit_transaction() caller will do the commit work.
     55 * |
     56 * | At this stage, only btrfs_join_transaction*() variants can attach
     57 * | to this running transaction.
     58 * | All other variants will wait for current one to finish and attach to
     59 * | transaction N+1.
     60 * |
     61 * | To next stage:
     62 * |  Caller is chosen to commit transaction N, and all other trans handle
     63 * |  haven been released.
     64 * V
     65 * Transaction N [[TRANS_STATE_COMMIT_DOING]]
     66 * |
     67 * | The heavy lifting transaction work is started.
     68 * | From running delayed refs (modifying extent tree) to creating pending
     69 * | snapshots, running qgroups.
     70 * | In short, modify supporting trees to reflect modifications of subvolume
     71 * | trees.
     72 * |
     73 * | At this stage, all start_transaction() calls will wait for this
     74 * | transaction to finish and attach to transaction N+1.
     75 * |
     76 * | To next stage:
     77 * |  Until all supporting trees are updated.
     78 * V
     79 * Transaction N [[TRANS_STATE_UNBLOCKED]]
     80 * |						    Transaction N+1
     81 * | All needed trees are modified, thus we only    [[TRANS_STATE_RUNNING]]
     82 * | need to write them back to disk and update	    |
     83 * | super blocks.				    |
     84 * |						    |
     85 * | At this stage, new transaction is allowed to   |
     86 * | start.					    |
     87 * | All new start_transaction() calls will be	    |
     88 * | attached to transid N+1.			    |
     89 * |						    |
     90 * | To next stage:				    |
     91 * |  Until all tree blocks are super blocks are    |
     92 * |  written to block devices			    |
     93 * V						    |
     94 * Transaction N [[TRANS_STATE_COMPLETED]]	    V
     95 *   All tree blocks and super blocks are written.  Transaction N+1
     96 *   This transaction is finished and all its	    [[TRANS_STATE_COMMIT_START]]
     97 *   data structures will be cleaned up.	    | Life goes on
     98 */
     99static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
    100	[TRANS_STATE_RUNNING]		= 0U,
    101	[TRANS_STATE_COMMIT_START]	= (__TRANS_START | __TRANS_ATTACH),
    102	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_START |
    103					   __TRANS_ATTACH |
    104					   __TRANS_JOIN |
    105					   __TRANS_JOIN_NOSTART),
    106	[TRANS_STATE_UNBLOCKED]		= (__TRANS_START |
    107					   __TRANS_ATTACH |
    108					   __TRANS_JOIN |
    109					   __TRANS_JOIN_NOLOCK |
    110					   __TRANS_JOIN_NOSTART),
    111	[TRANS_STATE_SUPER_COMMITTED]	= (__TRANS_START |
    112					   __TRANS_ATTACH |
    113					   __TRANS_JOIN |
    114					   __TRANS_JOIN_NOLOCK |
    115					   __TRANS_JOIN_NOSTART),
    116	[TRANS_STATE_COMPLETED]		= (__TRANS_START |
    117					   __TRANS_ATTACH |
    118					   __TRANS_JOIN |
    119					   __TRANS_JOIN_NOLOCK |
    120					   __TRANS_JOIN_NOSTART),
    121};
    122
    123void btrfs_put_transaction(struct btrfs_transaction *transaction)
    124{
    125	WARN_ON(refcount_read(&transaction->use_count) == 0);
    126	if (refcount_dec_and_test(&transaction->use_count)) {
    127		BUG_ON(!list_empty(&transaction->list));
    128		WARN_ON(!RB_EMPTY_ROOT(
    129				&transaction->delayed_refs.href_root.rb_root));
    130		WARN_ON(!RB_EMPTY_ROOT(
    131				&transaction->delayed_refs.dirty_extent_root));
    132		if (transaction->delayed_refs.pending_csums)
    133			btrfs_err(transaction->fs_info,
    134				  "pending csums is %llu",
    135				  transaction->delayed_refs.pending_csums);
    136		/*
    137		 * If any block groups are found in ->deleted_bgs then it's
    138		 * because the transaction was aborted and a commit did not
    139		 * happen (things failed before writing the new superblock
    140		 * and calling btrfs_finish_extent_commit()), so we can not
    141		 * discard the physical locations of the block groups.
    142		 */
    143		while (!list_empty(&transaction->deleted_bgs)) {
    144			struct btrfs_block_group *cache;
    145
    146			cache = list_first_entry(&transaction->deleted_bgs,
    147						 struct btrfs_block_group,
    148						 bg_list);
    149			list_del_init(&cache->bg_list);
    150			btrfs_unfreeze_block_group(cache);
    151			btrfs_put_block_group(cache);
    152		}
    153		WARN_ON(!list_empty(&transaction->dev_update_list));
    154		kfree(transaction);
    155	}
    156}
    157
    158static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
    159{
    160	struct btrfs_transaction *cur_trans = trans->transaction;
    161	struct btrfs_fs_info *fs_info = trans->fs_info;
    162	struct btrfs_root *root, *tmp;
    163	struct btrfs_caching_control *caching_ctl, *next;
    164
    165	/*
    166	 * At this point no one can be using this transaction to modify any tree
    167	 * and no one can start another transaction to modify any tree either.
    168	 */
    169	ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
    170
    171	down_write(&fs_info->commit_root_sem);
    172
    173	if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
    174		fs_info->last_reloc_trans = trans->transid;
    175
    176	list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
    177				 dirty_list) {
    178		list_del_init(&root->dirty_list);
    179		free_extent_buffer(root->commit_root);
    180		root->commit_root = btrfs_root_node(root);
    181		extent_io_tree_release(&root->dirty_log_pages);
    182		btrfs_qgroup_clean_swapped_blocks(root);
    183	}
    184
    185	/* We can free old roots now. */
    186	spin_lock(&cur_trans->dropped_roots_lock);
    187	while (!list_empty(&cur_trans->dropped_roots)) {
    188		root = list_first_entry(&cur_trans->dropped_roots,
    189					struct btrfs_root, root_list);
    190		list_del_init(&root->root_list);
    191		spin_unlock(&cur_trans->dropped_roots_lock);
    192		btrfs_free_log(trans, root);
    193		btrfs_drop_and_free_fs_root(fs_info, root);
    194		spin_lock(&cur_trans->dropped_roots_lock);
    195	}
    196	spin_unlock(&cur_trans->dropped_roots_lock);
    197
    198	/*
    199	 * We have to update the last_byte_to_unpin under the commit_root_sem,
    200	 * at the same time we swap out the commit roots.
    201	 *
    202	 * This is because we must have a real view of the last spot the caching
    203	 * kthreads were while caching.  Consider the following views of the
    204	 * extent tree for a block group
    205	 *
    206	 * commit root
    207	 * +----+----+----+----+----+----+----+
    208	 * |\\\\|    |\\\\|\\\\|    |\\\\|\\\\|
    209	 * +----+----+----+----+----+----+----+
    210	 * 0    1    2    3    4    5    6    7
    211	 *
    212	 * new commit root
    213	 * +----+----+----+----+----+----+----+
    214	 * |    |    |    |\\\\|    |    |\\\\|
    215	 * +----+----+----+----+----+----+----+
    216	 * 0    1    2    3    4    5    6    7
    217	 *
    218	 * If the cache_ctl->progress was at 3, then we are only allowed to
    219	 * unpin [0,1) and [2,3], because the caching thread has already
    220	 * processed those extents.  We are not allowed to unpin [5,6), because
    221	 * the caching thread will re-start it's search from 3, and thus find
    222	 * the hole from [4,6) to add to the free space cache.
    223	 */
    224	write_lock(&fs_info->block_group_cache_lock);
    225	list_for_each_entry_safe(caching_ctl, next,
    226				 &fs_info->caching_block_groups, list) {
    227		struct btrfs_block_group *cache = caching_ctl->block_group;
    228
    229		if (btrfs_block_group_done(cache)) {
    230			cache->last_byte_to_unpin = (u64)-1;
    231			list_del_init(&caching_ctl->list);
    232			btrfs_put_caching_control(caching_ctl);
    233		} else {
    234			cache->last_byte_to_unpin = caching_ctl->progress;
    235		}
    236	}
    237	write_unlock(&fs_info->block_group_cache_lock);
    238	up_write(&fs_info->commit_root_sem);
    239}
    240
    241static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
    242					 unsigned int type)
    243{
    244	if (type & TRANS_EXTWRITERS)
    245		atomic_inc(&trans->num_extwriters);
    246}
    247
    248static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
    249					 unsigned int type)
    250{
    251	if (type & TRANS_EXTWRITERS)
    252		atomic_dec(&trans->num_extwriters);
    253}
    254
    255static inline void extwriter_counter_init(struct btrfs_transaction *trans,
    256					  unsigned int type)
    257{
    258	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
    259}
    260
    261static inline int extwriter_counter_read(struct btrfs_transaction *trans)
    262{
    263	return atomic_read(&trans->num_extwriters);
    264}
    265
    266/*
    267 * To be called after doing the chunk btree updates right after allocating a new
    268 * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
    269 * chunk after all chunk btree updates and after finishing the second phase of
    270 * chunk allocation (btrfs_create_pending_block_groups()) in case some block
    271 * group had its chunk item insertion delayed to the second phase.
    272 */
    273void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
    274{
    275	struct btrfs_fs_info *fs_info = trans->fs_info;
    276
    277	if (!trans->chunk_bytes_reserved)
    278		return;
    279
    280	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
    281				trans->chunk_bytes_reserved, NULL);
    282	trans->chunk_bytes_reserved = 0;
    283}
    284
    285/*
    286 * either allocate a new transaction or hop into the existing one
    287 */
    288static noinline int join_transaction(struct btrfs_fs_info *fs_info,
    289				     unsigned int type)
    290{
    291	struct btrfs_transaction *cur_trans;
    292
    293	spin_lock(&fs_info->trans_lock);
    294loop:
    295	/* The file system has been taken offline. No new transactions. */
    296	if (BTRFS_FS_ERROR(fs_info)) {
    297		spin_unlock(&fs_info->trans_lock);
    298		return -EROFS;
    299	}
    300
    301	cur_trans = fs_info->running_transaction;
    302	if (cur_trans) {
    303		if (TRANS_ABORTED(cur_trans)) {
    304			spin_unlock(&fs_info->trans_lock);
    305			return cur_trans->aborted;
    306		}
    307		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
    308			spin_unlock(&fs_info->trans_lock);
    309			return -EBUSY;
    310		}
    311		refcount_inc(&cur_trans->use_count);
    312		atomic_inc(&cur_trans->num_writers);
    313		extwriter_counter_inc(cur_trans, type);
    314		spin_unlock(&fs_info->trans_lock);
    315		return 0;
    316	}
    317	spin_unlock(&fs_info->trans_lock);
    318
    319	/*
    320	 * If we are ATTACH, we just want to catch the current transaction,
    321	 * and commit it. If there is no transaction, just return ENOENT.
    322	 */
    323	if (type == TRANS_ATTACH)
    324		return -ENOENT;
    325
    326	/*
    327	 * JOIN_NOLOCK only happens during the transaction commit, so
    328	 * it is impossible that ->running_transaction is NULL
    329	 */
    330	BUG_ON(type == TRANS_JOIN_NOLOCK);
    331
    332	cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
    333	if (!cur_trans)
    334		return -ENOMEM;
    335
    336	spin_lock(&fs_info->trans_lock);
    337	if (fs_info->running_transaction) {
    338		/*
    339		 * someone started a transaction after we unlocked.  Make sure
    340		 * to redo the checks above
    341		 */
    342		kfree(cur_trans);
    343		goto loop;
    344	} else if (BTRFS_FS_ERROR(fs_info)) {
    345		spin_unlock(&fs_info->trans_lock);
    346		kfree(cur_trans);
    347		return -EROFS;
    348	}
    349
    350	cur_trans->fs_info = fs_info;
    351	atomic_set(&cur_trans->pending_ordered, 0);
    352	init_waitqueue_head(&cur_trans->pending_wait);
    353	atomic_set(&cur_trans->num_writers, 1);
    354	extwriter_counter_init(cur_trans, type);
    355	init_waitqueue_head(&cur_trans->writer_wait);
    356	init_waitqueue_head(&cur_trans->commit_wait);
    357	cur_trans->state = TRANS_STATE_RUNNING;
    358	/*
    359	 * One for this trans handle, one so it will live on until we
    360	 * commit the transaction.
    361	 */
    362	refcount_set(&cur_trans->use_count, 2);
    363	cur_trans->flags = 0;
    364	cur_trans->start_time = ktime_get_seconds();
    365
    366	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
    367
    368	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
    369	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
    370	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
    371
    372	/*
    373	 * although the tree mod log is per file system and not per transaction,
    374	 * the log must never go across transaction boundaries.
    375	 */
    376	smp_mb();
    377	if (!list_empty(&fs_info->tree_mod_seq_list))
    378		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
    379	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
    380		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
    381	atomic64_set(&fs_info->tree_mod_seq, 0);
    382
    383	spin_lock_init(&cur_trans->delayed_refs.lock);
    384
    385	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
    386	INIT_LIST_HEAD(&cur_trans->dev_update_list);
    387	INIT_LIST_HEAD(&cur_trans->switch_commits);
    388	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
    389	INIT_LIST_HEAD(&cur_trans->io_bgs);
    390	INIT_LIST_HEAD(&cur_trans->dropped_roots);
    391	mutex_init(&cur_trans->cache_write_mutex);
    392	spin_lock_init(&cur_trans->dirty_bgs_lock);
    393	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
    394	spin_lock_init(&cur_trans->dropped_roots_lock);
    395	INIT_LIST_HEAD(&cur_trans->releasing_ebs);
    396	spin_lock_init(&cur_trans->releasing_ebs_lock);
    397	list_add_tail(&cur_trans->list, &fs_info->trans_list);
    398	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
    399			IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
    400	extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
    401			IO_TREE_FS_PINNED_EXTENTS, NULL);
    402	fs_info->generation++;
    403	cur_trans->transid = fs_info->generation;
    404	fs_info->running_transaction = cur_trans;
    405	cur_trans->aborted = 0;
    406	spin_unlock(&fs_info->trans_lock);
    407
    408	return 0;
    409}
    410
    411/*
    412 * This does all the record keeping required to make sure that a shareable root
    413 * is properly recorded in a given transaction.  This is required to make sure
    414 * the old root from before we joined the transaction is deleted when the
    415 * transaction commits.
    416 */
    417static int record_root_in_trans(struct btrfs_trans_handle *trans,
    418			       struct btrfs_root *root,
    419			       int force)
    420{
    421	struct btrfs_fs_info *fs_info = root->fs_info;
    422	int ret = 0;
    423
    424	if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
    425	    root->last_trans < trans->transid) || force) {
    426		WARN_ON(!force && root->commit_root != root->node);
    427
    428		/*
    429		 * see below for IN_TRANS_SETUP usage rules
    430		 * we have the reloc mutex held now, so there
    431		 * is only one writer in this function
    432		 */
    433		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
    434
    435		/* make sure readers find IN_TRANS_SETUP before
    436		 * they find our root->last_trans update
    437		 */
    438		smp_wmb();
    439
    440		spin_lock(&fs_info->fs_roots_lock);
    441		if (root->last_trans == trans->transid && !force) {
    442			spin_unlock(&fs_info->fs_roots_lock);
    443			return 0;
    444		}
    445		xa_set_mark(&fs_info->fs_roots,
    446			    (unsigned long)root->root_key.objectid,
    447			    BTRFS_ROOT_TRANS_TAG);
    448		spin_unlock(&fs_info->fs_roots_lock);
    449		root->last_trans = trans->transid;
    450
    451		/* this is pretty tricky.  We don't want to
    452		 * take the relocation lock in btrfs_record_root_in_trans
    453		 * unless we're really doing the first setup for this root in
    454		 * this transaction.
    455		 *
    456		 * Normally we'd use root->last_trans as a flag to decide
    457		 * if we want to take the expensive mutex.
    458		 *
    459		 * But, we have to set root->last_trans before we
    460		 * init the relocation root, otherwise, we trip over warnings
    461		 * in ctree.c.  The solution used here is to flag ourselves
    462		 * with root IN_TRANS_SETUP.  When this is 1, we're still
    463		 * fixing up the reloc trees and everyone must wait.
    464		 *
    465		 * When this is zero, they can trust root->last_trans and fly
    466		 * through btrfs_record_root_in_trans without having to take the
    467		 * lock.  smp_wmb() makes sure that all the writes above are
    468		 * done before we pop in the zero below
    469		 */
    470		ret = btrfs_init_reloc_root(trans, root);
    471		smp_mb__before_atomic();
    472		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
    473	}
    474	return ret;
    475}
    476
    477
    478void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
    479			    struct btrfs_root *root)
    480{
    481	struct btrfs_fs_info *fs_info = root->fs_info;
    482	struct btrfs_transaction *cur_trans = trans->transaction;
    483
    484	/* Add ourselves to the transaction dropped list */
    485	spin_lock(&cur_trans->dropped_roots_lock);
    486	list_add_tail(&root->root_list, &cur_trans->dropped_roots);
    487	spin_unlock(&cur_trans->dropped_roots_lock);
    488
    489	/* Make sure we don't try to update the root at commit time */
    490	xa_clear_mark(&fs_info->fs_roots,
    491		      (unsigned long)root->root_key.objectid,
    492		      BTRFS_ROOT_TRANS_TAG);
    493}
    494
    495int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
    496			       struct btrfs_root *root)
    497{
    498	struct btrfs_fs_info *fs_info = root->fs_info;
    499	int ret;
    500
    501	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
    502		return 0;
    503
    504	/*
    505	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
    506	 * and barriers
    507	 */
    508	smp_rmb();
    509	if (root->last_trans == trans->transid &&
    510	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
    511		return 0;
    512
    513	mutex_lock(&fs_info->reloc_mutex);
    514	ret = record_root_in_trans(trans, root, 0);
    515	mutex_unlock(&fs_info->reloc_mutex);
    516
    517	return ret;
    518}
    519
    520static inline int is_transaction_blocked(struct btrfs_transaction *trans)
    521{
    522	return (trans->state >= TRANS_STATE_COMMIT_START &&
    523		trans->state < TRANS_STATE_UNBLOCKED &&
    524		!TRANS_ABORTED(trans));
    525}
    526
    527/* wait for commit against the current transaction to become unblocked
    528 * when this is done, it is safe to start a new transaction, but the current
    529 * transaction might not be fully on disk.
    530 */
    531static void wait_current_trans(struct btrfs_fs_info *fs_info)
    532{
    533	struct btrfs_transaction *cur_trans;
    534
    535	spin_lock(&fs_info->trans_lock);
    536	cur_trans = fs_info->running_transaction;
    537	if (cur_trans && is_transaction_blocked(cur_trans)) {
    538		refcount_inc(&cur_trans->use_count);
    539		spin_unlock(&fs_info->trans_lock);
    540
    541		wait_event(fs_info->transaction_wait,
    542			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
    543			   TRANS_ABORTED(cur_trans));
    544		btrfs_put_transaction(cur_trans);
    545	} else {
    546		spin_unlock(&fs_info->trans_lock);
    547	}
    548}
    549
    550static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
    551{
    552	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
    553		return 0;
    554
    555	if (type == TRANS_START)
    556		return 1;
    557
    558	return 0;
    559}
    560
    561static inline bool need_reserve_reloc_root(struct btrfs_root *root)
    562{
    563	struct btrfs_fs_info *fs_info = root->fs_info;
    564
    565	if (!fs_info->reloc_ctl ||
    566	    !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
    567	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
    568	    root->reloc_root)
    569		return false;
    570
    571	return true;
    572}
    573
    574static struct btrfs_trans_handle *
    575start_transaction(struct btrfs_root *root, unsigned int num_items,
    576		  unsigned int type, enum btrfs_reserve_flush_enum flush,
    577		  bool enforce_qgroups)
    578{
    579	struct btrfs_fs_info *fs_info = root->fs_info;
    580	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
    581	struct btrfs_trans_handle *h;
    582	struct btrfs_transaction *cur_trans;
    583	u64 num_bytes = 0;
    584	u64 qgroup_reserved = 0;
    585	bool reloc_reserved = false;
    586	bool do_chunk_alloc = false;
    587	int ret;
    588
    589	if (BTRFS_FS_ERROR(fs_info))
    590		return ERR_PTR(-EROFS);
    591
    592	if (current->journal_info) {
    593		WARN_ON(type & TRANS_EXTWRITERS);
    594		h = current->journal_info;
    595		refcount_inc(&h->use_count);
    596		WARN_ON(refcount_read(&h->use_count) > 2);
    597		h->orig_rsv = h->block_rsv;
    598		h->block_rsv = NULL;
    599		goto got_it;
    600	}
    601
    602	/*
    603	 * Do the reservation before we join the transaction so we can do all
    604	 * the appropriate flushing if need be.
    605	 */
    606	if (num_items && root != fs_info->chunk_root) {
    607		struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
    608		u64 delayed_refs_bytes = 0;
    609
    610		qgroup_reserved = num_items * fs_info->nodesize;
    611		ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
    612				enforce_qgroups);
    613		if (ret)
    614			return ERR_PTR(ret);
    615
    616		/*
    617		 * We want to reserve all the bytes we may need all at once, so
    618		 * we only do 1 enospc flushing cycle per transaction start.  We
    619		 * accomplish this by simply assuming we'll do 2 x num_items
    620		 * worth of delayed refs updates in this trans handle, and
    621		 * refill that amount for whatever is missing in the reserve.
    622		 */
    623		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
    624		if (flush == BTRFS_RESERVE_FLUSH_ALL &&
    625		    delayed_refs_rsv->full == 0) {
    626			delayed_refs_bytes = num_bytes;
    627			num_bytes <<= 1;
    628		}
    629
    630		/*
    631		 * Do the reservation for the relocation root creation
    632		 */
    633		if (need_reserve_reloc_root(root)) {
    634			num_bytes += fs_info->nodesize;
    635			reloc_reserved = true;
    636		}
    637
    638		ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush);
    639		if (ret)
    640			goto reserve_fail;
    641		if (delayed_refs_bytes) {
    642			btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
    643							  delayed_refs_bytes);
    644			num_bytes -= delayed_refs_bytes;
    645		}
    646
    647		if (rsv->space_info->force_alloc)
    648			do_chunk_alloc = true;
    649	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
    650		   !delayed_refs_rsv->full) {
    651		/*
    652		 * Some people call with btrfs_start_transaction(root, 0)
    653		 * because they can be throttled, but have some other mechanism
    654		 * for reserving space.  We still want these guys to refill the
    655		 * delayed block_rsv so just add 1 items worth of reservation
    656		 * here.
    657		 */
    658		ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
    659		if (ret)
    660			goto reserve_fail;
    661	}
    662again:
    663	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
    664	if (!h) {
    665		ret = -ENOMEM;
    666		goto alloc_fail;
    667	}
    668
    669	/*
    670	 * If we are JOIN_NOLOCK we're already committing a transaction and
    671	 * waiting on this guy, so we don't need to do the sb_start_intwrite
    672	 * because we're already holding a ref.  We need this because we could
    673	 * have raced in and did an fsync() on a file which can kick a commit
    674	 * and then we deadlock with somebody doing a freeze.
    675	 *
    676	 * If we are ATTACH, it means we just want to catch the current
    677	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
    678	 */
    679	if (type & __TRANS_FREEZABLE)
    680		sb_start_intwrite(fs_info->sb);
    681
    682	if (may_wait_transaction(fs_info, type))
    683		wait_current_trans(fs_info);
    684
    685	do {
    686		ret = join_transaction(fs_info, type);
    687		if (ret == -EBUSY) {
    688			wait_current_trans(fs_info);
    689			if (unlikely(type == TRANS_ATTACH ||
    690				     type == TRANS_JOIN_NOSTART))
    691				ret = -ENOENT;
    692		}
    693	} while (ret == -EBUSY);
    694
    695	if (ret < 0)
    696		goto join_fail;
    697
    698	cur_trans = fs_info->running_transaction;
    699
    700	h->transid = cur_trans->transid;
    701	h->transaction = cur_trans;
    702	refcount_set(&h->use_count, 1);
    703	h->fs_info = root->fs_info;
    704
    705	h->type = type;
    706	INIT_LIST_HEAD(&h->new_bgs);
    707
    708	smp_mb();
    709	if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
    710	    may_wait_transaction(fs_info, type)) {
    711		current->journal_info = h;
    712		btrfs_commit_transaction(h);
    713		goto again;
    714	}
    715
    716	if (num_bytes) {
    717		trace_btrfs_space_reservation(fs_info, "transaction",
    718					      h->transid, num_bytes, 1);
    719		h->block_rsv = &fs_info->trans_block_rsv;
    720		h->bytes_reserved = num_bytes;
    721		h->reloc_reserved = reloc_reserved;
    722	}
    723
    724got_it:
    725	if (!current->journal_info)
    726		current->journal_info = h;
    727
    728	/*
    729	 * If the space_info is marked ALLOC_FORCE then we'll get upgraded to
    730	 * ALLOC_FORCE the first run through, and then we won't allocate for
    731	 * anybody else who races in later.  We don't care about the return
    732	 * value here.
    733	 */
    734	if (do_chunk_alloc && num_bytes) {
    735		u64 flags = h->block_rsv->space_info->flags;
    736
    737		btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
    738				  CHUNK_ALLOC_NO_FORCE);
    739	}
    740
    741	/*
    742	 * btrfs_record_root_in_trans() needs to alloc new extents, and may
    743	 * call btrfs_join_transaction() while we're also starting a
    744	 * transaction.
    745	 *
    746	 * Thus it need to be called after current->journal_info initialized,
    747	 * or we can deadlock.
    748	 */
    749	ret = btrfs_record_root_in_trans(h, root);
    750	if (ret) {
    751		/*
    752		 * The transaction handle is fully initialized and linked with
    753		 * other structures so it needs to be ended in case of errors,
    754		 * not just freed.
    755		 */
    756		btrfs_end_transaction(h);
    757		return ERR_PTR(ret);
    758	}
    759
    760	return h;
    761
    762join_fail:
    763	if (type & __TRANS_FREEZABLE)
    764		sb_end_intwrite(fs_info->sb);
    765	kmem_cache_free(btrfs_trans_handle_cachep, h);
    766alloc_fail:
    767	if (num_bytes)
    768		btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
    769					num_bytes, NULL);
    770reserve_fail:
    771	btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
    772	return ERR_PTR(ret);
    773}
    774
    775struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
    776						   unsigned int num_items)
    777{
    778	return start_transaction(root, num_items, TRANS_START,
    779				 BTRFS_RESERVE_FLUSH_ALL, true);
    780}
    781
    782struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
    783					struct btrfs_root *root,
    784					unsigned int num_items)
    785{
    786	return start_transaction(root, num_items, TRANS_START,
    787				 BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
    788}
    789
    790struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
    791{
    792	return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
    793				 true);
    794}
    795
    796struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
    797{
    798	return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
    799				 BTRFS_RESERVE_NO_FLUSH, true);
    800}
    801
    802/*
    803 * Similar to regular join but it never starts a transaction when none is
    804 * running or after waiting for the current one to finish.
    805 */
    806struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
    807{
    808	return start_transaction(root, 0, TRANS_JOIN_NOSTART,
    809				 BTRFS_RESERVE_NO_FLUSH, true);
    810}
    811
    812/*
    813 * btrfs_attach_transaction() - catch the running transaction
    814 *
    815 * It is used when we want to commit the current the transaction, but
    816 * don't want to start a new one.
    817 *
    818 * Note: If this function return -ENOENT, it just means there is no
    819 * running transaction. But it is possible that the inactive transaction
    820 * is still in the memory, not fully on disk. If you hope there is no
    821 * inactive transaction in the fs when -ENOENT is returned, you should
    822 * invoke
    823 *     btrfs_attach_transaction_barrier()
    824 */
    825struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
    826{
    827	return start_transaction(root, 0, TRANS_ATTACH,
    828				 BTRFS_RESERVE_NO_FLUSH, true);
    829}
    830
    831/*
    832 * btrfs_attach_transaction_barrier() - catch the running transaction
    833 *
    834 * It is similar to the above function, the difference is this one
    835 * will wait for all the inactive transactions until they fully
    836 * complete.
    837 */
    838struct btrfs_trans_handle *
    839btrfs_attach_transaction_barrier(struct btrfs_root *root)
    840{
    841	struct btrfs_trans_handle *trans;
    842
    843	trans = start_transaction(root, 0, TRANS_ATTACH,
    844				  BTRFS_RESERVE_NO_FLUSH, true);
    845	if (trans == ERR_PTR(-ENOENT))
    846		btrfs_wait_for_commit(root->fs_info, 0);
    847
    848	return trans;
    849}
    850
    851/* Wait for a transaction commit to reach at least the given state. */
    852static noinline void wait_for_commit(struct btrfs_transaction *commit,
    853				     const enum btrfs_trans_state min_state)
    854{
    855	struct btrfs_fs_info *fs_info = commit->fs_info;
    856	u64 transid = commit->transid;
    857	bool put = false;
    858
    859	while (1) {
    860		wait_event(commit->commit_wait, commit->state >= min_state);
    861		if (put)
    862			btrfs_put_transaction(commit);
    863
    864		if (min_state < TRANS_STATE_COMPLETED)
    865			break;
    866
    867		/*
    868		 * A transaction isn't really completed until all of the
    869		 * previous transactions are completed, but with fsync we can
    870		 * end up with SUPER_COMMITTED transactions before a COMPLETED
    871		 * transaction. Wait for those.
    872		 */
    873
    874		spin_lock(&fs_info->trans_lock);
    875		commit = list_first_entry_or_null(&fs_info->trans_list,
    876						  struct btrfs_transaction,
    877						  list);
    878		if (!commit || commit->transid > transid) {
    879			spin_unlock(&fs_info->trans_lock);
    880			break;
    881		}
    882		refcount_inc(&commit->use_count);
    883		put = true;
    884		spin_unlock(&fs_info->trans_lock);
    885	}
    886}
    887
    888int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
    889{
    890	struct btrfs_transaction *cur_trans = NULL, *t;
    891	int ret = 0;
    892
    893	if (transid) {
    894		if (transid <= fs_info->last_trans_committed)
    895			goto out;
    896
    897		/* find specified transaction */
    898		spin_lock(&fs_info->trans_lock);
    899		list_for_each_entry(t, &fs_info->trans_list, list) {
    900			if (t->transid == transid) {
    901				cur_trans = t;
    902				refcount_inc(&cur_trans->use_count);
    903				ret = 0;
    904				break;
    905			}
    906			if (t->transid > transid) {
    907				ret = 0;
    908				break;
    909			}
    910		}
    911		spin_unlock(&fs_info->trans_lock);
    912
    913		/*
    914		 * The specified transaction doesn't exist, or we
    915		 * raced with btrfs_commit_transaction
    916		 */
    917		if (!cur_trans) {
    918			if (transid > fs_info->last_trans_committed)
    919				ret = -EINVAL;
    920			goto out;
    921		}
    922	} else {
    923		/* find newest transaction that is committing | committed */
    924		spin_lock(&fs_info->trans_lock);
    925		list_for_each_entry_reverse(t, &fs_info->trans_list,
    926					    list) {
    927			if (t->state >= TRANS_STATE_COMMIT_START) {
    928				if (t->state == TRANS_STATE_COMPLETED)
    929					break;
    930				cur_trans = t;
    931				refcount_inc(&cur_trans->use_count);
    932				break;
    933			}
    934		}
    935		spin_unlock(&fs_info->trans_lock);
    936		if (!cur_trans)
    937			goto out;  /* nothing committing|committed */
    938	}
    939
    940	wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
    941	btrfs_put_transaction(cur_trans);
    942out:
    943	return ret;
    944}
    945
    946void btrfs_throttle(struct btrfs_fs_info *fs_info)
    947{
    948	wait_current_trans(fs_info);
    949}
    950
    951static bool should_end_transaction(struct btrfs_trans_handle *trans)
    952{
    953	struct btrfs_fs_info *fs_info = trans->fs_info;
    954
    955	if (btrfs_check_space_for_delayed_refs(fs_info))
    956		return true;
    957
    958	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
    959}
    960
    961bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
    962{
    963	struct btrfs_transaction *cur_trans = trans->transaction;
    964
    965	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
    966	    test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
    967		return true;
    968
    969	return should_end_transaction(trans);
    970}
    971
    972static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
    973
    974{
    975	struct btrfs_fs_info *fs_info = trans->fs_info;
    976
    977	if (!trans->block_rsv) {
    978		ASSERT(!trans->bytes_reserved);
    979		return;
    980	}
    981
    982	if (!trans->bytes_reserved)
    983		return;
    984
    985	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
    986	trace_btrfs_space_reservation(fs_info, "transaction",
    987				      trans->transid, trans->bytes_reserved, 0);
    988	btrfs_block_rsv_release(fs_info, trans->block_rsv,
    989				trans->bytes_reserved, NULL);
    990	trans->bytes_reserved = 0;
    991}
    992
    993static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
    994				   int throttle)
    995{
    996	struct btrfs_fs_info *info = trans->fs_info;
    997	struct btrfs_transaction *cur_trans = trans->transaction;
    998	int err = 0;
    999
   1000	if (refcount_read(&trans->use_count) > 1) {
   1001		refcount_dec(&trans->use_count);
   1002		trans->block_rsv = trans->orig_rsv;
   1003		return 0;
   1004	}
   1005
   1006	btrfs_trans_release_metadata(trans);
   1007	trans->block_rsv = NULL;
   1008
   1009	btrfs_create_pending_block_groups(trans);
   1010
   1011	btrfs_trans_release_chunk_metadata(trans);
   1012
   1013	if (trans->type & __TRANS_FREEZABLE)
   1014		sb_end_intwrite(info->sb);
   1015
   1016	WARN_ON(cur_trans != info->running_transaction);
   1017	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
   1018	atomic_dec(&cur_trans->num_writers);
   1019	extwriter_counter_dec(cur_trans, trans->type);
   1020
   1021	cond_wake_up(&cur_trans->writer_wait);
   1022	btrfs_put_transaction(cur_trans);
   1023
   1024	if (current->journal_info == trans)
   1025		current->journal_info = NULL;
   1026
   1027	if (throttle)
   1028		btrfs_run_delayed_iputs(info);
   1029
   1030	if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
   1031		wake_up_process(info->transaction_kthread);
   1032		if (TRANS_ABORTED(trans))
   1033			err = trans->aborted;
   1034		else
   1035			err = -EROFS;
   1036	}
   1037
   1038	kmem_cache_free(btrfs_trans_handle_cachep, trans);
   1039	return err;
   1040}
   1041
   1042int btrfs_end_transaction(struct btrfs_trans_handle *trans)
   1043{
   1044	return __btrfs_end_transaction(trans, 0);
   1045}
   1046
   1047int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
   1048{
   1049	return __btrfs_end_transaction(trans, 1);
   1050}
   1051
   1052/*
   1053 * when btree blocks are allocated, they have some corresponding bits set for
   1054 * them in one of two extent_io trees.  This is used to make sure all of
   1055 * those extents are sent to disk but does not wait on them
   1056 */
   1057int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
   1058			       struct extent_io_tree *dirty_pages, int mark)
   1059{
   1060	int err = 0;
   1061	int werr = 0;
   1062	struct address_space *mapping = fs_info->btree_inode->i_mapping;
   1063	struct extent_state *cached_state = NULL;
   1064	u64 start = 0;
   1065	u64 end;
   1066
   1067	atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
   1068	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
   1069				      mark, &cached_state)) {
   1070		bool wait_writeback = false;
   1071
   1072		err = convert_extent_bit(dirty_pages, start, end,
   1073					 EXTENT_NEED_WAIT,
   1074					 mark, &cached_state);
   1075		/*
   1076		 * convert_extent_bit can return -ENOMEM, which is most of the
   1077		 * time a temporary error. So when it happens, ignore the error
   1078		 * and wait for writeback of this range to finish - because we
   1079		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
   1080		 * to __btrfs_wait_marked_extents() would not know that
   1081		 * writeback for this range started and therefore wouldn't
   1082		 * wait for it to finish - we don't want to commit a
   1083		 * superblock that points to btree nodes/leafs for which
   1084		 * writeback hasn't finished yet (and without errors).
   1085		 * We cleanup any entries left in the io tree when committing
   1086		 * the transaction (through extent_io_tree_release()).
   1087		 */
   1088		if (err == -ENOMEM) {
   1089			err = 0;
   1090			wait_writeback = true;
   1091		}
   1092		if (!err)
   1093			err = filemap_fdatawrite_range(mapping, start, end);
   1094		if (err)
   1095			werr = err;
   1096		else if (wait_writeback)
   1097			werr = filemap_fdatawait_range(mapping, start, end);
   1098		free_extent_state(cached_state);
   1099		cached_state = NULL;
   1100		cond_resched();
   1101		start = end + 1;
   1102	}
   1103	atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
   1104	return werr;
   1105}
   1106
   1107/*
   1108 * when btree blocks are allocated, they have some corresponding bits set for
   1109 * them in one of two extent_io trees.  This is used to make sure all of
   1110 * those extents are on disk for transaction or log commit.  We wait
   1111 * on all the pages and clear them from the dirty pages state tree
   1112 */
   1113static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
   1114				       struct extent_io_tree *dirty_pages)
   1115{
   1116	int err = 0;
   1117	int werr = 0;
   1118	struct address_space *mapping = fs_info->btree_inode->i_mapping;
   1119	struct extent_state *cached_state = NULL;
   1120	u64 start = 0;
   1121	u64 end;
   1122
   1123	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
   1124				      EXTENT_NEED_WAIT, &cached_state)) {
   1125		/*
   1126		 * Ignore -ENOMEM errors returned by clear_extent_bit().
   1127		 * When committing the transaction, we'll remove any entries
   1128		 * left in the io tree. For a log commit, we don't remove them
   1129		 * after committing the log because the tree can be accessed
   1130		 * concurrently - we do it only at transaction commit time when
   1131		 * it's safe to do it (through extent_io_tree_release()).
   1132		 */
   1133		err = clear_extent_bit(dirty_pages, start, end,
   1134				       EXTENT_NEED_WAIT, 0, 0, &cached_state);
   1135		if (err == -ENOMEM)
   1136			err = 0;
   1137		if (!err)
   1138			err = filemap_fdatawait_range(mapping, start, end);
   1139		if (err)
   1140			werr = err;
   1141		free_extent_state(cached_state);
   1142		cached_state = NULL;
   1143		cond_resched();
   1144		start = end + 1;
   1145	}
   1146	if (err)
   1147		werr = err;
   1148	return werr;
   1149}
   1150
   1151static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
   1152		       struct extent_io_tree *dirty_pages)
   1153{
   1154	bool errors = false;
   1155	int err;
   1156
   1157	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
   1158	if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
   1159		errors = true;
   1160
   1161	if (errors && !err)
   1162		err = -EIO;
   1163	return err;
   1164}
   1165
   1166int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
   1167{
   1168	struct btrfs_fs_info *fs_info = log_root->fs_info;
   1169	struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
   1170	bool errors = false;
   1171	int err;
   1172
   1173	ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
   1174
   1175	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
   1176	if ((mark & EXTENT_DIRTY) &&
   1177	    test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
   1178		errors = true;
   1179
   1180	if ((mark & EXTENT_NEW) &&
   1181	    test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
   1182		errors = true;
   1183
   1184	if (errors && !err)
   1185		err = -EIO;
   1186	return err;
   1187}
   1188
   1189/*
   1190 * When btree blocks are allocated the corresponding extents are marked dirty.
   1191 * This function ensures such extents are persisted on disk for transaction or
   1192 * log commit.
   1193 *
   1194 * @trans: transaction whose dirty pages we'd like to write
   1195 */
   1196static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
   1197{
   1198	int ret;
   1199	int ret2;
   1200	struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
   1201	struct btrfs_fs_info *fs_info = trans->fs_info;
   1202	struct blk_plug plug;
   1203
   1204	blk_start_plug(&plug);
   1205	ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
   1206	blk_finish_plug(&plug);
   1207	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
   1208
   1209	extent_io_tree_release(&trans->transaction->dirty_pages);
   1210
   1211	if (ret)
   1212		return ret;
   1213	else if (ret2)
   1214		return ret2;
   1215	else
   1216		return 0;
   1217}
   1218
   1219/*
   1220 * this is used to update the root pointer in the tree of tree roots.
   1221 *
   1222 * But, in the case of the extent allocation tree, updating the root
   1223 * pointer may allocate blocks which may change the root of the extent
   1224 * allocation tree.
   1225 *
   1226 * So, this loops and repeats and makes sure the cowonly root didn't
   1227 * change while the root pointer was being updated in the metadata.
   1228 */
   1229static int update_cowonly_root(struct btrfs_trans_handle *trans,
   1230			       struct btrfs_root *root)
   1231{
   1232	int ret;
   1233	u64 old_root_bytenr;
   1234	u64 old_root_used;
   1235	struct btrfs_fs_info *fs_info = root->fs_info;
   1236	struct btrfs_root *tree_root = fs_info->tree_root;
   1237
   1238	old_root_used = btrfs_root_used(&root->root_item);
   1239
   1240	while (1) {
   1241		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
   1242		if (old_root_bytenr == root->node->start &&
   1243		    old_root_used == btrfs_root_used(&root->root_item))
   1244			break;
   1245
   1246		btrfs_set_root_node(&root->root_item, root->node);
   1247		ret = btrfs_update_root(trans, tree_root,
   1248					&root->root_key,
   1249					&root->root_item);
   1250		if (ret)
   1251			return ret;
   1252
   1253		old_root_used = btrfs_root_used(&root->root_item);
   1254	}
   1255
   1256	return 0;
   1257}
   1258
   1259/*
   1260 * update all the cowonly tree roots on disk
   1261 *
   1262 * The error handling in this function may not be obvious. Any of the
   1263 * failures will cause the file system to go offline. We still need
   1264 * to clean up the delayed refs.
   1265 */
   1266static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
   1267{
   1268	struct btrfs_fs_info *fs_info = trans->fs_info;
   1269	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
   1270	struct list_head *io_bgs = &trans->transaction->io_bgs;
   1271	struct list_head *next;
   1272	struct extent_buffer *eb;
   1273	int ret;
   1274
   1275	/*
   1276	 * At this point no one can be using this transaction to modify any tree
   1277	 * and no one can start another transaction to modify any tree either.
   1278	 */
   1279	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
   1280
   1281	eb = btrfs_lock_root_node(fs_info->tree_root);
   1282	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
   1283			      0, &eb, BTRFS_NESTING_COW);
   1284	btrfs_tree_unlock(eb);
   1285	free_extent_buffer(eb);
   1286
   1287	if (ret)
   1288		return ret;
   1289
   1290	ret = btrfs_run_dev_stats(trans);
   1291	if (ret)
   1292		return ret;
   1293	ret = btrfs_run_dev_replace(trans);
   1294	if (ret)
   1295		return ret;
   1296	ret = btrfs_run_qgroups(trans);
   1297	if (ret)
   1298		return ret;
   1299
   1300	ret = btrfs_setup_space_cache(trans);
   1301	if (ret)
   1302		return ret;
   1303
   1304again:
   1305	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
   1306		struct btrfs_root *root;
   1307		next = fs_info->dirty_cowonly_roots.next;
   1308		list_del_init(next);
   1309		root = list_entry(next, struct btrfs_root, dirty_list);
   1310		clear_bit(BTRFS_ROOT_DIRTY, &root->state);
   1311
   1312		list_add_tail(&root->dirty_list,
   1313			      &trans->transaction->switch_commits);
   1314		ret = update_cowonly_root(trans, root);
   1315		if (ret)
   1316			return ret;
   1317	}
   1318
   1319	/* Now flush any delayed refs generated by updating all of the roots */
   1320	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
   1321	if (ret)
   1322		return ret;
   1323
   1324	while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
   1325		ret = btrfs_write_dirty_block_groups(trans);
   1326		if (ret)
   1327			return ret;
   1328
   1329		/*
   1330		 * We're writing the dirty block groups, which could generate
   1331		 * delayed refs, which could generate more dirty block groups,
   1332		 * so we want to keep this flushing in this loop to make sure
   1333		 * everything gets run.
   1334		 */
   1335		ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
   1336		if (ret)
   1337			return ret;
   1338	}
   1339
   1340	if (!list_empty(&fs_info->dirty_cowonly_roots))
   1341		goto again;
   1342
   1343	/* Update dev-replace pointer once everything is committed */
   1344	fs_info->dev_replace.committed_cursor_left =
   1345		fs_info->dev_replace.cursor_left_last_write_of_item;
   1346
   1347	return 0;
   1348}
   1349
   1350/*
   1351 * If we had a pending drop we need to see if there are any others left in our
   1352 * dead roots list, and if not clear our bit and wake any waiters.
   1353 */
   1354void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
   1355{
   1356	/*
   1357	 * We put the drop in progress roots at the front of the list, so if the
   1358	 * first entry doesn't have UNFINISHED_DROP set we can wake everybody
   1359	 * up.
   1360	 */
   1361	spin_lock(&fs_info->trans_lock);
   1362	if (!list_empty(&fs_info->dead_roots)) {
   1363		struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
   1364							   struct btrfs_root,
   1365							   root_list);
   1366		if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
   1367			spin_unlock(&fs_info->trans_lock);
   1368			return;
   1369		}
   1370	}
   1371	spin_unlock(&fs_info->trans_lock);
   1372
   1373	btrfs_wake_unfinished_drop(fs_info);
   1374}
   1375
   1376/*
   1377 * dead roots are old snapshots that need to be deleted.  This allocates
   1378 * a dirty root struct and adds it into the list of dead roots that need to
   1379 * be deleted
   1380 */
   1381void btrfs_add_dead_root(struct btrfs_root *root)
   1382{
   1383	struct btrfs_fs_info *fs_info = root->fs_info;
   1384
   1385	spin_lock(&fs_info->trans_lock);
   1386	if (list_empty(&root->root_list)) {
   1387		btrfs_grab_root(root);
   1388
   1389		/* We want to process the partially complete drops first. */
   1390		if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
   1391			list_add(&root->root_list, &fs_info->dead_roots);
   1392		else
   1393			list_add_tail(&root->root_list, &fs_info->dead_roots);
   1394	}
   1395	spin_unlock(&fs_info->trans_lock);
   1396}
   1397
   1398/*
   1399 * Update each subvolume root and its relocation root, if it exists, in the tree
   1400 * of tree roots. Also free log roots if they exist.
   1401 */
   1402static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
   1403{
   1404	struct btrfs_fs_info *fs_info = trans->fs_info;
   1405	struct btrfs_root *root;
   1406	unsigned long index;
   1407
   1408	/*
   1409	 * At this point no one can be using this transaction to modify any tree
   1410	 * and no one can start another transaction to modify any tree either.
   1411	 */
   1412	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
   1413
   1414	spin_lock(&fs_info->fs_roots_lock);
   1415	xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) {
   1416		int ret;
   1417
   1418		/*
   1419		 * At this point we can neither have tasks logging inodes
   1420		 * from a root nor trying to commit a log tree.
   1421		 */
   1422		ASSERT(atomic_read(&root->log_writers) == 0);
   1423		ASSERT(atomic_read(&root->log_commit[0]) == 0);
   1424		ASSERT(atomic_read(&root->log_commit[1]) == 0);
   1425
   1426		xa_clear_mark(&fs_info->fs_roots,
   1427			      (unsigned long)root->root_key.objectid,
   1428			      BTRFS_ROOT_TRANS_TAG);
   1429		spin_unlock(&fs_info->fs_roots_lock);
   1430
   1431		btrfs_free_log(trans, root);
   1432		ret = btrfs_update_reloc_root(trans, root);
   1433		if (ret)
   1434			return ret;
   1435
   1436		/* See comments in should_cow_block() */
   1437		clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
   1438		smp_mb__after_atomic();
   1439
   1440		if (root->commit_root != root->node) {
   1441			list_add_tail(&root->dirty_list,
   1442				      &trans->transaction->switch_commits);
   1443			btrfs_set_root_node(&root->root_item, root->node);
   1444		}
   1445
   1446		ret = btrfs_update_root(trans, fs_info->tree_root,
   1447					&root->root_key, &root->root_item);
   1448		if (ret)
   1449			return ret;
   1450		spin_lock(&fs_info->fs_roots_lock);
   1451		btrfs_qgroup_free_meta_all_pertrans(root);
   1452	}
   1453	spin_unlock(&fs_info->fs_roots_lock);
   1454	return 0;
   1455}
   1456
   1457/*
   1458 * defrag a given btree.
   1459 * Every leaf in the btree is read and defragged.
   1460 */
   1461int btrfs_defrag_root(struct btrfs_root *root)
   1462{
   1463	struct btrfs_fs_info *info = root->fs_info;
   1464	struct btrfs_trans_handle *trans;
   1465	int ret;
   1466
   1467	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
   1468		return 0;
   1469
   1470	while (1) {
   1471		trans = btrfs_start_transaction(root, 0);
   1472		if (IS_ERR(trans)) {
   1473			ret = PTR_ERR(trans);
   1474			break;
   1475		}
   1476
   1477		ret = btrfs_defrag_leaves(trans, root);
   1478
   1479		btrfs_end_transaction(trans);
   1480		btrfs_btree_balance_dirty(info);
   1481		cond_resched();
   1482
   1483		if (btrfs_fs_closing(info) || ret != -EAGAIN)
   1484			break;
   1485
   1486		if (btrfs_defrag_cancelled(info)) {
   1487			btrfs_debug(info, "defrag_root cancelled");
   1488			ret = -EAGAIN;
   1489			break;
   1490		}
   1491	}
   1492	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
   1493	return ret;
   1494}
   1495
   1496/*
   1497 * Do all special snapshot related qgroup dirty hack.
   1498 *
   1499 * Will do all needed qgroup inherit and dirty hack like switch commit
   1500 * roots inside one transaction and write all btree into disk, to make
   1501 * qgroup works.
   1502 */
   1503static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
   1504				   struct btrfs_root *src,
   1505				   struct btrfs_root *parent,
   1506				   struct btrfs_qgroup_inherit *inherit,
   1507				   u64 dst_objectid)
   1508{
   1509	struct btrfs_fs_info *fs_info = src->fs_info;
   1510	int ret;
   1511
   1512	/*
   1513	 * Save some performance in the case that qgroups are not
   1514	 * enabled. If this check races with the ioctl, rescan will
   1515	 * kick in anyway.
   1516	 */
   1517	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   1518		return 0;
   1519
   1520	/*
   1521	 * Ensure dirty @src will be committed.  Or, after coming
   1522	 * commit_fs_roots() and switch_commit_roots(), any dirty but not
   1523	 * recorded root will never be updated again, causing an outdated root
   1524	 * item.
   1525	 */
   1526	ret = record_root_in_trans(trans, src, 1);
   1527	if (ret)
   1528		return ret;
   1529
   1530	/*
   1531	 * btrfs_qgroup_inherit relies on a consistent view of the usage for the
   1532	 * src root, so we must run the delayed refs here.
   1533	 *
   1534	 * However this isn't particularly fool proof, because there's no
   1535	 * synchronization keeping us from changing the tree after this point
   1536	 * before we do the qgroup_inherit, or even from making changes while
   1537	 * we're doing the qgroup_inherit.  But that's a problem for the future,
   1538	 * for now flush the delayed refs to narrow the race window where the
   1539	 * qgroup counters could end up wrong.
   1540	 */
   1541	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
   1542	if (ret) {
   1543		btrfs_abort_transaction(trans, ret);
   1544		return ret;
   1545	}
   1546
   1547	ret = commit_fs_roots(trans);
   1548	if (ret)
   1549		goto out;
   1550	ret = btrfs_qgroup_account_extents(trans);
   1551	if (ret < 0)
   1552		goto out;
   1553
   1554	/* Now qgroup are all updated, we can inherit it to new qgroups */
   1555	ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
   1556				   inherit);
   1557	if (ret < 0)
   1558		goto out;
   1559
   1560	/*
   1561	 * Now we do a simplified commit transaction, which will:
   1562	 * 1) commit all subvolume and extent tree
   1563	 *    To ensure all subvolume and extent tree have a valid
   1564	 *    commit_root to accounting later insert_dir_item()
   1565	 * 2) write all btree blocks onto disk
   1566	 *    This is to make sure later btree modification will be cowed
   1567	 *    Or commit_root can be populated and cause wrong qgroup numbers
   1568	 * In this simplified commit, we don't really care about other trees
   1569	 * like chunk and root tree, as they won't affect qgroup.
   1570	 * And we don't write super to avoid half committed status.
   1571	 */
   1572	ret = commit_cowonly_roots(trans);
   1573	if (ret)
   1574		goto out;
   1575	switch_commit_roots(trans);
   1576	ret = btrfs_write_and_wait_transaction(trans);
   1577	if (ret)
   1578		btrfs_handle_fs_error(fs_info, ret,
   1579			"Error while writing out transaction for qgroup");
   1580
   1581out:
   1582	/*
   1583	 * Force parent root to be updated, as we recorded it before so its
   1584	 * last_trans == cur_transid.
   1585	 * Or it won't be committed again onto disk after later
   1586	 * insert_dir_item()
   1587	 */
   1588	if (!ret)
   1589		ret = record_root_in_trans(trans, parent, 1);
   1590	return ret;
   1591}
   1592
   1593/*
   1594 * new snapshots need to be created at a very specific time in the
   1595 * transaction commit.  This does the actual creation.
   1596 *
   1597 * Note:
   1598 * If the error which may affect the commitment of the current transaction
   1599 * happens, we should return the error number. If the error which just affect
   1600 * the creation of the pending snapshots, just return 0.
   1601 */
   1602static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
   1603				   struct btrfs_pending_snapshot *pending)
   1604{
   1605
   1606	struct btrfs_fs_info *fs_info = trans->fs_info;
   1607	struct btrfs_key key;
   1608	struct btrfs_root_item *new_root_item;
   1609	struct btrfs_root *tree_root = fs_info->tree_root;
   1610	struct btrfs_root *root = pending->root;
   1611	struct btrfs_root *parent_root;
   1612	struct btrfs_block_rsv *rsv;
   1613	struct inode *parent_inode;
   1614	struct btrfs_path *path;
   1615	struct btrfs_dir_item *dir_item;
   1616	struct dentry *dentry;
   1617	struct extent_buffer *tmp;
   1618	struct extent_buffer *old;
   1619	struct timespec64 cur_time;
   1620	int ret = 0;
   1621	u64 to_reserve = 0;
   1622	u64 index = 0;
   1623	u64 objectid;
   1624	u64 root_flags;
   1625
   1626	ASSERT(pending->path);
   1627	path = pending->path;
   1628
   1629	ASSERT(pending->root_item);
   1630	new_root_item = pending->root_item;
   1631
   1632	pending->error = btrfs_get_free_objectid(tree_root, &objectid);
   1633	if (pending->error)
   1634		goto no_free_objectid;
   1635
   1636	/*
   1637	 * Make qgroup to skip current new snapshot's qgroupid, as it is
   1638	 * accounted by later btrfs_qgroup_inherit().
   1639	 */
   1640	btrfs_set_skip_qgroup(trans, objectid);
   1641
   1642	btrfs_reloc_pre_snapshot(pending, &to_reserve);
   1643
   1644	if (to_reserve > 0) {
   1645		pending->error = btrfs_block_rsv_add(fs_info,
   1646						     &pending->block_rsv,
   1647						     to_reserve,
   1648						     BTRFS_RESERVE_NO_FLUSH);
   1649		if (pending->error)
   1650			goto clear_skip_qgroup;
   1651	}
   1652
   1653	key.objectid = objectid;
   1654	key.offset = (u64)-1;
   1655	key.type = BTRFS_ROOT_ITEM_KEY;
   1656
   1657	rsv = trans->block_rsv;
   1658	trans->block_rsv = &pending->block_rsv;
   1659	trans->bytes_reserved = trans->block_rsv->reserved;
   1660	trace_btrfs_space_reservation(fs_info, "transaction",
   1661				      trans->transid,
   1662				      trans->bytes_reserved, 1);
   1663	dentry = pending->dentry;
   1664	parent_inode = pending->dir;
   1665	parent_root = BTRFS_I(parent_inode)->root;
   1666	ret = record_root_in_trans(trans, parent_root, 0);
   1667	if (ret)
   1668		goto fail;
   1669	cur_time = current_time(parent_inode);
   1670
   1671	/*
   1672	 * insert the directory item
   1673	 */
   1674	ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
   1675	BUG_ON(ret); /* -ENOMEM */
   1676
   1677	/* check if there is a file/dir which has the same name. */
   1678	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
   1679					 btrfs_ino(BTRFS_I(parent_inode)),
   1680					 dentry->d_name.name,
   1681					 dentry->d_name.len, 0);
   1682	if (dir_item != NULL && !IS_ERR(dir_item)) {
   1683		pending->error = -EEXIST;
   1684		goto dir_item_existed;
   1685	} else if (IS_ERR(dir_item)) {
   1686		ret = PTR_ERR(dir_item);
   1687		btrfs_abort_transaction(trans, ret);
   1688		goto fail;
   1689	}
   1690	btrfs_release_path(path);
   1691
   1692	/*
   1693	 * pull in the delayed directory update
   1694	 * and the delayed inode item
   1695	 * otherwise we corrupt the FS during
   1696	 * snapshot
   1697	 */
   1698	ret = btrfs_run_delayed_items(trans);
   1699	if (ret) {	/* Transaction aborted */
   1700		btrfs_abort_transaction(trans, ret);
   1701		goto fail;
   1702	}
   1703
   1704	ret = record_root_in_trans(trans, root, 0);
   1705	if (ret) {
   1706		btrfs_abort_transaction(trans, ret);
   1707		goto fail;
   1708	}
   1709	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
   1710	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
   1711	btrfs_check_and_init_root_item(new_root_item);
   1712
   1713	root_flags = btrfs_root_flags(new_root_item);
   1714	if (pending->readonly)
   1715		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
   1716	else
   1717		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
   1718	btrfs_set_root_flags(new_root_item, root_flags);
   1719
   1720	btrfs_set_root_generation_v2(new_root_item,
   1721			trans->transid);
   1722	generate_random_guid(new_root_item->uuid);
   1723	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
   1724			BTRFS_UUID_SIZE);
   1725	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
   1726		memset(new_root_item->received_uuid, 0,
   1727		       sizeof(new_root_item->received_uuid));
   1728		memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
   1729		memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
   1730		btrfs_set_root_stransid(new_root_item, 0);
   1731		btrfs_set_root_rtransid(new_root_item, 0);
   1732	}
   1733	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
   1734	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
   1735	btrfs_set_root_otransid(new_root_item, trans->transid);
   1736
   1737	old = btrfs_lock_root_node(root);
   1738	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
   1739			      BTRFS_NESTING_COW);
   1740	if (ret) {
   1741		btrfs_tree_unlock(old);
   1742		free_extent_buffer(old);
   1743		btrfs_abort_transaction(trans, ret);
   1744		goto fail;
   1745	}
   1746
   1747	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
   1748	/* clean up in any case */
   1749	btrfs_tree_unlock(old);
   1750	free_extent_buffer(old);
   1751	if (ret) {
   1752		btrfs_abort_transaction(trans, ret);
   1753		goto fail;
   1754	}
   1755	/* see comments in should_cow_block() */
   1756	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
   1757	smp_wmb();
   1758
   1759	btrfs_set_root_node(new_root_item, tmp);
   1760	/* record when the snapshot was created in key.offset */
   1761	key.offset = trans->transid;
   1762	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
   1763	btrfs_tree_unlock(tmp);
   1764	free_extent_buffer(tmp);
   1765	if (ret) {
   1766		btrfs_abort_transaction(trans, ret);
   1767		goto fail;
   1768	}
   1769
   1770	/*
   1771	 * insert root back/forward references
   1772	 */
   1773	ret = btrfs_add_root_ref(trans, objectid,
   1774				 parent_root->root_key.objectid,
   1775				 btrfs_ino(BTRFS_I(parent_inode)), index,
   1776				 dentry->d_name.name, dentry->d_name.len);
   1777	if (ret) {
   1778		btrfs_abort_transaction(trans, ret);
   1779		goto fail;
   1780	}
   1781
   1782	key.offset = (u64)-1;
   1783	pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
   1784	if (IS_ERR(pending->snap)) {
   1785		ret = PTR_ERR(pending->snap);
   1786		pending->snap = NULL;
   1787		btrfs_abort_transaction(trans, ret);
   1788		goto fail;
   1789	}
   1790
   1791	ret = btrfs_reloc_post_snapshot(trans, pending);
   1792	if (ret) {
   1793		btrfs_abort_transaction(trans, ret);
   1794		goto fail;
   1795	}
   1796
   1797	/*
   1798	 * Do special qgroup accounting for snapshot, as we do some qgroup
   1799	 * snapshot hack to do fast snapshot.
   1800	 * To co-operate with that hack, we do hack again.
   1801	 * Or snapshot will be greatly slowed down by a subtree qgroup rescan
   1802	 */
   1803	ret = qgroup_account_snapshot(trans, root, parent_root,
   1804				      pending->inherit, objectid);
   1805	if (ret < 0)
   1806		goto fail;
   1807
   1808	ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
   1809				    dentry->d_name.len, BTRFS_I(parent_inode),
   1810				    &key, BTRFS_FT_DIR, index);
   1811	/* We have check then name at the beginning, so it is impossible. */
   1812	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
   1813	if (ret) {
   1814		btrfs_abort_transaction(trans, ret);
   1815		goto fail;
   1816	}
   1817
   1818	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
   1819					 dentry->d_name.len * 2);
   1820	parent_inode->i_mtime = parent_inode->i_ctime =
   1821		current_time(parent_inode);
   1822	ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
   1823	if (ret) {
   1824		btrfs_abort_transaction(trans, ret);
   1825		goto fail;
   1826	}
   1827	ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
   1828				  BTRFS_UUID_KEY_SUBVOL,
   1829				  objectid);
   1830	if (ret) {
   1831		btrfs_abort_transaction(trans, ret);
   1832		goto fail;
   1833	}
   1834	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
   1835		ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
   1836					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
   1837					  objectid);
   1838		if (ret && ret != -EEXIST) {
   1839			btrfs_abort_transaction(trans, ret);
   1840			goto fail;
   1841		}
   1842	}
   1843
   1844fail:
   1845	pending->error = ret;
   1846dir_item_existed:
   1847	trans->block_rsv = rsv;
   1848	trans->bytes_reserved = 0;
   1849clear_skip_qgroup:
   1850	btrfs_clear_skip_qgroup(trans);
   1851no_free_objectid:
   1852	kfree(new_root_item);
   1853	pending->root_item = NULL;
   1854	btrfs_free_path(path);
   1855	pending->path = NULL;
   1856
   1857	return ret;
   1858}
   1859
   1860/*
   1861 * create all the snapshots we've scheduled for creation
   1862 */
   1863static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
   1864{
   1865	struct btrfs_pending_snapshot *pending, *next;
   1866	struct list_head *head = &trans->transaction->pending_snapshots;
   1867	int ret = 0;
   1868
   1869	list_for_each_entry_safe(pending, next, head, list) {
   1870		list_del(&pending->list);
   1871		ret = create_pending_snapshot(trans, pending);
   1872		if (ret)
   1873			break;
   1874	}
   1875	return ret;
   1876}
   1877
   1878static void update_super_roots(struct btrfs_fs_info *fs_info)
   1879{
   1880	struct btrfs_root_item *root_item;
   1881	struct btrfs_super_block *super;
   1882
   1883	super = fs_info->super_copy;
   1884
   1885	root_item = &fs_info->chunk_root->root_item;
   1886	super->chunk_root = root_item->bytenr;
   1887	super->chunk_root_generation = root_item->generation;
   1888	super->chunk_root_level = root_item->level;
   1889
   1890	root_item = &fs_info->tree_root->root_item;
   1891	super->root = root_item->bytenr;
   1892	super->generation = root_item->generation;
   1893	super->root_level = root_item->level;
   1894	if (btrfs_test_opt(fs_info, SPACE_CACHE))
   1895		super->cache_generation = root_item->generation;
   1896	else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
   1897		super->cache_generation = 0;
   1898	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
   1899		super->uuid_tree_generation = root_item->generation;
   1900
   1901	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
   1902		root_item = &fs_info->block_group_root->root_item;
   1903
   1904		super->block_group_root = root_item->bytenr;
   1905		super->block_group_root_generation = root_item->generation;
   1906		super->block_group_root_level = root_item->level;
   1907	}
   1908}
   1909
   1910int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
   1911{
   1912	struct btrfs_transaction *trans;
   1913	int ret = 0;
   1914
   1915	spin_lock(&info->trans_lock);
   1916	trans = info->running_transaction;
   1917	if (trans)
   1918		ret = (trans->state >= TRANS_STATE_COMMIT_START);
   1919	spin_unlock(&info->trans_lock);
   1920	return ret;
   1921}
   1922
   1923int btrfs_transaction_blocked(struct btrfs_fs_info *info)
   1924{
   1925	struct btrfs_transaction *trans;
   1926	int ret = 0;
   1927
   1928	spin_lock(&info->trans_lock);
   1929	trans = info->running_transaction;
   1930	if (trans)
   1931		ret = is_transaction_blocked(trans);
   1932	spin_unlock(&info->trans_lock);
   1933	return ret;
   1934}
   1935
   1936void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
   1937{
   1938	struct btrfs_fs_info *fs_info = trans->fs_info;
   1939	struct btrfs_transaction *cur_trans;
   1940
   1941	/* Kick the transaction kthread. */
   1942	set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
   1943	wake_up_process(fs_info->transaction_kthread);
   1944
   1945	/* take transaction reference */
   1946	cur_trans = trans->transaction;
   1947	refcount_inc(&cur_trans->use_count);
   1948
   1949	btrfs_end_transaction(trans);
   1950
   1951	/*
   1952	 * Wait for the current transaction commit to start and block
   1953	 * subsequent transaction joins
   1954	 */
   1955	wait_event(fs_info->transaction_blocked_wait,
   1956		   cur_trans->state >= TRANS_STATE_COMMIT_START ||
   1957		   TRANS_ABORTED(cur_trans));
   1958	btrfs_put_transaction(cur_trans);
   1959}
   1960
   1961static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
   1962{
   1963	struct btrfs_fs_info *fs_info = trans->fs_info;
   1964	struct btrfs_transaction *cur_trans = trans->transaction;
   1965
   1966	WARN_ON(refcount_read(&trans->use_count) > 1);
   1967
   1968	btrfs_abort_transaction(trans, err);
   1969
   1970	spin_lock(&fs_info->trans_lock);
   1971
   1972	/*
   1973	 * If the transaction is removed from the list, it means this
   1974	 * transaction has been committed successfully, so it is impossible
   1975	 * to call the cleanup function.
   1976	 */
   1977	BUG_ON(list_empty(&cur_trans->list));
   1978
   1979	if (cur_trans == fs_info->running_transaction) {
   1980		cur_trans->state = TRANS_STATE_COMMIT_DOING;
   1981		spin_unlock(&fs_info->trans_lock);
   1982		wait_event(cur_trans->writer_wait,
   1983			   atomic_read(&cur_trans->num_writers) == 1);
   1984
   1985		spin_lock(&fs_info->trans_lock);
   1986	}
   1987
   1988	/*
   1989	 * Now that we know no one else is still using the transaction we can
   1990	 * remove the transaction from the list of transactions. This avoids
   1991	 * the transaction kthread from cleaning up the transaction while some
   1992	 * other task is still using it, which could result in a use-after-free
   1993	 * on things like log trees, as it forces the transaction kthread to
   1994	 * wait for this transaction to be cleaned up by us.
   1995	 */
   1996	list_del_init(&cur_trans->list);
   1997
   1998	spin_unlock(&fs_info->trans_lock);
   1999
   2000	btrfs_cleanup_one_transaction(trans->transaction, fs_info);
   2001
   2002	spin_lock(&fs_info->trans_lock);
   2003	if (cur_trans == fs_info->running_transaction)
   2004		fs_info->running_transaction = NULL;
   2005	spin_unlock(&fs_info->trans_lock);
   2006
   2007	if (trans->type & __TRANS_FREEZABLE)
   2008		sb_end_intwrite(fs_info->sb);
   2009	btrfs_put_transaction(cur_trans);
   2010	btrfs_put_transaction(cur_trans);
   2011
   2012	trace_btrfs_transaction_commit(fs_info);
   2013
   2014	if (current->journal_info == trans)
   2015		current->journal_info = NULL;
   2016	btrfs_scrub_cancel(fs_info);
   2017
   2018	kmem_cache_free(btrfs_trans_handle_cachep, trans);
   2019}
   2020
   2021/*
   2022 * Release reserved delayed ref space of all pending block groups of the
   2023 * transaction and remove them from the list
   2024 */
   2025static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
   2026{
   2027       struct btrfs_fs_info *fs_info = trans->fs_info;
   2028       struct btrfs_block_group *block_group, *tmp;
   2029
   2030       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
   2031               btrfs_delayed_refs_rsv_release(fs_info, 1);
   2032               list_del_init(&block_group->bg_list);
   2033       }
   2034}
   2035
   2036static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
   2037{
   2038	/*
   2039	 * We use try_to_writeback_inodes_sb() here because if we used
   2040	 * btrfs_start_delalloc_roots we would deadlock with fs freeze.
   2041	 * Currently are holding the fs freeze lock, if we do an async flush
   2042	 * we'll do btrfs_join_transaction() and deadlock because we need to
   2043	 * wait for the fs freeze lock.  Using the direct flushing we benefit
   2044	 * from already being in a transaction and our join_transaction doesn't
   2045	 * have to re-take the fs freeze lock.
   2046	 *
   2047	 * Note that try_to_writeback_inodes_sb() will only trigger writeback
   2048	 * if it can read lock sb->s_umount. It will always be able to lock it,
   2049	 * except when the filesystem is being unmounted or being frozen, but in
   2050	 * those cases sync_filesystem() is called, which results in calling
   2051	 * writeback_inodes_sb() while holding a write lock on sb->s_umount.
   2052	 * Note that we don't call writeback_inodes_sb() directly, because it
   2053	 * will emit a warning if sb->s_umount is not locked.
   2054	 */
   2055	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
   2056		try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
   2057	return 0;
   2058}
   2059
   2060static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
   2061{
   2062	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
   2063		btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
   2064}
   2065
   2066/*
   2067 * Add a pending snapshot associated with the given transaction handle to the
   2068 * respective handle. This must be called after the transaction commit started
   2069 * and while holding fs_info->trans_lock.
   2070 * This serves to guarantee a caller of btrfs_commit_transaction() that it can
   2071 * safely free the pending snapshot pointer in case btrfs_commit_transaction()
   2072 * returns an error.
   2073 */
   2074static void add_pending_snapshot(struct btrfs_trans_handle *trans)
   2075{
   2076	struct btrfs_transaction *cur_trans = trans->transaction;
   2077
   2078	if (!trans->pending_snapshot)
   2079		return;
   2080
   2081	lockdep_assert_held(&trans->fs_info->trans_lock);
   2082	ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
   2083
   2084	list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
   2085}
   2086
   2087int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
   2088{
   2089	struct btrfs_fs_info *fs_info = trans->fs_info;
   2090	struct btrfs_transaction *cur_trans = trans->transaction;
   2091	struct btrfs_transaction *prev_trans = NULL;
   2092	int ret;
   2093
   2094	ASSERT(refcount_read(&trans->use_count) == 1);
   2095
   2096	/* Stop the commit early if ->aborted is set */
   2097	if (TRANS_ABORTED(cur_trans)) {
   2098		ret = cur_trans->aborted;
   2099		btrfs_end_transaction(trans);
   2100		return ret;
   2101	}
   2102
   2103	btrfs_trans_release_metadata(trans);
   2104	trans->block_rsv = NULL;
   2105
   2106	/*
   2107	 * We only want one transaction commit doing the flushing so we do not
   2108	 * waste a bunch of time on lock contention on the extent root node.
   2109	 */
   2110	if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
   2111			      &cur_trans->delayed_refs.flags)) {
   2112		/*
   2113		 * Make a pass through all the delayed refs we have so far.
   2114		 * Any running threads may add more while we are here.
   2115		 */
   2116		ret = btrfs_run_delayed_refs(trans, 0);
   2117		if (ret) {
   2118			btrfs_end_transaction(trans);
   2119			return ret;
   2120		}
   2121	}
   2122
   2123	btrfs_create_pending_block_groups(trans);
   2124
   2125	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
   2126		int run_it = 0;
   2127
   2128		/* this mutex is also taken before trying to set
   2129		 * block groups readonly.  We need to make sure
   2130		 * that nobody has set a block group readonly
   2131		 * after a extents from that block group have been
   2132		 * allocated for cache files.  btrfs_set_block_group_ro
   2133		 * will wait for the transaction to commit if it
   2134		 * finds BTRFS_TRANS_DIRTY_BG_RUN set.
   2135		 *
   2136		 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
   2137		 * only one process starts all the block group IO.  It wouldn't
   2138		 * hurt to have more than one go through, but there's no
   2139		 * real advantage to it either.
   2140		 */
   2141		mutex_lock(&fs_info->ro_block_group_mutex);
   2142		if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
   2143				      &cur_trans->flags))
   2144			run_it = 1;
   2145		mutex_unlock(&fs_info->ro_block_group_mutex);
   2146
   2147		if (run_it) {
   2148			ret = btrfs_start_dirty_block_groups(trans);
   2149			if (ret) {
   2150				btrfs_end_transaction(trans);
   2151				return ret;
   2152			}
   2153		}
   2154	}
   2155
   2156	spin_lock(&fs_info->trans_lock);
   2157	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
   2158		enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
   2159
   2160		add_pending_snapshot(trans);
   2161
   2162		spin_unlock(&fs_info->trans_lock);
   2163		refcount_inc(&cur_trans->use_count);
   2164
   2165		if (trans->in_fsync)
   2166			want_state = TRANS_STATE_SUPER_COMMITTED;
   2167		ret = btrfs_end_transaction(trans);
   2168		wait_for_commit(cur_trans, want_state);
   2169
   2170		if (TRANS_ABORTED(cur_trans))
   2171			ret = cur_trans->aborted;
   2172
   2173		btrfs_put_transaction(cur_trans);
   2174
   2175		return ret;
   2176	}
   2177
   2178	cur_trans->state = TRANS_STATE_COMMIT_START;
   2179	wake_up(&fs_info->transaction_blocked_wait);
   2180
   2181	if (cur_trans->list.prev != &fs_info->trans_list) {
   2182		enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
   2183
   2184		if (trans->in_fsync)
   2185			want_state = TRANS_STATE_SUPER_COMMITTED;
   2186
   2187		prev_trans = list_entry(cur_trans->list.prev,
   2188					struct btrfs_transaction, list);
   2189		if (prev_trans->state < want_state) {
   2190			refcount_inc(&prev_trans->use_count);
   2191			spin_unlock(&fs_info->trans_lock);
   2192
   2193			wait_for_commit(prev_trans, want_state);
   2194
   2195			ret = READ_ONCE(prev_trans->aborted);
   2196
   2197			btrfs_put_transaction(prev_trans);
   2198			if (ret)
   2199				goto cleanup_transaction;
   2200		} else {
   2201			spin_unlock(&fs_info->trans_lock);
   2202		}
   2203	} else {
   2204		spin_unlock(&fs_info->trans_lock);
   2205		/*
   2206		 * The previous transaction was aborted and was already removed
   2207		 * from the list of transactions at fs_info->trans_list. So we
   2208		 * abort to prevent writing a new superblock that reflects a
   2209		 * corrupt state (pointing to trees with unwritten nodes/leafs).
   2210		 */
   2211		if (BTRFS_FS_ERROR(fs_info)) {
   2212			ret = -EROFS;
   2213			goto cleanup_transaction;
   2214		}
   2215	}
   2216
   2217	extwriter_counter_dec(cur_trans, trans->type);
   2218
   2219	ret = btrfs_start_delalloc_flush(fs_info);
   2220	if (ret)
   2221		goto cleanup_transaction;
   2222
   2223	ret = btrfs_run_delayed_items(trans);
   2224	if (ret)
   2225		goto cleanup_transaction;
   2226
   2227	wait_event(cur_trans->writer_wait,
   2228		   extwriter_counter_read(cur_trans) == 0);
   2229
   2230	/* some pending stuffs might be added after the previous flush. */
   2231	ret = btrfs_run_delayed_items(trans);
   2232	if (ret)
   2233		goto cleanup_transaction;
   2234
   2235	btrfs_wait_delalloc_flush(fs_info);
   2236
   2237	/*
   2238	 * Wait for all ordered extents started by a fast fsync that joined this
   2239	 * transaction. Otherwise if this transaction commits before the ordered
   2240	 * extents complete we lose logged data after a power failure.
   2241	 */
   2242	wait_event(cur_trans->pending_wait,
   2243		   atomic_read(&cur_trans->pending_ordered) == 0);
   2244
   2245	btrfs_scrub_pause(fs_info);
   2246	/*
   2247	 * Ok now we need to make sure to block out any other joins while we
   2248	 * commit the transaction.  We could have started a join before setting
   2249	 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
   2250	 */
   2251	spin_lock(&fs_info->trans_lock);
   2252	add_pending_snapshot(trans);
   2253	cur_trans->state = TRANS_STATE_COMMIT_DOING;
   2254	spin_unlock(&fs_info->trans_lock);
   2255	wait_event(cur_trans->writer_wait,
   2256		   atomic_read(&cur_trans->num_writers) == 1);
   2257
   2258	/*
   2259	 * We've started the commit, clear the flag in case we were triggered to
   2260	 * do an async commit but somebody else started before the transaction
   2261	 * kthread could do the work.
   2262	 */
   2263	clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
   2264
   2265	if (TRANS_ABORTED(cur_trans)) {
   2266		ret = cur_trans->aborted;
   2267		goto scrub_continue;
   2268	}
   2269	/*
   2270	 * the reloc mutex makes sure that we stop
   2271	 * the balancing code from coming in and moving
   2272	 * extents around in the middle of the commit
   2273	 */
   2274	mutex_lock(&fs_info->reloc_mutex);
   2275
   2276	/*
   2277	 * We needn't worry about the delayed items because we will
   2278	 * deal with them in create_pending_snapshot(), which is the
   2279	 * core function of the snapshot creation.
   2280	 */
   2281	ret = create_pending_snapshots(trans);
   2282	if (ret)
   2283		goto unlock_reloc;
   2284
   2285	/*
   2286	 * We insert the dir indexes of the snapshots and update the inode
   2287	 * of the snapshots' parents after the snapshot creation, so there
   2288	 * are some delayed items which are not dealt with. Now deal with
   2289	 * them.
   2290	 *
   2291	 * We needn't worry that this operation will corrupt the snapshots,
   2292	 * because all the tree which are snapshoted will be forced to COW
   2293	 * the nodes and leaves.
   2294	 */
   2295	ret = btrfs_run_delayed_items(trans);
   2296	if (ret)
   2297		goto unlock_reloc;
   2298
   2299	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
   2300	if (ret)
   2301		goto unlock_reloc;
   2302
   2303	/*
   2304	 * make sure none of the code above managed to slip in a
   2305	 * delayed item
   2306	 */
   2307	btrfs_assert_delayed_root_empty(fs_info);
   2308
   2309	WARN_ON(cur_trans != trans->transaction);
   2310
   2311	ret = commit_fs_roots(trans);
   2312	if (ret)
   2313		goto unlock_reloc;
   2314
   2315	/*
   2316	 * Since the transaction is done, we can apply the pending changes
   2317	 * before the next transaction.
   2318	 */
   2319	btrfs_apply_pending_changes(fs_info);
   2320
   2321	/* commit_fs_roots gets rid of all the tree log roots, it is now
   2322	 * safe to free the root of tree log roots
   2323	 */
   2324	btrfs_free_log_root_tree(trans, fs_info);
   2325
   2326	/*
   2327	 * Since fs roots are all committed, we can get a quite accurate
   2328	 * new_roots. So let's do quota accounting.
   2329	 */
   2330	ret = btrfs_qgroup_account_extents(trans);
   2331	if (ret < 0)
   2332		goto unlock_reloc;
   2333
   2334	ret = commit_cowonly_roots(trans);
   2335	if (ret)
   2336		goto unlock_reloc;
   2337
   2338	/*
   2339	 * The tasks which save the space cache and inode cache may also
   2340	 * update ->aborted, check it.
   2341	 */
   2342	if (TRANS_ABORTED(cur_trans)) {
   2343		ret = cur_trans->aborted;
   2344		goto unlock_reloc;
   2345	}
   2346
   2347	cur_trans = fs_info->running_transaction;
   2348
   2349	btrfs_set_root_node(&fs_info->tree_root->root_item,
   2350			    fs_info->tree_root->node);
   2351	list_add_tail(&fs_info->tree_root->dirty_list,
   2352		      &cur_trans->switch_commits);
   2353
   2354	btrfs_set_root_node(&fs_info->chunk_root->root_item,
   2355			    fs_info->chunk_root->node);
   2356	list_add_tail(&fs_info->chunk_root->dirty_list,
   2357		      &cur_trans->switch_commits);
   2358
   2359	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
   2360		btrfs_set_root_node(&fs_info->block_group_root->root_item,
   2361				    fs_info->block_group_root->node);
   2362		list_add_tail(&fs_info->block_group_root->dirty_list,
   2363			      &cur_trans->switch_commits);
   2364	}
   2365
   2366	switch_commit_roots(trans);
   2367
   2368	ASSERT(list_empty(&cur_trans->dirty_bgs));
   2369	ASSERT(list_empty(&cur_trans->io_bgs));
   2370	update_super_roots(fs_info);
   2371
   2372	btrfs_set_super_log_root(fs_info->super_copy, 0);
   2373	btrfs_set_super_log_root_level(fs_info->super_copy, 0);
   2374	memcpy(fs_info->super_for_commit, fs_info->super_copy,
   2375	       sizeof(*fs_info->super_copy));
   2376
   2377	btrfs_commit_device_sizes(cur_trans);
   2378
   2379	clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
   2380	clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
   2381
   2382	btrfs_trans_release_chunk_metadata(trans);
   2383
   2384	/*
   2385	 * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
   2386	 * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
   2387	 * make sure that before we commit our superblock, no other task can
   2388	 * start a new transaction and commit a log tree before we commit our
   2389	 * superblock. Anyone trying to commit a log tree locks this mutex before
   2390	 * writing its superblock.
   2391	 */
   2392	mutex_lock(&fs_info->tree_log_mutex);
   2393
   2394	spin_lock(&fs_info->trans_lock);
   2395	cur_trans->state = TRANS_STATE_UNBLOCKED;
   2396	fs_info->running_transaction = NULL;
   2397	spin_unlock(&fs_info->trans_lock);
   2398	mutex_unlock(&fs_info->reloc_mutex);
   2399
   2400	wake_up(&fs_info->transaction_wait);
   2401
   2402	ret = btrfs_write_and_wait_transaction(trans);
   2403	if (ret) {
   2404		btrfs_handle_fs_error(fs_info, ret,
   2405				      "Error while writing out transaction");
   2406		mutex_unlock(&fs_info->tree_log_mutex);
   2407		goto scrub_continue;
   2408	}
   2409
   2410	/*
   2411	 * At this point, we should have written all the tree blocks allocated
   2412	 * in this transaction. So it's now safe to free the redirtyied extent
   2413	 * buffers.
   2414	 */
   2415	btrfs_free_redirty_list(cur_trans);
   2416
   2417	ret = write_all_supers(fs_info, 0);
   2418	/*
   2419	 * the super is written, we can safely allow the tree-loggers
   2420	 * to go about their business
   2421	 */
   2422	mutex_unlock(&fs_info->tree_log_mutex);
   2423	if (ret)
   2424		goto scrub_continue;
   2425
   2426	/*
   2427	 * We needn't acquire the lock here because there is no other task
   2428	 * which can change it.
   2429	 */
   2430	cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
   2431	wake_up(&cur_trans->commit_wait);
   2432
   2433	btrfs_finish_extent_commit(trans);
   2434
   2435	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
   2436		btrfs_clear_space_info_full(fs_info);
   2437
   2438	fs_info->last_trans_committed = cur_trans->transid;
   2439	/*
   2440	 * We needn't acquire the lock here because there is no other task
   2441	 * which can change it.
   2442	 */
   2443	cur_trans->state = TRANS_STATE_COMPLETED;
   2444	wake_up(&cur_trans->commit_wait);
   2445
   2446	spin_lock(&fs_info->trans_lock);
   2447	list_del_init(&cur_trans->list);
   2448	spin_unlock(&fs_info->trans_lock);
   2449
   2450	btrfs_put_transaction(cur_trans);
   2451	btrfs_put_transaction(cur_trans);
   2452
   2453	if (trans->type & __TRANS_FREEZABLE)
   2454		sb_end_intwrite(fs_info->sb);
   2455
   2456	trace_btrfs_transaction_commit(fs_info);
   2457
   2458	btrfs_scrub_continue(fs_info);
   2459
   2460	if (current->journal_info == trans)
   2461		current->journal_info = NULL;
   2462
   2463	kmem_cache_free(btrfs_trans_handle_cachep, trans);
   2464
   2465	return ret;
   2466
   2467unlock_reloc:
   2468	mutex_unlock(&fs_info->reloc_mutex);
   2469scrub_continue:
   2470	btrfs_scrub_continue(fs_info);
   2471cleanup_transaction:
   2472	btrfs_trans_release_metadata(trans);
   2473	btrfs_cleanup_pending_block_groups(trans);
   2474	btrfs_trans_release_chunk_metadata(trans);
   2475	trans->block_rsv = NULL;
   2476	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
   2477	if (current->journal_info == trans)
   2478		current->journal_info = NULL;
   2479	cleanup_transaction(trans, ret);
   2480
   2481	return ret;
   2482}
   2483
   2484/*
   2485 * return < 0 if error
   2486 * 0 if there are no more dead_roots at the time of call
   2487 * 1 there are more to be processed, call me again
   2488 *
   2489 * The return value indicates there are certainly more snapshots to delete, but
   2490 * if there comes a new one during processing, it may return 0. We don't mind,
   2491 * because btrfs_commit_super will poke cleaner thread and it will process it a
   2492 * few seconds later.
   2493 */
   2494int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
   2495{
   2496	struct btrfs_root *root;
   2497	int ret;
   2498
   2499	spin_lock(&fs_info->trans_lock);
   2500	if (list_empty(&fs_info->dead_roots)) {
   2501		spin_unlock(&fs_info->trans_lock);
   2502		return 0;
   2503	}
   2504	root = list_first_entry(&fs_info->dead_roots,
   2505			struct btrfs_root, root_list);
   2506	list_del_init(&root->root_list);
   2507	spin_unlock(&fs_info->trans_lock);
   2508
   2509	btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
   2510
   2511	btrfs_kill_all_delayed_nodes(root);
   2512
   2513	if (btrfs_header_backref_rev(root->node) <
   2514			BTRFS_MIXED_BACKREF_REV)
   2515		ret = btrfs_drop_snapshot(root, 0, 0);
   2516	else
   2517		ret = btrfs_drop_snapshot(root, 1, 0);
   2518
   2519	btrfs_put_root(root);
   2520	return (ret < 0) ? 0 : 1;
   2521}
   2522
   2523void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
   2524{
   2525	unsigned long prev;
   2526	unsigned long bit;
   2527
   2528	prev = xchg(&fs_info->pending_changes, 0);
   2529	if (!prev)
   2530		return;
   2531
   2532	bit = 1 << BTRFS_PENDING_COMMIT;
   2533	if (prev & bit)
   2534		btrfs_debug(fs_info, "pending commit done");
   2535	prev &= ~bit;
   2536
   2537	if (prev)
   2538		btrfs_warn(fs_info,
   2539			"unknown pending changes left 0x%lx, ignoring", prev);
   2540}