disk-io.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
disk-io.c (150957B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2007 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/fs.h>
      7#include <linux/blkdev.h>
      8#include <linux/writeback.h>
      9#include <linux/workqueue.h>
     10#include <linux/kthread.h>
     11#include <linux/slab.h>
     12#include <linux/migrate.h>
     13#include <linux/ratelimit.h>
     14#include <linux/uuid.h>
     15#include <linux/semaphore.h>
     16#include <linux/error-injection.h>
     17#include <linux/crc32c.h>
     18#include <linux/sched/mm.h>
     19#include <asm/unaligned.h>
     20#include <crypto/hash.h>
     21#include "ctree.h"
     22#include "disk-io.h"
     23#include "transaction.h"
     24#include "btrfs_inode.h"
     25#include "volumes.h"
     26#include "print-tree.h"
     27#include "locking.h"
     28#include "tree-log.h"
     29#include "free-space-cache.h"
     30#include "free-space-tree.h"
     31#include "check-integrity.h"
     32#include "rcu-string.h"
     33#include "dev-replace.h"
     34#include "raid56.h"
     35#include "sysfs.h"
     36#include "qgroup.h"
     37#include "compression.h"
     38#include "tree-checker.h"
     39#include "ref-verify.h"
     40#include "block-group.h"
     41#include "discard.h"
     42#include "space-info.h"
     43#include "zoned.h"
     44#include "subpage.h"
     45
     46#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
     47				 BTRFS_HEADER_FLAG_RELOC |\
     48				 BTRFS_SUPER_FLAG_ERROR |\
     49				 BTRFS_SUPER_FLAG_SEEDING |\
     50				 BTRFS_SUPER_FLAG_METADUMP |\
     51				 BTRFS_SUPER_FLAG_METADUMP_V2)
     52
     53static void end_workqueue_fn(struct btrfs_work *work);
     54static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
     55static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
     56				      struct btrfs_fs_info *fs_info);
     57static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
     58static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
     59					struct extent_io_tree *dirty_pages,
     60					int mark);
     61static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
     62				       struct extent_io_tree *pinned_extents);
     63static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
     64static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
     65
     66/*
     67 * btrfs_end_io_wq structs are used to do processing in task context when an IO
     68 * is complete.  This is used during reads to verify checksums, and it is used
     69 * by writes to insert metadata for new file extents after IO is complete.
     70 */
     71struct btrfs_end_io_wq {
     72	struct bio *bio;
     73	bio_end_io_t *end_io;
     74	void *private;
     75	struct btrfs_fs_info *info;
     76	blk_status_t status;
     77	enum btrfs_wq_endio_type metadata;
     78	struct btrfs_work work;
     79};
     80
     81static struct kmem_cache *btrfs_end_io_wq_cache;
     82
     83int __init btrfs_end_io_wq_init(void)
     84{
     85	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
     86					sizeof(struct btrfs_end_io_wq),
     87					0,
     88					SLAB_MEM_SPREAD,
     89					NULL);
     90	if (!btrfs_end_io_wq_cache)
     91		return -ENOMEM;
     92	return 0;
     93}
     94
     95void __cold btrfs_end_io_wq_exit(void)
     96{
     97	kmem_cache_destroy(btrfs_end_io_wq_cache);
     98}
     99
    100static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
    101{
    102	if (fs_info->csum_shash)
    103		crypto_free_shash(fs_info->csum_shash);
    104}
    105
    106/*
    107 * async submit bios are used to offload expensive checksumming
    108 * onto the worker threads.  They checksum file and metadata bios
    109 * just before they are sent down the IO stack.
    110 */
    111struct async_submit_bio {
    112	struct inode *inode;
    113	struct bio *bio;
    114	extent_submit_bio_start_t *submit_bio_start;
    115	int mirror_num;
    116
    117	/* Optional parameter for submit_bio_start used by direct io */
    118	u64 dio_file_offset;
    119	struct btrfs_work work;
    120	blk_status_t status;
    121};
    122
    123/*
    124 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
    125 * eb, the lockdep key is determined by the btrfs_root it belongs to and
    126 * the level the eb occupies in the tree.
    127 *
    128 * Different roots are used for different purposes and may nest inside each
    129 * other and they require separate keysets.  As lockdep keys should be
    130 * static, assign keysets according to the purpose of the root as indicated
    131 * by btrfs_root->root_key.objectid.  This ensures that all special purpose
    132 * roots have separate keysets.
    133 *
    134 * Lock-nesting across peer nodes is always done with the immediate parent
    135 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
    136 * subclass to avoid triggering lockdep warning in such cases.
    137 *
    138 * The key is set by the readpage_end_io_hook after the buffer has passed
    139 * csum validation but before the pages are unlocked.  It is also set by
    140 * btrfs_init_new_buffer on freshly allocated blocks.
    141 *
    142 * We also add a check to make sure the highest level of the tree is the
    143 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
    144 * needs update as well.
    145 */
    146#ifdef CONFIG_DEBUG_LOCK_ALLOC
    147# if BTRFS_MAX_LEVEL != 8
    148#  error
    149# endif
    150
    151#define DEFINE_LEVEL(stem, level)					\
    152	.names[level] = "btrfs-" stem "-0" #level,
    153
    154#define DEFINE_NAME(stem)						\
    155	DEFINE_LEVEL(stem, 0)						\
    156	DEFINE_LEVEL(stem, 1)						\
    157	DEFINE_LEVEL(stem, 2)						\
    158	DEFINE_LEVEL(stem, 3)						\
    159	DEFINE_LEVEL(stem, 4)						\
    160	DEFINE_LEVEL(stem, 5)						\
    161	DEFINE_LEVEL(stem, 6)						\
    162	DEFINE_LEVEL(stem, 7)
    163
    164static struct btrfs_lockdep_keyset {
    165	u64			id;		/* root objectid */
    166	/* Longest entry: btrfs-free-space-00 */
    167	char			names[BTRFS_MAX_LEVEL][20];
    168	struct lock_class_key	keys[BTRFS_MAX_LEVEL];
    169} btrfs_lockdep_keysets[] = {
    170	{ .id = BTRFS_ROOT_TREE_OBJECTID,	DEFINE_NAME("root")	},
    171	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	DEFINE_NAME("extent")	},
    172	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	DEFINE_NAME("chunk")	},
    173	{ .id = BTRFS_DEV_TREE_OBJECTID,	DEFINE_NAME("dev")	},
    174	{ .id = BTRFS_CSUM_TREE_OBJECTID,	DEFINE_NAME("csum")	},
    175	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	DEFINE_NAME("quota")	},
    176	{ .id = BTRFS_TREE_LOG_OBJECTID,	DEFINE_NAME("log")	},
    177	{ .id = BTRFS_TREE_RELOC_OBJECTID,	DEFINE_NAME("treloc")	},
    178	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	DEFINE_NAME("dreloc")	},
    179	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
    180	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
    181	{ .id = 0,				DEFINE_NAME("tree")	},
    182};
    183
    184#undef DEFINE_LEVEL
    185#undef DEFINE_NAME
    186
    187void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
    188				    int level)
    189{
    190	struct btrfs_lockdep_keyset *ks;
    191
    192	BUG_ON(level >= ARRAY_SIZE(ks->keys));
    193
    194	/* find the matching keyset, id 0 is the default entry */
    195	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
    196		if (ks->id == objectid)
    197			break;
    198
    199	lockdep_set_class_and_name(&eb->lock,
    200				   &ks->keys[level], ks->names[level]);
    201}
    202
    203#endif
    204
    205/*
    206 * Compute the csum of a btree block and store the result to provided buffer.
    207 */
    208static void csum_tree_block(struct extent_buffer *buf, u8 *result)
    209{
    210	struct btrfs_fs_info *fs_info = buf->fs_info;
    211	const int num_pages = num_extent_pages(buf);
    212	const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
    213	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
    214	char *kaddr;
    215	int i;
    216
    217	shash->tfm = fs_info->csum_shash;
    218	crypto_shash_init(shash);
    219	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
    220	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
    221			    first_page_part - BTRFS_CSUM_SIZE);
    222
    223	for (i = 1; i < num_pages; i++) {
    224		kaddr = page_address(buf->pages[i]);
    225		crypto_shash_update(shash, kaddr, PAGE_SIZE);
    226	}
    227	memset(result, 0, BTRFS_CSUM_SIZE);
    228	crypto_shash_final(shash, result);
    229}
    230
    231/*
    232 * we can't consider a given block up to date unless the transid of the
    233 * block matches the transid in the parent node's pointer.  This is how we
    234 * detect blocks that either didn't get written at all or got written
    235 * in the wrong place.
    236 */
    237static int verify_parent_transid(struct extent_io_tree *io_tree,
    238				 struct extent_buffer *eb, u64 parent_transid,
    239				 int atomic)
    240{
    241	struct extent_state *cached_state = NULL;
    242	int ret;
    243
    244	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
    245		return 0;
    246
    247	if (atomic)
    248		return -EAGAIN;
    249
    250	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
    251			 &cached_state);
    252	if (extent_buffer_uptodate(eb) &&
    253	    btrfs_header_generation(eb) == parent_transid) {
    254		ret = 0;
    255		goto out;
    256	}
    257	btrfs_err_rl(eb->fs_info,
    258		"parent transid verify failed on %llu wanted %llu found %llu",
    259			eb->start,
    260			parent_transid, btrfs_header_generation(eb));
    261	ret = 1;
    262	clear_extent_buffer_uptodate(eb);
    263out:
    264	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
    265			     &cached_state);
    266	return ret;
    267}
    268
    269static bool btrfs_supported_super_csum(u16 csum_type)
    270{
    271	switch (csum_type) {
    272	case BTRFS_CSUM_TYPE_CRC32:
    273	case BTRFS_CSUM_TYPE_XXHASH:
    274	case BTRFS_CSUM_TYPE_SHA256:
    275	case BTRFS_CSUM_TYPE_BLAKE2:
    276		return true;
    277	default:
    278		return false;
    279	}
    280}
    281
    282/*
    283 * Return 0 if the superblock checksum type matches the checksum value of that
    284 * algorithm. Pass the raw disk superblock data.
    285 */
    286static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
    287				  char *raw_disk_sb)
    288{
    289	struct btrfs_super_block *disk_sb =
    290		(struct btrfs_super_block *)raw_disk_sb;
    291	char result[BTRFS_CSUM_SIZE];
    292	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
    293
    294	shash->tfm = fs_info->csum_shash;
    295
    296	/*
    297	 * The super_block structure does not span the whole
    298	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
    299	 * filled with zeros and is included in the checksum.
    300	 */
    301	crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
    302			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
    303
    304	if (memcmp(disk_sb->csum, result, fs_info->csum_size))
    305		return 1;
    306
    307	return 0;
    308}
    309
    310int btrfs_verify_level_key(struct extent_buffer *eb, int level,
    311			   struct btrfs_key *first_key, u64 parent_transid)
    312{
    313	struct btrfs_fs_info *fs_info = eb->fs_info;
    314	int found_level;
    315	struct btrfs_key found_key;
    316	int ret;
    317
    318	found_level = btrfs_header_level(eb);
    319	if (found_level != level) {
    320		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
    321		     KERN_ERR "BTRFS: tree level check failed\n");
    322		btrfs_err(fs_info,
    323"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
    324			  eb->start, level, found_level);
    325		return -EIO;
    326	}
    327
    328	if (!first_key)
    329		return 0;
    330
    331	/*
    332	 * For live tree block (new tree blocks in current transaction),
    333	 * we need proper lock context to avoid race, which is impossible here.
    334	 * So we only checks tree blocks which is read from disk, whose
    335	 * generation <= fs_info->last_trans_committed.
    336	 */
    337	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
    338		return 0;
    339
    340	/* We have @first_key, so this @eb must have at least one item */
    341	if (btrfs_header_nritems(eb) == 0) {
    342		btrfs_err(fs_info,
    343		"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
    344			  eb->start);
    345		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
    346		return -EUCLEAN;
    347	}
    348
    349	if (found_level)
    350		btrfs_node_key_to_cpu(eb, &found_key, 0);
    351	else
    352		btrfs_item_key_to_cpu(eb, &found_key, 0);
    353	ret = btrfs_comp_cpu_keys(first_key, &found_key);
    354
    355	if (ret) {
    356		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
    357		     KERN_ERR "BTRFS: tree first key check failed\n");
    358		btrfs_err(fs_info,
    359"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
    360			  eb->start, parent_transid, first_key->objectid,
    361			  first_key->type, first_key->offset,
    362			  found_key.objectid, found_key.type,
    363			  found_key.offset);
    364	}
    365	return ret;
    366}
    367
    368/*
    369 * helper to read a given tree block, doing retries as required when
    370 * the checksums don't match and we have alternate mirrors to try.
    371 *
    372 * @parent_transid:	expected transid, skip check if 0
    373 * @level:		expected level, mandatory check
    374 * @first_key:		expected key of first slot, skip check if NULL
    375 */
    376int btrfs_read_extent_buffer(struct extent_buffer *eb,
    377			     u64 parent_transid, int level,
    378			     struct btrfs_key *first_key)
    379{
    380	struct btrfs_fs_info *fs_info = eb->fs_info;
    381	struct extent_io_tree *io_tree;
    382	int failed = 0;
    383	int ret;
    384	int num_copies = 0;
    385	int mirror_num = 0;
    386	int failed_mirror = 0;
    387
    388	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
    389	while (1) {
    390		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
    391		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
    392		if (!ret) {
    393			if (verify_parent_transid(io_tree, eb,
    394						   parent_transid, 0))
    395				ret = -EIO;
    396			else if (btrfs_verify_level_key(eb, level,
    397						first_key, parent_transid))
    398				ret = -EUCLEAN;
    399			else
    400				break;
    401		}
    402
    403		num_copies = btrfs_num_copies(fs_info,
    404					      eb->start, eb->len);
    405		if (num_copies == 1)
    406			break;
    407
    408		if (!failed_mirror) {
    409			failed = 1;
    410			failed_mirror = eb->read_mirror;
    411		}
    412
    413		mirror_num++;
    414		if (mirror_num == failed_mirror)
    415			mirror_num++;
    416
    417		if (mirror_num > num_copies)
    418			break;
    419	}
    420
    421	if (failed && !ret && failed_mirror)
    422		btrfs_repair_eb_io_failure(eb, failed_mirror);
    423
    424	return ret;
    425}
    426
    427static int csum_one_extent_buffer(struct extent_buffer *eb)
    428{
    429	struct btrfs_fs_info *fs_info = eb->fs_info;
    430	u8 result[BTRFS_CSUM_SIZE];
    431	int ret;
    432
    433	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
    434				    offsetof(struct btrfs_header, fsid),
    435				    BTRFS_FSID_SIZE) == 0);
    436	csum_tree_block(eb, result);
    437
    438	if (btrfs_header_level(eb))
    439		ret = btrfs_check_node(eb);
    440	else
    441		ret = btrfs_check_leaf_full(eb);
    442
    443	if (ret < 0)
    444		goto error;
    445
    446	/*
    447	 * Also check the generation, the eb reached here must be newer than
    448	 * last committed. Or something seriously wrong happened.
    449	 */
    450	if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
    451		ret = -EUCLEAN;
    452		btrfs_err(fs_info,
    453			"block=%llu bad generation, have %llu expect > %llu",
    454			  eb->start, btrfs_header_generation(eb),
    455			  fs_info->last_trans_committed);
    456		goto error;
    457	}
    458	write_extent_buffer(eb, result, 0, fs_info->csum_size);
    459
    460	return 0;
    461
    462error:
    463	btrfs_print_tree(eb, 0);
    464	btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
    465		  eb->start);
    466	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
    467	return ret;
    468}
    469
    470/* Checksum all dirty extent buffers in one bio_vec */
    471static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
    472				      struct bio_vec *bvec)
    473{
    474	struct page *page = bvec->bv_page;
    475	u64 bvec_start = page_offset(page) + bvec->bv_offset;
    476	u64 cur;
    477	int ret = 0;
    478
    479	for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
    480	     cur += fs_info->nodesize) {
    481		struct extent_buffer *eb;
    482		bool uptodate;
    483
    484		eb = find_extent_buffer(fs_info, cur);
    485		uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
    486						       fs_info->nodesize);
    487
    488		/* A dirty eb shouldn't disappear from extent_buffers */
    489		if (WARN_ON(!eb))
    490			return -EUCLEAN;
    491
    492		if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
    493			free_extent_buffer(eb);
    494			return -EUCLEAN;
    495		}
    496		if (WARN_ON(!uptodate)) {
    497			free_extent_buffer(eb);
    498			return -EUCLEAN;
    499		}
    500
    501		ret = csum_one_extent_buffer(eb);
    502		free_extent_buffer(eb);
    503		if (ret < 0)
    504			return ret;
    505	}
    506	return ret;
    507}
    508
    509/*
    510 * Checksum a dirty tree block before IO.  This has extra checks to make sure
    511 * we only fill in the checksum field in the first page of a multi-page block.
    512 * For subpage extent buffers we need bvec to also read the offset in the page.
    513 */
    514static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
    515{
    516	struct page *page = bvec->bv_page;
    517	u64 start = page_offset(page);
    518	u64 found_start;
    519	struct extent_buffer *eb;
    520
    521	if (fs_info->nodesize < PAGE_SIZE)
    522		return csum_dirty_subpage_buffers(fs_info, bvec);
    523
    524	eb = (struct extent_buffer *)page->private;
    525	if (page != eb->pages[0])
    526		return 0;
    527
    528	found_start = btrfs_header_bytenr(eb);
    529
    530	if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
    531		WARN_ON(found_start != 0);
    532		return 0;
    533	}
    534
    535	/*
    536	 * Please do not consolidate these warnings into a single if.
    537	 * It is useful to know what went wrong.
    538	 */
    539	if (WARN_ON(found_start != start))
    540		return -EUCLEAN;
    541	if (WARN_ON(!PageUptodate(page)))
    542		return -EUCLEAN;
    543
    544	return csum_one_extent_buffer(eb);
    545}
    546
    547static int check_tree_block_fsid(struct extent_buffer *eb)
    548{
    549	struct btrfs_fs_info *fs_info = eb->fs_info;
    550	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
    551	u8 fsid[BTRFS_FSID_SIZE];
    552	u8 *metadata_uuid;
    553
    554	read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
    555			   BTRFS_FSID_SIZE);
    556	/*
    557	 * Checking the incompat flag is only valid for the current fs. For
    558	 * seed devices it's forbidden to have their uuid changed so reading
    559	 * ->fsid in this case is fine
    560	 */
    561	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
    562		metadata_uuid = fs_devices->metadata_uuid;
    563	else
    564		metadata_uuid = fs_devices->fsid;
    565
    566	if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
    567		return 0;
    568
    569	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
    570		if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
    571			return 0;
    572
    573	return 1;
    574}
    575
    576/* Do basic extent buffer checks at read time */
    577static int validate_extent_buffer(struct extent_buffer *eb)
    578{
    579	struct btrfs_fs_info *fs_info = eb->fs_info;
    580	u64 found_start;
    581	const u32 csum_size = fs_info->csum_size;
    582	u8 found_level;
    583	u8 result[BTRFS_CSUM_SIZE];
    584	const u8 *header_csum;
    585	int ret = 0;
    586
    587	found_start = btrfs_header_bytenr(eb);
    588	if (found_start != eb->start) {
    589		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
    590			     eb->start, found_start);
    591		ret = -EIO;
    592		goto out;
    593	}
    594	if (check_tree_block_fsid(eb)) {
    595		btrfs_err_rl(fs_info, "bad fsid on block %llu",
    596			     eb->start);
    597		ret = -EIO;
    598		goto out;
    599	}
    600	found_level = btrfs_header_level(eb);
    601	if (found_level >= BTRFS_MAX_LEVEL) {
    602		btrfs_err(fs_info, "bad tree block level %d on %llu",
    603			  (int)btrfs_header_level(eb), eb->start);
    604		ret = -EIO;
    605		goto out;
    606	}
    607
    608	csum_tree_block(eb, result);
    609	header_csum = page_address(eb->pages[0]) +
    610		get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
    611
    612	if (memcmp(result, header_csum, csum_size) != 0) {
    613		btrfs_warn_rl(fs_info,
    614	"checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
    615			      eb->start,
    616			      CSUM_FMT_VALUE(csum_size, header_csum),
    617			      CSUM_FMT_VALUE(csum_size, result),
    618			      btrfs_header_level(eb));
    619		ret = -EUCLEAN;
    620		goto out;
    621	}
    622
    623	/*
    624	 * If this is a leaf block and it is corrupt, set the corrupt bit so
    625	 * that we don't try and read the other copies of this block, just
    626	 * return -EIO.
    627	 */
    628	if (found_level == 0 && btrfs_check_leaf_full(eb)) {
    629		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
    630		ret = -EIO;
    631	}
    632
    633	if (found_level > 0 && btrfs_check_node(eb))
    634		ret = -EIO;
    635
    636	if (!ret)
    637		set_extent_buffer_uptodate(eb);
    638	else
    639		btrfs_err(fs_info,
    640			  "block=%llu read time tree block corruption detected",
    641			  eb->start);
    642out:
    643	return ret;
    644}
    645
    646static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
    647				   int mirror)
    648{
    649	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
    650	struct extent_buffer *eb;
    651	bool reads_done;
    652	int ret = 0;
    653
    654	/*
    655	 * We don't allow bio merge for subpage metadata read, so we should
    656	 * only get one eb for each endio hook.
    657	 */
    658	ASSERT(end == start + fs_info->nodesize - 1);
    659	ASSERT(PagePrivate(page));
    660
    661	eb = find_extent_buffer(fs_info, start);
    662	/*
    663	 * When we are reading one tree block, eb must have been inserted into
    664	 * the radix tree. If not, something is wrong.
    665	 */
    666	ASSERT(eb);
    667
    668	reads_done = atomic_dec_and_test(&eb->io_pages);
    669	/* Subpage read must finish in page read */
    670	ASSERT(reads_done);
    671
    672	eb->read_mirror = mirror;
    673	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
    674		ret = -EIO;
    675		goto err;
    676	}
    677	ret = validate_extent_buffer(eb);
    678	if (ret < 0)
    679		goto err;
    680
    681	set_extent_buffer_uptodate(eb);
    682
    683	free_extent_buffer(eb);
    684	return ret;
    685err:
    686	/*
    687	 * end_bio_extent_readpage decrements io_pages in case of error,
    688	 * make sure it has something to decrement.
    689	 */
    690	atomic_inc(&eb->io_pages);
    691	clear_extent_buffer_uptodate(eb);
    692	free_extent_buffer(eb);
    693	return ret;
    694}
    695
    696int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
    697				   struct page *page, u64 start, u64 end,
    698				   int mirror)
    699{
    700	struct extent_buffer *eb;
    701	int ret = 0;
    702	int reads_done;
    703
    704	ASSERT(page->private);
    705
    706	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
    707		return validate_subpage_buffer(page, start, end, mirror);
    708
    709	eb = (struct extent_buffer *)page->private;
    710
    711	/*
    712	 * The pending IO might have been the only thing that kept this buffer
    713	 * in memory.  Make sure we have a ref for all this other checks
    714	 */
    715	atomic_inc(&eb->refs);
    716
    717	reads_done = atomic_dec_and_test(&eb->io_pages);
    718	if (!reads_done)
    719		goto err;
    720
    721	eb->read_mirror = mirror;
    722	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
    723		ret = -EIO;
    724		goto err;
    725	}
    726	ret = validate_extent_buffer(eb);
    727err:
    728	if (ret) {
    729		/*
    730		 * our io error hook is going to dec the io pages
    731		 * again, we have to make sure it has something
    732		 * to decrement
    733		 */
    734		atomic_inc(&eb->io_pages);
    735		clear_extent_buffer_uptodate(eb);
    736	}
    737	free_extent_buffer(eb);
    738
    739	return ret;
    740}
    741
    742static void end_workqueue_bio(struct bio *bio)
    743{
    744	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
    745	struct btrfs_fs_info *fs_info;
    746	struct btrfs_workqueue *wq;
    747
    748	fs_info = end_io_wq->info;
    749	end_io_wq->status = bio->bi_status;
    750
    751	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
    752		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
    753			wq = fs_info->endio_meta_write_workers;
    754		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
    755			wq = fs_info->endio_freespace_worker;
    756		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
    757			wq = fs_info->endio_raid56_workers;
    758		else
    759			wq = fs_info->endio_write_workers;
    760	} else {
    761		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
    762			wq = fs_info->endio_raid56_workers;
    763		else if (end_io_wq->metadata)
    764			wq = fs_info->endio_meta_workers;
    765		else
    766			wq = fs_info->endio_workers;
    767	}
    768
    769	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
    770	btrfs_queue_work(wq, &end_io_wq->work);
    771}
    772
    773blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
    774			enum btrfs_wq_endio_type metadata)
    775{
    776	struct btrfs_end_io_wq *end_io_wq;
    777
    778	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
    779	if (!end_io_wq)
    780		return BLK_STS_RESOURCE;
    781
    782	end_io_wq->private = bio->bi_private;
    783	end_io_wq->end_io = bio->bi_end_io;
    784	end_io_wq->info = info;
    785	end_io_wq->status = 0;
    786	end_io_wq->bio = bio;
    787	end_io_wq->metadata = metadata;
    788
    789	bio->bi_private = end_io_wq;
    790	bio->bi_end_io = end_workqueue_bio;
    791	return 0;
    792}
    793
    794static void run_one_async_start(struct btrfs_work *work)
    795{
    796	struct async_submit_bio *async;
    797	blk_status_t ret;
    798
    799	async = container_of(work, struct  async_submit_bio, work);
    800	ret = async->submit_bio_start(async->inode, async->bio,
    801				      async->dio_file_offset);
    802	if (ret)
    803		async->status = ret;
    804}
    805
    806/*
    807 * In order to insert checksums into the metadata in large chunks, we wait
    808 * until bio submission time.   All the pages in the bio are checksummed and
    809 * sums are attached onto the ordered extent record.
    810 *
    811 * At IO completion time the csums attached on the ordered extent record are
    812 * inserted into the tree.
    813 */
    814static void run_one_async_done(struct btrfs_work *work)
    815{
    816	struct async_submit_bio *async;
    817	struct inode *inode;
    818	blk_status_t ret;
    819
    820	async = container_of(work, struct  async_submit_bio, work);
    821	inode = async->inode;
    822
    823	/* If an error occurred we just want to clean up the bio and move on */
    824	if (async->status) {
    825		async->bio->bi_status = async->status;
    826		bio_endio(async->bio);
    827		return;
    828	}
    829
    830	/*
    831	 * All of the bios that pass through here are from async helpers.
    832	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
    833	 * This changes nothing when cgroups aren't in use.
    834	 */
    835	async->bio->bi_opf |= REQ_CGROUP_PUNT;
    836	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
    837	if (ret) {
    838		async->bio->bi_status = ret;
    839		bio_endio(async->bio);
    840	}
    841}
    842
    843static void run_one_async_free(struct btrfs_work *work)
    844{
    845	struct async_submit_bio *async;
    846
    847	async = container_of(work, struct  async_submit_bio, work);
    848	kfree(async);
    849}
    850
    851blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
    852				 int mirror_num, u64 dio_file_offset,
    853				 extent_submit_bio_start_t *submit_bio_start)
    854{
    855	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
    856	struct async_submit_bio *async;
    857
    858	async = kmalloc(sizeof(*async), GFP_NOFS);
    859	if (!async)
    860		return BLK_STS_RESOURCE;
    861
    862	async->inode = inode;
    863	async->bio = bio;
    864	async->mirror_num = mirror_num;
    865	async->submit_bio_start = submit_bio_start;
    866
    867	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
    868			run_one_async_free);
    869
    870	async->dio_file_offset = dio_file_offset;
    871
    872	async->status = 0;
    873
    874	if (op_is_sync(bio->bi_opf))
    875		btrfs_queue_work(fs_info->hipri_workers, &async->work);
    876	else
    877		btrfs_queue_work(fs_info->workers, &async->work);
    878	return 0;
    879}
    880
    881static blk_status_t btree_csum_one_bio(struct bio *bio)
    882{
    883	struct bio_vec *bvec;
    884	struct btrfs_root *root;
    885	int ret = 0;
    886	struct bvec_iter_all iter_all;
    887
    888	ASSERT(!bio_flagged(bio, BIO_CLONED));
    889	bio_for_each_segment_all(bvec, bio, iter_all) {
    890		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
    891		ret = csum_dirty_buffer(root->fs_info, bvec);
    892		if (ret)
    893			break;
    894	}
    895
    896	return errno_to_blk_status(ret);
    897}
    898
    899static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
    900					   u64 dio_file_offset)
    901{
    902	/*
    903	 * when we're called for a write, we're already in the async
    904	 * submission context.  Just jump into btrfs_map_bio
    905	 */
    906	return btree_csum_one_bio(bio);
    907}
    908
    909static bool should_async_write(struct btrfs_fs_info *fs_info,
    910			     struct btrfs_inode *bi)
    911{
    912	if (btrfs_is_zoned(fs_info))
    913		return false;
    914	if (atomic_read(&bi->sync_writers))
    915		return false;
    916	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
    917		return false;
    918	return true;
    919}
    920
    921void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
    922{
    923	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    924	blk_status_t ret;
    925
    926	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
    927		/*
    928		 * called for a read, do the setup so that checksum validation
    929		 * can happen in the async kernel threads
    930		 */
    931		ret = btrfs_bio_wq_end_io(fs_info, bio,
    932					  BTRFS_WQ_ENDIO_METADATA);
    933		if (!ret)
    934			ret = btrfs_map_bio(fs_info, bio, mirror_num);
    935	} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
    936		ret = btree_csum_one_bio(bio);
    937		if (!ret)
    938			ret = btrfs_map_bio(fs_info, bio, mirror_num);
    939	} else {
    940		/*
    941		 * kthread helpers are used to submit writes so that
    942		 * checksumming can happen in parallel across all CPUs
    943		 */
    944		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
    945					  btree_submit_bio_start);
    946	}
    947
    948	if (ret) {
    949		bio->bi_status = ret;
    950		bio_endio(bio);
    951	}
    952}
    953
    954#ifdef CONFIG_MIGRATION
    955static int btree_migratepage(struct address_space *mapping,
    956			struct page *newpage, struct page *page,
    957			enum migrate_mode mode)
    958{
    959	/*
    960	 * we can't safely write a btree page from here,
    961	 * we haven't done the locking hook
    962	 */
    963	if (PageDirty(page))
    964		return -EAGAIN;
    965	/*
    966	 * Buffers may be managed in a filesystem specific way.
    967	 * We must have no buffers or drop them.
    968	 */
    969	if (page_has_private(page) &&
    970	    !try_to_release_page(page, GFP_KERNEL))
    971		return -EAGAIN;
    972	return migrate_page(mapping, newpage, page, mode);
    973}
    974#endif
    975
    976
    977static int btree_writepages(struct address_space *mapping,
    978			    struct writeback_control *wbc)
    979{
    980	struct btrfs_fs_info *fs_info;
    981	int ret;
    982
    983	if (wbc->sync_mode == WB_SYNC_NONE) {
    984
    985		if (wbc->for_kupdate)
    986			return 0;
    987
    988		fs_info = BTRFS_I(mapping->host)->root->fs_info;
    989		/* this is a bit racy, but that's ok */
    990		ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
    991					     BTRFS_DIRTY_METADATA_THRESH,
    992					     fs_info->dirty_metadata_batch);
    993		if (ret < 0)
    994			return 0;
    995	}
    996	return btree_write_cache_pages(mapping, wbc);
    997}
    998
    999static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
   1000{
   1001	if (folio_test_writeback(folio) || folio_test_dirty(folio))
   1002		return false;
   1003
   1004	return try_release_extent_buffer(&folio->page);
   1005}
   1006
   1007static void btree_invalidate_folio(struct folio *folio, size_t offset,
   1008				 size_t length)
   1009{
   1010	struct extent_io_tree *tree;
   1011	tree = &BTRFS_I(folio->mapping->host)->io_tree;
   1012	extent_invalidate_folio(tree, folio, offset);
   1013	btree_release_folio(folio, GFP_NOFS);
   1014	if (folio_get_private(folio)) {
   1015		btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
   1016			   "folio private not zero on folio %llu",
   1017			   (unsigned long long)folio_pos(folio));
   1018		folio_detach_private(folio);
   1019	}
   1020}
   1021
   1022#ifdef DEBUG
   1023static bool btree_dirty_folio(struct address_space *mapping,
   1024		struct folio *folio)
   1025{
   1026	struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
   1027	struct btrfs_subpage *subpage;
   1028	struct extent_buffer *eb;
   1029	int cur_bit = 0;
   1030	u64 page_start = folio_pos(folio);
   1031
   1032	if (fs_info->sectorsize == PAGE_SIZE) {
   1033		eb = folio_get_private(folio);
   1034		BUG_ON(!eb);
   1035		BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
   1036		BUG_ON(!atomic_read(&eb->refs));
   1037		btrfs_assert_tree_write_locked(eb);
   1038		return filemap_dirty_folio(mapping, folio);
   1039	}
   1040	subpage = folio_get_private(folio);
   1041
   1042	ASSERT(subpage->dirty_bitmap);
   1043	while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
   1044		unsigned long flags;
   1045		u64 cur;
   1046		u16 tmp = (1 << cur_bit);
   1047
   1048		spin_lock_irqsave(&subpage->lock, flags);
   1049		if (!(tmp & subpage->dirty_bitmap)) {
   1050			spin_unlock_irqrestore(&subpage->lock, flags);
   1051			cur_bit++;
   1052			continue;
   1053		}
   1054		spin_unlock_irqrestore(&subpage->lock, flags);
   1055		cur = page_start + cur_bit * fs_info->sectorsize;
   1056
   1057		eb = find_extent_buffer(fs_info, cur);
   1058		ASSERT(eb);
   1059		ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
   1060		ASSERT(atomic_read(&eb->refs));
   1061		btrfs_assert_tree_write_locked(eb);
   1062		free_extent_buffer(eb);
   1063
   1064		cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
   1065	}
   1066	return filemap_dirty_folio(mapping, folio);
   1067}
   1068#else
   1069#define btree_dirty_folio filemap_dirty_folio
   1070#endif
   1071
   1072static const struct address_space_operations btree_aops = {
   1073	.writepages	= btree_writepages,
   1074	.release_folio	= btree_release_folio,
   1075	.invalidate_folio = btree_invalidate_folio,
   1076#ifdef CONFIG_MIGRATION
   1077	.migratepage	= btree_migratepage,
   1078#endif
   1079	.dirty_folio = btree_dirty_folio,
   1080};
   1081
   1082struct extent_buffer *btrfs_find_create_tree_block(
   1083						struct btrfs_fs_info *fs_info,
   1084						u64 bytenr, u64 owner_root,
   1085						int level)
   1086{
   1087	if (btrfs_is_testing(fs_info))
   1088		return alloc_test_extent_buffer(fs_info, bytenr);
   1089	return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
   1090}
   1091
   1092/*
   1093 * Read tree block at logical address @bytenr and do variant basic but critical
   1094 * verification.
   1095 *
   1096 * @owner_root:		the objectid of the root owner for this block.
   1097 * @parent_transid:	expected transid of this tree block, skip check if 0
   1098 * @level:		expected level, mandatory check
   1099 * @first_key:		expected key in slot 0, skip check if NULL
   1100 */
   1101struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
   1102				      u64 owner_root, u64 parent_transid,
   1103				      int level, struct btrfs_key *first_key)
   1104{
   1105	struct extent_buffer *buf = NULL;
   1106	int ret;
   1107
   1108	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
   1109	if (IS_ERR(buf))
   1110		return buf;
   1111
   1112	ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
   1113	if (ret) {
   1114		free_extent_buffer_stale(buf);
   1115		return ERR_PTR(ret);
   1116	}
   1117	if (btrfs_check_eb_owner(buf, owner_root)) {
   1118		free_extent_buffer_stale(buf);
   1119		return ERR_PTR(-EUCLEAN);
   1120	}
   1121	return buf;
   1122
   1123}
   1124
   1125void btrfs_clean_tree_block(struct extent_buffer *buf)
   1126{
   1127	struct btrfs_fs_info *fs_info = buf->fs_info;
   1128	if (btrfs_header_generation(buf) ==
   1129	    fs_info->running_transaction->transid) {
   1130		btrfs_assert_tree_write_locked(buf);
   1131
   1132		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
   1133			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
   1134						 -buf->len,
   1135						 fs_info->dirty_metadata_batch);
   1136			clear_extent_buffer_dirty(buf);
   1137		}
   1138	}
   1139}
   1140
   1141static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
   1142			 u64 objectid)
   1143{
   1144	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
   1145
   1146	memset(&root->root_key, 0, sizeof(root->root_key));
   1147	memset(&root->root_item, 0, sizeof(root->root_item));
   1148	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
   1149	root->fs_info = fs_info;
   1150	root->root_key.objectid = objectid;
   1151	root->node = NULL;
   1152	root->commit_root = NULL;
   1153	root->state = 0;
   1154	RB_CLEAR_NODE(&root->rb_node);
   1155
   1156	root->last_trans = 0;
   1157	root->free_objectid = 0;
   1158	root->nr_delalloc_inodes = 0;
   1159	root->nr_ordered_extents = 0;
   1160	root->inode_tree = RB_ROOT;
   1161	xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
   1162
   1163	btrfs_init_root_block_rsv(root);
   1164
   1165	INIT_LIST_HEAD(&root->dirty_list);
   1166	INIT_LIST_HEAD(&root->root_list);
   1167	INIT_LIST_HEAD(&root->delalloc_inodes);
   1168	INIT_LIST_HEAD(&root->delalloc_root);
   1169	INIT_LIST_HEAD(&root->ordered_extents);
   1170	INIT_LIST_HEAD(&root->ordered_root);
   1171	INIT_LIST_HEAD(&root->reloc_dirty_list);
   1172	INIT_LIST_HEAD(&root->logged_list[0]);
   1173	INIT_LIST_HEAD(&root->logged_list[1]);
   1174	spin_lock_init(&root->inode_lock);
   1175	spin_lock_init(&root->delalloc_lock);
   1176	spin_lock_init(&root->ordered_extent_lock);
   1177	spin_lock_init(&root->accounting_lock);
   1178	spin_lock_init(&root->log_extents_lock[0]);
   1179	spin_lock_init(&root->log_extents_lock[1]);
   1180	spin_lock_init(&root->qgroup_meta_rsv_lock);
   1181	mutex_init(&root->objectid_mutex);
   1182	mutex_init(&root->log_mutex);
   1183	mutex_init(&root->ordered_extent_mutex);
   1184	mutex_init(&root->delalloc_mutex);
   1185	init_waitqueue_head(&root->qgroup_flush_wait);
   1186	init_waitqueue_head(&root->log_writer_wait);
   1187	init_waitqueue_head(&root->log_commit_wait[0]);
   1188	init_waitqueue_head(&root->log_commit_wait[1]);
   1189	INIT_LIST_HEAD(&root->log_ctxs[0]);
   1190	INIT_LIST_HEAD(&root->log_ctxs[1]);
   1191	atomic_set(&root->log_commit[0], 0);
   1192	atomic_set(&root->log_commit[1], 0);
   1193	atomic_set(&root->log_writers, 0);
   1194	atomic_set(&root->log_batch, 0);
   1195	refcount_set(&root->refs, 1);
   1196	atomic_set(&root->snapshot_force_cow, 0);
   1197	atomic_set(&root->nr_swapfiles, 0);
   1198	root->log_transid = 0;
   1199	root->log_transid_committed = -1;
   1200	root->last_log_commit = 0;
   1201	root->anon_dev = 0;
   1202	if (!dummy) {
   1203		extent_io_tree_init(fs_info, &root->dirty_log_pages,
   1204				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
   1205		extent_io_tree_init(fs_info, &root->log_csum_range,
   1206				    IO_TREE_LOG_CSUM_RANGE, NULL);
   1207	}
   1208
   1209	spin_lock_init(&root->root_item_lock);
   1210	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
   1211#ifdef CONFIG_BTRFS_DEBUG
   1212	INIT_LIST_HEAD(&root->leak_list);
   1213	spin_lock(&fs_info->fs_roots_lock);
   1214	list_add_tail(&root->leak_list, &fs_info->allocated_roots);
   1215	spin_unlock(&fs_info->fs_roots_lock);
   1216#endif
   1217}
   1218
   1219static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
   1220					   u64 objectid, gfp_t flags)
   1221{
   1222	struct btrfs_root *root = kzalloc(sizeof(*root), flags);
   1223	if (root)
   1224		__setup_root(root, fs_info, objectid);
   1225	return root;
   1226}
   1227
   1228#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
   1229/* Should only be used by the testing infrastructure */
   1230struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
   1231{
   1232	struct btrfs_root *root;
   1233
   1234	if (!fs_info)
   1235		return ERR_PTR(-EINVAL);
   1236
   1237	root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
   1238	if (!root)
   1239		return ERR_PTR(-ENOMEM);
   1240
   1241	/* We don't use the stripesize in selftest, set it as sectorsize */
   1242	root->alloc_bytenr = 0;
   1243
   1244	return root;
   1245}
   1246#endif
   1247
   1248static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
   1249{
   1250	const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
   1251	const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
   1252
   1253	return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
   1254}
   1255
   1256static int global_root_key_cmp(const void *k, const struct rb_node *node)
   1257{
   1258	const struct btrfs_key *key = k;
   1259	const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
   1260
   1261	return btrfs_comp_cpu_keys(key, &root->root_key);
   1262}
   1263
   1264int btrfs_global_root_insert(struct btrfs_root *root)
   1265{
   1266	struct btrfs_fs_info *fs_info = root->fs_info;
   1267	struct rb_node *tmp;
   1268
   1269	write_lock(&fs_info->global_root_lock);
   1270	tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
   1271	write_unlock(&fs_info->global_root_lock);
   1272	ASSERT(!tmp);
   1273
   1274	return tmp ? -EEXIST : 0;
   1275}
   1276
   1277void btrfs_global_root_delete(struct btrfs_root *root)
   1278{
   1279	struct btrfs_fs_info *fs_info = root->fs_info;
   1280
   1281	write_lock(&fs_info->global_root_lock);
   1282	rb_erase(&root->rb_node, &fs_info->global_root_tree);
   1283	write_unlock(&fs_info->global_root_lock);
   1284}
   1285
   1286struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
   1287				     struct btrfs_key *key)
   1288{
   1289	struct rb_node *node;
   1290	struct btrfs_root *root = NULL;
   1291
   1292	read_lock(&fs_info->global_root_lock);
   1293	node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
   1294	if (node)
   1295		root = container_of(node, struct btrfs_root, rb_node);
   1296	read_unlock(&fs_info->global_root_lock);
   1297
   1298	return root;
   1299}
   1300
   1301static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
   1302{
   1303	struct btrfs_block_group *block_group;
   1304	u64 ret;
   1305
   1306	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
   1307		return 0;
   1308
   1309	if (bytenr)
   1310		block_group = btrfs_lookup_block_group(fs_info, bytenr);
   1311	else
   1312		block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
   1313	ASSERT(block_group);
   1314	if (!block_group)
   1315		return 0;
   1316	ret = block_group->global_root_id;
   1317	btrfs_put_block_group(block_group);
   1318
   1319	return ret;
   1320}
   1321
   1322struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
   1323{
   1324	struct btrfs_key key = {
   1325		.objectid = BTRFS_CSUM_TREE_OBJECTID,
   1326		.type = BTRFS_ROOT_ITEM_KEY,
   1327		.offset = btrfs_global_root_id(fs_info, bytenr),
   1328	};
   1329
   1330	return btrfs_global_root(fs_info, &key);
   1331}
   1332
   1333struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
   1334{
   1335	struct btrfs_key key = {
   1336		.objectid = BTRFS_EXTENT_TREE_OBJECTID,
   1337		.type = BTRFS_ROOT_ITEM_KEY,
   1338		.offset = btrfs_global_root_id(fs_info, bytenr),
   1339	};
   1340
   1341	return btrfs_global_root(fs_info, &key);
   1342}
   1343
   1344struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
   1345				     u64 objectid)
   1346{
   1347	struct btrfs_fs_info *fs_info = trans->fs_info;
   1348	struct extent_buffer *leaf;
   1349	struct btrfs_root *tree_root = fs_info->tree_root;
   1350	struct btrfs_root *root;
   1351	struct btrfs_key key;
   1352	unsigned int nofs_flag;
   1353	int ret = 0;
   1354
   1355	/*
   1356	 * We're holding a transaction handle, so use a NOFS memory allocation
   1357	 * context to avoid deadlock if reclaim happens.
   1358	 */
   1359	nofs_flag = memalloc_nofs_save();
   1360	root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
   1361	memalloc_nofs_restore(nofs_flag);
   1362	if (!root)
   1363		return ERR_PTR(-ENOMEM);
   1364
   1365	root->root_key.objectid = objectid;
   1366	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
   1367	root->root_key.offset = 0;
   1368
   1369	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
   1370				      BTRFS_NESTING_NORMAL);
   1371	if (IS_ERR(leaf)) {
   1372		ret = PTR_ERR(leaf);
   1373		leaf = NULL;
   1374		goto fail_unlock;
   1375	}
   1376
   1377	root->node = leaf;
   1378	btrfs_mark_buffer_dirty(leaf);
   1379
   1380	root->commit_root = btrfs_root_node(root);
   1381	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
   1382
   1383	btrfs_set_root_flags(&root->root_item, 0);
   1384	btrfs_set_root_limit(&root->root_item, 0);
   1385	btrfs_set_root_bytenr(&root->root_item, leaf->start);
   1386	btrfs_set_root_generation(&root->root_item, trans->transid);
   1387	btrfs_set_root_level(&root->root_item, 0);
   1388	btrfs_set_root_refs(&root->root_item, 1);
   1389	btrfs_set_root_used(&root->root_item, leaf->len);
   1390	btrfs_set_root_last_snapshot(&root->root_item, 0);
   1391	btrfs_set_root_dirid(&root->root_item, 0);
   1392	if (is_fstree(objectid))
   1393		generate_random_guid(root->root_item.uuid);
   1394	else
   1395		export_guid(root->root_item.uuid, &guid_null);
   1396	btrfs_set_root_drop_level(&root->root_item, 0);
   1397
   1398	btrfs_tree_unlock(leaf);
   1399
   1400	key.objectid = objectid;
   1401	key.type = BTRFS_ROOT_ITEM_KEY;
   1402	key.offset = 0;
   1403	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
   1404	if (ret)
   1405		goto fail;
   1406
   1407	return root;
   1408
   1409fail_unlock:
   1410	if (leaf)
   1411		btrfs_tree_unlock(leaf);
   1412fail:
   1413	btrfs_put_root(root);
   1414
   1415	return ERR_PTR(ret);
   1416}
   1417
   1418static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
   1419					 struct btrfs_fs_info *fs_info)
   1420{
   1421	struct btrfs_root *root;
   1422
   1423	root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
   1424	if (!root)
   1425		return ERR_PTR(-ENOMEM);
   1426
   1427	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
   1428	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
   1429	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
   1430
   1431	return root;
   1432}
   1433
   1434int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
   1435			      struct btrfs_root *root)
   1436{
   1437	struct extent_buffer *leaf;
   1438
   1439	/*
   1440	 * DON'T set SHAREABLE bit for log trees.
   1441	 *
   1442	 * Log trees are not exposed to user space thus can't be snapshotted,
   1443	 * and they go away before a real commit is actually done.
   1444	 *
   1445	 * They do store pointers to file data extents, and those reference
   1446	 * counts still get updated (along with back refs to the log tree).
   1447	 */
   1448
   1449	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
   1450			NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
   1451	if (IS_ERR(leaf))
   1452		return PTR_ERR(leaf);
   1453
   1454	root->node = leaf;
   1455
   1456	btrfs_mark_buffer_dirty(root->node);
   1457	btrfs_tree_unlock(root->node);
   1458
   1459	return 0;
   1460}
   1461
   1462int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
   1463			     struct btrfs_fs_info *fs_info)
   1464{
   1465	struct btrfs_root *log_root;
   1466
   1467	log_root = alloc_log_tree(trans, fs_info);
   1468	if (IS_ERR(log_root))
   1469		return PTR_ERR(log_root);
   1470
   1471	if (!btrfs_is_zoned(fs_info)) {
   1472		int ret = btrfs_alloc_log_tree_node(trans, log_root);
   1473
   1474		if (ret) {
   1475			btrfs_put_root(log_root);
   1476			return ret;
   1477		}
   1478	}
   1479
   1480	WARN_ON(fs_info->log_root_tree);
   1481	fs_info->log_root_tree = log_root;
   1482	return 0;
   1483}
   1484
   1485int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
   1486		       struct btrfs_root *root)
   1487{
   1488	struct btrfs_fs_info *fs_info = root->fs_info;
   1489	struct btrfs_root *log_root;
   1490	struct btrfs_inode_item *inode_item;
   1491	int ret;
   1492
   1493	log_root = alloc_log_tree(trans, fs_info);
   1494	if (IS_ERR(log_root))
   1495		return PTR_ERR(log_root);
   1496
   1497	ret = btrfs_alloc_log_tree_node(trans, log_root);
   1498	if (ret) {
   1499		btrfs_put_root(log_root);
   1500		return ret;
   1501	}
   1502
   1503	log_root->last_trans = trans->transid;
   1504	log_root->root_key.offset = root->root_key.objectid;
   1505
   1506	inode_item = &log_root->root_item.inode;
   1507	btrfs_set_stack_inode_generation(inode_item, 1);
   1508	btrfs_set_stack_inode_size(inode_item, 3);
   1509	btrfs_set_stack_inode_nlink(inode_item, 1);
   1510	btrfs_set_stack_inode_nbytes(inode_item,
   1511				     fs_info->nodesize);
   1512	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
   1513
   1514	btrfs_set_root_node(&log_root->root_item, log_root->node);
   1515
   1516	WARN_ON(root->log_root);
   1517	root->log_root = log_root;
   1518	root->log_transid = 0;
   1519	root->log_transid_committed = -1;
   1520	root->last_log_commit = 0;
   1521	return 0;
   1522}
   1523
   1524static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
   1525					      struct btrfs_path *path,
   1526					      struct btrfs_key *key)
   1527{
   1528	struct btrfs_root *root;
   1529	struct btrfs_fs_info *fs_info = tree_root->fs_info;
   1530	u64 generation;
   1531	int ret;
   1532	int level;
   1533
   1534	root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
   1535	if (!root)
   1536		return ERR_PTR(-ENOMEM);
   1537
   1538	ret = btrfs_find_root(tree_root, key, path,
   1539			      &root->root_item, &root->root_key);
   1540	if (ret) {
   1541		if (ret > 0)
   1542			ret = -ENOENT;
   1543		goto fail;
   1544	}
   1545
   1546	generation = btrfs_root_generation(&root->root_item);
   1547	level = btrfs_root_level(&root->root_item);
   1548	root->node = read_tree_block(fs_info,
   1549				     btrfs_root_bytenr(&root->root_item),
   1550				     key->objectid, generation, level, NULL);
   1551	if (IS_ERR(root->node)) {
   1552		ret = PTR_ERR(root->node);
   1553		root->node = NULL;
   1554		goto fail;
   1555	}
   1556	if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
   1557		ret = -EIO;
   1558		goto fail;
   1559	}
   1560
   1561	/*
   1562	 * For real fs, and not log/reloc trees, root owner must
   1563	 * match its root node owner
   1564	 */
   1565	if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
   1566	    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
   1567	    root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
   1568	    root->root_key.objectid != btrfs_header_owner(root->node)) {
   1569		btrfs_crit(fs_info,
   1570"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
   1571			   root->root_key.objectid, root->node->start,
   1572			   btrfs_header_owner(root->node),
   1573			   root->root_key.objectid);
   1574		ret = -EUCLEAN;
   1575		goto fail;
   1576	}
   1577	root->commit_root = btrfs_root_node(root);
   1578	return root;
   1579fail:
   1580	btrfs_put_root(root);
   1581	return ERR_PTR(ret);
   1582}
   1583
   1584struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
   1585					struct btrfs_key *key)
   1586{
   1587	struct btrfs_root *root;
   1588	struct btrfs_path *path;
   1589
   1590	path = btrfs_alloc_path();
   1591	if (!path)
   1592		return ERR_PTR(-ENOMEM);
   1593	root = read_tree_root_path(tree_root, path, key);
   1594	btrfs_free_path(path);
   1595
   1596	return root;
   1597}
   1598
   1599/*
   1600 * Initialize subvolume root in-memory structure
   1601 *
   1602 * @anon_dev:	anonymous device to attach to the root, if zero, allocate new
   1603 */
   1604static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
   1605{
   1606	int ret;
   1607	unsigned int nofs_flag;
   1608
   1609	/*
   1610	 * We might be called under a transaction (e.g. indirect backref
   1611	 * resolution) which could deadlock if it triggers memory reclaim
   1612	 */
   1613	nofs_flag = memalloc_nofs_save();
   1614	ret = btrfs_drew_lock_init(&root->snapshot_lock);
   1615	memalloc_nofs_restore(nofs_flag);
   1616	if (ret)
   1617		goto fail;
   1618
   1619	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
   1620	    !btrfs_is_data_reloc_root(root)) {
   1621		set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
   1622		btrfs_check_and_init_root_item(&root->root_item);
   1623	}
   1624
   1625	/*
   1626	 * Don't assign anonymous block device to roots that are not exposed to
   1627	 * userspace, the id pool is limited to 1M
   1628	 */
   1629	if (is_fstree(root->root_key.objectid) &&
   1630	    btrfs_root_refs(&root->root_item) > 0) {
   1631		if (!anon_dev) {
   1632			ret = get_anon_bdev(&root->anon_dev);
   1633			if (ret)
   1634				goto fail;
   1635		} else {
   1636			root->anon_dev = anon_dev;
   1637		}
   1638	}
   1639
   1640	mutex_lock(&root->objectid_mutex);
   1641	ret = btrfs_init_root_free_objectid(root);
   1642	if (ret) {
   1643		mutex_unlock(&root->objectid_mutex);
   1644		goto fail;
   1645	}
   1646
   1647	ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
   1648
   1649	mutex_unlock(&root->objectid_mutex);
   1650
   1651	return 0;
   1652fail:
   1653	/* The caller is responsible to call btrfs_free_fs_root */
   1654	return ret;
   1655}
   1656
   1657static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
   1658					       u64 root_id)
   1659{
   1660	struct btrfs_root *root;
   1661
   1662	spin_lock(&fs_info->fs_roots_lock);
   1663	root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
   1664	if (root)
   1665		root = btrfs_grab_root(root);
   1666	spin_unlock(&fs_info->fs_roots_lock);
   1667	return root;
   1668}
   1669
   1670static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
   1671						u64 objectid)
   1672{
   1673	struct btrfs_key key = {
   1674		.objectid = objectid,
   1675		.type = BTRFS_ROOT_ITEM_KEY,
   1676		.offset = 0,
   1677	};
   1678
   1679	if (objectid == BTRFS_ROOT_TREE_OBJECTID)
   1680		return btrfs_grab_root(fs_info->tree_root);
   1681	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
   1682		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
   1683	if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
   1684		return btrfs_grab_root(fs_info->chunk_root);
   1685	if (objectid == BTRFS_DEV_TREE_OBJECTID)
   1686		return btrfs_grab_root(fs_info->dev_root);
   1687	if (objectid == BTRFS_CSUM_TREE_OBJECTID)
   1688		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
   1689	if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
   1690		return btrfs_grab_root(fs_info->quota_root) ?
   1691			fs_info->quota_root : ERR_PTR(-ENOENT);
   1692	if (objectid == BTRFS_UUID_TREE_OBJECTID)
   1693		return btrfs_grab_root(fs_info->uuid_root) ?
   1694			fs_info->uuid_root : ERR_PTR(-ENOENT);
   1695	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
   1696		struct btrfs_root *root = btrfs_global_root(fs_info, &key);
   1697
   1698		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
   1699	}
   1700	return NULL;
   1701}
   1702
   1703int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
   1704			 struct btrfs_root *root)
   1705{
   1706	int ret;
   1707
   1708	spin_lock(&fs_info->fs_roots_lock);
   1709	ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
   1710			root, GFP_NOFS);
   1711	if (ret == 0) {
   1712		btrfs_grab_root(root);
   1713		set_bit(BTRFS_ROOT_REGISTERED, &root->state);
   1714	}
   1715	spin_unlock(&fs_info->fs_roots_lock);
   1716
   1717	return ret;
   1718}
   1719
   1720void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
   1721{
   1722#ifdef CONFIG_BTRFS_DEBUG
   1723	struct btrfs_root *root;
   1724
   1725	while (!list_empty(&fs_info->allocated_roots)) {
   1726		char buf[BTRFS_ROOT_NAME_BUF_LEN];
   1727
   1728		root = list_first_entry(&fs_info->allocated_roots,
   1729					struct btrfs_root, leak_list);
   1730		btrfs_err(fs_info, "leaked root %s refcount %d",
   1731			  btrfs_root_name(&root->root_key, buf),
   1732			  refcount_read(&root->refs));
   1733		while (refcount_read(&root->refs) > 1)
   1734			btrfs_put_root(root);
   1735		btrfs_put_root(root);
   1736	}
   1737#endif
   1738}
   1739
   1740static void free_global_roots(struct btrfs_fs_info *fs_info)
   1741{
   1742	struct btrfs_root *root;
   1743	struct rb_node *node;
   1744
   1745	while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
   1746		root = rb_entry(node, struct btrfs_root, rb_node);
   1747		rb_erase(&root->rb_node, &fs_info->global_root_tree);
   1748		btrfs_put_root(root);
   1749	}
   1750}
   1751
   1752void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
   1753{
   1754	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
   1755	percpu_counter_destroy(&fs_info->delalloc_bytes);
   1756	percpu_counter_destroy(&fs_info->ordered_bytes);
   1757	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
   1758	btrfs_free_csum_hash(fs_info);
   1759	btrfs_free_stripe_hash_table(fs_info);
   1760	btrfs_free_ref_cache(fs_info);
   1761	kfree(fs_info->balance_ctl);
   1762	kfree(fs_info->delayed_root);
   1763	free_global_roots(fs_info);
   1764	btrfs_put_root(fs_info->tree_root);
   1765	btrfs_put_root(fs_info->chunk_root);
   1766	btrfs_put_root(fs_info->dev_root);
   1767	btrfs_put_root(fs_info->quota_root);
   1768	btrfs_put_root(fs_info->uuid_root);
   1769	btrfs_put_root(fs_info->fs_root);
   1770	btrfs_put_root(fs_info->data_reloc_root);
   1771	btrfs_put_root(fs_info->block_group_root);
   1772	btrfs_check_leaked_roots(fs_info);
   1773	btrfs_extent_buffer_leak_debug_check(fs_info);
   1774	kfree(fs_info->super_copy);
   1775	kfree(fs_info->super_for_commit);
   1776	kfree(fs_info->subpage_info);
   1777	kvfree(fs_info);
   1778}
   1779
   1780
   1781/*
   1782 * Get an in-memory reference of a root structure.
   1783 *
   1784 * For essential trees like root/extent tree, we grab it from fs_info directly.
   1785 * For subvolume trees, we check the cached filesystem roots first. If not
   1786 * found, then read it from disk and add it to cached fs roots.
   1787 *
   1788 * Caller should release the root by calling btrfs_put_root() after the usage.
   1789 *
   1790 * NOTE: Reloc and log trees can't be read by this function as they share the
   1791 *	 same root objectid.
   1792 *
   1793 * @objectid:	root id
   1794 * @anon_dev:	preallocated anonymous block device number for new roots,
   1795 * 		pass 0 for new allocation.
   1796 * @check_ref:	whether to check root item references, If true, return -ENOENT
   1797 *		for orphan roots
   1798 */
   1799static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
   1800					     u64 objectid, dev_t anon_dev,
   1801					     bool check_ref)
   1802{
   1803	struct btrfs_root *root;
   1804	struct btrfs_path *path;
   1805	struct btrfs_key key;
   1806	int ret;
   1807
   1808	root = btrfs_get_global_root(fs_info, objectid);
   1809	if (root)
   1810		return root;
   1811again:
   1812	root = btrfs_lookup_fs_root(fs_info, objectid);
   1813	if (root) {
   1814		/* Shouldn't get preallocated anon_dev for cached roots */
   1815		ASSERT(!anon_dev);
   1816		if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
   1817			btrfs_put_root(root);
   1818			return ERR_PTR(-ENOENT);
   1819		}
   1820		return root;
   1821	}
   1822
   1823	key.objectid = objectid;
   1824	key.type = BTRFS_ROOT_ITEM_KEY;
   1825	key.offset = (u64)-1;
   1826	root = btrfs_read_tree_root(fs_info->tree_root, &key);
   1827	if (IS_ERR(root))
   1828		return root;
   1829
   1830	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
   1831		ret = -ENOENT;
   1832		goto fail;
   1833	}
   1834
   1835	ret = btrfs_init_fs_root(root, anon_dev);
   1836	if (ret)
   1837		goto fail;
   1838
   1839	path = btrfs_alloc_path();
   1840	if (!path) {
   1841		ret = -ENOMEM;
   1842		goto fail;
   1843	}
   1844	key.objectid = BTRFS_ORPHAN_OBJECTID;
   1845	key.type = BTRFS_ORPHAN_ITEM_KEY;
   1846	key.offset = objectid;
   1847
   1848	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
   1849	btrfs_free_path(path);
   1850	if (ret < 0)
   1851		goto fail;
   1852	if (ret == 0)
   1853		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
   1854
   1855	ret = btrfs_insert_fs_root(fs_info, root);
   1856	if (ret) {
   1857		if (ret == -EEXIST) {
   1858			btrfs_put_root(root);
   1859			goto again;
   1860		}
   1861		goto fail;
   1862	}
   1863	return root;
   1864fail:
   1865	/*
   1866	 * If our caller provided us an anonymous device, then it's his
   1867	 * responsability to free it in case we fail. So we have to set our
   1868	 * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
   1869	 * and once again by our caller.
   1870	 */
   1871	if (anon_dev)
   1872		root->anon_dev = 0;
   1873	btrfs_put_root(root);
   1874	return ERR_PTR(ret);
   1875}
   1876
   1877/*
   1878 * Get in-memory reference of a root structure
   1879 *
   1880 * @objectid:	tree objectid
   1881 * @check_ref:	if set, verify that the tree exists and the item has at least
   1882 *		one reference
   1883 */
   1884struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
   1885				     u64 objectid, bool check_ref)
   1886{
   1887	return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
   1888}
   1889
   1890/*
   1891 * Get in-memory reference of a root structure, created as new, optionally pass
   1892 * the anonymous block device id
   1893 *
   1894 * @objectid:	tree objectid
   1895 * @anon_dev:	if zero, allocate a new anonymous block device or use the
   1896 *		parameter value
   1897 */
   1898struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
   1899					 u64 objectid, dev_t anon_dev)
   1900{
   1901	return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
   1902}
   1903
   1904/*
   1905 * btrfs_get_fs_root_commit_root - return a root for the given objectid
   1906 * @fs_info:	the fs_info
   1907 * @objectid:	the objectid we need to lookup
   1908 *
   1909 * This is exclusively used for backref walking, and exists specifically because
   1910 * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
   1911 * creation time, which means we may have to read the tree_root in order to look
   1912 * up a fs root that is not in memory.  If the root is not in memory we will
   1913 * read the tree root commit root and look up the fs root from there.  This is a
   1914 * temporary root, it will not be inserted into the radix tree as it doesn't
   1915 * have the most uptodate information, it'll simply be discarded once the
   1916 * backref code is finished using the root.
   1917 */
   1918struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
   1919						 struct btrfs_path *path,
   1920						 u64 objectid)
   1921{
   1922	struct btrfs_root *root;
   1923	struct btrfs_key key;
   1924
   1925	ASSERT(path->search_commit_root && path->skip_locking);
   1926
   1927	/*
   1928	 * This can return -ENOENT if we ask for a root that doesn't exist, but
   1929	 * since this is called via the backref walking code we won't be looking
   1930	 * up a root that doesn't exist, unless there's corruption.  So if root
   1931	 * != NULL just return it.
   1932	 */
   1933	root = btrfs_get_global_root(fs_info, objectid);
   1934	if (root)
   1935		return root;
   1936
   1937	root = btrfs_lookup_fs_root(fs_info, objectid);
   1938	if (root)
   1939		return root;
   1940
   1941	key.objectid = objectid;
   1942	key.type = BTRFS_ROOT_ITEM_KEY;
   1943	key.offset = (u64)-1;
   1944	root = read_tree_root_path(fs_info->tree_root, path, &key);
   1945	btrfs_release_path(path);
   1946
   1947	return root;
   1948}
   1949
   1950/*
   1951 * called by the kthread helper functions to finally call the bio end_io
   1952 * functions.  This is where read checksum verification actually happens
   1953 */
   1954static void end_workqueue_fn(struct btrfs_work *work)
   1955{
   1956	struct bio *bio;
   1957	struct btrfs_end_io_wq *end_io_wq;
   1958
   1959	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
   1960	bio = end_io_wq->bio;
   1961
   1962	bio->bi_status = end_io_wq->status;
   1963	bio->bi_private = end_io_wq->private;
   1964	bio->bi_end_io = end_io_wq->end_io;
   1965	bio_endio(bio);
   1966	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
   1967}
   1968
   1969static int cleaner_kthread(void *arg)
   1970{
   1971	struct btrfs_fs_info *fs_info = arg;
   1972	int again;
   1973
   1974	while (1) {
   1975		again = 0;
   1976
   1977		set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
   1978
   1979		/* Make the cleaner go to sleep early. */
   1980		if (btrfs_need_cleaner_sleep(fs_info))
   1981			goto sleep;
   1982
   1983		/*
   1984		 * Do not do anything if we might cause open_ctree() to block
   1985		 * before we have finished mounting the filesystem.
   1986		 */
   1987		if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
   1988			goto sleep;
   1989
   1990		if (!mutex_trylock(&fs_info->cleaner_mutex))
   1991			goto sleep;
   1992
   1993		/*
   1994		 * Avoid the problem that we change the status of the fs
   1995		 * during the above check and trylock.
   1996		 */
   1997		if (btrfs_need_cleaner_sleep(fs_info)) {
   1998			mutex_unlock(&fs_info->cleaner_mutex);
   1999			goto sleep;
   2000		}
   2001
   2002		btrfs_run_delayed_iputs(fs_info);
   2003
   2004		again = btrfs_clean_one_deleted_snapshot(fs_info);
   2005		mutex_unlock(&fs_info->cleaner_mutex);
   2006
   2007		/*
   2008		 * The defragger has dealt with the R/O remount and umount,
   2009		 * needn't do anything special here.
   2010		 */
   2011		btrfs_run_defrag_inodes(fs_info);
   2012
   2013		/*
   2014		 * Acquires fs_info->reclaim_bgs_lock to avoid racing
   2015		 * with relocation (btrfs_relocate_chunk) and relocation
   2016		 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
   2017		 * after acquiring fs_info->reclaim_bgs_lock. So we
   2018		 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
   2019		 * unused block groups.
   2020		 */
   2021		btrfs_delete_unused_bgs(fs_info);
   2022
   2023		/*
   2024		 * Reclaim block groups in the reclaim_bgs list after we deleted
   2025		 * all unused block_groups. This possibly gives us some more free
   2026		 * space.
   2027		 */
   2028		btrfs_reclaim_bgs(fs_info);
   2029sleep:
   2030		clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
   2031		if (kthread_should_park())
   2032			kthread_parkme();
   2033		if (kthread_should_stop())
   2034			return 0;
   2035		if (!again) {
   2036			set_current_state(TASK_INTERRUPTIBLE);
   2037			schedule();
   2038			__set_current_state(TASK_RUNNING);
   2039		}
   2040	}
   2041}
   2042
   2043static int transaction_kthread(void *arg)
   2044{
   2045	struct btrfs_root *root = arg;
   2046	struct btrfs_fs_info *fs_info = root->fs_info;
   2047	struct btrfs_trans_handle *trans;
   2048	struct btrfs_transaction *cur;
   2049	u64 transid;
   2050	time64_t delta;
   2051	unsigned long delay;
   2052	bool cannot_commit;
   2053
   2054	do {
   2055		cannot_commit = false;
   2056		delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
   2057		mutex_lock(&fs_info->transaction_kthread_mutex);
   2058
   2059		spin_lock(&fs_info->trans_lock);
   2060		cur = fs_info->running_transaction;
   2061		if (!cur) {
   2062			spin_unlock(&fs_info->trans_lock);
   2063			goto sleep;
   2064		}
   2065
   2066		delta = ktime_get_seconds() - cur->start_time;
   2067		if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
   2068		    cur->state < TRANS_STATE_COMMIT_START &&
   2069		    delta < fs_info->commit_interval) {
   2070			spin_unlock(&fs_info->trans_lock);
   2071			delay -= msecs_to_jiffies((delta - 1) * 1000);
   2072			delay = min(delay,
   2073				    msecs_to_jiffies(fs_info->commit_interval * 1000));
   2074			goto sleep;
   2075		}
   2076		transid = cur->transid;
   2077		spin_unlock(&fs_info->trans_lock);
   2078
   2079		/* If the file system is aborted, this will always fail. */
   2080		trans = btrfs_attach_transaction(root);
   2081		if (IS_ERR(trans)) {
   2082			if (PTR_ERR(trans) != -ENOENT)
   2083				cannot_commit = true;
   2084			goto sleep;
   2085		}
   2086		if (transid == trans->transid) {
   2087			btrfs_commit_transaction(trans);
   2088		} else {
   2089			btrfs_end_transaction(trans);
   2090		}
   2091sleep:
   2092		wake_up_process(fs_info->cleaner_kthread);
   2093		mutex_unlock(&fs_info->transaction_kthread_mutex);
   2094
   2095		if (BTRFS_FS_ERROR(fs_info))
   2096			btrfs_cleanup_transaction(fs_info);
   2097		if (!kthread_should_stop() &&
   2098				(!btrfs_transaction_blocked(fs_info) ||
   2099				 cannot_commit))
   2100			schedule_timeout_interruptible(delay);
   2101	} while (!kthread_should_stop());
   2102	return 0;
   2103}
   2104
   2105/*
   2106 * This will find the highest generation in the array of root backups.  The
   2107 * index of the highest array is returned, or -EINVAL if we can't find
   2108 * anything.
   2109 *
   2110 * We check to make sure the array is valid by comparing the
   2111 * generation of the latest  root in the array with the generation
   2112 * in the super block.  If they don't match we pitch it.
   2113 */
   2114static int find_newest_super_backup(struct btrfs_fs_info *info)
   2115{
   2116	const u64 newest_gen = btrfs_super_generation(info->super_copy);
   2117	u64 cur;
   2118	struct btrfs_root_backup *root_backup;
   2119	int i;
   2120
   2121	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
   2122		root_backup = info->super_copy->super_roots + i;
   2123		cur = btrfs_backup_tree_root_gen(root_backup);
   2124		if (cur == newest_gen)
   2125			return i;
   2126	}
   2127
   2128	return -EINVAL;
   2129}
   2130
   2131/*
   2132 * copy all the root pointers into the super backup array.
   2133 * this will bump the backup pointer by one when it is
   2134 * done
   2135 */
   2136static void backup_super_roots(struct btrfs_fs_info *info)
   2137{
   2138	const int next_backup = info->backup_root_index;
   2139	struct btrfs_root_backup *root_backup;
   2140
   2141	root_backup = info->super_for_commit->super_roots + next_backup;
   2142
   2143	/*
   2144	 * make sure all of our padding and empty slots get zero filled
   2145	 * regardless of which ones we use today
   2146	 */
   2147	memset(root_backup, 0, sizeof(*root_backup));
   2148
   2149	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
   2150
   2151	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
   2152	btrfs_set_backup_tree_root_gen(root_backup,
   2153			       btrfs_header_generation(info->tree_root->node));
   2154
   2155	btrfs_set_backup_tree_root_level(root_backup,
   2156			       btrfs_header_level(info->tree_root->node));
   2157
   2158	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
   2159	btrfs_set_backup_chunk_root_gen(root_backup,
   2160			       btrfs_header_generation(info->chunk_root->node));
   2161	btrfs_set_backup_chunk_root_level(root_backup,
   2162			       btrfs_header_level(info->chunk_root->node));
   2163
   2164	if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
   2165		btrfs_set_backup_block_group_root(root_backup,
   2166					info->block_group_root->node->start);
   2167		btrfs_set_backup_block_group_root_gen(root_backup,
   2168			btrfs_header_generation(info->block_group_root->node));
   2169		btrfs_set_backup_block_group_root_level(root_backup,
   2170			btrfs_header_level(info->block_group_root->node));
   2171	} else {
   2172		struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
   2173		struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
   2174
   2175		btrfs_set_backup_extent_root(root_backup,
   2176					     extent_root->node->start);
   2177		btrfs_set_backup_extent_root_gen(root_backup,
   2178				btrfs_header_generation(extent_root->node));
   2179		btrfs_set_backup_extent_root_level(root_backup,
   2180					btrfs_header_level(extent_root->node));
   2181
   2182		btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
   2183		btrfs_set_backup_csum_root_gen(root_backup,
   2184					       btrfs_header_generation(csum_root->node));
   2185		btrfs_set_backup_csum_root_level(root_backup,
   2186						 btrfs_header_level(csum_root->node));
   2187	}
   2188
   2189	/*
   2190	 * we might commit during log recovery, which happens before we set
   2191	 * the fs_root.  Make sure it is valid before we fill it in.
   2192	 */
   2193	if (info->fs_root && info->fs_root->node) {
   2194		btrfs_set_backup_fs_root(root_backup,
   2195					 info->fs_root->node->start);
   2196		btrfs_set_backup_fs_root_gen(root_backup,
   2197			       btrfs_header_generation(info->fs_root->node));
   2198		btrfs_set_backup_fs_root_level(root_backup,
   2199			       btrfs_header_level(info->fs_root->node));
   2200	}
   2201
   2202	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
   2203	btrfs_set_backup_dev_root_gen(root_backup,
   2204			       btrfs_header_generation(info->dev_root->node));
   2205	btrfs_set_backup_dev_root_level(root_backup,
   2206				       btrfs_header_level(info->dev_root->node));
   2207
   2208	btrfs_set_backup_total_bytes(root_backup,
   2209			     btrfs_super_total_bytes(info->super_copy));
   2210	btrfs_set_backup_bytes_used(root_backup,
   2211			     btrfs_super_bytes_used(info->super_copy));
   2212	btrfs_set_backup_num_devices(root_backup,
   2213			     btrfs_super_num_devices(info->super_copy));
   2214
   2215	/*
   2216	 * if we don't copy this out to the super_copy, it won't get remembered
   2217	 * for the next commit
   2218	 */
   2219	memcpy(&info->super_copy->super_roots,
   2220	       &info->super_for_commit->super_roots,
   2221	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
   2222}
   2223
   2224/*
   2225 * read_backup_root - Reads a backup root based on the passed priority. Prio 0
   2226 * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
   2227 *
   2228 * fs_info - filesystem whose backup roots need to be read
   2229 * priority - priority of backup root required
   2230 *
   2231 * Returns backup root index on success and -EINVAL otherwise.
   2232 */
   2233static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
   2234{
   2235	int backup_index = find_newest_super_backup(fs_info);
   2236	struct btrfs_super_block *super = fs_info->super_copy;
   2237	struct btrfs_root_backup *root_backup;
   2238
   2239	if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
   2240		if (priority == 0)
   2241			return backup_index;
   2242
   2243		backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
   2244		backup_index %= BTRFS_NUM_BACKUP_ROOTS;
   2245	} else {
   2246		return -EINVAL;
   2247	}
   2248
   2249	root_backup = super->super_roots + backup_index;
   2250
   2251	btrfs_set_super_generation(super,
   2252				   btrfs_backup_tree_root_gen(root_backup));
   2253	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
   2254	btrfs_set_super_root_level(super,
   2255				   btrfs_backup_tree_root_level(root_backup));
   2256	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
   2257
   2258	/*
   2259	 * Fixme: the total bytes and num_devices need to match or we should
   2260	 * need a fsck
   2261	 */
   2262	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
   2263	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
   2264
   2265	return backup_index;
   2266}
   2267
   2268/* helper to cleanup workers */
   2269static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
   2270{
   2271	btrfs_destroy_workqueue(fs_info->fixup_workers);
   2272	btrfs_destroy_workqueue(fs_info->delalloc_workers);
   2273	btrfs_destroy_workqueue(fs_info->hipri_workers);
   2274	btrfs_destroy_workqueue(fs_info->workers);
   2275	btrfs_destroy_workqueue(fs_info->endio_workers);
   2276	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
   2277	if (fs_info->rmw_workers)
   2278		destroy_workqueue(fs_info->rmw_workers);
   2279	btrfs_destroy_workqueue(fs_info->endio_write_workers);
   2280	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
   2281	btrfs_destroy_workqueue(fs_info->delayed_workers);
   2282	btrfs_destroy_workqueue(fs_info->caching_workers);
   2283	btrfs_destroy_workqueue(fs_info->flush_workers);
   2284	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
   2285	if (fs_info->discard_ctl.discard_workers)
   2286		destroy_workqueue(fs_info->discard_ctl.discard_workers);
   2287	/*
   2288	 * Now that all other work queues are destroyed, we can safely destroy
   2289	 * the queues used for metadata I/O, since tasks from those other work
   2290	 * queues can do metadata I/O operations.
   2291	 */
   2292	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
   2293	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
   2294}
   2295
   2296static void free_root_extent_buffers(struct btrfs_root *root)
   2297{
   2298	if (root) {
   2299		free_extent_buffer(root->node);
   2300		free_extent_buffer(root->commit_root);
   2301		root->node = NULL;
   2302		root->commit_root = NULL;
   2303	}
   2304}
   2305
   2306static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
   2307{
   2308	struct btrfs_root *root, *tmp;
   2309
   2310	rbtree_postorder_for_each_entry_safe(root, tmp,
   2311					     &fs_info->global_root_tree,
   2312					     rb_node)
   2313		free_root_extent_buffers(root);
   2314}
   2315
   2316/* helper to cleanup tree roots */
   2317static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
   2318{
   2319	free_root_extent_buffers(info->tree_root);
   2320
   2321	free_global_root_pointers(info);
   2322	free_root_extent_buffers(info->dev_root);
   2323	free_root_extent_buffers(info->quota_root);
   2324	free_root_extent_buffers(info->uuid_root);
   2325	free_root_extent_buffers(info->fs_root);
   2326	free_root_extent_buffers(info->data_reloc_root);
   2327	free_root_extent_buffers(info->block_group_root);
   2328	if (free_chunk_root)
   2329		free_root_extent_buffers(info->chunk_root);
   2330}
   2331
   2332void btrfs_put_root(struct btrfs_root *root)
   2333{
   2334	if (!root)
   2335		return;
   2336
   2337	if (refcount_dec_and_test(&root->refs)) {
   2338		WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
   2339		WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
   2340		if (root->anon_dev)
   2341			free_anon_bdev(root->anon_dev);
   2342		btrfs_drew_lock_destroy(&root->snapshot_lock);
   2343		free_root_extent_buffers(root);
   2344#ifdef CONFIG_BTRFS_DEBUG
   2345		spin_lock(&root->fs_info->fs_roots_lock);
   2346		list_del_init(&root->leak_list);
   2347		spin_unlock(&root->fs_info->fs_roots_lock);
   2348#endif
   2349		kfree(root);
   2350	}
   2351}
   2352
   2353void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
   2354{
   2355	struct btrfs_root *root;
   2356	unsigned long index = 0;
   2357
   2358	while (!list_empty(&fs_info->dead_roots)) {
   2359		root = list_entry(fs_info->dead_roots.next,
   2360				  struct btrfs_root, root_list);
   2361		list_del(&root->root_list);
   2362
   2363		if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
   2364			btrfs_drop_and_free_fs_root(fs_info, root);
   2365		btrfs_put_root(root);
   2366	}
   2367
   2368	xa_for_each(&fs_info->fs_roots, index, root) {
   2369		btrfs_drop_and_free_fs_root(fs_info, root);
   2370	}
   2371}
   2372
   2373static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
   2374{
   2375	mutex_init(&fs_info->scrub_lock);
   2376	atomic_set(&fs_info->scrubs_running, 0);
   2377	atomic_set(&fs_info->scrub_pause_req, 0);
   2378	atomic_set(&fs_info->scrubs_paused, 0);
   2379	atomic_set(&fs_info->scrub_cancel_req, 0);
   2380	init_waitqueue_head(&fs_info->scrub_pause_wait);
   2381	refcount_set(&fs_info->scrub_workers_refcnt, 0);
   2382}
   2383
   2384static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
   2385{
   2386	spin_lock_init(&fs_info->balance_lock);
   2387	mutex_init(&fs_info->balance_mutex);
   2388	atomic_set(&fs_info->balance_pause_req, 0);
   2389	atomic_set(&fs_info->balance_cancel_req, 0);
   2390	fs_info->balance_ctl = NULL;
   2391	init_waitqueue_head(&fs_info->balance_wait_q);
   2392	atomic_set(&fs_info->reloc_cancel_req, 0);
   2393}
   2394
   2395static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
   2396{
   2397	struct inode *inode = fs_info->btree_inode;
   2398
   2399	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
   2400	set_nlink(inode, 1);
   2401	/*
   2402	 * we set the i_size on the btree inode to the max possible int.
   2403	 * the real end of the address space is determined by all of
   2404	 * the devices in the system
   2405	 */
   2406	inode->i_size = OFFSET_MAX;
   2407	inode->i_mapping->a_ops = &btree_aops;
   2408
   2409	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
   2410	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
   2411			    IO_TREE_BTREE_INODE_IO, inode);
   2412	BTRFS_I(inode)->io_tree.track_uptodate = false;
   2413	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
   2414
   2415	BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
   2416	memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
   2417	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
   2418	btrfs_insert_inode_hash(inode);
   2419}
   2420
   2421static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
   2422{
   2423	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
   2424	init_rwsem(&fs_info->dev_replace.rwsem);
   2425	init_waitqueue_head(&fs_info->dev_replace.replace_wait);
   2426}
   2427
   2428static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
   2429{
   2430	spin_lock_init(&fs_info->qgroup_lock);
   2431	mutex_init(&fs_info->qgroup_ioctl_lock);
   2432	fs_info->qgroup_tree = RB_ROOT;
   2433	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
   2434	fs_info->qgroup_seq = 1;
   2435	fs_info->qgroup_ulist = NULL;
   2436	fs_info->qgroup_rescan_running = false;
   2437	mutex_init(&fs_info->qgroup_rescan_lock);
   2438}
   2439
   2440static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
   2441{
   2442	u32 max_active = fs_info->thread_pool_size;
   2443	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
   2444
   2445	fs_info->workers =
   2446		btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
   2447	fs_info->hipri_workers =
   2448		btrfs_alloc_workqueue(fs_info, "worker-high",
   2449				      flags | WQ_HIGHPRI, max_active, 16);
   2450
   2451	fs_info->delalloc_workers =
   2452		btrfs_alloc_workqueue(fs_info, "delalloc",
   2453				      flags, max_active, 2);
   2454
   2455	fs_info->flush_workers =
   2456		btrfs_alloc_workqueue(fs_info, "flush_delalloc",
   2457				      flags, max_active, 0);
   2458
   2459	fs_info->caching_workers =
   2460		btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
   2461
   2462	fs_info->fixup_workers =
   2463		btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
   2464
   2465	/*
   2466	 * endios are largely parallel and should have a very
   2467	 * low idle thresh
   2468	 */
   2469	fs_info->endio_workers =
   2470		btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
   2471	fs_info->endio_meta_workers =
   2472		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
   2473				      max_active, 4);
   2474	fs_info->endio_meta_write_workers =
   2475		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
   2476				      max_active, 2);
   2477	fs_info->endio_raid56_workers =
   2478		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
   2479				      max_active, 4);
   2480	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
   2481	fs_info->endio_write_workers =
   2482		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
   2483				      max_active, 2);
   2484	fs_info->endio_freespace_worker =
   2485		btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
   2486				      max_active, 0);
   2487	fs_info->delayed_workers =
   2488		btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
   2489				      max_active, 0);
   2490	fs_info->qgroup_rescan_workers =
   2491		btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
   2492	fs_info->discard_ctl.discard_workers =
   2493		alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
   2494
   2495	if (!(fs_info->workers && fs_info->hipri_workers &&
   2496	      fs_info->delalloc_workers && fs_info->flush_workers &&
   2497	      fs_info->endio_workers && fs_info->endio_meta_workers &&
   2498	      fs_info->endio_meta_write_workers &&
   2499	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
   2500	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
   2501	      fs_info->caching_workers && fs_info->fixup_workers &&
   2502	      fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
   2503	      fs_info->discard_ctl.discard_workers)) {
   2504		return -ENOMEM;
   2505	}
   2506
   2507	return 0;
   2508}
   2509
   2510static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
   2511{
   2512	struct crypto_shash *csum_shash;
   2513	const char *csum_driver = btrfs_super_csum_driver(csum_type);
   2514
   2515	csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
   2516
   2517	if (IS_ERR(csum_shash)) {
   2518		btrfs_err(fs_info, "error allocating %s hash for checksum",
   2519			  csum_driver);
   2520		return PTR_ERR(csum_shash);
   2521	}
   2522
   2523	fs_info->csum_shash = csum_shash;
   2524
   2525	return 0;
   2526}
   2527
   2528static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
   2529			    struct btrfs_fs_devices *fs_devices)
   2530{
   2531	int ret;
   2532	struct btrfs_root *log_tree_root;
   2533	struct btrfs_super_block *disk_super = fs_info->super_copy;
   2534	u64 bytenr = btrfs_super_log_root(disk_super);
   2535	int level = btrfs_super_log_root_level(disk_super);
   2536
   2537	if (fs_devices->rw_devices == 0) {
   2538		btrfs_warn(fs_info, "log replay required on RO media");
   2539		return -EIO;
   2540	}
   2541
   2542	log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
   2543					 GFP_KERNEL);
   2544	if (!log_tree_root)
   2545		return -ENOMEM;
   2546
   2547	log_tree_root->node = read_tree_block(fs_info, bytenr,
   2548					      BTRFS_TREE_LOG_OBJECTID,
   2549					      fs_info->generation + 1, level,
   2550					      NULL);
   2551	if (IS_ERR(log_tree_root->node)) {
   2552		btrfs_warn(fs_info, "failed to read log tree");
   2553		ret = PTR_ERR(log_tree_root->node);
   2554		log_tree_root->node = NULL;
   2555		btrfs_put_root(log_tree_root);
   2556		return ret;
   2557	}
   2558	if (!extent_buffer_uptodate(log_tree_root->node)) {
   2559		btrfs_err(fs_info, "failed to read log tree");
   2560		btrfs_put_root(log_tree_root);
   2561		return -EIO;
   2562	}
   2563
   2564	/* returns with log_tree_root freed on success */
   2565	ret = btrfs_recover_log_trees(log_tree_root);
   2566	if (ret) {
   2567		btrfs_handle_fs_error(fs_info, ret,
   2568				      "Failed to recover log tree");
   2569		btrfs_put_root(log_tree_root);
   2570		return ret;
   2571	}
   2572
   2573	if (sb_rdonly(fs_info->sb)) {
   2574		ret = btrfs_commit_super(fs_info);
   2575		if (ret)
   2576			return ret;
   2577	}
   2578
   2579	return 0;
   2580}
   2581
   2582static int load_global_roots_objectid(struct btrfs_root *tree_root,
   2583				      struct btrfs_path *path, u64 objectid,
   2584				      const char *name)
   2585{
   2586	struct btrfs_fs_info *fs_info = tree_root->fs_info;
   2587	struct btrfs_root *root;
   2588	u64 max_global_id = 0;
   2589	int ret;
   2590	struct btrfs_key key = {
   2591		.objectid = objectid,
   2592		.type = BTRFS_ROOT_ITEM_KEY,
   2593		.offset = 0,
   2594	};
   2595	bool found = false;
   2596
   2597	/* If we have IGNOREDATACSUMS skip loading these roots. */
   2598	if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
   2599	    btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
   2600		set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
   2601		return 0;
   2602	}
   2603
   2604	while (1) {
   2605		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
   2606		if (ret < 0)
   2607			break;
   2608
   2609		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
   2610			ret = btrfs_next_leaf(tree_root, path);
   2611			if (ret) {
   2612				if (ret > 0)
   2613					ret = 0;
   2614				break;
   2615			}
   2616		}
   2617		ret = 0;
   2618
   2619		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   2620		if (key.objectid != objectid)
   2621			break;
   2622		btrfs_release_path(path);
   2623
   2624		/*
   2625		 * Just worry about this for extent tree, it'll be the same for
   2626		 * everybody.
   2627		 */
   2628		if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
   2629			max_global_id = max(max_global_id, key.offset);
   2630
   2631		found = true;
   2632		root = read_tree_root_path(tree_root, path, &key);
   2633		if (IS_ERR(root)) {
   2634			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
   2635				ret = PTR_ERR(root);
   2636			break;
   2637		}
   2638		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
   2639		ret = btrfs_global_root_insert(root);
   2640		if (ret) {
   2641			btrfs_put_root(root);
   2642			break;
   2643		}
   2644		key.offset++;
   2645	}
   2646	btrfs_release_path(path);
   2647
   2648	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
   2649		fs_info->nr_global_roots = max_global_id + 1;
   2650
   2651	if (!found || ret) {
   2652		if (objectid == BTRFS_CSUM_TREE_OBJECTID)
   2653			set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
   2654
   2655		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
   2656			ret = ret ? ret : -ENOENT;
   2657		else
   2658			ret = 0;
   2659		btrfs_err(fs_info, "failed to load root %s", name);
   2660	}
   2661	return ret;
   2662}
   2663
   2664static int load_global_roots(struct btrfs_root *tree_root)
   2665{
   2666	struct btrfs_path *path;
   2667	int ret = 0;
   2668
   2669	path = btrfs_alloc_path();
   2670	if (!path)
   2671		return -ENOMEM;
   2672
   2673	ret = load_global_roots_objectid(tree_root, path,
   2674					 BTRFS_EXTENT_TREE_OBJECTID, "extent");
   2675	if (ret)
   2676		goto out;
   2677	ret = load_global_roots_objectid(tree_root, path,
   2678					 BTRFS_CSUM_TREE_OBJECTID, "csum");
   2679	if (ret)
   2680		goto out;
   2681	if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
   2682		goto out;
   2683	ret = load_global_roots_objectid(tree_root, path,
   2684					 BTRFS_FREE_SPACE_TREE_OBJECTID,
   2685					 "free space");
   2686out:
   2687	btrfs_free_path(path);
   2688	return ret;
   2689}
   2690
   2691static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
   2692{
   2693	struct btrfs_root *tree_root = fs_info->tree_root;
   2694	struct btrfs_root *root;
   2695	struct btrfs_key location;
   2696	int ret;
   2697
   2698	BUG_ON(!fs_info->tree_root);
   2699
   2700	ret = load_global_roots(tree_root);
   2701	if (ret)
   2702		return ret;
   2703
   2704	location.objectid = BTRFS_DEV_TREE_OBJECTID;
   2705	location.type = BTRFS_ROOT_ITEM_KEY;
   2706	location.offset = 0;
   2707
   2708	root = btrfs_read_tree_root(tree_root, &location);
   2709	if (IS_ERR(root)) {
   2710		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
   2711			ret = PTR_ERR(root);
   2712			goto out;
   2713		}
   2714	} else {
   2715		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
   2716		fs_info->dev_root = root;
   2717	}
   2718	/* Initialize fs_info for all devices in any case */
   2719	btrfs_init_devices_late(fs_info);
   2720
   2721	/*
   2722	 * This tree can share blocks with some other fs tree during relocation
   2723	 * and we need a proper setup by btrfs_get_fs_root
   2724	 */
   2725	root = btrfs_get_fs_root(tree_root->fs_info,
   2726				 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
   2727	if (IS_ERR(root)) {
   2728		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
   2729			ret = PTR_ERR(root);
   2730			goto out;
   2731		}
   2732	} else {
   2733		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
   2734		fs_info->data_reloc_root = root;
   2735	}
   2736
   2737	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
   2738	root = btrfs_read_tree_root(tree_root, &location);
   2739	if (!IS_ERR(root)) {
   2740		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
   2741		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
   2742		fs_info->quota_root = root;
   2743	}
   2744
   2745	location.objectid = BTRFS_UUID_TREE_OBJECTID;
   2746	root = btrfs_read_tree_root(tree_root, &location);
   2747	if (IS_ERR(root)) {
   2748		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
   2749			ret = PTR_ERR(root);
   2750			if (ret != -ENOENT)
   2751				goto out;
   2752		}
   2753	} else {
   2754		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
   2755		fs_info->uuid_root = root;
   2756	}
   2757
   2758	return 0;
   2759out:
   2760	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
   2761		   location.objectid, ret);
   2762	return ret;
   2763}
   2764
   2765/*
   2766 * Real super block validation
   2767 * NOTE: super csum type and incompat features will not be checked here.
   2768 *
   2769 * @sb:		super block to check
   2770 * @mirror_num:	the super block number to check its bytenr:
   2771 * 		0	the primary (1st) sb
   2772 * 		1, 2	2nd and 3rd backup copy
   2773 * 	       -1	skip bytenr check
   2774 */
   2775static int validate_super(struct btrfs_fs_info *fs_info,
   2776			    struct btrfs_super_block *sb, int mirror_num)
   2777{
   2778	u64 nodesize = btrfs_super_nodesize(sb);
   2779	u64 sectorsize = btrfs_super_sectorsize(sb);
   2780	int ret = 0;
   2781
   2782	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
   2783		btrfs_err(fs_info, "no valid FS found");
   2784		ret = -EINVAL;
   2785	}
   2786	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
   2787		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
   2788				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
   2789		ret = -EINVAL;
   2790	}
   2791	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
   2792		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
   2793				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
   2794		ret = -EINVAL;
   2795	}
   2796	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
   2797		btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
   2798				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
   2799		ret = -EINVAL;
   2800	}
   2801	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
   2802		btrfs_err(fs_info, "log_root level too big: %d >= %d",
   2803				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
   2804		ret = -EINVAL;
   2805	}
   2806
   2807	/*
   2808	 * Check sectorsize and nodesize first, other check will need it.
   2809	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
   2810	 */
   2811	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
   2812	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
   2813		btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
   2814		ret = -EINVAL;
   2815	}
   2816
   2817	/*
   2818	 * We only support at most two sectorsizes: 4K and PAGE_SIZE.
   2819	 *
   2820	 * We can support 16K sectorsize with 64K page size without problem,
   2821	 * but such sectorsize/pagesize combination doesn't make much sense.
   2822	 * 4K will be our future standard, PAGE_SIZE is supported from the very
   2823	 * beginning.
   2824	 */
   2825	if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
   2826		btrfs_err(fs_info,
   2827			"sectorsize %llu not yet supported for page size %lu",
   2828			sectorsize, PAGE_SIZE);
   2829		ret = -EINVAL;
   2830	}
   2831
   2832	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
   2833	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
   2834		btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
   2835		ret = -EINVAL;
   2836	}
   2837	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
   2838		btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
   2839			  le32_to_cpu(sb->__unused_leafsize), nodesize);
   2840		ret = -EINVAL;
   2841	}
   2842
   2843	/* Root alignment check */
   2844	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
   2845		btrfs_warn(fs_info, "tree_root block unaligned: %llu",
   2846			   btrfs_super_root(sb));
   2847		ret = -EINVAL;
   2848	}
   2849	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
   2850		btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
   2851			   btrfs_super_chunk_root(sb));
   2852		ret = -EINVAL;
   2853	}
   2854	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
   2855		btrfs_warn(fs_info, "log_root block unaligned: %llu",
   2856			   btrfs_super_log_root(sb));
   2857		ret = -EINVAL;
   2858	}
   2859
   2860	if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
   2861		   BTRFS_FSID_SIZE)) {
   2862		btrfs_err(fs_info,
   2863		"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
   2864			fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
   2865		ret = -EINVAL;
   2866	}
   2867
   2868	if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
   2869	    memcmp(fs_info->fs_devices->metadata_uuid,
   2870		   fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
   2871		btrfs_err(fs_info,
   2872"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
   2873			fs_info->super_copy->metadata_uuid,
   2874			fs_info->fs_devices->metadata_uuid);
   2875		ret = -EINVAL;
   2876	}
   2877
   2878	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
   2879		   BTRFS_FSID_SIZE) != 0) {
   2880		btrfs_err(fs_info,
   2881			"dev_item UUID does not match metadata fsid: %pU != %pU",
   2882			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
   2883		ret = -EINVAL;
   2884	}
   2885
   2886	/*
   2887	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
   2888	 * done later
   2889	 */
   2890	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
   2891		btrfs_err(fs_info, "bytes_used is too small %llu",
   2892			  btrfs_super_bytes_used(sb));
   2893		ret = -EINVAL;
   2894	}
   2895	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
   2896		btrfs_err(fs_info, "invalid stripesize %u",
   2897			  btrfs_super_stripesize(sb));
   2898		ret = -EINVAL;
   2899	}
   2900	if (btrfs_super_num_devices(sb) > (1UL << 31))
   2901		btrfs_warn(fs_info, "suspicious number of devices: %llu",
   2902			   btrfs_super_num_devices(sb));
   2903	if (btrfs_super_num_devices(sb) == 0) {
   2904		btrfs_err(fs_info, "number of devices is 0");
   2905		ret = -EINVAL;
   2906	}
   2907
   2908	if (mirror_num >= 0 &&
   2909	    btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
   2910		btrfs_err(fs_info, "super offset mismatch %llu != %u",
   2911			  btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
   2912		ret = -EINVAL;
   2913	}
   2914
   2915	/*
   2916	 * Obvious sys_chunk_array corruptions, it must hold at least one key
   2917	 * and one chunk
   2918	 */
   2919	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
   2920		btrfs_err(fs_info, "system chunk array too big %u > %u",
   2921			  btrfs_super_sys_array_size(sb),
   2922			  BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
   2923		ret = -EINVAL;
   2924	}
   2925	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
   2926			+ sizeof(struct btrfs_chunk)) {
   2927		btrfs_err(fs_info, "system chunk array too small %u < %zu",
   2928			  btrfs_super_sys_array_size(sb),
   2929			  sizeof(struct btrfs_disk_key)
   2930			  + sizeof(struct btrfs_chunk));
   2931		ret = -EINVAL;
   2932	}
   2933
   2934	/*
   2935	 * The generation is a global counter, we'll trust it more than the others
   2936	 * but it's still possible that it's the one that's wrong.
   2937	 */
   2938	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
   2939		btrfs_warn(fs_info,
   2940			"suspicious: generation < chunk_root_generation: %llu < %llu",
   2941			btrfs_super_generation(sb),
   2942			btrfs_super_chunk_root_generation(sb));
   2943	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
   2944	    && btrfs_super_cache_generation(sb) != (u64)-1)
   2945		btrfs_warn(fs_info,
   2946			"suspicious: generation < cache_generation: %llu < %llu",
   2947			btrfs_super_generation(sb),
   2948			btrfs_super_cache_generation(sb));
   2949
   2950	return ret;
   2951}
   2952
   2953/*
   2954 * Validation of super block at mount time.
   2955 * Some checks already done early at mount time, like csum type and incompat
   2956 * flags will be skipped.
   2957 */
   2958static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
   2959{
   2960	return validate_super(fs_info, fs_info->super_copy, 0);
   2961}
   2962
   2963/*
   2964 * Validation of super block at write time.
   2965 * Some checks like bytenr check will be skipped as their values will be
   2966 * overwritten soon.
   2967 * Extra checks like csum type and incompat flags will be done here.
   2968 */
   2969static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
   2970				      struct btrfs_super_block *sb)
   2971{
   2972	int ret;
   2973
   2974	ret = validate_super(fs_info, sb, -1);
   2975	if (ret < 0)
   2976		goto out;
   2977	if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
   2978		ret = -EUCLEAN;
   2979		btrfs_err(fs_info, "invalid csum type, has %u want %u",
   2980			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
   2981		goto out;
   2982	}
   2983	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
   2984		ret = -EUCLEAN;
   2985		btrfs_err(fs_info,
   2986		"invalid incompat flags, has 0x%llx valid mask 0x%llx",
   2987			  btrfs_super_incompat_flags(sb),
   2988			  (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
   2989		goto out;
   2990	}
   2991out:
   2992	if (ret < 0)
   2993		btrfs_err(fs_info,
   2994		"super block corruption detected before writing it to disk");
   2995	return ret;
   2996}
   2997
   2998static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
   2999{
   3000	int ret = 0;
   3001
   3002	root->node = read_tree_block(root->fs_info, bytenr,
   3003				     root->root_key.objectid, gen, level, NULL);
   3004	if (IS_ERR(root->node)) {
   3005		ret = PTR_ERR(root->node);
   3006		root->node = NULL;
   3007		return ret;
   3008	}
   3009	if (!extent_buffer_uptodate(root->node)) {
   3010		free_extent_buffer(root->node);
   3011		root->node = NULL;
   3012		return -EIO;
   3013	}
   3014
   3015	btrfs_set_root_node(&root->root_item, root->node);
   3016	root->commit_root = btrfs_root_node(root);
   3017	btrfs_set_root_refs(&root->root_item, 1);
   3018	return ret;
   3019}
   3020
   3021static int load_important_roots(struct btrfs_fs_info *fs_info)
   3022{
   3023	struct btrfs_super_block *sb = fs_info->super_copy;
   3024	u64 gen, bytenr;
   3025	int level, ret;
   3026
   3027	bytenr = btrfs_super_root(sb);
   3028	gen = btrfs_super_generation(sb);
   3029	level = btrfs_super_root_level(sb);
   3030	ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
   3031	if (ret) {
   3032		btrfs_warn(fs_info, "couldn't read tree root");
   3033		return ret;
   3034	}
   3035
   3036	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
   3037		return 0;
   3038
   3039	bytenr = btrfs_super_block_group_root(sb);
   3040	gen = btrfs_super_block_group_root_generation(sb);
   3041	level = btrfs_super_block_group_root_level(sb);
   3042	ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
   3043	if (ret)
   3044		btrfs_warn(fs_info, "couldn't read block group root");
   3045	return ret;
   3046}
   3047
   3048static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
   3049{
   3050	int backup_index = find_newest_super_backup(fs_info);
   3051	struct btrfs_super_block *sb = fs_info->super_copy;
   3052	struct btrfs_root *tree_root = fs_info->tree_root;
   3053	bool handle_error = false;
   3054	int ret = 0;
   3055	int i;
   3056
   3057	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
   3058		struct btrfs_root *root;
   3059
   3060		root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
   3061					GFP_KERNEL);
   3062		if (!root)
   3063			return -ENOMEM;
   3064		fs_info->block_group_root = root;
   3065	}
   3066
   3067	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
   3068		if (handle_error) {
   3069			if (!IS_ERR(tree_root->node))
   3070				free_extent_buffer(tree_root->node);
   3071			tree_root->node = NULL;
   3072
   3073			if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
   3074				break;
   3075
   3076			free_root_pointers(fs_info, 0);
   3077
   3078			/*
   3079			 * Don't use the log in recovery mode, it won't be
   3080			 * valid
   3081			 */
   3082			btrfs_set_super_log_root(sb, 0);
   3083
   3084			/* We can't trust the free space cache either */
   3085			btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
   3086
   3087			ret = read_backup_root(fs_info, i);
   3088			backup_index = ret;
   3089			if (ret < 0)
   3090				return ret;
   3091		}
   3092
   3093		ret = load_important_roots(fs_info);
   3094		if (ret) {
   3095			handle_error = true;
   3096			continue;
   3097		}
   3098
   3099		/*
   3100		 * No need to hold btrfs_root::objectid_mutex since the fs
   3101		 * hasn't been fully initialised and we are the only user
   3102		 */
   3103		ret = btrfs_init_root_free_objectid(tree_root);
   3104		if (ret < 0) {
   3105			handle_error = true;
   3106			continue;
   3107		}
   3108
   3109		ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
   3110
   3111		ret = btrfs_read_roots(fs_info);
   3112		if (ret < 0) {
   3113			handle_error = true;
   3114			continue;
   3115		}
   3116
   3117		/* All successful */
   3118		fs_info->generation = btrfs_header_generation(tree_root->node);
   3119		fs_info->last_trans_committed = fs_info->generation;
   3120		fs_info->last_reloc_trans = 0;
   3121
   3122		/* Always begin writing backup roots after the one being used */
   3123		if (backup_index < 0) {
   3124			fs_info->backup_root_index = 0;
   3125		} else {
   3126			fs_info->backup_root_index = backup_index + 1;
   3127			fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
   3128		}
   3129		break;
   3130	}
   3131
   3132	return ret;
   3133}
   3134
   3135void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
   3136{
   3137	xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
   3138	xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
   3139	INIT_LIST_HEAD(&fs_info->trans_list);
   3140	INIT_LIST_HEAD(&fs_info->dead_roots);
   3141	INIT_LIST_HEAD(&fs_info->delayed_iputs);
   3142	INIT_LIST_HEAD(&fs_info->delalloc_roots);
   3143	INIT_LIST_HEAD(&fs_info->caching_block_groups);
   3144	spin_lock_init(&fs_info->delalloc_root_lock);
   3145	spin_lock_init(&fs_info->trans_lock);
   3146	spin_lock_init(&fs_info->fs_roots_lock);
   3147	spin_lock_init(&fs_info->delayed_iput_lock);
   3148	spin_lock_init(&fs_info->defrag_inodes_lock);
   3149	spin_lock_init(&fs_info->super_lock);
   3150	spin_lock_init(&fs_info->buffer_lock);
   3151	spin_lock_init(&fs_info->unused_bgs_lock);
   3152	spin_lock_init(&fs_info->treelog_bg_lock);
   3153	spin_lock_init(&fs_info->zone_active_bgs_lock);
   3154	spin_lock_init(&fs_info->relocation_bg_lock);
   3155	rwlock_init(&fs_info->tree_mod_log_lock);
   3156	rwlock_init(&fs_info->global_root_lock);
   3157	mutex_init(&fs_info->unused_bg_unpin_mutex);
   3158	mutex_init(&fs_info->reclaim_bgs_lock);
   3159	mutex_init(&fs_info->reloc_mutex);
   3160	mutex_init(&fs_info->delalloc_root_mutex);
   3161	mutex_init(&fs_info->zoned_meta_io_lock);
   3162	mutex_init(&fs_info->zoned_data_reloc_io_lock);
   3163	seqlock_init(&fs_info->profiles_lock);
   3164
   3165	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
   3166	INIT_LIST_HEAD(&fs_info->space_info);
   3167	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
   3168	INIT_LIST_HEAD(&fs_info->unused_bgs);
   3169	INIT_LIST_HEAD(&fs_info->reclaim_bgs);
   3170	INIT_LIST_HEAD(&fs_info->zone_active_bgs);
   3171#ifdef CONFIG_BTRFS_DEBUG
   3172	INIT_LIST_HEAD(&fs_info->allocated_roots);
   3173	INIT_LIST_HEAD(&fs_info->allocated_ebs);
   3174	spin_lock_init(&fs_info->eb_leak_lock);
   3175#endif
   3176	extent_map_tree_init(&fs_info->mapping_tree);
   3177	btrfs_init_block_rsv(&fs_info->global_block_rsv,
   3178			     BTRFS_BLOCK_RSV_GLOBAL);
   3179	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
   3180	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
   3181	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
   3182	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
   3183			     BTRFS_BLOCK_RSV_DELOPS);
   3184	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
   3185			     BTRFS_BLOCK_RSV_DELREFS);
   3186
   3187	atomic_set(&fs_info->async_delalloc_pages, 0);
   3188	atomic_set(&fs_info->defrag_running, 0);
   3189	atomic_set(&fs_info->nr_delayed_iputs, 0);
   3190	atomic64_set(&fs_info->tree_mod_seq, 0);
   3191	fs_info->global_root_tree = RB_ROOT;
   3192	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
   3193	fs_info->metadata_ratio = 0;
   3194	fs_info->defrag_inodes = RB_ROOT;
   3195	atomic64_set(&fs_info->free_chunk_space, 0);
   3196	fs_info->tree_mod_log = RB_ROOT;
   3197	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
   3198	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
   3199	btrfs_init_ref_verify(fs_info);
   3200
   3201	fs_info->thread_pool_size = min_t(unsigned long,
   3202					  num_online_cpus() + 2, 8);
   3203
   3204	INIT_LIST_HEAD(&fs_info->ordered_roots);
   3205	spin_lock_init(&fs_info->ordered_root_lock);
   3206
   3207	btrfs_init_scrub(fs_info);
   3208#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
   3209	fs_info->check_integrity_print_mask = 0;
   3210#endif
   3211	btrfs_init_balance(fs_info);
   3212	btrfs_init_async_reclaim_work(fs_info);
   3213
   3214	rwlock_init(&fs_info->block_group_cache_lock);
   3215	fs_info->block_group_cache_tree = RB_ROOT_CACHED;
   3216
   3217	extent_io_tree_init(fs_info, &fs_info->excluded_extents,
   3218			    IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
   3219
   3220	mutex_init(&fs_info->ordered_operations_mutex);
   3221	mutex_init(&fs_info->tree_log_mutex);
   3222	mutex_init(&fs_info->chunk_mutex);
   3223	mutex_init(&fs_info->transaction_kthread_mutex);
   3224	mutex_init(&fs_info->cleaner_mutex);
   3225	mutex_init(&fs_info->ro_block_group_mutex);
   3226	init_rwsem(&fs_info->commit_root_sem);
   3227	init_rwsem(&fs_info->cleanup_work_sem);
   3228	init_rwsem(&fs_info->subvol_sem);
   3229	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
   3230
   3231	btrfs_init_dev_replace_locks(fs_info);
   3232	btrfs_init_qgroup(fs_info);
   3233	btrfs_discard_init(fs_info);
   3234
   3235	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
   3236	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
   3237
   3238	init_waitqueue_head(&fs_info->transaction_throttle);
   3239	init_waitqueue_head(&fs_info->transaction_wait);
   3240	init_waitqueue_head(&fs_info->transaction_blocked_wait);
   3241	init_waitqueue_head(&fs_info->async_submit_wait);
   3242	init_waitqueue_head(&fs_info->delayed_iputs_wait);
   3243
   3244	/* Usable values until the real ones are cached from the superblock */
   3245	fs_info->nodesize = 4096;
   3246	fs_info->sectorsize = 4096;
   3247	fs_info->sectorsize_bits = ilog2(4096);
   3248	fs_info->stripesize = 4096;
   3249
   3250	spin_lock_init(&fs_info->swapfile_pins_lock);
   3251	fs_info->swapfile_pins = RB_ROOT;
   3252
   3253	fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
   3254	INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
   3255}
   3256
   3257static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
   3258{
   3259	int ret;
   3260
   3261	fs_info->sb = sb;
   3262	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
   3263	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
   3264
   3265	ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
   3266	if (ret)
   3267		return ret;
   3268
   3269	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
   3270	if (ret)
   3271		return ret;
   3272
   3273	fs_info->dirty_metadata_batch = PAGE_SIZE *
   3274					(1 + ilog2(nr_cpu_ids));
   3275
   3276	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
   3277	if (ret)
   3278		return ret;
   3279
   3280	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
   3281			GFP_KERNEL);
   3282	if (ret)
   3283		return ret;
   3284
   3285	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
   3286					GFP_KERNEL);
   3287	if (!fs_info->delayed_root)
   3288		return -ENOMEM;
   3289	btrfs_init_delayed_root(fs_info->delayed_root);
   3290
   3291	if (sb_rdonly(sb))
   3292		set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
   3293
   3294	return btrfs_alloc_stripe_hash_table(fs_info);
   3295}
   3296
   3297static int btrfs_uuid_rescan_kthread(void *data)
   3298{
   3299	struct btrfs_fs_info *fs_info = data;
   3300	int ret;
   3301
   3302	/*
   3303	 * 1st step is to iterate through the existing UUID tree and
   3304	 * to delete all entries that contain outdated data.
   3305	 * 2nd step is to add all missing entries to the UUID tree.
   3306	 */
   3307	ret = btrfs_uuid_tree_iterate(fs_info);
   3308	if (ret < 0) {
   3309		if (ret != -EINTR)
   3310			btrfs_warn(fs_info, "iterating uuid_tree failed %d",
   3311				   ret);
   3312		up(&fs_info->uuid_tree_rescan_sem);
   3313		return ret;
   3314	}
   3315	return btrfs_uuid_scan_kthread(data);
   3316}
   3317
   3318static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
   3319{
   3320	struct task_struct *task;
   3321
   3322	down(&fs_info->uuid_tree_rescan_sem);
   3323	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
   3324	if (IS_ERR(task)) {
   3325		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
   3326		btrfs_warn(fs_info, "failed to start uuid_rescan task");
   3327		up(&fs_info->uuid_tree_rescan_sem);
   3328		return PTR_ERR(task);
   3329	}
   3330
   3331	return 0;
   3332}
   3333
   3334/*
   3335 * Some options only have meaning at mount time and shouldn't persist across
   3336 * remounts, or be displayed. Clear these at the end of mount and remount
   3337 * code paths.
   3338 */
   3339void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
   3340{
   3341	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
   3342	btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
   3343}
   3344
   3345/*
   3346 * Mounting logic specific to read-write file systems. Shared by open_ctree
   3347 * and btrfs_remount when remounting from read-only to read-write.
   3348 */
   3349int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
   3350{
   3351	int ret;
   3352	const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
   3353	bool clear_free_space_tree = false;
   3354
   3355	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
   3356	    btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
   3357		clear_free_space_tree = true;
   3358	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
   3359		   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
   3360		btrfs_warn(fs_info, "free space tree is invalid");
   3361		clear_free_space_tree = true;
   3362	}
   3363
   3364	if (clear_free_space_tree) {
   3365		btrfs_info(fs_info, "clearing free space tree");
   3366		ret = btrfs_clear_free_space_tree(fs_info);
   3367		if (ret) {
   3368			btrfs_warn(fs_info,
   3369				   "failed to clear free space tree: %d", ret);
   3370			goto out;
   3371		}
   3372	}
   3373
   3374	/*
   3375	 * btrfs_find_orphan_roots() is responsible for finding all the dead
   3376	 * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
   3377	 * them into the fs_info->fs_roots. This must be done before
   3378	 * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
   3379	 * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
   3380	 * item before the root's tree is deleted - this means that if we unmount
   3381	 * or crash before the deletion completes, on the next mount we will not
   3382	 * delete what remains of the tree because the orphan item does not
   3383	 * exists anymore, which is what tells us we have a pending deletion.
   3384	 */
   3385	ret = btrfs_find_orphan_roots(fs_info);
   3386	if (ret)
   3387		goto out;
   3388
   3389	ret = btrfs_cleanup_fs_roots(fs_info);
   3390	if (ret)
   3391		goto out;
   3392
   3393	down_read(&fs_info->cleanup_work_sem);
   3394	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
   3395	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
   3396		up_read(&fs_info->cleanup_work_sem);
   3397		goto out;
   3398	}
   3399	up_read(&fs_info->cleanup_work_sem);
   3400
   3401	mutex_lock(&fs_info->cleaner_mutex);
   3402	ret = btrfs_recover_relocation(fs_info);
   3403	mutex_unlock(&fs_info->cleaner_mutex);
   3404	if (ret < 0) {
   3405		btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
   3406		goto out;
   3407	}
   3408
   3409	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
   3410	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
   3411		btrfs_info(fs_info, "creating free space tree");
   3412		ret = btrfs_create_free_space_tree(fs_info);
   3413		if (ret) {
   3414			btrfs_warn(fs_info,
   3415				"failed to create free space tree: %d", ret);
   3416			goto out;
   3417		}
   3418	}
   3419
   3420	if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
   3421		ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
   3422		if (ret)
   3423			goto out;
   3424	}
   3425
   3426	ret = btrfs_resume_balance_async(fs_info);
   3427	if (ret)
   3428		goto out;
   3429
   3430	ret = btrfs_resume_dev_replace_async(fs_info);
   3431	if (ret) {
   3432		btrfs_warn(fs_info, "failed to resume dev_replace");
   3433		goto out;
   3434	}
   3435
   3436	btrfs_qgroup_rescan_resume(fs_info);
   3437
   3438	if (!fs_info->uuid_root) {
   3439		btrfs_info(fs_info, "creating UUID tree");
   3440		ret = btrfs_create_uuid_tree(fs_info);
   3441		if (ret) {
   3442			btrfs_warn(fs_info,
   3443				   "failed to create the UUID tree %d", ret);
   3444			goto out;
   3445		}
   3446	}
   3447
   3448out:
   3449	return ret;
   3450}
   3451
   3452int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
   3453		      char *options)
   3454{
   3455	u32 sectorsize;
   3456	u32 nodesize;
   3457	u32 stripesize;
   3458	u64 generation;
   3459	u64 features;
   3460	u16 csum_type;
   3461	struct btrfs_super_block *disk_super;
   3462	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
   3463	struct btrfs_root *tree_root;
   3464	struct btrfs_root *chunk_root;
   3465	int ret;
   3466	int err = -EINVAL;
   3467	int level;
   3468
   3469	ret = init_mount_fs_info(fs_info, sb);
   3470	if (ret) {
   3471		err = ret;
   3472		goto fail;
   3473	}
   3474
   3475	/* These need to be init'ed before we start creating inodes and such. */
   3476	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
   3477				     GFP_KERNEL);
   3478	fs_info->tree_root = tree_root;
   3479	chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
   3480				      GFP_KERNEL);
   3481	fs_info->chunk_root = chunk_root;
   3482	if (!tree_root || !chunk_root) {
   3483		err = -ENOMEM;
   3484		goto fail;
   3485	}
   3486
   3487	fs_info->btree_inode = new_inode(sb);
   3488	if (!fs_info->btree_inode) {
   3489		err = -ENOMEM;
   3490		goto fail;
   3491	}
   3492	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
   3493	btrfs_init_btree_inode(fs_info);
   3494
   3495	invalidate_bdev(fs_devices->latest_dev->bdev);
   3496
   3497	/*
   3498	 * Read super block and check the signature bytes only
   3499	 */
   3500	disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
   3501	if (IS_ERR(disk_super)) {
   3502		err = PTR_ERR(disk_super);
   3503		goto fail_alloc;
   3504	}
   3505
   3506	/*
   3507	 * Verify the type first, if that or the checksum value are
   3508	 * corrupted, we'll find out
   3509	 */
   3510	csum_type = btrfs_super_csum_type(disk_super);
   3511	if (!btrfs_supported_super_csum(csum_type)) {
   3512		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
   3513			  csum_type);
   3514		err = -EINVAL;
   3515		btrfs_release_disk_super(disk_super);
   3516		goto fail_alloc;
   3517	}
   3518
   3519	fs_info->csum_size = btrfs_super_csum_size(disk_super);
   3520
   3521	ret = btrfs_init_csum_hash(fs_info, csum_type);
   3522	if (ret) {
   3523		err = ret;
   3524		btrfs_release_disk_super(disk_super);
   3525		goto fail_alloc;
   3526	}
   3527
   3528	/*
   3529	 * We want to check superblock checksum, the type is stored inside.
   3530	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
   3531	 */
   3532	if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
   3533		btrfs_err(fs_info, "superblock checksum mismatch");
   3534		err = -EINVAL;
   3535		btrfs_release_disk_super(disk_super);
   3536		goto fail_alloc;
   3537	}
   3538
   3539	/*
   3540	 * super_copy is zeroed at allocation time and we never touch the
   3541	 * following bytes up to INFO_SIZE, the checksum is calculated from
   3542	 * the whole block of INFO_SIZE
   3543	 */
   3544	memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
   3545	btrfs_release_disk_super(disk_super);
   3546
   3547	disk_super = fs_info->super_copy;
   3548
   3549
   3550	features = btrfs_super_flags(disk_super);
   3551	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
   3552		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
   3553		btrfs_set_super_flags(disk_super, features);
   3554		btrfs_info(fs_info,
   3555			"found metadata UUID change in progress flag, clearing");
   3556	}
   3557
   3558	memcpy(fs_info->super_for_commit, fs_info->super_copy,
   3559	       sizeof(*fs_info->super_for_commit));
   3560
   3561	ret = btrfs_validate_mount_super(fs_info);
   3562	if (ret) {
   3563		btrfs_err(fs_info, "superblock contains fatal errors");
   3564		err = -EINVAL;
   3565		goto fail_alloc;
   3566	}
   3567
   3568	if (!btrfs_super_root(disk_super))
   3569		goto fail_alloc;
   3570
   3571	/* check FS state, whether FS is broken. */
   3572	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
   3573		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
   3574
   3575	/*
   3576	 * In the long term, we'll store the compression type in the super
   3577	 * block, and it'll be used for per file compression control.
   3578	 */
   3579	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
   3580
   3581	/*
   3582	 * Flag our filesystem as having big metadata blocks if they are bigger
   3583	 * than the page size.
   3584	 */
   3585	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
   3586		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
   3587			btrfs_info(fs_info,
   3588				"flagging fs with big metadata feature");
   3589		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
   3590	}
   3591
   3592	/* Set up fs_info before parsing mount options */
   3593	nodesize = btrfs_super_nodesize(disk_super);
   3594	sectorsize = btrfs_super_sectorsize(disk_super);
   3595	stripesize = sectorsize;
   3596	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
   3597	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
   3598
   3599	fs_info->nodesize = nodesize;
   3600	fs_info->sectorsize = sectorsize;
   3601	fs_info->sectorsize_bits = ilog2(sectorsize);
   3602	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
   3603	fs_info->stripesize = stripesize;
   3604
   3605	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
   3606	if (ret) {
   3607		err = ret;
   3608		goto fail_alloc;
   3609	}
   3610
   3611	features = btrfs_super_incompat_flags(disk_super) &
   3612		~BTRFS_FEATURE_INCOMPAT_SUPP;
   3613	if (features) {
   3614		btrfs_err(fs_info,
   3615		    "cannot mount because of unsupported optional features (0x%llx)",
   3616		    features);
   3617		err = -EINVAL;
   3618		goto fail_alloc;
   3619	}
   3620
   3621	features = btrfs_super_incompat_flags(disk_super);
   3622	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
   3623	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
   3624		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
   3625	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
   3626		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
   3627
   3628	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
   3629		btrfs_info(fs_info, "has skinny extents");
   3630
   3631	/*
   3632	 * mixed block groups end up with duplicate but slightly offset
   3633	 * extent buffers for the same range.  It leads to corruptions
   3634	 */
   3635	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
   3636	    (sectorsize != nodesize)) {
   3637		btrfs_err(fs_info,
   3638"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
   3639			nodesize, sectorsize);
   3640		goto fail_alloc;
   3641	}
   3642
   3643	/*
   3644	 * Needn't use the lock because there is no other task which will
   3645	 * update the flag.
   3646	 */
   3647	btrfs_set_super_incompat_flags(disk_super, features);
   3648
   3649	features = btrfs_super_compat_ro_flags(disk_super) &
   3650		~BTRFS_FEATURE_COMPAT_RO_SUPP;
   3651	if (!sb_rdonly(sb) && features) {
   3652		btrfs_err(fs_info,
   3653	"cannot mount read-write because of unsupported optional features (0x%llx)",
   3654		       features);
   3655		err = -EINVAL;
   3656		goto fail_alloc;
   3657	}
   3658
   3659	if (sectorsize < PAGE_SIZE) {
   3660		struct btrfs_subpage_info *subpage_info;
   3661
   3662		/*
   3663		 * V1 space cache has some hardcoded PAGE_SIZE usage, and is
   3664		 * going to be deprecated.
   3665		 *
   3666		 * Force to use v2 cache for subpage case.
   3667		 */
   3668		btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
   3669		btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
   3670			"forcing free space tree for sector size %u with page size %lu",
   3671			sectorsize, PAGE_SIZE);
   3672
   3673		btrfs_warn(fs_info,
   3674		"read-write for sector size %u with page size %lu is experimental",
   3675			   sectorsize, PAGE_SIZE);
   3676		subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
   3677		if (!subpage_info)
   3678			goto fail_alloc;
   3679		btrfs_init_subpage_info(subpage_info, sectorsize);
   3680		fs_info->subpage_info = subpage_info;
   3681	}
   3682
   3683	ret = btrfs_init_workqueues(fs_info);
   3684	if (ret) {
   3685		err = ret;
   3686		goto fail_sb_buffer;
   3687	}
   3688
   3689	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
   3690	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
   3691
   3692	sb->s_blocksize = sectorsize;
   3693	sb->s_blocksize_bits = blksize_bits(sectorsize);
   3694	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
   3695
   3696	mutex_lock(&fs_info->chunk_mutex);
   3697	ret = btrfs_read_sys_array(fs_info);
   3698	mutex_unlock(&fs_info->chunk_mutex);
   3699	if (ret) {
   3700		btrfs_err(fs_info, "failed to read the system array: %d", ret);
   3701		goto fail_sb_buffer;
   3702	}
   3703
   3704	generation = btrfs_super_chunk_root_generation(disk_super);
   3705	level = btrfs_super_chunk_root_level(disk_super);
   3706	ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
   3707			      generation, level);
   3708	if (ret) {
   3709		btrfs_err(fs_info, "failed to read chunk root");
   3710		goto fail_tree_roots;
   3711	}
   3712
   3713	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
   3714			   offsetof(struct btrfs_header, chunk_tree_uuid),
   3715			   BTRFS_UUID_SIZE);
   3716
   3717	ret = btrfs_read_chunk_tree(fs_info);
   3718	if (ret) {
   3719		btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
   3720		goto fail_tree_roots;
   3721	}
   3722
   3723	/*
   3724	 * At this point we know all the devices that make this filesystem,
   3725	 * including the seed devices but we don't know yet if the replace
   3726	 * target is required. So free devices that are not part of this
   3727	 * filesystem but skip the replace target device which is checked
   3728	 * below in btrfs_init_dev_replace().
   3729	 */
   3730	btrfs_free_extra_devids(fs_devices);
   3731	if (!fs_devices->latest_dev->bdev) {
   3732		btrfs_err(fs_info, "failed to read devices");
   3733		goto fail_tree_roots;
   3734	}
   3735
   3736	ret = init_tree_roots(fs_info);
   3737	if (ret)
   3738		goto fail_tree_roots;
   3739
   3740	/*
   3741	 * Get zone type information of zoned block devices. This will also
   3742	 * handle emulation of a zoned filesystem if a regular device has the
   3743	 * zoned incompat feature flag set.
   3744	 */
   3745	ret = btrfs_get_dev_zone_info_all_devices(fs_info);
   3746	if (ret) {
   3747		btrfs_err(fs_info,
   3748			  "zoned: failed to read device zone info: %d",
   3749			  ret);
   3750		goto fail_block_groups;
   3751	}
   3752
   3753	/*
   3754	 * If we have a uuid root and we're not being told to rescan we need to
   3755	 * check the generation here so we can set the
   3756	 * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
   3757	 * transaction during a balance or the log replay without updating the
   3758	 * uuid generation, and then if we crash we would rescan the uuid tree,
   3759	 * even though it was perfectly fine.
   3760	 */
   3761	if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
   3762	    fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
   3763		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
   3764
   3765	ret = btrfs_verify_dev_extents(fs_info);
   3766	if (ret) {
   3767		btrfs_err(fs_info,
   3768			  "failed to verify dev extents against chunks: %d",
   3769			  ret);
   3770		goto fail_block_groups;
   3771	}
   3772	ret = btrfs_recover_balance(fs_info);
   3773	if (ret) {
   3774		btrfs_err(fs_info, "failed to recover balance: %d", ret);
   3775		goto fail_block_groups;
   3776	}
   3777
   3778	ret = btrfs_init_dev_stats(fs_info);
   3779	if (ret) {
   3780		btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
   3781		goto fail_block_groups;
   3782	}
   3783
   3784	ret = btrfs_init_dev_replace(fs_info);
   3785	if (ret) {
   3786		btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
   3787		goto fail_block_groups;
   3788	}
   3789
   3790	ret = btrfs_check_zoned_mode(fs_info);
   3791	if (ret) {
   3792		btrfs_err(fs_info, "failed to initialize zoned mode: %d",
   3793			  ret);
   3794		goto fail_block_groups;
   3795	}
   3796
   3797	ret = btrfs_sysfs_add_fsid(fs_devices);
   3798	if (ret) {
   3799		btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
   3800				ret);
   3801		goto fail_block_groups;
   3802	}
   3803
   3804	ret = btrfs_sysfs_add_mounted(fs_info);
   3805	if (ret) {
   3806		btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
   3807		goto fail_fsdev_sysfs;
   3808	}
   3809
   3810	ret = btrfs_init_space_info(fs_info);
   3811	if (ret) {
   3812		btrfs_err(fs_info, "failed to initialize space info: %d", ret);
   3813		goto fail_sysfs;
   3814	}
   3815
   3816	ret = btrfs_read_block_groups(fs_info);
   3817	if (ret) {
   3818		btrfs_err(fs_info, "failed to read block groups: %d", ret);
   3819		goto fail_sysfs;
   3820	}
   3821
   3822	btrfs_free_zone_cache(fs_info);
   3823
   3824	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
   3825	    !btrfs_check_rw_degradable(fs_info, NULL)) {
   3826		btrfs_warn(fs_info,
   3827		"writable mount is not allowed due to too many missing devices");
   3828		goto fail_sysfs;
   3829	}
   3830
   3831	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
   3832					       "btrfs-cleaner");
   3833	if (IS_ERR(fs_info->cleaner_kthread))
   3834		goto fail_sysfs;
   3835
   3836	fs_info->transaction_kthread = kthread_run(transaction_kthread,
   3837						   tree_root,
   3838						   "btrfs-transaction");
   3839	if (IS_ERR(fs_info->transaction_kthread))
   3840		goto fail_cleaner;
   3841
   3842	if (!btrfs_test_opt(fs_info, NOSSD) &&
   3843	    !fs_info->fs_devices->rotating) {
   3844		btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
   3845	}
   3846
   3847	/*
   3848	 * Mount does not set all options immediately, we can do it now and do
   3849	 * not have to wait for transaction commit
   3850	 */
   3851	btrfs_apply_pending_changes(fs_info);
   3852
   3853#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
   3854	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
   3855		ret = btrfsic_mount(fs_info, fs_devices,
   3856				    btrfs_test_opt(fs_info,
   3857					CHECK_INTEGRITY_DATA) ? 1 : 0,
   3858				    fs_info->check_integrity_print_mask);
   3859		if (ret)
   3860			btrfs_warn(fs_info,
   3861				"failed to initialize integrity check module: %d",
   3862				ret);
   3863	}
   3864#endif
   3865	ret = btrfs_read_qgroup_config(fs_info);
   3866	if (ret)
   3867		goto fail_trans_kthread;
   3868
   3869	if (btrfs_build_ref_tree(fs_info))
   3870		btrfs_err(fs_info, "couldn't build ref tree");
   3871
   3872	/* do not make disk changes in broken FS or nologreplay is given */
   3873	if (btrfs_super_log_root(disk_super) != 0 &&
   3874	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
   3875		btrfs_info(fs_info, "start tree-log replay");
   3876		ret = btrfs_replay_log(fs_info, fs_devices);
   3877		if (ret) {
   3878			err = ret;
   3879			goto fail_qgroup;
   3880		}
   3881	}
   3882
   3883	fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
   3884	if (IS_ERR(fs_info->fs_root)) {
   3885		err = PTR_ERR(fs_info->fs_root);
   3886		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
   3887		fs_info->fs_root = NULL;
   3888		goto fail_qgroup;
   3889	}
   3890
   3891	if (sb_rdonly(sb))
   3892		goto clear_oneshot;
   3893
   3894	ret = btrfs_start_pre_rw_mount(fs_info);
   3895	if (ret) {
   3896		close_ctree(fs_info);
   3897		return ret;
   3898	}
   3899	btrfs_discard_resume(fs_info);
   3900
   3901	if (fs_info->uuid_root &&
   3902	    (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
   3903	     fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
   3904		btrfs_info(fs_info, "checking UUID tree");
   3905		ret = btrfs_check_uuid_tree(fs_info);
   3906		if (ret) {
   3907			btrfs_warn(fs_info,
   3908				"failed to check the UUID tree: %d", ret);
   3909			close_ctree(fs_info);
   3910			return ret;
   3911		}
   3912	}
   3913
   3914	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
   3915
   3916	/* Kick the cleaner thread so it'll start deleting snapshots. */
   3917	if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
   3918		wake_up_process(fs_info->cleaner_kthread);
   3919
   3920clear_oneshot:
   3921	btrfs_clear_oneshot_options(fs_info);
   3922	return 0;
   3923
   3924fail_qgroup:
   3925	btrfs_free_qgroup_config(fs_info);
   3926fail_trans_kthread:
   3927	kthread_stop(fs_info->transaction_kthread);
   3928	btrfs_cleanup_transaction(fs_info);
   3929	btrfs_free_fs_roots(fs_info);
   3930fail_cleaner:
   3931	kthread_stop(fs_info->cleaner_kthread);
   3932
   3933	/*
   3934	 * make sure we're done with the btree inode before we stop our
   3935	 * kthreads
   3936	 */
   3937	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
   3938
   3939fail_sysfs:
   3940	btrfs_sysfs_remove_mounted(fs_info);
   3941
   3942fail_fsdev_sysfs:
   3943	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
   3944
   3945fail_block_groups:
   3946	btrfs_put_block_group_cache(fs_info);
   3947
   3948fail_tree_roots:
   3949	if (fs_info->data_reloc_root)
   3950		btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
   3951	free_root_pointers(fs_info, true);
   3952	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
   3953
   3954fail_sb_buffer:
   3955	btrfs_stop_all_workers(fs_info);
   3956	btrfs_free_block_groups(fs_info);
   3957fail_alloc:
   3958	btrfs_mapping_tree_free(&fs_info->mapping_tree);
   3959
   3960	iput(fs_info->btree_inode);
   3961fail:
   3962	btrfs_close_devices(fs_info->fs_devices);
   3963	return err;
   3964}
   3965ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
   3966
   3967static void btrfs_end_super_write(struct bio *bio)
   3968{
   3969	struct btrfs_device *device = bio->bi_private;
   3970	struct bio_vec *bvec;
   3971	struct bvec_iter_all iter_all;
   3972	struct page *page;
   3973
   3974	bio_for_each_segment_all(bvec, bio, iter_all) {
   3975		page = bvec->bv_page;
   3976
   3977		if (bio->bi_status) {
   3978			btrfs_warn_rl_in_rcu(device->fs_info,
   3979				"lost page write due to IO error on %s (%d)",
   3980				rcu_str_deref(device->name),
   3981				blk_status_to_errno(bio->bi_status));
   3982			ClearPageUptodate(page);
   3983			SetPageError(page);
   3984			btrfs_dev_stat_inc_and_print(device,
   3985						     BTRFS_DEV_STAT_WRITE_ERRS);
   3986		} else {
   3987			SetPageUptodate(page);
   3988		}
   3989
   3990		put_page(page);
   3991		unlock_page(page);
   3992	}
   3993
   3994	bio_put(bio);
   3995}
   3996
   3997struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
   3998						   int copy_num)
   3999{
   4000	struct btrfs_super_block *super;
   4001	struct page *page;
   4002	u64 bytenr, bytenr_orig;
   4003	struct address_space *mapping = bdev->bd_inode->i_mapping;
   4004	int ret;
   4005
   4006	bytenr_orig = btrfs_sb_offset(copy_num);
   4007	ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
   4008	if (ret == -ENOENT)
   4009		return ERR_PTR(-EINVAL);
   4010	else if (ret)
   4011		return ERR_PTR(ret);
   4012
   4013	if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
   4014		return ERR_PTR(-EINVAL);
   4015
   4016	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
   4017	if (IS_ERR(page))
   4018		return ERR_CAST(page);
   4019
   4020	super = page_address(page);
   4021	if (btrfs_super_magic(super) != BTRFS_MAGIC) {
   4022		btrfs_release_disk_super(super);
   4023		return ERR_PTR(-ENODATA);
   4024	}
   4025
   4026	if (btrfs_super_bytenr(super) != bytenr_orig) {
   4027		btrfs_release_disk_super(super);
   4028		return ERR_PTR(-EINVAL);
   4029	}
   4030
   4031	return super;
   4032}
   4033
   4034
   4035struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
   4036{
   4037	struct btrfs_super_block *super, *latest = NULL;
   4038	int i;
   4039	u64 transid = 0;
   4040
   4041	/* we would like to check all the supers, but that would make
   4042	 * a btrfs mount succeed after a mkfs from a different FS.
   4043	 * So, we need to add a special mount option to scan for
   4044	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
   4045	 */
   4046	for (i = 0; i < 1; i++) {
   4047		super = btrfs_read_dev_one_super(bdev, i);
   4048		if (IS_ERR(super))
   4049			continue;
   4050
   4051		if (!latest || btrfs_super_generation(super) > transid) {
   4052			if (latest)
   4053				btrfs_release_disk_super(super);
   4054
   4055			latest = super;
   4056			transid = btrfs_super_generation(super);
   4057		}
   4058	}
   4059
   4060	return super;
   4061}
   4062
   4063/*
   4064 * Write superblock @sb to the @device. Do not wait for completion, all the
   4065 * pages we use for writing are locked.
   4066 *
   4067 * Write @max_mirrors copies of the superblock, where 0 means default that fit
   4068 * the expected device size at commit time. Note that max_mirrors must be
   4069 * same for write and wait phases.
   4070 *
   4071 * Return number of errors when page is not found or submission fails.
   4072 */
   4073static int write_dev_supers(struct btrfs_device *device,
   4074			    struct btrfs_super_block *sb, int max_mirrors)
   4075{
   4076	struct btrfs_fs_info *fs_info = device->fs_info;
   4077	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
   4078	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
   4079	int i;
   4080	int errors = 0;
   4081	int ret;
   4082	u64 bytenr, bytenr_orig;
   4083
   4084	if (max_mirrors == 0)
   4085		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
   4086
   4087	shash->tfm = fs_info->csum_shash;
   4088
   4089	for (i = 0; i < max_mirrors; i++) {
   4090		struct page *page;
   4091		struct bio *bio;
   4092		struct btrfs_super_block *disk_super;
   4093
   4094		bytenr_orig = btrfs_sb_offset(i);
   4095		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
   4096		if (ret == -ENOENT) {
   4097			continue;
   4098		} else if (ret < 0) {
   4099			btrfs_err(device->fs_info,
   4100				"couldn't get super block location for mirror %d",
   4101				i);
   4102			errors++;
   4103			continue;
   4104		}
   4105		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
   4106		    device->commit_total_bytes)
   4107			break;
   4108
   4109		btrfs_set_super_bytenr(sb, bytenr_orig);
   4110
   4111		crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
   4112				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
   4113				    sb->csum);
   4114
   4115		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
   4116					   GFP_NOFS);
   4117		if (!page) {
   4118			btrfs_err(device->fs_info,
   4119			    "couldn't get super block page for bytenr %llu",
   4120			    bytenr);
   4121			errors++;
   4122			continue;
   4123		}
   4124
   4125		/* Bump the refcount for wait_dev_supers() */
   4126		get_page(page);
   4127
   4128		disk_super = page_address(page);
   4129		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
   4130
   4131		/*
   4132		 * Directly use bios here instead of relying on the page cache
   4133		 * to do I/O, so we don't lose the ability to do integrity
   4134		 * checking.
   4135		 */
   4136		bio = bio_alloc(device->bdev, 1,
   4137				REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
   4138				GFP_NOFS);
   4139		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
   4140		bio->bi_private = device;
   4141		bio->bi_end_io = btrfs_end_super_write;
   4142		__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
   4143			       offset_in_page(bytenr));
   4144
   4145		/*
   4146		 * We FUA only the first super block.  The others we allow to
   4147		 * go down lazy and there's a short window where the on-disk
   4148		 * copies might still contain the older version.
   4149		 */
   4150		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
   4151			bio->bi_opf |= REQ_FUA;
   4152
   4153		btrfsic_check_bio(bio);
   4154		submit_bio(bio);
   4155
   4156		if (btrfs_advance_sb_log(device, i))
   4157			errors++;
   4158	}
   4159	return errors < i ? 0 : -1;
   4160}
   4161
   4162/*
   4163 * Wait for write completion of superblocks done by write_dev_supers,
   4164 * @max_mirrors same for write and wait phases.
   4165 *
   4166 * Return number of errors when page is not found or not marked up to
   4167 * date.
   4168 */
   4169static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
   4170{
   4171	int i;
   4172	int errors = 0;
   4173	bool primary_failed = false;
   4174	int ret;
   4175	u64 bytenr;
   4176
   4177	if (max_mirrors == 0)
   4178		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
   4179
   4180	for (i = 0; i < max_mirrors; i++) {
   4181		struct page *page;
   4182
   4183		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
   4184		if (ret == -ENOENT) {
   4185			break;
   4186		} else if (ret < 0) {
   4187			errors++;
   4188			if (i == 0)
   4189				primary_failed = true;
   4190			continue;
   4191		}
   4192		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
   4193		    device->commit_total_bytes)
   4194			break;
   4195
   4196		page = find_get_page(device->bdev->bd_inode->i_mapping,
   4197				     bytenr >> PAGE_SHIFT);
   4198		if (!page) {
   4199			errors++;
   4200			if (i == 0)
   4201				primary_failed = true;
   4202			continue;
   4203		}
   4204		/* Page is submitted locked and unlocked once the IO completes */
   4205		wait_on_page_locked(page);
   4206		if (PageError(page)) {
   4207			errors++;
   4208			if (i == 0)
   4209				primary_failed = true;
   4210		}
   4211
   4212		/* Drop our reference */
   4213		put_page(page);
   4214
   4215		/* Drop the reference from the writing run */
   4216		put_page(page);
   4217	}
   4218
   4219	/* log error, force error return */
   4220	if (primary_failed) {
   4221		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
   4222			  device->devid);
   4223		return -1;
   4224	}
   4225
   4226	return errors < i ? 0 : -1;
   4227}
   4228
   4229/*
   4230 * endio for the write_dev_flush, this will wake anyone waiting
   4231 * for the barrier when it is done
   4232 */
   4233static void btrfs_end_empty_barrier(struct bio *bio)
   4234{
   4235	bio_uninit(bio);
   4236	complete(bio->bi_private);
   4237}
   4238
   4239/*
   4240 * Submit a flush request to the device if it supports it. Error handling is
   4241 * done in the waiting counterpart.
   4242 */
   4243static void write_dev_flush(struct btrfs_device *device)
   4244{
   4245	struct bio *bio = &device->flush_bio;
   4246
   4247#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
   4248	/*
   4249	 * When a disk has write caching disabled, we skip submission of a bio
   4250	 * with flush and sync requests before writing the superblock, since
   4251	 * it's not needed. However when the integrity checker is enabled, this
   4252	 * results in reports that there are metadata blocks referred by a
   4253	 * superblock that were not properly flushed. So don't skip the bio
   4254	 * submission only when the integrity checker is enabled for the sake
   4255	 * of simplicity, since this is a debug tool and not meant for use in
   4256	 * non-debug builds.
   4257	 */
   4258	if (!bdev_write_cache(device->bdev))
   4259		return;
   4260#endif
   4261
   4262	bio_init(bio, device->bdev, NULL, 0,
   4263		 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
   4264	bio->bi_end_io = btrfs_end_empty_barrier;
   4265	init_completion(&device->flush_wait);
   4266	bio->bi_private = &device->flush_wait;
   4267
   4268	btrfsic_check_bio(bio);
   4269	submit_bio(bio);
   4270	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
   4271}
   4272
   4273/*
   4274 * If the flush bio has been submitted by write_dev_flush, wait for it.
   4275 */
   4276static blk_status_t wait_dev_flush(struct btrfs_device *device)
   4277{
   4278	struct bio *bio = &device->flush_bio;
   4279
   4280	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
   4281		return BLK_STS_OK;
   4282
   4283	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
   4284	wait_for_completion_io(&device->flush_wait);
   4285
   4286	return bio->bi_status;
   4287}
   4288
   4289static int check_barrier_error(struct btrfs_fs_info *fs_info)
   4290{
   4291	if (!btrfs_check_rw_degradable(fs_info, NULL))
   4292		return -EIO;
   4293	return 0;
   4294}
   4295
   4296/*
   4297 * send an empty flush down to each device in parallel,
   4298 * then wait for them
   4299 */
   4300static int barrier_all_devices(struct btrfs_fs_info *info)
   4301{
   4302	struct list_head *head;
   4303	struct btrfs_device *dev;
   4304	int errors_wait = 0;
   4305	blk_status_t ret;
   4306
   4307	lockdep_assert_held(&info->fs_devices->device_list_mutex);
   4308	/* send down all the barriers */
   4309	head = &info->fs_devices->devices;
   4310	list_for_each_entry(dev, head, dev_list) {
   4311		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
   4312			continue;
   4313		if (!dev->bdev)
   4314			continue;
   4315		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
   4316		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
   4317			continue;
   4318
   4319		write_dev_flush(dev);
   4320		dev->last_flush_error = BLK_STS_OK;
   4321	}
   4322
   4323	/* wait for all the barriers */
   4324	list_for_each_entry(dev, head, dev_list) {
   4325		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
   4326			continue;
   4327		if (!dev->bdev) {
   4328			errors_wait++;
   4329			continue;
   4330		}
   4331		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
   4332		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
   4333			continue;
   4334
   4335		ret = wait_dev_flush(dev);
   4336		if (ret) {
   4337			dev->last_flush_error = ret;
   4338			btrfs_dev_stat_inc_and_print(dev,
   4339					BTRFS_DEV_STAT_FLUSH_ERRS);
   4340			errors_wait++;
   4341		}
   4342	}
   4343
   4344	if (errors_wait) {
   4345		/*
   4346		 * At some point we need the status of all disks
   4347		 * to arrive at the volume status. So error checking
   4348		 * is being pushed to a separate loop.
   4349		 */
   4350		return check_barrier_error(info);
   4351	}
   4352	return 0;
   4353}
   4354
   4355int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
   4356{
   4357	int raid_type;
   4358	int min_tolerated = INT_MAX;
   4359
   4360	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
   4361	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
   4362		min_tolerated = min_t(int, min_tolerated,
   4363				    btrfs_raid_array[BTRFS_RAID_SINGLE].
   4364				    tolerated_failures);
   4365
   4366	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
   4367		if (raid_type == BTRFS_RAID_SINGLE)
   4368			continue;
   4369		if (!(flags & btrfs_raid_array[raid_type].bg_flag))
   4370			continue;
   4371		min_tolerated = min_t(int, min_tolerated,
   4372				    btrfs_raid_array[raid_type].
   4373				    tolerated_failures);
   4374	}
   4375
   4376	if (min_tolerated == INT_MAX) {
   4377		pr_warn("BTRFS: unknown raid flag: %llu", flags);
   4378		min_tolerated = 0;
   4379	}
   4380
   4381	return min_tolerated;
   4382}
   4383
   4384int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
   4385{
   4386	struct list_head *head;
   4387	struct btrfs_device *dev;
   4388	struct btrfs_super_block *sb;
   4389	struct btrfs_dev_item *dev_item;
   4390	int ret;
   4391	int do_barriers;
   4392	int max_errors;
   4393	int total_errors = 0;
   4394	u64 flags;
   4395
   4396	do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
   4397
   4398	/*
   4399	 * max_mirrors == 0 indicates we're from commit_transaction,
   4400	 * not from fsync where the tree roots in fs_info have not
   4401	 * been consistent on disk.
   4402	 */
   4403	if (max_mirrors == 0)
   4404		backup_super_roots(fs_info);
   4405
   4406	sb = fs_info->super_for_commit;
   4407	dev_item = &sb->dev_item;
   4408
   4409	mutex_lock(&fs_info->fs_devices->device_list_mutex);
   4410	head = &fs_info->fs_devices->devices;
   4411	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
   4412
   4413	if (do_barriers) {
   4414		ret = barrier_all_devices(fs_info);
   4415		if (ret) {
   4416			mutex_unlock(
   4417				&fs_info->fs_devices->device_list_mutex);
   4418			btrfs_handle_fs_error(fs_info, ret,
   4419					      "errors while submitting device barriers.");
   4420			return ret;
   4421		}
   4422	}
   4423
   4424	list_for_each_entry(dev, head, dev_list) {
   4425		if (!dev->bdev) {
   4426			total_errors++;
   4427			continue;
   4428		}
   4429		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
   4430		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
   4431			continue;
   4432
   4433		btrfs_set_stack_device_generation(dev_item, 0);
   4434		btrfs_set_stack_device_type(dev_item, dev->type);
   4435		btrfs_set_stack_device_id(dev_item, dev->devid);
   4436		btrfs_set_stack_device_total_bytes(dev_item,
   4437						   dev->commit_total_bytes);
   4438		btrfs_set_stack_device_bytes_used(dev_item,
   4439						  dev->commit_bytes_used);
   4440		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
   4441		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
   4442		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
   4443		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
   4444		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
   4445		       BTRFS_FSID_SIZE);
   4446
   4447		flags = btrfs_super_flags(sb);
   4448		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
   4449
   4450		ret = btrfs_validate_write_super(fs_info, sb);
   4451		if (ret < 0) {
   4452			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4453			btrfs_handle_fs_error(fs_info, -EUCLEAN,
   4454				"unexpected superblock corruption detected");
   4455			return -EUCLEAN;
   4456		}
   4457
   4458		ret = write_dev_supers(dev, sb, max_mirrors);
   4459		if (ret)
   4460			total_errors++;
   4461	}
   4462	if (total_errors > max_errors) {
   4463		btrfs_err(fs_info, "%d errors while writing supers",
   4464			  total_errors);
   4465		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4466
   4467		/* FUA is masked off if unsupported and can't be the reason */
   4468		btrfs_handle_fs_error(fs_info, -EIO,
   4469				      "%d errors while writing supers",
   4470				      total_errors);
   4471		return -EIO;
   4472	}
   4473
   4474	total_errors = 0;
   4475	list_for_each_entry(dev, head, dev_list) {
   4476		if (!dev->bdev)
   4477			continue;
   4478		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
   4479		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
   4480			continue;
   4481
   4482		ret = wait_dev_supers(dev, max_mirrors);
   4483		if (ret)
   4484			total_errors++;
   4485	}
   4486	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4487	if (total_errors > max_errors) {
   4488		btrfs_handle_fs_error(fs_info, -EIO,
   4489				      "%d errors while writing supers",
   4490				      total_errors);
   4491		return -EIO;
   4492	}
   4493	return 0;
   4494}
   4495
   4496/* Drop a fs root from the radix tree and free it. */
   4497void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
   4498				  struct btrfs_root *root)
   4499{
   4500	bool drop_ref = false;
   4501
   4502	spin_lock(&fs_info->fs_roots_lock);
   4503	xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
   4504	if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
   4505		drop_ref = true;
   4506	spin_unlock(&fs_info->fs_roots_lock);
   4507
   4508	if (BTRFS_FS_ERROR(fs_info)) {
   4509		ASSERT(root->log_root == NULL);
   4510		if (root->reloc_root) {
   4511			btrfs_put_root(root->reloc_root);
   4512			root->reloc_root = NULL;
   4513		}
   4514	}
   4515
   4516	if (drop_ref)
   4517		btrfs_put_root(root);
   4518}
   4519
   4520int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
   4521{
   4522	struct btrfs_root *roots[8];
   4523	unsigned long index = 0;
   4524	int i;
   4525	int err = 0;
   4526	int grabbed;
   4527
   4528	while (1) {
   4529		struct btrfs_root *root;
   4530
   4531		spin_lock(&fs_info->fs_roots_lock);
   4532		if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
   4533			spin_unlock(&fs_info->fs_roots_lock);
   4534			return err;
   4535		}
   4536
   4537		grabbed = 0;
   4538		xa_for_each_start(&fs_info->fs_roots, index, root, index) {
   4539			/* Avoid grabbing roots in dead_roots */
   4540			if (btrfs_root_refs(&root->root_item) > 0)
   4541				roots[grabbed++] = btrfs_grab_root(root);
   4542			if (grabbed >= ARRAY_SIZE(roots))
   4543				break;
   4544		}
   4545		spin_unlock(&fs_info->fs_roots_lock);
   4546
   4547		for (i = 0; i < grabbed; i++) {
   4548			if (!roots[i])
   4549				continue;
   4550			index = roots[i]->root_key.objectid;
   4551			err = btrfs_orphan_cleanup(roots[i]);
   4552			if (err)
   4553				goto out;
   4554			btrfs_put_root(roots[i]);
   4555		}
   4556		index++;
   4557	}
   4558
   4559out:
   4560	/* Release the roots that remain uncleaned due to error */
   4561	for (; i < grabbed; i++) {
   4562		if (roots[i])
   4563			btrfs_put_root(roots[i]);
   4564	}
   4565	return err;
   4566}
   4567
   4568int btrfs_commit_super(struct btrfs_fs_info *fs_info)
   4569{
   4570	struct btrfs_root *root = fs_info->tree_root;
   4571	struct btrfs_trans_handle *trans;
   4572
   4573	mutex_lock(&fs_info->cleaner_mutex);
   4574	btrfs_run_delayed_iputs(fs_info);
   4575	mutex_unlock(&fs_info->cleaner_mutex);
   4576	wake_up_process(fs_info->cleaner_kthread);
   4577
   4578	/* wait until ongoing cleanup work done */
   4579	down_write(&fs_info->cleanup_work_sem);
   4580	up_write(&fs_info->cleanup_work_sem);
   4581
   4582	trans = btrfs_join_transaction(root);
   4583	if (IS_ERR(trans))
   4584		return PTR_ERR(trans);
   4585	return btrfs_commit_transaction(trans);
   4586}
   4587
   4588static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
   4589{
   4590	struct btrfs_transaction *trans;
   4591	struct btrfs_transaction *tmp;
   4592	bool found = false;
   4593
   4594	if (list_empty(&fs_info->trans_list))
   4595		return;
   4596
   4597	/*
   4598	 * This function is only called at the very end of close_ctree(),
   4599	 * thus no other running transaction, no need to take trans_lock.
   4600	 */
   4601	ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
   4602	list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
   4603		struct extent_state *cached = NULL;
   4604		u64 dirty_bytes = 0;
   4605		u64 cur = 0;
   4606		u64 found_start;
   4607		u64 found_end;
   4608
   4609		found = true;
   4610		while (!find_first_extent_bit(&trans->dirty_pages, cur,
   4611			&found_start, &found_end, EXTENT_DIRTY, &cached)) {
   4612			dirty_bytes += found_end + 1 - found_start;
   4613			cur = found_end + 1;
   4614		}
   4615		btrfs_warn(fs_info,
   4616	"transaction %llu (with %llu dirty metadata bytes) is not committed",
   4617			   trans->transid, dirty_bytes);
   4618		btrfs_cleanup_one_transaction(trans, fs_info);
   4619
   4620		if (trans == fs_info->running_transaction)
   4621			fs_info->running_transaction = NULL;
   4622		list_del_init(&trans->list);
   4623
   4624		btrfs_put_transaction(trans);
   4625		trace_btrfs_transaction_commit(fs_info);
   4626	}
   4627	ASSERT(!found);
   4628}
   4629
   4630void __cold close_ctree(struct btrfs_fs_info *fs_info)
   4631{
   4632	int ret;
   4633
   4634	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
   4635
   4636	/*
   4637	 * We may have the reclaim task running and relocating a data block group,
   4638	 * in which case it may create delayed iputs. So stop it before we park
   4639	 * the cleaner kthread otherwise we can get new delayed iputs after
   4640	 * parking the cleaner, and that can make the async reclaim task to hang
   4641	 * if it's waiting for delayed iputs to complete, since the cleaner is
   4642	 * parked and can not run delayed iputs - this will make us hang when
   4643	 * trying to stop the async reclaim task.
   4644	 */
   4645	cancel_work_sync(&fs_info->reclaim_bgs_work);
   4646	/*
   4647	 * We don't want the cleaner to start new transactions, add more delayed
   4648	 * iputs, etc. while we're closing. We can't use kthread_stop() yet
   4649	 * because that frees the task_struct, and the transaction kthread might
   4650	 * still try to wake up the cleaner.
   4651	 */
   4652	kthread_park(fs_info->cleaner_kthread);
   4653
   4654	/*
   4655	 * If we had UNFINISHED_DROPS we could still be processing them, so
   4656	 * clear that bit and wake up relocation so it can stop.
   4657	 */
   4658	btrfs_wake_unfinished_drop(fs_info);
   4659
   4660	/* wait for the qgroup rescan worker to stop */
   4661	btrfs_qgroup_wait_for_completion(fs_info, false);
   4662
   4663	/* wait for the uuid_scan task to finish */
   4664	down(&fs_info->uuid_tree_rescan_sem);
   4665	/* avoid complains from lockdep et al., set sem back to initial state */
   4666	up(&fs_info->uuid_tree_rescan_sem);
   4667
   4668	/* pause restriper - we want to resume on mount */
   4669	btrfs_pause_balance(fs_info);
   4670
   4671	btrfs_dev_replace_suspend_for_unmount(fs_info);
   4672
   4673	btrfs_scrub_cancel(fs_info);
   4674
   4675	/* wait for any defraggers to finish */
   4676	wait_event(fs_info->transaction_wait,
   4677		   (atomic_read(&fs_info->defrag_running) == 0));
   4678
   4679	/* clear out the rbtree of defraggable inodes */
   4680	btrfs_cleanup_defrag_inodes(fs_info);
   4681
   4682	cancel_work_sync(&fs_info->async_reclaim_work);
   4683	cancel_work_sync(&fs_info->async_data_reclaim_work);
   4684	cancel_work_sync(&fs_info->preempt_reclaim_work);
   4685
   4686	/* Cancel or finish ongoing discard work */
   4687	btrfs_discard_cleanup(fs_info);
   4688
   4689	if (!sb_rdonly(fs_info->sb)) {
   4690		/*
   4691		 * The cleaner kthread is stopped, so do one final pass over
   4692		 * unused block groups.
   4693		 */
   4694		btrfs_delete_unused_bgs(fs_info);
   4695
   4696		/*
   4697		 * There might be existing delayed inode workers still running
   4698		 * and holding an empty delayed inode item. We must wait for
   4699		 * them to complete first because they can create a transaction.
   4700		 * This happens when someone calls btrfs_balance_delayed_items()
   4701		 * and then a transaction commit runs the same delayed nodes
   4702		 * before any delayed worker has done something with the nodes.
   4703		 * We must wait for any worker here and not at transaction
   4704		 * commit time since that could cause a deadlock.
   4705		 * This is a very rare case.
   4706		 */
   4707		btrfs_flush_workqueue(fs_info->delayed_workers);
   4708
   4709		ret = btrfs_commit_super(fs_info);
   4710		if (ret)
   4711			btrfs_err(fs_info, "commit super ret %d", ret);
   4712	}
   4713
   4714	if (BTRFS_FS_ERROR(fs_info))
   4715		btrfs_error_commit_super(fs_info);
   4716
   4717	kthread_stop(fs_info->transaction_kthread);
   4718	kthread_stop(fs_info->cleaner_kthread);
   4719
   4720	ASSERT(list_empty(&fs_info->delayed_iputs));
   4721	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
   4722
   4723	if (btrfs_check_quota_leak(fs_info)) {
   4724		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
   4725		btrfs_err(fs_info, "qgroup reserved space leaked");
   4726	}
   4727
   4728	btrfs_free_qgroup_config(fs_info);
   4729	ASSERT(list_empty(&fs_info->delalloc_roots));
   4730
   4731	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
   4732		btrfs_info(fs_info, "at unmount delalloc count %lld",
   4733		       percpu_counter_sum(&fs_info->delalloc_bytes));
   4734	}
   4735
   4736	if (percpu_counter_sum(&fs_info->ordered_bytes))
   4737		btrfs_info(fs_info, "at unmount dio bytes count %lld",
   4738			   percpu_counter_sum(&fs_info->ordered_bytes));
   4739
   4740	btrfs_sysfs_remove_mounted(fs_info);
   4741	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
   4742
   4743	btrfs_put_block_group_cache(fs_info);
   4744
   4745	/*
   4746	 * we must make sure there is not any read request to
   4747	 * submit after we stopping all workers.
   4748	 */
   4749	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
   4750	btrfs_stop_all_workers(fs_info);
   4751
   4752	/* We shouldn't have any transaction open at this point */
   4753	warn_about_uncommitted_trans(fs_info);
   4754
   4755	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
   4756	free_root_pointers(fs_info, true);
   4757	btrfs_free_fs_roots(fs_info);
   4758
   4759	/*
   4760	 * We must free the block groups after dropping the fs_roots as we could
   4761	 * have had an IO error and have left over tree log blocks that aren't
   4762	 * cleaned up until the fs roots are freed.  This makes the block group
   4763	 * accounting appear to be wrong because there's pending reserved bytes,
   4764	 * so make sure we do the block group cleanup afterwards.
   4765	 */
   4766	btrfs_free_block_groups(fs_info);
   4767
   4768	iput(fs_info->btree_inode);
   4769
   4770#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
   4771	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
   4772		btrfsic_unmount(fs_info->fs_devices);
   4773#endif
   4774
   4775	btrfs_mapping_tree_free(&fs_info->mapping_tree);
   4776	btrfs_close_devices(fs_info->fs_devices);
   4777}
   4778
   4779int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
   4780			  int atomic)
   4781{
   4782	int ret;
   4783	struct inode *btree_inode = buf->pages[0]->mapping->host;
   4784
   4785	ret = extent_buffer_uptodate(buf);
   4786	if (!ret)
   4787		return ret;
   4788
   4789	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
   4790				    parent_transid, atomic);
   4791	if (ret == -EAGAIN)
   4792		return ret;
   4793	return !ret;
   4794}
   4795
   4796void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
   4797{
   4798	struct btrfs_fs_info *fs_info = buf->fs_info;
   4799	u64 transid = btrfs_header_generation(buf);
   4800	int was_dirty;
   4801
   4802#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
   4803	/*
   4804	 * This is a fast path so only do this check if we have sanity tests
   4805	 * enabled.  Normal people shouldn't be using unmapped buffers as dirty
   4806	 * outside of the sanity tests.
   4807	 */
   4808	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
   4809		return;
   4810#endif
   4811	btrfs_assert_tree_write_locked(buf);
   4812	if (transid != fs_info->generation)
   4813		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
   4814			buf->start, transid, fs_info->generation);
   4815	was_dirty = set_extent_buffer_dirty(buf);
   4816	if (!was_dirty)
   4817		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
   4818					 buf->len,
   4819					 fs_info->dirty_metadata_batch);
   4820#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
   4821	/*
   4822	 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
   4823	 * but item data not updated.
   4824	 * So here we should only check item pointers, not item data.
   4825	 */
   4826	if (btrfs_header_level(buf) == 0 &&
   4827	    btrfs_check_leaf_relaxed(buf)) {
   4828		btrfs_print_leaf(buf);
   4829		ASSERT(0);
   4830	}
   4831#endif
   4832}
   4833
   4834static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
   4835					int flush_delayed)
   4836{
   4837	/*
   4838	 * looks as though older kernels can get into trouble with
   4839	 * this code, they end up stuck in balance_dirty_pages forever
   4840	 */
   4841	int ret;
   4842
   4843	if (current->flags & PF_MEMALLOC)
   4844		return;
   4845
   4846	if (flush_delayed)
   4847		btrfs_balance_delayed_items(fs_info);
   4848
   4849	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
   4850				     BTRFS_DIRTY_METADATA_THRESH,
   4851				     fs_info->dirty_metadata_batch);
   4852	if (ret > 0) {
   4853		balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
   4854	}
   4855}
   4856
   4857void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
   4858{
   4859	__btrfs_btree_balance_dirty(fs_info, 1);
   4860}
   4861
   4862void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
   4863{
   4864	__btrfs_btree_balance_dirty(fs_info, 0);
   4865}
   4866
   4867static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
   4868{
   4869	/* cleanup FS via transaction */
   4870	btrfs_cleanup_transaction(fs_info);
   4871
   4872	mutex_lock(&fs_info->cleaner_mutex);
   4873	btrfs_run_delayed_iputs(fs_info);
   4874	mutex_unlock(&fs_info->cleaner_mutex);
   4875
   4876	down_write(&fs_info->cleanup_work_sem);
   4877	up_write(&fs_info->cleanup_work_sem);
   4878}
   4879
   4880static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
   4881{
   4882	unsigned long index = 0;
   4883	int grabbed = 0;
   4884	struct btrfs_root *roots[8];
   4885
   4886	spin_lock(&fs_info->fs_roots_lock);
   4887	while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
   4888				     ULONG_MAX, 8, XA_PRESENT))) {
   4889		for (int i = 0; i < grabbed; i++)
   4890			roots[i] = btrfs_grab_root(roots[i]);
   4891		spin_unlock(&fs_info->fs_roots_lock);
   4892
   4893		for (int i = 0; i < grabbed; i++) {
   4894			if (!roots[i])
   4895				continue;
   4896			index = roots[i]->root_key.objectid;
   4897			btrfs_free_log(NULL, roots[i]);
   4898			btrfs_put_root(roots[i]);
   4899		}
   4900		index++;
   4901		spin_lock(&fs_info->fs_roots_lock);
   4902	}
   4903	spin_unlock(&fs_info->fs_roots_lock);
   4904	btrfs_free_log_root_tree(NULL, fs_info);
   4905}
   4906
   4907static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
   4908{
   4909	struct btrfs_ordered_extent *ordered;
   4910
   4911	spin_lock(&root->ordered_extent_lock);
   4912	/*
   4913	 * This will just short circuit the ordered completion stuff which will
   4914	 * make sure the ordered extent gets properly cleaned up.
   4915	 */
   4916	list_for_each_entry(ordered, &root->ordered_extents,
   4917			    root_extent_list)
   4918		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
   4919	spin_unlock(&root->ordered_extent_lock);
   4920}
   4921
   4922static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
   4923{
   4924	struct btrfs_root *root;
   4925	struct list_head splice;
   4926
   4927	INIT_LIST_HEAD(&splice);
   4928
   4929	spin_lock(&fs_info->ordered_root_lock);
   4930	list_splice_init(&fs_info->ordered_roots, &splice);
   4931	while (!list_empty(&splice)) {
   4932		root = list_first_entry(&splice, struct btrfs_root,
   4933					ordered_root);
   4934		list_move_tail(&root->ordered_root,
   4935			       &fs_info->ordered_roots);
   4936
   4937		spin_unlock(&fs_info->ordered_root_lock);
   4938		btrfs_destroy_ordered_extents(root);
   4939
   4940		cond_resched();
   4941		spin_lock(&fs_info->ordered_root_lock);
   4942	}
   4943	spin_unlock(&fs_info->ordered_root_lock);
   4944
   4945	/*
   4946	 * We need this here because if we've been flipped read-only we won't
   4947	 * get sync() from the umount, so we need to make sure any ordered
   4948	 * extents that haven't had their dirty pages IO start writeout yet
   4949	 * actually get run and error out properly.
   4950	 */
   4951	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
   4952}
   4953
   4954static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
   4955				      struct btrfs_fs_info *fs_info)
   4956{
   4957	struct rb_node *node;
   4958	struct btrfs_delayed_ref_root *delayed_refs;
   4959	struct btrfs_delayed_ref_node *ref;
   4960	int ret = 0;
   4961
   4962	delayed_refs = &trans->delayed_refs;
   4963
   4964	spin_lock(&delayed_refs->lock);
   4965	if (atomic_read(&delayed_refs->num_entries) == 0) {
   4966		spin_unlock(&delayed_refs->lock);
   4967		btrfs_debug(fs_info, "delayed_refs has NO entry");
   4968		return ret;
   4969	}
   4970
   4971	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
   4972		struct btrfs_delayed_ref_head *head;
   4973		struct rb_node *n;
   4974		bool pin_bytes = false;
   4975
   4976		head = rb_entry(node, struct btrfs_delayed_ref_head,
   4977				href_node);
   4978		if (btrfs_delayed_ref_lock(delayed_refs, head))
   4979			continue;
   4980
   4981		spin_lock(&head->lock);
   4982		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
   4983			ref = rb_entry(n, struct btrfs_delayed_ref_node,
   4984				       ref_node);
   4985			ref->in_tree = 0;
   4986			rb_erase_cached(&ref->ref_node, &head->ref_tree);
   4987			RB_CLEAR_NODE(&ref->ref_node);
   4988			if (!list_empty(&ref->add_list))
   4989				list_del(&ref->add_list);
   4990			atomic_dec(&delayed_refs->num_entries);
   4991			btrfs_put_delayed_ref(ref);
   4992		}
   4993		if (head->must_insert_reserved)
   4994			pin_bytes = true;
   4995		btrfs_free_delayed_extent_op(head->extent_op);
   4996		btrfs_delete_ref_head(delayed_refs, head);
   4997		spin_unlock(&head->lock);
   4998		spin_unlock(&delayed_refs->lock);
   4999		mutex_unlock(&head->mutex);
   5000
   5001		if (pin_bytes) {
   5002			struct btrfs_block_group *cache;
   5003
   5004			cache = btrfs_lookup_block_group(fs_info, head->bytenr);
   5005			BUG_ON(!cache);
   5006
   5007			spin_lock(&cache->space_info->lock);
   5008			spin_lock(&cache->lock);
   5009			cache->pinned += head->num_bytes;
   5010			btrfs_space_info_update_bytes_pinned(fs_info,
   5011				cache->space_info, head->num_bytes);
   5012			cache->reserved -= head->num_bytes;
   5013			cache->space_info->bytes_reserved -= head->num_bytes;
   5014			spin_unlock(&cache->lock);
   5015			spin_unlock(&cache->space_info->lock);
   5016
   5017			btrfs_put_block_group(cache);
   5018
   5019			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
   5020				head->bytenr + head->num_bytes - 1);
   5021		}
   5022		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
   5023		btrfs_put_delayed_ref_head(head);
   5024		cond_resched();
   5025		spin_lock(&delayed_refs->lock);
   5026	}
   5027	btrfs_qgroup_destroy_extent_records(trans);
   5028
   5029	spin_unlock(&delayed_refs->lock);
   5030
   5031	return ret;
   5032}
   5033
   5034static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
   5035{
   5036	struct btrfs_inode *btrfs_inode;
   5037	struct list_head splice;
   5038
   5039	INIT_LIST_HEAD(&splice);
   5040
   5041	spin_lock(&root->delalloc_lock);
   5042	list_splice_init(&root->delalloc_inodes, &splice);
   5043
   5044	while (!list_empty(&splice)) {
   5045		struct inode *inode = NULL;
   5046		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
   5047					       delalloc_inodes);
   5048		__btrfs_del_delalloc_inode(root, btrfs_inode);
   5049		spin_unlock(&root->delalloc_lock);
   5050
   5051		/*
   5052		 * Make sure we get a live inode and that it'll not disappear
   5053		 * meanwhile.
   5054		 */
   5055		inode = igrab(&btrfs_inode->vfs_inode);
   5056		if (inode) {
   5057			invalidate_inode_pages2(inode->i_mapping);
   5058			iput(inode);
   5059		}
   5060		spin_lock(&root->delalloc_lock);
   5061	}
   5062	spin_unlock(&root->delalloc_lock);
   5063}
   5064
   5065static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
   5066{
   5067	struct btrfs_root *root;
   5068	struct list_head splice;
   5069
   5070	INIT_LIST_HEAD(&splice);
   5071
   5072	spin_lock(&fs_info->delalloc_root_lock);
   5073	list_splice_init(&fs_info->delalloc_roots, &splice);
   5074	while (!list_empty(&splice)) {
   5075		root = list_first_entry(&splice, struct btrfs_root,
   5076					 delalloc_root);
   5077		root = btrfs_grab_root(root);
   5078		BUG_ON(!root);
   5079		spin_unlock(&fs_info->delalloc_root_lock);
   5080
   5081		btrfs_destroy_delalloc_inodes(root);
   5082		btrfs_put_root(root);
   5083
   5084		spin_lock(&fs_info->delalloc_root_lock);
   5085	}
   5086	spin_unlock(&fs_info->delalloc_root_lock);
   5087}
   5088
   5089static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
   5090					struct extent_io_tree *dirty_pages,
   5091					int mark)
   5092{
   5093	int ret;
   5094	struct extent_buffer *eb;
   5095	u64 start = 0;
   5096	u64 end;
   5097
   5098	while (1) {
   5099		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
   5100					    mark, NULL);
   5101		if (ret)
   5102			break;
   5103
   5104		clear_extent_bits(dirty_pages, start, end, mark);
   5105		while (start <= end) {
   5106			eb = find_extent_buffer(fs_info, start);
   5107			start += fs_info->nodesize;
   5108			if (!eb)
   5109				continue;
   5110			wait_on_extent_buffer_writeback(eb);
   5111
   5112			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
   5113					       &eb->bflags))
   5114				clear_extent_buffer_dirty(eb);
   5115			free_extent_buffer_stale(eb);
   5116		}
   5117	}
   5118
   5119	return ret;
   5120}
   5121
   5122static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
   5123				       struct extent_io_tree *unpin)
   5124{
   5125	u64 start;
   5126	u64 end;
   5127	int ret;
   5128
   5129	while (1) {
   5130		struct extent_state *cached_state = NULL;
   5131
   5132		/*
   5133		 * The btrfs_finish_extent_commit() may get the same range as
   5134		 * ours between find_first_extent_bit and clear_extent_dirty.
   5135		 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
   5136		 * the same extent range.
   5137		 */
   5138		mutex_lock(&fs_info->unused_bg_unpin_mutex);
   5139		ret = find_first_extent_bit(unpin, 0, &start, &end,
   5140					    EXTENT_DIRTY, &cached_state);
   5141		if (ret) {
   5142			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
   5143			break;
   5144		}
   5145
   5146		clear_extent_dirty(unpin, start, end, &cached_state);
   5147		free_extent_state(cached_state);
   5148		btrfs_error_unpin_extent_range(fs_info, start, end);
   5149		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
   5150		cond_resched();
   5151	}
   5152
   5153	return 0;
   5154}
   5155
   5156static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
   5157{
   5158	struct inode *inode;
   5159
   5160	inode = cache->io_ctl.inode;
   5161	if (inode) {
   5162		invalidate_inode_pages2(inode->i_mapping);
   5163		BTRFS_I(inode)->generation = 0;
   5164		cache->io_ctl.inode = NULL;
   5165		iput(inode);
   5166	}
   5167	ASSERT(cache->io_ctl.pages == NULL);
   5168	btrfs_put_block_group(cache);
   5169}
   5170
   5171void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
   5172			     struct btrfs_fs_info *fs_info)
   5173{
   5174	struct btrfs_block_group *cache;
   5175
   5176	spin_lock(&cur_trans->dirty_bgs_lock);
   5177	while (!list_empty(&cur_trans->dirty_bgs)) {
   5178		cache = list_first_entry(&cur_trans->dirty_bgs,
   5179					 struct btrfs_block_group,
   5180					 dirty_list);
   5181
   5182		if (!list_empty(&cache->io_list)) {
   5183			spin_unlock(&cur_trans->dirty_bgs_lock);
   5184			list_del_init(&cache->io_list);
   5185			btrfs_cleanup_bg_io(cache);
   5186			spin_lock(&cur_trans->dirty_bgs_lock);
   5187		}
   5188
   5189		list_del_init(&cache->dirty_list);
   5190		spin_lock(&cache->lock);
   5191		cache->disk_cache_state = BTRFS_DC_ERROR;
   5192		spin_unlock(&cache->lock);
   5193
   5194		spin_unlock(&cur_trans->dirty_bgs_lock);
   5195		btrfs_put_block_group(cache);
   5196		btrfs_delayed_refs_rsv_release(fs_info, 1);
   5197		spin_lock(&cur_trans->dirty_bgs_lock);
   5198	}
   5199	spin_unlock(&cur_trans->dirty_bgs_lock);
   5200
   5201	/*
   5202	 * Refer to the definition of io_bgs member for details why it's safe
   5203	 * to use it without any locking
   5204	 */
   5205	while (!list_empty(&cur_trans->io_bgs)) {
   5206		cache = list_first_entry(&cur_trans->io_bgs,
   5207					 struct btrfs_block_group,
   5208					 io_list);
   5209
   5210		list_del_init(&cache->io_list);
   5211		spin_lock(&cache->lock);
   5212		cache->disk_cache_state = BTRFS_DC_ERROR;
   5213		spin_unlock(&cache->lock);
   5214		btrfs_cleanup_bg_io(cache);
   5215	}
   5216}
   5217
   5218void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
   5219				   struct btrfs_fs_info *fs_info)
   5220{
   5221	struct btrfs_device *dev, *tmp;
   5222
   5223	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
   5224	ASSERT(list_empty(&cur_trans->dirty_bgs));
   5225	ASSERT(list_empty(&cur_trans->io_bgs));
   5226
   5227	list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
   5228				 post_commit_list) {
   5229		list_del_init(&dev->post_commit_list);
   5230	}
   5231
   5232	btrfs_destroy_delayed_refs(cur_trans, fs_info);
   5233
   5234	cur_trans->state = TRANS_STATE_COMMIT_START;
   5235	wake_up(&fs_info->transaction_blocked_wait);
   5236
   5237	cur_trans->state = TRANS_STATE_UNBLOCKED;
   5238	wake_up(&fs_info->transaction_wait);
   5239
   5240	btrfs_destroy_delayed_inodes(fs_info);
   5241
   5242	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
   5243				     EXTENT_DIRTY);
   5244	btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
   5245
   5246	btrfs_free_redirty_list(cur_trans);
   5247
   5248	cur_trans->state =TRANS_STATE_COMPLETED;
   5249	wake_up(&cur_trans->commit_wait);
   5250}
   5251
   5252static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
   5253{
   5254	struct btrfs_transaction *t;
   5255
   5256	mutex_lock(&fs_info->transaction_kthread_mutex);
   5257
   5258	spin_lock(&fs_info->trans_lock);
   5259	while (!list_empty(&fs_info->trans_list)) {
   5260		t = list_first_entry(&fs_info->trans_list,
   5261				     struct btrfs_transaction, list);
   5262		if (t->state >= TRANS_STATE_COMMIT_START) {
   5263			refcount_inc(&t->use_count);
   5264			spin_unlock(&fs_info->trans_lock);
   5265			btrfs_wait_for_commit(fs_info, t->transid);
   5266			btrfs_put_transaction(t);
   5267			spin_lock(&fs_info->trans_lock);
   5268			continue;
   5269		}
   5270		if (t == fs_info->running_transaction) {
   5271			t->state = TRANS_STATE_COMMIT_DOING;
   5272			spin_unlock(&fs_info->trans_lock);
   5273			/*
   5274			 * We wait for 0 num_writers since we don't hold a trans
   5275			 * handle open currently for this transaction.
   5276			 */
   5277			wait_event(t->writer_wait,
   5278				   atomic_read(&t->num_writers) == 0);
   5279		} else {
   5280			spin_unlock(&fs_info->trans_lock);
   5281		}
   5282		btrfs_cleanup_one_transaction(t, fs_info);
   5283
   5284		spin_lock(&fs_info->trans_lock);
   5285		if (t == fs_info->running_transaction)
   5286			fs_info->running_transaction = NULL;
   5287		list_del_init(&t->list);
   5288		spin_unlock(&fs_info->trans_lock);
   5289
   5290		btrfs_put_transaction(t);
   5291		trace_btrfs_transaction_commit(fs_info);
   5292		spin_lock(&fs_info->trans_lock);
   5293	}
   5294	spin_unlock(&fs_info->trans_lock);
   5295	btrfs_destroy_all_ordered_extents(fs_info);
   5296	btrfs_destroy_delayed_inodes(fs_info);
   5297	btrfs_assert_delayed_root_empty(fs_info);
   5298	btrfs_destroy_all_delalloc_inodes(fs_info);
   5299	btrfs_drop_all_logs(fs_info);
   5300	mutex_unlock(&fs_info->transaction_kthread_mutex);
   5301
   5302	return 0;
   5303}
   5304
   5305int btrfs_init_root_free_objectid(struct btrfs_root *root)
   5306{
   5307	struct btrfs_path *path;
   5308	int ret;
   5309	struct extent_buffer *l;
   5310	struct btrfs_key search_key;
   5311	struct btrfs_key found_key;
   5312	int slot;
   5313
   5314	path = btrfs_alloc_path();
   5315	if (!path)
   5316		return -ENOMEM;
   5317
   5318	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
   5319	search_key.type = -1;
   5320	search_key.offset = (u64)-1;
   5321	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
   5322	if (ret < 0)
   5323		goto error;
   5324	BUG_ON(ret == 0); /* Corruption */
   5325	if (path->slots[0] > 0) {
   5326		slot = path->slots[0] - 1;
   5327		l = path->nodes[0];
   5328		btrfs_item_key_to_cpu(l, &found_key, slot);
   5329		root->free_objectid = max_t(u64, found_key.objectid + 1,
   5330					    BTRFS_FIRST_FREE_OBJECTID);
   5331	} else {
   5332		root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
   5333	}
   5334	ret = 0;
   5335error:
   5336	btrfs_free_path(path);
   5337	return ret;
   5338}
   5339
   5340int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
   5341{
   5342	int ret;
   5343	mutex_lock(&root->objectid_mutex);
   5344
   5345	if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
   5346		btrfs_warn(root->fs_info,
   5347			   "the objectid of root %llu reaches its highest value",
   5348			   root->root_key.objectid);
   5349		ret = -ENOSPC;
   5350		goto out;
   5351	}
   5352
   5353	*objectid = root->free_objectid++;
   5354	ret = 0;
   5355out:
   5356	mutex_unlock(&root->objectid_mutex);
   5357	return ret;
   5358}