block-group.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
block-group.c (125861B)
      1// SPDX-License-Identifier: GPL-2.0
      2
      3#include <linux/list_sort.h>
      4#include "misc.h"
      5#include "ctree.h"
      6#include "block-group.h"
      7#include "space-info.h"
      8#include "disk-io.h"
      9#include "free-space-cache.h"
     10#include "free-space-tree.h"
     11#include "volumes.h"
     12#include "transaction.h"
     13#include "ref-verify.h"
     14#include "sysfs.h"
     15#include "tree-log.h"
     16#include "delalloc-space.h"
     17#include "discard.h"
     18#include "raid56.h"
     19#include "zoned.h"
     20
     21/*
     22 * Return target flags in extended format or 0 if restripe for this chunk_type
     23 * is not in progress
     24 *
     25 * Should be called with balance_lock held
     26 */
     27static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
     28{
     29	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
     30	u64 target = 0;
     31
     32	if (!bctl)
     33		return 0;
     34
     35	if (flags & BTRFS_BLOCK_GROUP_DATA &&
     36	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
     37		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
     38	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
     39		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
     40		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
     41	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
     42		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
     43		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
     44	}
     45
     46	return target;
     47}
     48
     49/*
     50 * @flags: available profiles in extended format (see ctree.h)
     51 *
     52 * Return reduced profile in chunk format.  If profile changing is in progress
     53 * (either running or paused) picks the target profile (if it's already
     54 * available), otherwise falls back to plain reducing.
     55 */
     56static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
     57{
     58	u64 num_devices = fs_info->fs_devices->rw_devices;
     59	u64 target;
     60	u64 raid_type;
     61	u64 allowed = 0;
     62
     63	/*
     64	 * See if restripe for this chunk_type is in progress, if so try to
     65	 * reduce to the target profile
     66	 */
     67	spin_lock(&fs_info->balance_lock);
     68	target = get_restripe_target(fs_info, flags);
     69	if (target) {
     70		spin_unlock(&fs_info->balance_lock);
     71		return extended_to_chunk(target);
     72	}
     73	spin_unlock(&fs_info->balance_lock);
     74
     75	/* First, mask out the RAID levels which aren't possible */
     76	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
     77		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
     78			allowed |= btrfs_raid_array[raid_type].bg_flag;
     79	}
     80	allowed &= flags;
     81
     82	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
     83		allowed = BTRFS_BLOCK_GROUP_RAID6;
     84	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
     85		allowed = BTRFS_BLOCK_GROUP_RAID5;
     86	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
     87		allowed = BTRFS_BLOCK_GROUP_RAID10;
     88	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
     89		allowed = BTRFS_BLOCK_GROUP_RAID1;
     90	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
     91		allowed = BTRFS_BLOCK_GROUP_RAID0;
     92
     93	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
     94
     95	return extended_to_chunk(flags | allowed);
     96}
     97
     98u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
     99{
    100	unsigned seq;
    101	u64 flags;
    102
    103	do {
    104		flags = orig_flags;
    105		seq = read_seqbegin(&fs_info->profiles_lock);
    106
    107		if (flags & BTRFS_BLOCK_GROUP_DATA)
    108			flags |= fs_info->avail_data_alloc_bits;
    109		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    110			flags |= fs_info->avail_system_alloc_bits;
    111		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
    112			flags |= fs_info->avail_metadata_alloc_bits;
    113	} while (read_seqretry(&fs_info->profiles_lock, seq));
    114
    115	return btrfs_reduce_alloc_profile(fs_info, flags);
    116}
    117
    118void btrfs_get_block_group(struct btrfs_block_group *cache)
    119{
    120	refcount_inc(&cache->refs);
    121}
    122
    123void btrfs_put_block_group(struct btrfs_block_group *cache)
    124{
    125	if (refcount_dec_and_test(&cache->refs)) {
    126		WARN_ON(cache->pinned > 0);
    127		/*
    128		 * If there was a failure to cleanup a log tree, very likely due
    129		 * to an IO failure on a writeback attempt of one or more of its
    130		 * extent buffers, we could not do proper (and cheap) unaccounting
    131		 * of their reserved space, so don't warn on reserved > 0 in that
    132		 * case.
    133		 */
    134		if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
    135		    !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
    136			WARN_ON(cache->reserved > 0);
    137
    138		/*
    139		 * A block_group shouldn't be on the discard_list anymore.
    140		 * Remove the block_group from the discard_list to prevent us
    141		 * from causing a panic due to NULL pointer dereference.
    142		 */
    143		if (WARN_ON(!list_empty(&cache->discard_list)))
    144			btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
    145						  cache);
    146
    147		/*
    148		 * If not empty, someone is still holding mutex of
    149		 * full_stripe_lock, which can only be released by caller.
    150		 * And it will definitely cause use-after-free when caller
    151		 * tries to release full stripe lock.
    152		 *
    153		 * No better way to resolve, but only to warn.
    154		 */
    155		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
    156		kfree(cache->free_space_ctl);
    157		kfree(cache->physical_map);
    158		kfree(cache);
    159	}
    160}
    161
    162/*
    163 * This adds the block group to the fs_info rb tree for the block group cache
    164 */
    165static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
    166				       struct btrfs_block_group *block_group)
    167{
    168	struct rb_node **p;
    169	struct rb_node *parent = NULL;
    170	struct btrfs_block_group *cache;
    171	bool leftmost = true;
    172
    173	ASSERT(block_group->length != 0);
    174
    175	write_lock(&info->block_group_cache_lock);
    176	p = &info->block_group_cache_tree.rb_root.rb_node;
    177
    178	while (*p) {
    179		parent = *p;
    180		cache = rb_entry(parent, struct btrfs_block_group, cache_node);
    181		if (block_group->start < cache->start) {
    182			p = &(*p)->rb_left;
    183		} else if (block_group->start > cache->start) {
    184			p = &(*p)->rb_right;
    185			leftmost = false;
    186		} else {
    187			write_unlock(&info->block_group_cache_lock);
    188			return -EEXIST;
    189		}
    190	}
    191
    192	rb_link_node(&block_group->cache_node, parent, p);
    193	rb_insert_color_cached(&block_group->cache_node,
    194			       &info->block_group_cache_tree, leftmost);
    195
    196	write_unlock(&info->block_group_cache_lock);
    197
    198	return 0;
    199}
    200
    201/*
    202 * This will return the block group at or after bytenr if contains is 0, else
    203 * it will return the block group that contains the bytenr
    204 */
    205static struct btrfs_block_group *block_group_cache_tree_search(
    206		struct btrfs_fs_info *info, u64 bytenr, int contains)
    207{
    208	struct btrfs_block_group *cache, *ret = NULL;
    209	struct rb_node *n;
    210	u64 end, start;
    211
    212	read_lock(&info->block_group_cache_lock);
    213	n = info->block_group_cache_tree.rb_root.rb_node;
    214
    215	while (n) {
    216		cache = rb_entry(n, struct btrfs_block_group, cache_node);
    217		end = cache->start + cache->length - 1;
    218		start = cache->start;
    219
    220		if (bytenr < start) {
    221			if (!contains && (!ret || start < ret->start))
    222				ret = cache;
    223			n = n->rb_left;
    224		} else if (bytenr > start) {
    225			if (contains && bytenr <= end) {
    226				ret = cache;
    227				break;
    228			}
    229			n = n->rb_right;
    230		} else {
    231			ret = cache;
    232			break;
    233		}
    234	}
    235	if (ret)
    236		btrfs_get_block_group(ret);
    237	read_unlock(&info->block_group_cache_lock);
    238
    239	return ret;
    240}
    241
    242/*
    243 * Return the block group that starts at or after bytenr
    244 */
    245struct btrfs_block_group *btrfs_lookup_first_block_group(
    246		struct btrfs_fs_info *info, u64 bytenr)
    247{
    248	return block_group_cache_tree_search(info, bytenr, 0);
    249}
    250
    251/*
    252 * Return the block group that contains the given bytenr
    253 */
    254struct btrfs_block_group *btrfs_lookup_block_group(
    255		struct btrfs_fs_info *info, u64 bytenr)
    256{
    257	return block_group_cache_tree_search(info, bytenr, 1);
    258}
    259
    260struct btrfs_block_group *btrfs_next_block_group(
    261		struct btrfs_block_group *cache)
    262{
    263	struct btrfs_fs_info *fs_info = cache->fs_info;
    264	struct rb_node *node;
    265
    266	read_lock(&fs_info->block_group_cache_lock);
    267
    268	/* If our block group was removed, we need a full search. */
    269	if (RB_EMPTY_NODE(&cache->cache_node)) {
    270		const u64 next_bytenr = cache->start + cache->length;
    271
    272		read_unlock(&fs_info->block_group_cache_lock);
    273		btrfs_put_block_group(cache);
    274		return btrfs_lookup_first_block_group(fs_info, next_bytenr);
    275	}
    276	node = rb_next(&cache->cache_node);
    277	btrfs_put_block_group(cache);
    278	if (node) {
    279		cache = rb_entry(node, struct btrfs_block_group, cache_node);
    280		btrfs_get_block_group(cache);
    281	} else
    282		cache = NULL;
    283	read_unlock(&fs_info->block_group_cache_lock);
    284	return cache;
    285}
    286
    287/**
    288 * Check if we can do a NOCOW write for a given extent.
    289 *
    290 * @fs_info:       The filesystem information object.
    291 * @bytenr:        Logical start address of the extent.
    292 *
    293 * Check if we can do a NOCOW write for the given extent, and increments the
    294 * number of NOCOW writers in the block group that contains the extent, as long
    295 * as the block group exists and it's currently not in read-only mode.
    296 *
    297 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
    298 *          is responsible for calling btrfs_dec_nocow_writers() later.
    299 *
    300 *          Or NULL if we can not do a NOCOW write
    301 */
    302struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
    303						  u64 bytenr)
    304{
    305	struct btrfs_block_group *bg;
    306	bool can_nocow = true;
    307
    308	bg = btrfs_lookup_block_group(fs_info, bytenr);
    309	if (!bg)
    310		return NULL;
    311
    312	spin_lock(&bg->lock);
    313	if (bg->ro)
    314		can_nocow = false;
    315	else
    316		atomic_inc(&bg->nocow_writers);
    317	spin_unlock(&bg->lock);
    318
    319	if (!can_nocow) {
    320		btrfs_put_block_group(bg);
    321		return NULL;
    322	}
    323
    324	/* No put on block group, done by btrfs_dec_nocow_writers(). */
    325	return bg;
    326}
    327
    328/**
    329 * Decrement the number of NOCOW writers in a block group.
    330 *
    331 * @bg:       The block group.
    332 *
    333 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
    334 * and on the block group returned by that call. Typically this is called after
    335 * creating an ordered extent for a NOCOW write, to prevent races with scrub and
    336 * relocation.
    337 *
    338 * After this call, the caller should not use the block group anymore. It it wants
    339 * to use it, then it should get a reference on it before calling this function.
    340 */
    341void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
    342{
    343	if (atomic_dec_and_test(&bg->nocow_writers))
    344		wake_up_var(&bg->nocow_writers);
    345
    346	/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
    347	btrfs_put_block_group(bg);
    348}
    349
    350void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
    351{
    352	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
    353}
    354
    355void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
    356					const u64 start)
    357{
    358	struct btrfs_block_group *bg;
    359
    360	bg = btrfs_lookup_block_group(fs_info, start);
    361	ASSERT(bg);
    362	if (atomic_dec_and_test(&bg->reservations))
    363		wake_up_var(&bg->reservations);
    364	btrfs_put_block_group(bg);
    365}
    366
    367void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
    368{
    369	struct btrfs_space_info *space_info = bg->space_info;
    370
    371	ASSERT(bg->ro);
    372
    373	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
    374		return;
    375
    376	/*
    377	 * Our block group is read only but before we set it to read only,
    378	 * some task might have had allocated an extent from it already, but it
    379	 * has not yet created a respective ordered extent (and added it to a
    380	 * root's list of ordered extents).
    381	 * Therefore wait for any task currently allocating extents, since the
    382	 * block group's reservations counter is incremented while a read lock
    383	 * on the groups' semaphore is held and decremented after releasing
    384	 * the read access on that semaphore and creating the ordered extent.
    385	 */
    386	down_write(&space_info->groups_sem);
    387	up_write(&space_info->groups_sem);
    388
    389	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
    390}
    391
    392struct btrfs_caching_control *btrfs_get_caching_control(
    393		struct btrfs_block_group *cache)
    394{
    395	struct btrfs_caching_control *ctl;
    396
    397	spin_lock(&cache->lock);
    398	if (!cache->caching_ctl) {
    399		spin_unlock(&cache->lock);
    400		return NULL;
    401	}
    402
    403	ctl = cache->caching_ctl;
    404	refcount_inc(&ctl->count);
    405	spin_unlock(&cache->lock);
    406	return ctl;
    407}
    408
    409void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
    410{
    411	if (refcount_dec_and_test(&ctl->count))
    412		kfree(ctl);
    413}
    414
    415/*
    416 * When we wait for progress in the block group caching, its because our
    417 * allocation attempt failed at least once.  So, we must sleep and let some
    418 * progress happen before we try again.
    419 *
    420 * This function will sleep at least once waiting for new free space to show
    421 * up, and then it will check the block group free space numbers for our min
    422 * num_bytes.  Another option is to have it go ahead and look in the rbtree for
    423 * a free extent of a given size, but this is a good start.
    424 *
    425 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
    426 * any of the information in this block group.
    427 */
    428void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
    429					   u64 num_bytes)
    430{
    431	struct btrfs_caching_control *caching_ctl;
    432
    433	caching_ctl = btrfs_get_caching_control(cache);
    434	if (!caching_ctl)
    435		return;
    436
    437	wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
    438		   (cache->free_space_ctl->free_space >= num_bytes));
    439
    440	btrfs_put_caching_control(caching_ctl);
    441}
    442
    443int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
    444{
    445	struct btrfs_caching_control *caching_ctl;
    446	int ret = 0;
    447
    448	caching_ctl = btrfs_get_caching_control(cache);
    449	if (!caching_ctl)
    450		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
    451
    452	wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
    453	if (cache->cached == BTRFS_CACHE_ERROR)
    454		ret = -EIO;
    455	btrfs_put_caching_control(caching_ctl);
    456	return ret;
    457}
    458
    459static bool space_cache_v1_done(struct btrfs_block_group *cache)
    460{
    461	bool ret;
    462
    463	spin_lock(&cache->lock);
    464	ret = cache->cached != BTRFS_CACHE_FAST;
    465	spin_unlock(&cache->lock);
    466
    467	return ret;
    468}
    469
    470void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
    471				struct btrfs_caching_control *caching_ctl)
    472{
    473	wait_event(caching_ctl->wait, space_cache_v1_done(cache));
    474}
    475
    476#ifdef CONFIG_BTRFS_DEBUG
    477static void fragment_free_space(struct btrfs_block_group *block_group)
    478{
    479	struct btrfs_fs_info *fs_info = block_group->fs_info;
    480	u64 start = block_group->start;
    481	u64 len = block_group->length;
    482	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
    483		fs_info->nodesize : fs_info->sectorsize;
    484	u64 step = chunk << 1;
    485
    486	while (len > chunk) {
    487		btrfs_remove_free_space(block_group, start, chunk);
    488		start += step;
    489		if (len < step)
    490			len = 0;
    491		else
    492			len -= step;
    493	}
    494}
    495#endif
    496
    497/*
    498 * This is only called by btrfs_cache_block_group, since we could have freed
    499 * extents we need to check the pinned_extents for any extents that can't be
    500 * used yet since their free space will be released as soon as the transaction
    501 * commits.
    502 */
    503u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
    504{
    505	struct btrfs_fs_info *info = block_group->fs_info;
    506	u64 extent_start, extent_end, size, total_added = 0;
    507	int ret;
    508
    509	while (start < end) {
    510		ret = find_first_extent_bit(&info->excluded_extents, start,
    511					    &extent_start, &extent_end,
    512					    EXTENT_DIRTY | EXTENT_UPTODATE,
    513					    NULL);
    514		if (ret)
    515			break;
    516
    517		if (extent_start <= start) {
    518			start = extent_end + 1;
    519		} else if (extent_start > start && extent_start < end) {
    520			size = extent_start - start;
    521			total_added += size;
    522			ret = btrfs_add_free_space_async_trimmed(block_group,
    523								 start, size);
    524			BUG_ON(ret); /* -ENOMEM or logic error */
    525			start = extent_end + 1;
    526		} else {
    527			break;
    528		}
    529	}
    530
    531	if (start < end) {
    532		size = end - start;
    533		total_added += size;
    534		ret = btrfs_add_free_space_async_trimmed(block_group, start,
    535							 size);
    536		BUG_ON(ret); /* -ENOMEM or logic error */
    537	}
    538
    539	return total_added;
    540}
    541
    542static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
    543{
    544	struct btrfs_block_group *block_group = caching_ctl->block_group;
    545	struct btrfs_fs_info *fs_info = block_group->fs_info;
    546	struct btrfs_root *extent_root;
    547	struct btrfs_path *path;
    548	struct extent_buffer *leaf;
    549	struct btrfs_key key;
    550	u64 total_found = 0;
    551	u64 last = 0;
    552	u32 nritems;
    553	int ret;
    554	bool wakeup = true;
    555
    556	path = btrfs_alloc_path();
    557	if (!path)
    558		return -ENOMEM;
    559
    560	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
    561	extent_root = btrfs_extent_root(fs_info, last);
    562
    563#ifdef CONFIG_BTRFS_DEBUG
    564	/*
    565	 * If we're fragmenting we don't want to make anybody think we can
    566	 * allocate from this block group until we've had a chance to fragment
    567	 * the free space.
    568	 */
    569	if (btrfs_should_fragment_free_space(block_group))
    570		wakeup = false;
    571#endif
    572	/*
    573	 * We don't want to deadlock with somebody trying to allocate a new
    574	 * extent for the extent root while also trying to search the extent
    575	 * root to add free space.  So we skip locking and search the commit
    576	 * root, since its read-only
    577	 */
    578	path->skip_locking = 1;
    579	path->search_commit_root = 1;
    580	path->reada = READA_FORWARD;
    581
    582	key.objectid = last;
    583	key.offset = 0;
    584	key.type = BTRFS_EXTENT_ITEM_KEY;
    585
    586next:
    587	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
    588	if (ret < 0)
    589		goto out;
    590
    591	leaf = path->nodes[0];
    592	nritems = btrfs_header_nritems(leaf);
    593
    594	while (1) {
    595		if (btrfs_fs_closing(fs_info) > 1) {
    596			last = (u64)-1;
    597			break;
    598		}
    599
    600		if (path->slots[0] < nritems) {
    601			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    602		} else {
    603			ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
    604			if (ret)
    605				break;
    606
    607			if (need_resched() ||
    608			    rwsem_is_contended(&fs_info->commit_root_sem)) {
    609				if (wakeup)
    610					caching_ctl->progress = last;
    611				btrfs_release_path(path);
    612				up_read(&fs_info->commit_root_sem);
    613				mutex_unlock(&caching_ctl->mutex);
    614				cond_resched();
    615				mutex_lock(&caching_ctl->mutex);
    616				down_read(&fs_info->commit_root_sem);
    617				goto next;
    618			}
    619
    620			ret = btrfs_next_leaf(extent_root, path);
    621			if (ret < 0)
    622				goto out;
    623			if (ret)
    624				break;
    625			leaf = path->nodes[0];
    626			nritems = btrfs_header_nritems(leaf);
    627			continue;
    628		}
    629
    630		if (key.objectid < last) {
    631			key.objectid = last;
    632			key.offset = 0;
    633			key.type = BTRFS_EXTENT_ITEM_KEY;
    634
    635			if (wakeup)
    636				caching_ctl->progress = last;
    637			btrfs_release_path(path);
    638			goto next;
    639		}
    640
    641		if (key.objectid < block_group->start) {
    642			path->slots[0]++;
    643			continue;
    644		}
    645
    646		if (key.objectid >= block_group->start + block_group->length)
    647			break;
    648
    649		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
    650		    key.type == BTRFS_METADATA_ITEM_KEY) {
    651			total_found += add_new_free_space(block_group, last,
    652							  key.objectid);
    653			if (key.type == BTRFS_METADATA_ITEM_KEY)
    654				last = key.objectid +
    655					fs_info->nodesize;
    656			else
    657				last = key.objectid + key.offset;
    658
    659			if (total_found > CACHING_CTL_WAKE_UP) {
    660				total_found = 0;
    661				if (wakeup)
    662					wake_up(&caching_ctl->wait);
    663			}
    664		}
    665		path->slots[0]++;
    666	}
    667	ret = 0;
    668
    669	total_found += add_new_free_space(block_group, last,
    670				block_group->start + block_group->length);
    671	caching_ctl->progress = (u64)-1;
    672
    673out:
    674	btrfs_free_path(path);
    675	return ret;
    676}
    677
    678static noinline void caching_thread(struct btrfs_work *work)
    679{
    680	struct btrfs_block_group *block_group;
    681	struct btrfs_fs_info *fs_info;
    682	struct btrfs_caching_control *caching_ctl;
    683	int ret;
    684
    685	caching_ctl = container_of(work, struct btrfs_caching_control, work);
    686	block_group = caching_ctl->block_group;
    687	fs_info = block_group->fs_info;
    688
    689	mutex_lock(&caching_ctl->mutex);
    690	down_read(&fs_info->commit_root_sem);
    691
    692	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
    693		ret = load_free_space_cache(block_group);
    694		if (ret == 1) {
    695			ret = 0;
    696			goto done;
    697		}
    698
    699		/*
    700		 * We failed to load the space cache, set ourselves to
    701		 * CACHE_STARTED and carry on.
    702		 */
    703		spin_lock(&block_group->lock);
    704		block_group->cached = BTRFS_CACHE_STARTED;
    705		spin_unlock(&block_group->lock);
    706		wake_up(&caching_ctl->wait);
    707	}
    708
    709	/*
    710	 * If we are in the transaction that populated the free space tree we
    711	 * can't actually cache from the free space tree as our commit root and
    712	 * real root are the same, so we could change the contents of the blocks
    713	 * while caching.  Instead do the slow caching in this case, and after
    714	 * the transaction has committed we will be safe.
    715	 */
    716	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
    717	    !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
    718		ret = load_free_space_tree(caching_ctl);
    719	else
    720		ret = load_extent_tree_free(caching_ctl);
    721done:
    722	spin_lock(&block_group->lock);
    723	block_group->caching_ctl = NULL;
    724	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
    725	spin_unlock(&block_group->lock);
    726
    727#ifdef CONFIG_BTRFS_DEBUG
    728	if (btrfs_should_fragment_free_space(block_group)) {
    729		u64 bytes_used;
    730
    731		spin_lock(&block_group->space_info->lock);
    732		spin_lock(&block_group->lock);
    733		bytes_used = block_group->length - block_group->used;
    734		block_group->space_info->bytes_used += bytes_used >> 1;
    735		spin_unlock(&block_group->lock);
    736		spin_unlock(&block_group->space_info->lock);
    737		fragment_free_space(block_group);
    738	}
    739#endif
    740
    741	caching_ctl->progress = (u64)-1;
    742
    743	up_read(&fs_info->commit_root_sem);
    744	btrfs_free_excluded_extents(block_group);
    745	mutex_unlock(&caching_ctl->mutex);
    746
    747	wake_up(&caching_ctl->wait);
    748
    749	btrfs_put_caching_control(caching_ctl);
    750	btrfs_put_block_group(block_group);
    751}
    752
    753int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
    754{
    755	DEFINE_WAIT(wait);
    756	struct btrfs_fs_info *fs_info = cache->fs_info;
    757	struct btrfs_caching_control *caching_ctl = NULL;
    758	int ret = 0;
    759
    760	/* Allocator for zoned filesystems does not use the cache at all */
    761	if (btrfs_is_zoned(fs_info))
    762		return 0;
    763
    764	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
    765	if (!caching_ctl)
    766		return -ENOMEM;
    767
    768	INIT_LIST_HEAD(&caching_ctl->list);
    769	mutex_init(&caching_ctl->mutex);
    770	init_waitqueue_head(&caching_ctl->wait);
    771	caching_ctl->block_group = cache;
    772	caching_ctl->progress = cache->start;
    773	refcount_set(&caching_ctl->count, 2);
    774	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
    775
    776	spin_lock(&cache->lock);
    777	if (cache->cached != BTRFS_CACHE_NO) {
    778		kfree(caching_ctl);
    779
    780		caching_ctl = cache->caching_ctl;
    781		if (caching_ctl)
    782			refcount_inc(&caching_ctl->count);
    783		spin_unlock(&cache->lock);
    784		goto out;
    785	}
    786	WARN_ON(cache->caching_ctl);
    787	cache->caching_ctl = caching_ctl;
    788	if (btrfs_test_opt(fs_info, SPACE_CACHE))
    789		cache->cached = BTRFS_CACHE_FAST;
    790	else
    791		cache->cached = BTRFS_CACHE_STARTED;
    792	cache->has_caching_ctl = 1;
    793	spin_unlock(&cache->lock);
    794
    795	write_lock(&fs_info->block_group_cache_lock);
    796	refcount_inc(&caching_ctl->count);
    797	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
    798	write_unlock(&fs_info->block_group_cache_lock);
    799
    800	btrfs_get_block_group(cache);
    801
    802	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
    803out:
    804	if (load_cache_only && caching_ctl)
    805		btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
    806	if (caching_ctl)
    807		btrfs_put_caching_control(caching_ctl);
    808
    809	return ret;
    810}
    811
    812static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
    813{
    814	u64 extra_flags = chunk_to_extended(flags) &
    815				BTRFS_EXTENDED_PROFILE_MASK;
    816
    817	write_seqlock(&fs_info->profiles_lock);
    818	if (flags & BTRFS_BLOCK_GROUP_DATA)
    819		fs_info->avail_data_alloc_bits &= ~extra_flags;
    820	if (flags & BTRFS_BLOCK_GROUP_METADATA)
    821		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
    822	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    823		fs_info->avail_system_alloc_bits &= ~extra_flags;
    824	write_sequnlock(&fs_info->profiles_lock);
    825}
    826
    827/*
    828 * Clear incompat bits for the following feature(s):
    829 *
    830 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
    831 *            in the whole filesystem
    832 *
    833 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
    834 */
    835static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
    836{
    837	bool found_raid56 = false;
    838	bool found_raid1c34 = false;
    839
    840	if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
    841	    (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
    842	    (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
    843		struct list_head *head = &fs_info->space_info;
    844		struct btrfs_space_info *sinfo;
    845
    846		list_for_each_entry_rcu(sinfo, head, list) {
    847			down_read(&sinfo->groups_sem);
    848			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
    849				found_raid56 = true;
    850			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
    851				found_raid56 = true;
    852			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
    853				found_raid1c34 = true;
    854			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
    855				found_raid1c34 = true;
    856			up_read(&sinfo->groups_sem);
    857		}
    858		if (!found_raid56)
    859			btrfs_clear_fs_incompat(fs_info, RAID56);
    860		if (!found_raid1c34)
    861			btrfs_clear_fs_incompat(fs_info, RAID1C34);
    862	}
    863}
    864
    865static int remove_block_group_item(struct btrfs_trans_handle *trans,
    866				   struct btrfs_path *path,
    867				   struct btrfs_block_group *block_group)
    868{
    869	struct btrfs_fs_info *fs_info = trans->fs_info;
    870	struct btrfs_root *root;
    871	struct btrfs_key key;
    872	int ret;
    873
    874	root = btrfs_block_group_root(fs_info);
    875	key.objectid = block_group->start;
    876	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    877	key.offset = block_group->length;
    878
    879	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    880	if (ret > 0)
    881		ret = -ENOENT;
    882	if (ret < 0)
    883		return ret;
    884
    885	ret = btrfs_del_item(trans, root, path);
    886	return ret;
    887}
    888
    889int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
    890			     u64 group_start, struct extent_map *em)
    891{
    892	struct btrfs_fs_info *fs_info = trans->fs_info;
    893	struct btrfs_path *path;
    894	struct btrfs_block_group *block_group;
    895	struct btrfs_free_cluster *cluster;
    896	struct inode *inode;
    897	struct kobject *kobj = NULL;
    898	int ret;
    899	int index;
    900	int factor;
    901	struct btrfs_caching_control *caching_ctl = NULL;
    902	bool remove_em;
    903	bool remove_rsv = false;
    904
    905	block_group = btrfs_lookup_block_group(fs_info, group_start);
    906	BUG_ON(!block_group);
    907	BUG_ON(!block_group->ro);
    908
    909	trace_btrfs_remove_block_group(block_group);
    910	/*
    911	 * Free the reserved super bytes from this block group before
    912	 * remove it.
    913	 */
    914	btrfs_free_excluded_extents(block_group);
    915	btrfs_free_ref_tree_range(fs_info, block_group->start,
    916				  block_group->length);
    917
    918	index = btrfs_bg_flags_to_raid_index(block_group->flags);
    919	factor = btrfs_bg_type_to_factor(block_group->flags);
    920
    921	/* make sure this block group isn't part of an allocation cluster */
    922	cluster = &fs_info->data_alloc_cluster;
    923	spin_lock(&cluster->refill_lock);
    924	btrfs_return_cluster_to_free_space(block_group, cluster);
    925	spin_unlock(&cluster->refill_lock);
    926
    927	/*
    928	 * make sure this block group isn't part of a metadata
    929	 * allocation cluster
    930	 */
    931	cluster = &fs_info->meta_alloc_cluster;
    932	spin_lock(&cluster->refill_lock);
    933	btrfs_return_cluster_to_free_space(block_group, cluster);
    934	spin_unlock(&cluster->refill_lock);
    935
    936	btrfs_clear_treelog_bg(block_group);
    937	btrfs_clear_data_reloc_bg(block_group);
    938
    939	path = btrfs_alloc_path();
    940	if (!path) {
    941		ret = -ENOMEM;
    942		goto out;
    943	}
    944
    945	/*
    946	 * get the inode first so any iput calls done for the io_list
    947	 * aren't the final iput (no unlinks allowed now)
    948	 */
    949	inode = lookup_free_space_inode(block_group, path);
    950
    951	mutex_lock(&trans->transaction->cache_write_mutex);
    952	/*
    953	 * Make sure our free space cache IO is done before removing the
    954	 * free space inode
    955	 */
    956	spin_lock(&trans->transaction->dirty_bgs_lock);
    957	if (!list_empty(&block_group->io_list)) {
    958		list_del_init(&block_group->io_list);
    959
    960		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
    961
    962		spin_unlock(&trans->transaction->dirty_bgs_lock);
    963		btrfs_wait_cache_io(trans, block_group, path);
    964		btrfs_put_block_group(block_group);
    965		spin_lock(&trans->transaction->dirty_bgs_lock);
    966	}
    967
    968	if (!list_empty(&block_group->dirty_list)) {
    969		list_del_init(&block_group->dirty_list);
    970		remove_rsv = true;
    971		btrfs_put_block_group(block_group);
    972	}
    973	spin_unlock(&trans->transaction->dirty_bgs_lock);
    974	mutex_unlock(&trans->transaction->cache_write_mutex);
    975
    976	ret = btrfs_remove_free_space_inode(trans, inode, block_group);
    977	if (ret)
    978		goto out;
    979
    980	write_lock(&fs_info->block_group_cache_lock);
    981	rb_erase_cached(&block_group->cache_node,
    982			&fs_info->block_group_cache_tree);
    983	RB_CLEAR_NODE(&block_group->cache_node);
    984
    985	/* Once for the block groups rbtree */
    986	btrfs_put_block_group(block_group);
    987
    988	write_unlock(&fs_info->block_group_cache_lock);
    989
    990	down_write(&block_group->space_info->groups_sem);
    991	/*
    992	 * we must use list_del_init so people can check to see if they
    993	 * are still on the list after taking the semaphore
    994	 */
    995	list_del_init(&block_group->list);
    996	if (list_empty(&block_group->space_info->block_groups[index])) {
    997		kobj = block_group->space_info->block_group_kobjs[index];
    998		block_group->space_info->block_group_kobjs[index] = NULL;
    999		clear_avail_alloc_bits(fs_info, block_group->flags);
   1000	}
   1001	up_write(&block_group->space_info->groups_sem);
   1002	clear_incompat_bg_bits(fs_info, block_group->flags);
   1003	if (kobj) {
   1004		kobject_del(kobj);
   1005		kobject_put(kobj);
   1006	}
   1007
   1008	if (block_group->has_caching_ctl)
   1009		caching_ctl = btrfs_get_caching_control(block_group);
   1010	if (block_group->cached == BTRFS_CACHE_STARTED)
   1011		btrfs_wait_block_group_cache_done(block_group);
   1012	if (block_group->has_caching_ctl) {
   1013		write_lock(&fs_info->block_group_cache_lock);
   1014		if (!caching_ctl) {
   1015			struct btrfs_caching_control *ctl;
   1016
   1017			list_for_each_entry(ctl,
   1018				    &fs_info->caching_block_groups, list)
   1019				if (ctl->block_group == block_group) {
   1020					caching_ctl = ctl;
   1021					refcount_inc(&caching_ctl->count);
   1022					break;
   1023				}
   1024		}
   1025		if (caching_ctl)
   1026			list_del_init(&caching_ctl->list);
   1027		write_unlock(&fs_info->block_group_cache_lock);
   1028		if (caching_ctl) {
   1029			/* Once for the caching bgs list and once for us. */
   1030			btrfs_put_caching_control(caching_ctl);
   1031			btrfs_put_caching_control(caching_ctl);
   1032		}
   1033	}
   1034
   1035	spin_lock(&trans->transaction->dirty_bgs_lock);
   1036	WARN_ON(!list_empty(&block_group->dirty_list));
   1037	WARN_ON(!list_empty(&block_group->io_list));
   1038	spin_unlock(&trans->transaction->dirty_bgs_lock);
   1039
   1040	btrfs_remove_free_space_cache(block_group);
   1041
   1042	spin_lock(&block_group->space_info->lock);
   1043	list_del_init(&block_group->ro_list);
   1044
   1045	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
   1046		WARN_ON(block_group->space_info->total_bytes
   1047			< block_group->length);
   1048		WARN_ON(block_group->space_info->bytes_readonly
   1049			< block_group->length - block_group->zone_unusable);
   1050		WARN_ON(block_group->space_info->bytes_zone_unusable
   1051			< block_group->zone_unusable);
   1052		WARN_ON(block_group->space_info->disk_total
   1053			< block_group->length * factor);
   1054	}
   1055	block_group->space_info->total_bytes -= block_group->length;
   1056	block_group->space_info->bytes_readonly -=
   1057		(block_group->length - block_group->zone_unusable);
   1058	block_group->space_info->bytes_zone_unusable -=
   1059		block_group->zone_unusable;
   1060	block_group->space_info->disk_total -= block_group->length * factor;
   1061
   1062	spin_unlock(&block_group->space_info->lock);
   1063
   1064	/*
   1065	 * Remove the free space for the block group from the free space tree
   1066	 * and the block group's item from the extent tree before marking the
   1067	 * block group as removed. This is to prevent races with tasks that
   1068	 * freeze and unfreeze a block group, this task and another task
   1069	 * allocating a new block group - the unfreeze task ends up removing
   1070	 * the block group's extent map before the task calling this function
   1071	 * deletes the block group item from the extent tree, allowing for
   1072	 * another task to attempt to create another block group with the same
   1073	 * item key (and failing with -EEXIST and a transaction abort).
   1074	 */
   1075	ret = remove_block_group_free_space(trans, block_group);
   1076	if (ret)
   1077		goto out;
   1078
   1079	ret = remove_block_group_item(trans, path, block_group);
   1080	if (ret < 0)
   1081		goto out;
   1082
   1083	spin_lock(&block_group->lock);
   1084	block_group->removed = 1;
   1085	/*
   1086	 * At this point trimming or scrub can't start on this block group,
   1087	 * because we removed the block group from the rbtree
   1088	 * fs_info->block_group_cache_tree so no one can't find it anymore and
   1089	 * even if someone already got this block group before we removed it
   1090	 * from the rbtree, they have already incremented block_group->frozen -
   1091	 * if they didn't, for the trimming case they won't find any free space
   1092	 * entries because we already removed them all when we called
   1093	 * btrfs_remove_free_space_cache().
   1094	 *
   1095	 * And we must not remove the extent map from the fs_info->mapping_tree
   1096	 * to prevent the same logical address range and physical device space
   1097	 * ranges from being reused for a new block group. This is needed to
   1098	 * avoid races with trimming and scrub.
   1099	 *
   1100	 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
   1101	 * completely transactionless, so while it is trimming a range the
   1102	 * currently running transaction might finish and a new one start,
   1103	 * allowing for new block groups to be created that can reuse the same
   1104	 * physical device locations unless we take this special care.
   1105	 *
   1106	 * There may also be an implicit trim operation if the file system
   1107	 * is mounted with -odiscard. The same protections must remain
   1108	 * in place until the extents have been discarded completely when
   1109	 * the transaction commit has completed.
   1110	 */
   1111	remove_em = (atomic_read(&block_group->frozen) == 0);
   1112	spin_unlock(&block_group->lock);
   1113
   1114	if (remove_em) {
   1115		struct extent_map_tree *em_tree;
   1116
   1117		em_tree = &fs_info->mapping_tree;
   1118		write_lock(&em_tree->lock);
   1119		remove_extent_mapping(em_tree, em);
   1120		write_unlock(&em_tree->lock);
   1121		/* once for the tree */
   1122		free_extent_map(em);
   1123	}
   1124
   1125out:
   1126	/* Once for the lookup reference */
   1127	btrfs_put_block_group(block_group);
   1128	if (remove_rsv)
   1129		btrfs_delayed_refs_rsv_release(fs_info, 1);
   1130	btrfs_free_path(path);
   1131	return ret;
   1132}
   1133
   1134struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
   1135		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
   1136{
   1137	struct btrfs_root *root = btrfs_block_group_root(fs_info);
   1138	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
   1139	struct extent_map *em;
   1140	struct map_lookup *map;
   1141	unsigned int num_items;
   1142
   1143	read_lock(&em_tree->lock);
   1144	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
   1145	read_unlock(&em_tree->lock);
   1146	ASSERT(em && em->start == chunk_offset);
   1147
   1148	/*
   1149	 * We need to reserve 3 + N units from the metadata space info in order
   1150	 * to remove a block group (done at btrfs_remove_chunk() and at
   1151	 * btrfs_remove_block_group()), which are used for:
   1152	 *
   1153	 * 1 unit for adding the free space inode's orphan (located in the tree
   1154	 * of tree roots).
   1155	 * 1 unit for deleting the block group item (located in the extent
   1156	 * tree).
   1157	 * 1 unit for deleting the free space item (located in tree of tree
   1158	 * roots).
   1159	 * N units for deleting N device extent items corresponding to each
   1160	 * stripe (located in the device tree).
   1161	 *
   1162	 * In order to remove a block group we also need to reserve units in the
   1163	 * system space info in order to update the chunk tree (update one or
   1164	 * more device items and remove one chunk item), but this is done at
   1165	 * btrfs_remove_chunk() through a call to check_system_chunk().
   1166	 */
   1167	map = em->map_lookup;
   1168	num_items = 3 + map->num_stripes;
   1169	free_extent_map(em);
   1170
   1171	return btrfs_start_transaction_fallback_global_rsv(root, num_items);
   1172}
   1173
   1174/*
   1175 * Mark block group @cache read-only, so later write won't happen to block
   1176 * group @cache.
   1177 *
   1178 * If @force is not set, this function will only mark the block group readonly
   1179 * if we have enough free space (1M) in other metadata/system block groups.
   1180 * If @force is not set, this function will mark the block group readonly
   1181 * without checking free space.
   1182 *
   1183 * NOTE: This function doesn't care if other block groups can contain all the
   1184 * data in this block group. That check should be done by relocation routine,
   1185 * not this function.
   1186 */
   1187static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
   1188{
   1189	struct btrfs_space_info *sinfo = cache->space_info;
   1190	u64 num_bytes;
   1191	int ret = -ENOSPC;
   1192
   1193	spin_lock(&sinfo->lock);
   1194	spin_lock(&cache->lock);
   1195
   1196	if (cache->swap_extents) {
   1197		ret = -ETXTBSY;
   1198		goto out;
   1199	}
   1200
   1201	if (cache->ro) {
   1202		cache->ro++;
   1203		ret = 0;
   1204		goto out;
   1205	}
   1206
   1207	num_bytes = cache->length - cache->reserved - cache->pinned -
   1208		    cache->bytes_super - cache->zone_unusable - cache->used;
   1209
   1210	/*
   1211	 * Data never overcommits, even in mixed mode, so do just the straight
   1212	 * check of left over space in how much we have allocated.
   1213	 */
   1214	if (force) {
   1215		ret = 0;
   1216	} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
   1217		u64 sinfo_used = btrfs_space_info_used(sinfo, true);
   1218
   1219		/*
   1220		 * Here we make sure if we mark this bg RO, we still have enough
   1221		 * free space as buffer.
   1222		 */
   1223		if (sinfo_used + num_bytes <= sinfo->total_bytes)
   1224			ret = 0;
   1225	} else {
   1226		/*
   1227		 * We overcommit metadata, so we need to do the
   1228		 * btrfs_can_overcommit check here, and we need to pass in
   1229		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
   1230		 * leeway to allow us to mark this block group as read only.
   1231		 */
   1232		if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
   1233					 BTRFS_RESERVE_NO_FLUSH))
   1234			ret = 0;
   1235	}
   1236
   1237	if (!ret) {
   1238		sinfo->bytes_readonly += num_bytes;
   1239		if (btrfs_is_zoned(cache->fs_info)) {
   1240			/* Migrate zone_unusable bytes to readonly */
   1241			sinfo->bytes_readonly += cache->zone_unusable;
   1242			sinfo->bytes_zone_unusable -= cache->zone_unusable;
   1243			cache->zone_unusable = 0;
   1244		}
   1245		cache->ro++;
   1246		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
   1247	}
   1248out:
   1249	spin_unlock(&cache->lock);
   1250	spin_unlock(&sinfo->lock);
   1251	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
   1252		btrfs_info(cache->fs_info,
   1253			"unable to make block group %llu ro", cache->start);
   1254		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
   1255	}
   1256	return ret;
   1257}
   1258
   1259static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
   1260				 struct btrfs_block_group *bg)
   1261{
   1262	struct btrfs_fs_info *fs_info = bg->fs_info;
   1263	struct btrfs_transaction *prev_trans = NULL;
   1264	const u64 start = bg->start;
   1265	const u64 end = start + bg->length - 1;
   1266	int ret;
   1267
   1268	spin_lock(&fs_info->trans_lock);
   1269	if (trans->transaction->list.prev != &fs_info->trans_list) {
   1270		prev_trans = list_last_entry(&trans->transaction->list,
   1271					     struct btrfs_transaction, list);
   1272		refcount_inc(&prev_trans->use_count);
   1273	}
   1274	spin_unlock(&fs_info->trans_lock);
   1275
   1276	/*
   1277	 * Hold the unused_bg_unpin_mutex lock to avoid racing with
   1278	 * btrfs_finish_extent_commit(). If we are at transaction N, another
   1279	 * task might be running finish_extent_commit() for the previous
   1280	 * transaction N - 1, and have seen a range belonging to the block
   1281	 * group in pinned_extents before we were able to clear the whole block
   1282	 * group range from pinned_extents. This means that task can lookup for
   1283	 * the block group after we unpinned it from pinned_extents and removed
   1284	 * it, leading to a BUG_ON() at unpin_extent_range().
   1285	 */
   1286	mutex_lock(&fs_info->unused_bg_unpin_mutex);
   1287	if (prev_trans) {
   1288		ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
   1289					EXTENT_DIRTY);
   1290		if (ret)
   1291			goto out;
   1292	}
   1293
   1294	ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
   1295				EXTENT_DIRTY);
   1296out:
   1297	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
   1298	if (prev_trans)
   1299		btrfs_put_transaction(prev_trans);
   1300
   1301	return ret == 0;
   1302}
   1303
   1304/*
   1305 * Process the unused_bgs list and remove any that don't have any allocated
   1306 * space inside of them.
   1307 */
   1308void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
   1309{
   1310	struct btrfs_block_group *block_group;
   1311	struct btrfs_space_info *space_info;
   1312	struct btrfs_trans_handle *trans;
   1313	const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
   1314	int ret = 0;
   1315
   1316	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
   1317		return;
   1318
   1319	/*
   1320	 * Long running balances can keep us blocked here for eternity, so
   1321	 * simply skip deletion if we're unable to get the mutex.
   1322	 */
   1323	if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
   1324		return;
   1325
   1326	spin_lock(&fs_info->unused_bgs_lock);
   1327	while (!list_empty(&fs_info->unused_bgs)) {
   1328		int trimming;
   1329
   1330		block_group = list_first_entry(&fs_info->unused_bgs,
   1331					       struct btrfs_block_group,
   1332					       bg_list);
   1333		list_del_init(&block_group->bg_list);
   1334
   1335		space_info = block_group->space_info;
   1336
   1337		if (ret || btrfs_mixed_space_info(space_info)) {
   1338			btrfs_put_block_group(block_group);
   1339			continue;
   1340		}
   1341		spin_unlock(&fs_info->unused_bgs_lock);
   1342
   1343		btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
   1344
   1345		/* Don't want to race with allocators so take the groups_sem */
   1346		down_write(&space_info->groups_sem);
   1347
   1348		/*
   1349		 * Async discard moves the final block group discard to be prior
   1350		 * to the unused_bgs code path.  Therefore, if it's not fully
   1351		 * trimmed, punt it back to the async discard lists.
   1352		 */
   1353		if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
   1354		    !btrfs_is_free_space_trimmed(block_group)) {
   1355			trace_btrfs_skip_unused_block_group(block_group);
   1356			up_write(&space_info->groups_sem);
   1357			/* Requeue if we failed because of async discard */
   1358			btrfs_discard_queue_work(&fs_info->discard_ctl,
   1359						 block_group);
   1360			goto next;
   1361		}
   1362
   1363		spin_lock(&block_group->lock);
   1364		if (block_group->reserved || block_group->pinned ||
   1365		    block_group->used || block_group->ro ||
   1366		    list_is_singular(&block_group->list)) {
   1367			/*
   1368			 * We want to bail if we made new allocations or have
   1369			 * outstanding allocations in this block group.  We do
   1370			 * the ro check in case balance is currently acting on
   1371			 * this block group.
   1372			 */
   1373			trace_btrfs_skip_unused_block_group(block_group);
   1374			spin_unlock(&block_group->lock);
   1375			up_write(&space_info->groups_sem);
   1376			goto next;
   1377		}
   1378		spin_unlock(&block_group->lock);
   1379
   1380		/* We don't want to force the issue, only flip if it's ok. */
   1381		ret = inc_block_group_ro(block_group, 0);
   1382		up_write(&space_info->groups_sem);
   1383		if (ret < 0) {
   1384			ret = 0;
   1385			goto next;
   1386		}
   1387
   1388		ret = btrfs_zone_finish(block_group);
   1389		if (ret < 0) {
   1390			btrfs_dec_block_group_ro(block_group);
   1391			if (ret == -EAGAIN)
   1392				ret = 0;
   1393			goto next;
   1394		}
   1395
   1396		/*
   1397		 * Want to do this before we do anything else so we can recover
   1398		 * properly if we fail to join the transaction.
   1399		 */
   1400		trans = btrfs_start_trans_remove_block_group(fs_info,
   1401						     block_group->start);
   1402		if (IS_ERR(trans)) {
   1403			btrfs_dec_block_group_ro(block_group);
   1404			ret = PTR_ERR(trans);
   1405			goto next;
   1406		}
   1407
   1408		/*
   1409		 * We could have pending pinned extents for this block group,
   1410		 * just delete them, we don't care about them anymore.
   1411		 */
   1412		if (!clean_pinned_extents(trans, block_group)) {
   1413			btrfs_dec_block_group_ro(block_group);
   1414			goto end_trans;
   1415		}
   1416
   1417		/*
   1418		 * At this point, the block_group is read only and should fail
   1419		 * new allocations.  However, btrfs_finish_extent_commit() can
   1420		 * cause this block_group to be placed back on the discard
   1421		 * lists because now the block_group isn't fully discarded.
   1422		 * Bail here and try again later after discarding everything.
   1423		 */
   1424		spin_lock(&fs_info->discard_ctl.lock);
   1425		if (!list_empty(&block_group->discard_list)) {
   1426			spin_unlock(&fs_info->discard_ctl.lock);
   1427			btrfs_dec_block_group_ro(block_group);
   1428			btrfs_discard_queue_work(&fs_info->discard_ctl,
   1429						 block_group);
   1430			goto end_trans;
   1431		}
   1432		spin_unlock(&fs_info->discard_ctl.lock);
   1433
   1434		/* Reset pinned so btrfs_put_block_group doesn't complain */
   1435		spin_lock(&space_info->lock);
   1436		spin_lock(&block_group->lock);
   1437
   1438		btrfs_space_info_update_bytes_pinned(fs_info, space_info,
   1439						     -block_group->pinned);
   1440		space_info->bytes_readonly += block_group->pinned;
   1441		block_group->pinned = 0;
   1442
   1443		spin_unlock(&block_group->lock);
   1444		spin_unlock(&space_info->lock);
   1445
   1446		/*
   1447		 * The normal path here is an unused block group is passed here,
   1448		 * then trimming is handled in the transaction commit path.
   1449		 * Async discard interposes before this to do the trimming
   1450		 * before coming down the unused block group path as trimming
   1451		 * will no longer be done later in the transaction commit path.
   1452		 */
   1453		if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
   1454			goto flip_async;
   1455
   1456		/*
   1457		 * DISCARD can flip during remount. On zoned filesystems, we
   1458		 * need to reset sequential-required zones.
   1459		 */
   1460		trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
   1461				btrfs_is_zoned(fs_info);
   1462
   1463		/* Implicit trim during transaction commit. */
   1464		if (trimming)
   1465			btrfs_freeze_block_group(block_group);
   1466
   1467		/*
   1468		 * Btrfs_remove_chunk will abort the transaction if things go
   1469		 * horribly wrong.
   1470		 */
   1471		ret = btrfs_remove_chunk(trans, block_group->start);
   1472
   1473		if (ret) {
   1474			if (trimming)
   1475				btrfs_unfreeze_block_group(block_group);
   1476			goto end_trans;
   1477		}
   1478
   1479		/*
   1480		 * If we're not mounted with -odiscard, we can just forget
   1481		 * about this block group. Otherwise we'll need to wait
   1482		 * until transaction commit to do the actual discard.
   1483		 */
   1484		if (trimming) {
   1485			spin_lock(&fs_info->unused_bgs_lock);
   1486			/*
   1487			 * A concurrent scrub might have added us to the list
   1488			 * fs_info->unused_bgs, so use a list_move operation
   1489			 * to add the block group to the deleted_bgs list.
   1490			 */
   1491			list_move(&block_group->bg_list,
   1492				  &trans->transaction->deleted_bgs);
   1493			spin_unlock(&fs_info->unused_bgs_lock);
   1494			btrfs_get_block_group(block_group);
   1495		}
   1496end_trans:
   1497		btrfs_end_transaction(trans);
   1498next:
   1499		btrfs_put_block_group(block_group);
   1500		spin_lock(&fs_info->unused_bgs_lock);
   1501	}
   1502	spin_unlock(&fs_info->unused_bgs_lock);
   1503	mutex_unlock(&fs_info->reclaim_bgs_lock);
   1504	return;
   1505
   1506flip_async:
   1507	btrfs_end_transaction(trans);
   1508	mutex_unlock(&fs_info->reclaim_bgs_lock);
   1509	btrfs_put_block_group(block_group);
   1510	btrfs_discard_punt_unused_bgs_list(fs_info);
   1511}
   1512
   1513void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
   1514{
   1515	struct btrfs_fs_info *fs_info = bg->fs_info;
   1516
   1517	spin_lock(&fs_info->unused_bgs_lock);
   1518	if (list_empty(&bg->bg_list)) {
   1519		btrfs_get_block_group(bg);
   1520		trace_btrfs_add_unused_block_group(bg);
   1521		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
   1522	}
   1523	spin_unlock(&fs_info->unused_bgs_lock);
   1524}
   1525
   1526/*
   1527 * We want block groups with a low number of used bytes to be in the beginning
   1528 * of the list, so they will get reclaimed first.
   1529 */
   1530static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
   1531			   const struct list_head *b)
   1532{
   1533	const struct btrfs_block_group *bg1, *bg2;
   1534
   1535	bg1 = list_entry(a, struct btrfs_block_group, bg_list);
   1536	bg2 = list_entry(b, struct btrfs_block_group, bg_list);
   1537
   1538	return bg1->used > bg2->used;
   1539}
   1540
   1541static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
   1542{
   1543	if (btrfs_is_zoned(fs_info))
   1544		return btrfs_zoned_should_reclaim(fs_info);
   1545	return true;
   1546}
   1547
   1548void btrfs_reclaim_bgs_work(struct work_struct *work)
   1549{
   1550	struct btrfs_fs_info *fs_info =
   1551		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
   1552	struct btrfs_block_group *bg;
   1553	struct btrfs_space_info *space_info;
   1554
   1555	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
   1556		return;
   1557
   1558	if (!btrfs_should_reclaim(fs_info))
   1559		return;
   1560
   1561	sb_start_write(fs_info->sb);
   1562
   1563	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
   1564		sb_end_write(fs_info->sb);
   1565		return;
   1566	}
   1567
   1568	/*
   1569	 * Long running balances can keep us blocked here for eternity, so
   1570	 * simply skip reclaim if we're unable to get the mutex.
   1571	 */
   1572	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
   1573		btrfs_exclop_finish(fs_info);
   1574		sb_end_write(fs_info->sb);
   1575		return;
   1576	}
   1577
   1578	spin_lock(&fs_info->unused_bgs_lock);
   1579	/*
   1580	 * Sort happens under lock because we can't simply splice it and sort.
   1581	 * The block groups might still be in use and reachable via bg_list,
   1582	 * and their presence in the reclaim_bgs list must be preserved.
   1583	 */
   1584	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
   1585	while (!list_empty(&fs_info->reclaim_bgs)) {
   1586		u64 zone_unusable;
   1587		int ret = 0;
   1588
   1589		bg = list_first_entry(&fs_info->reclaim_bgs,
   1590				      struct btrfs_block_group,
   1591				      bg_list);
   1592		list_del_init(&bg->bg_list);
   1593
   1594		space_info = bg->space_info;
   1595		spin_unlock(&fs_info->unused_bgs_lock);
   1596
   1597		/* Don't race with allocators so take the groups_sem */
   1598		down_write(&space_info->groups_sem);
   1599
   1600		spin_lock(&bg->lock);
   1601		if (bg->reserved || bg->pinned || bg->ro) {
   1602			/*
   1603			 * We want to bail if we made new allocations or have
   1604			 * outstanding allocations in this block group.  We do
   1605			 * the ro check in case balance is currently acting on
   1606			 * this block group.
   1607			 */
   1608			spin_unlock(&bg->lock);
   1609			up_write(&space_info->groups_sem);
   1610			goto next;
   1611		}
   1612		spin_unlock(&bg->lock);
   1613
   1614		/* Get out fast, in case we're unmounting the filesystem */
   1615		if (btrfs_fs_closing(fs_info)) {
   1616			up_write(&space_info->groups_sem);
   1617			goto next;
   1618		}
   1619
   1620		/*
   1621		 * Cache the zone_unusable value before turning the block group
   1622		 * to read only. As soon as the blog group is read only it's
   1623		 * zone_unusable value gets moved to the block group's read-only
   1624		 * bytes and isn't available for calculations anymore.
   1625		 */
   1626		zone_unusable = bg->zone_unusable;
   1627		ret = inc_block_group_ro(bg, 0);
   1628		up_write(&space_info->groups_sem);
   1629		if (ret < 0)
   1630			goto next;
   1631
   1632		btrfs_info(fs_info,
   1633			"reclaiming chunk %llu with %llu%% used %llu%% unusable",
   1634				bg->start, div_u64(bg->used * 100, bg->length),
   1635				div64_u64(zone_unusable * 100, bg->length));
   1636		trace_btrfs_reclaim_block_group(bg);
   1637		ret = btrfs_relocate_chunk(fs_info, bg->start);
   1638		if (ret)
   1639			btrfs_err(fs_info, "error relocating chunk %llu",
   1640				  bg->start);
   1641
   1642next:
   1643		btrfs_put_block_group(bg);
   1644		spin_lock(&fs_info->unused_bgs_lock);
   1645	}
   1646	spin_unlock(&fs_info->unused_bgs_lock);
   1647	mutex_unlock(&fs_info->reclaim_bgs_lock);
   1648	btrfs_exclop_finish(fs_info);
   1649	sb_end_write(fs_info->sb);
   1650}
   1651
   1652void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
   1653{
   1654	spin_lock(&fs_info->unused_bgs_lock);
   1655	if (!list_empty(&fs_info->reclaim_bgs))
   1656		queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
   1657	spin_unlock(&fs_info->unused_bgs_lock);
   1658}
   1659
   1660void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
   1661{
   1662	struct btrfs_fs_info *fs_info = bg->fs_info;
   1663
   1664	spin_lock(&fs_info->unused_bgs_lock);
   1665	if (list_empty(&bg->bg_list)) {
   1666		btrfs_get_block_group(bg);
   1667		trace_btrfs_add_reclaim_block_group(bg);
   1668		list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
   1669	}
   1670	spin_unlock(&fs_info->unused_bgs_lock);
   1671}
   1672
   1673static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
   1674			   struct btrfs_path *path)
   1675{
   1676	struct extent_map_tree *em_tree;
   1677	struct extent_map *em;
   1678	struct btrfs_block_group_item bg;
   1679	struct extent_buffer *leaf;
   1680	int slot;
   1681	u64 flags;
   1682	int ret = 0;
   1683
   1684	slot = path->slots[0];
   1685	leaf = path->nodes[0];
   1686
   1687	em_tree = &fs_info->mapping_tree;
   1688	read_lock(&em_tree->lock);
   1689	em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
   1690	read_unlock(&em_tree->lock);
   1691	if (!em) {
   1692		btrfs_err(fs_info,
   1693			  "logical %llu len %llu found bg but no related chunk",
   1694			  key->objectid, key->offset);
   1695		return -ENOENT;
   1696	}
   1697
   1698	if (em->start != key->objectid || em->len != key->offset) {
   1699		btrfs_err(fs_info,
   1700			"block group %llu len %llu mismatch with chunk %llu len %llu",
   1701			key->objectid, key->offset, em->start, em->len);
   1702		ret = -EUCLEAN;
   1703		goto out_free_em;
   1704	}
   1705
   1706	read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
   1707			   sizeof(bg));
   1708	flags = btrfs_stack_block_group_flags(&bg) &
   1709		BTRFS_BLOCK_GROUP_TYPE_MASK;
   1710
   1711	if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
   1712		btrfs_err(fs_info,
   1713"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
   1714			  key->objectid, key->offset, flags,
   1715			  (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
   1716		ret = -EUCLEAN;
   1717	}
   1718
   1719out_free_em:
   1720	free_extent_map(em);
   1721	return ret;
   1722}
   1723
   1724static int find_first_block_group(struct btrfs_fs_info *fs_info,
   1725				  struct btrfs_path *path,
   1726				  struct btrfs_key *key)
   1727{
   1728	struct btrfs_root *root = btrfs_block_group_root(fs_info);
   1729	int ret;
   1730	struct btrfs_key found_key;
   1731
   1732	btrfs_for_each_slot(root, key, &found_key, path, ret) {
   1733		if (found_key.objectid >= key->objectid &&
   1734		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
   1735			return read_bg_from_eb(fs_info, &found_key, path);
   1736		}
   1737	}
   1738	return ret;
   1739}
   1740
   1741static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
   1742{
   1743	u64 extra_flags = chunk_to_extended(flags) &
   1744				BTRFS_EXTENDED_PROFILE_MASK;
   1745
   1746	write_seqlock(&fs_info->profiles_lock);
   1747	if (flags & BTRFS_BLOCK_GROUP_DATA)
   1748		fs_info->avail_data_alloc_bits |= extra_flags;
   1749	if (flags & BTRFS_BLOCK_GROUP_METADATA)
   1750		fs_info->avail_metadata_alloc_bits |= extra_flags;
   1751	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
   1752		fs_info->avail_system_alloc_bits |= extra_flags;
   1753	write_sequnlock(&fs_info->profiles_lock);
   1754}
   1755
   1756/**
   1757 * Map a physical disk address to a list of logical addresses
   1758 *
   1759 * @fs_info:       the filesystem
   1760 * @chunk_start:   logical address of block group
   1761 * @bdev:	   physical device to resolve, can be NULL to indicate any device
   1762 * @physical:	   physical address to map to logical addresses
   1763 * @logical:	   return array of logical addresses which map to @physical
   1764 * @naddrs:	   length of @logical
   1765 * @stripe_len:    size of IO stripe for the given block group
   1766 *
   1767 * Maps a particular @physical disk address to a list of @logical addresses.
   1768 * Used primarily to exclude those portions of a block group that contain super
   1769 * block copies.
   1770 */
   1771int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
   1772		     struct block_device *bdev, u64 physical, u64 **logical,
   1773		     int *naddrs, int *stripe_len)
   1774{
   1775	struct extent_map *em;
   1776	struct map_lookup *map;
   1777	u64 *buf;
   1778	u64 bytenr;
   1779	u64 data_stripe_length;
   1780	u64 io_stripe_size;
   1781	int i, nr = 0;
   1782	int ret = 0;
   1783
   1784	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
   1785	if (IS_ERR(em))
   1786		return -EIO;
   1787
   1788	map = em->map_lookup;
   1789	data_stripe_length = em->orig_block_len;
   1790	io_stripe_size = map->stripe_len;
   1791	chunk_start = em->start;
   1792
   1793	/* For RAID5/6 adjust to a full IO stripe length */
   1794	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
   1795		io_stripe_size = map->stripe_len * nr_data_stripes(map);
   1796
   1797	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
   1798	if (!buf) {
   1799		ret = -ENOMEM;
   1800		goto out;
   1801	}
   1802
   1803	for (i = 0; i < map->num_stripes; i++) {
   1804		bool already_inserted = false;
   1805		u64 stripe_nr;
   1806		u64 offset;
   1807		int j;
   1808
   1809		if (!in_range(physical, map->stripes[i].physical,
   1810			      data_stripe_length))
   1811			continue;
   1812
   1813		if (bdev && map->stripes[i].dev->bdev != bdev)
   1814			continue;
   1815
   1816		stripe_nr = physical - map->stripes[i].physical;
   1817		stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
   1818
   1819		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
   1820			stripe_nr = stripe_nr * map->num_stripes + i;
   1821			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
   1822		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
   1823			stripe_nr = stripe_nr * map->num_stripes + i;
   1824		}
   1825		/*
   1826		 * The remaining case would be for RAID56, multiply by
   1827		 * nr_data_stripes().  Alternatively, just use rmap_len below
   1828		 * instead of map->stripe_len
   1829		 */
   1830
   1831		bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
   1832
   1833		/* Ensure we don't add duplicate addresses */
   1834		for (j = 0; j < nr; j++) {
   1835			if (buf[j] == bytenr) {
   1836				already_inserted = true;
   1837				break;
   1838			}
   1839		}
   1840
   1841		if (!already_inserted)
   1842			buf[nr++] = bytenr;
   1843	}
   1844
   1845	*logical = buf;
   1846	*naddrs = nr;
   1847	*stripe_len = io_stripe_size;
   1848out:
   1849	free_extent_map(em);
   1850	return ret;
   1851}
   1852
   1853static int exclude_super_stripes(struct btrfs_block_group *cache)
   1854{
   1855	struct btrfs_fs_info *fs_info = cache->fs_info;
   1856	const bool zoned = btrfs_is_zoned(fs_info);
   1857	u64 bytenr;
   1858	u64 *logical;
   1859	int stripe_len;
   1860	int i, nr, ret;
   1861
   1862	if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
   1863		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
   1864		cache->bytes_super += stripe_len;
   1865		ret = btrfs_add_excluded_extent(fs_info, cache->start,
   1866						stripe_len);
   1867		if (ret)
   1868			return ret;
   1869	}
   1870
   1871	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
   1872		bytenr = btrfs_sb_offset(i);
   1873		ret = btrfs_rmap_block(fs_info, cache->start, NULL,
   1874				       bytenr, &logical, &nr, &stripe_len);
   1875		if (ret)
   1876			return ret;
   1877
   1878		/* Shouldn't have super stripes in sequential zones */
   1879		if (zoned && nr) {
   1880			btrfs_err(fs_info,
   1881			"zoned: block group %llu must not contain super block",
   1882				  cache->start);
   1883			return -EUCLEAN;
   1884		}
   1885
   1886		while (nr--) {
   1887			u64 len = min_t(u64, stripe_len,
   1888				cache->start + cache->length - logical[nr]);
   1889
   1890			cache->bytes_super += len;
   1891			ret = btrfs_add_excluded_extent(fs_info, logical[nr],
   1892							len);
   1893			if (ret) {
   1894				kfree(logical);
   1895				return ret;
   1896			}
   1897		}
   1898
   1899		kfree(logical);
   1900	}
   1901	return 0;
   1902}
   1903
   1904static void link_block_group(struct btrfs_block_group *cache)
   1905{
   1906	struct btrfs_space_info *space_info = cache->space_info;
   1907	int index = btrfs_bg_flags_to_raid_index(cache->flags);
   1908
   1909	down_write(&space_info->groups_sem);
   1910	list_add_tail(&cache->list, &space_info->block_groups[index]);
   1911	up_write(&space_info->groups_sem);
   1912}
   1913
   1914static struct btrfs_block_group *btrfs_create_block_group_cache(
   1915		struct btrfs_fs_info *fs_info, u64 start)
   1916{
   1917	struct btrfs_block_group *cache;
   1918
   1919	cache = kzalloc(sizeof(*cache), GFP_NOFS);
   1920	if (!cache)
   1921		return NULL;
   1922
   1923	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
   1924					GFP_NOFS);
   1925	if (!cache->free_space_ctl) {
   1926		kfree(cache);
   1927		return NULL;
   1928	}
   1929
   1930	cache->start = start;
   1931
   1932	cache->fs_info = fs_info;
   1933	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
   1934
   1935	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
   1936
   1937	refcount_set(&cache->refs, 1);
   1938	spin_lock_init(&cache->lock);
   1939	init_rwsem(&cache->data_rwsem);
   1940	INIT_LIST_HEAD(&cache->list);
   1941	INIT_LIST_HEAD(&cache->cluster_list);
   1942	INIT_LIST_HEAD(&cache->bg_list);
   1943	INIT_LIST_HEAD(&cache->ro_list);
   1944	INIT_LIST_HEAD(&cache->discard_list);
   1945	INIT_LIST_HEAD(&cache->dirty_list);
   1946	INIT_LIST_HEAD(&cache->io_list);
   1947	INIT_LIST_HEAD(&cache->active_bg_list);
   1948	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
   1949	atomic_set(&cache->frozen, 0);
   1950	mutex_init(&cache->free_space_lock);
   1951	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
   1952
   1953	return cache;
   1954}
   1955
   1956/*
   1957 * Iterate all chunks and verify that each of them has the corresponding block
   1958 * group
   1959 */
   1960static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
   1961{
   1962	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
   1963	struct extent_map *em;
   1964	struct btrfs_block_group *bg;
   1965	u64 start = 0;
   1966	int ret = 0;
   1967
   1968	while (1) {
   1969		read_lock(&map_tree->lock);
   1970		/*
   1971		 * lookup_extent_mapping will return the first extent map
   1972		 * intersecting the range, so setting @len to 1 is enough to
   1973		 * get the first chunk.
   1974		 */
   1975		em = lookup_extent_mapping(map_tree, start, 1);
   1976		read_unlock(&map_tree->lock);
   1977		if (!em)
   1978			break;
   1979
   1980		bg = btrfs_lookup_block_group(fs_info, em->start);
   1981		if (!bg) {
   1982			btrfs_err(fs_info,
   1983	"chunk start=%llu len=%llu doesn't have corresponding block group",
   1984				     em->start, em->len);
   1985			ret = -EUCLEAN;
   1986			free_extent_map(em);
   1987			break;
   1988		}
   1989		if (bg->start != em->start || bg->length != em->len ||
   1990		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
   1991		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
   1992			btrfs_err(fs_info,
   1993"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
   1994				em->start, em->len,
   1995				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
   1996				bg->start, bg->length,
   1997				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
   1998			ret = -EUCLEAN;
   1999			free_extent_map(em);
   2000			btrfs_put_block_group(bg);
   2001			break;
   2002		}
   2003		start = em->start + em->len;
   2004		free_extent_map(em);
   2005		btrfs_put_block_group(bg);
   2006	}
   2007	return ret;
   2008}
   2009
   2010static int read_one_block_group(struct btrfs_fs_info *info,
   2011				struct btrfs_block_group_item *bgi,
   2012				const struct btrfs_key *key,
   2013				int need_clear)
   2014{
   2015	struct btrfs_block_group *cache;
   2016	struct btrfs_space_info *space_info;
   2017	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
   2018	int ret;
   2019
   2020	ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
   2021
   2022	cache = btrfs_create_block_group_cache(info, key->objectid);
   2023	if (!cache)
   2024		return -ENOMEM;
   2025
   2026	cache->length = key->offset;
   2027	cache->used = btrfs_stack_block_group_used(bgi);
   2028	cache->flags = btrfs_stack_block_group_flags(bgi);
   2029	cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
   2030
   2031	set_free_space_tree_thresholds(cache);
   2032
   2033	if (need_clear) {
   2034		/*
   2035		 * When we mount with old space cache, we need to
   2036		 * set BTRFS_DC_CLEAR and set dirty flag.
   2037		 *
   2038		 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
   2039		 *    truncate the old free space cache inode and
   2040		 *    setup a new one.
   2041		 * b) Setting 'dirty flag' makes sure that we flush
   2042		 *    the new space cache info onto disk.
   2043		 */
   2044		if (btrfs_test_opt(info, SPACE_CACHE))
   2045			cache->disk_cache_state = BTRFS_DC_CLEAR;
   2046	}
   2047	if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
   2048	    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
   2049			btrfs_err(info,
   2050"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
   2051				  cache->start);
   2052			ret = -EINVAL;
   2053			goto error;
   2054	}
   2055
   2056	ret = btrfs_load_block_group_zone_info(cache, false);
   2057	if (ret) {
   2058		btrfs_err(info, "zoned: failed to load zone info of bg %llu",
   2059			  cache->start);
   2060		goto error;
   2061	}
   2062
   2063	/*
   2064	 * We need to exclude the super stripes now so that the space info has
   2065	 * super bytes accounted for, otherwise we'll think we have more space
   2066	 * than we actually do.
   2067	 */
   2068	ret = exclude_super_stripes(cache);
   2069	if (ret) {
   2070		/* We may have excluded something, so call this just in case. */
   2071		btrfs_free_excluded_extents(cache);
   2072		goto error;
   2073	}
   2074
   2075	/*
   2076	 * For zoned filesystem, space after the allocation offset is the only
   2077	 * free space for a block group. So, we don't need any caching work.
   2078	 * btrfs_calc_zone_unusable() will set the amount of free space and
   2079	 * zone_unusable space.
   2080	 *
   2081	 * For regular filesystem, check for two cases, either we are full, and
   2082	 * therefore don't need to bother with the caching work since we won't
   2083	 * find any space, or we are empty, and we can just add all the space
   2084	 * in and be done with it.  This saves us _a_lot_ of time, particularly
   2085	 * in the full case.
   2086	 */
   2087	if (btrfs_is_zoned(info)) {
   2088		btrfs_calc_zone_unusable(cache);
   2089		/* Should not have any excluded extents. Just in case, though. */
   2090		btrfs_free_excluded_extents(cache);
   2091	} else if (cache->length == cache->used) {
   2092		cache->last_byte_to_unpin = (u64)-1;
   2093		cache->cached = BTRFS_CACHE_FINISHED;
   2094		btrfs_free_excluded_extents(cache);
   2095	} else if (cache->used == 0) {
   2096		cache->last_byte_to_unpin = (u64)-1;
   2097		cache->cached = BTRFS_CACHE_FINISHED;
   2098		add_new_free_space(cache, cache->start,
   2099				   cache->start + cache->length);
   2100		btrfs_free_excluded_extents(cache);
   2101	}
   2102
   2103	ret = btrfs_add_block_group_cache(info, cache);
   2104	if (ret) {
   2105		btrfs_remove_free_space_cache(cache);
   2106		goto error;
   2107	}
   2108	trace_btrfs_add_block_group(info, cache, 0);
   2109	btrfs_update_space_info(info, cache->flags, cache->length,
   2110				cache->used, cache->bytes_super,
   2111				cache->zone_unusable, &space_info);
   2112
   2113	cache->space_info = space_info;
   2114
   2115	link_block_group(cache);
   2116
   2117	set_avail_alloc_bits(info, cache->flags);
   2118	if (btrfs_chunk_writeable(info, cache->start)) {
   2119		if (cache->used == 0) {
   2120			ASSERT(list_empty(&cache->bg_list));
   2121			if (btrfs_test_opt(info, DISCARD_ASYNC))
   2122				btrfs_discard_queue_work(&info->discard_ctl, cache);
   2123			else
   2124				btrfs_mark_bg_unused(cache);
   2125		}
   2126	} else {
   2127		inc_block_group_ro(cache, 1);
   2128	}
   2129
   2130	return 0;
   2131error:
   2132	btrfs_put_block_group(cache);
   2133	return ret;
   2134}
   2135
   2136static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
   2137{
   2138	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
   2139	struct btrfs_space_info *space_info;
   2140	struct rb_node *node;
   2141	int ret = 0;
   2142
   2143	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
   2144		struct extent_map *em;
   2145		struct map_lookup *map;
   2146		struct btrfs_block_group *bg;
   2147
   2148		em = rb_entry(node, struct extent_map, rb_node);
   2149		map = em->map_lookup;
   2150		bg = btrfs_create_block_group_cache(fs_info, em->start);
   2151		if (!bg) {
   2152			ret = -ENOMEM;
   2153			break;
   2154		}
   2155
   2156		/* Fill dummy cache as FULL */
   2157		bg->length = em->len;
   2158		bg->flags = map->type;
   2159		bg->last_byte_to_unpin = (u64)-1;
   2160		bg->cached = BTRFS_CACHE_FINISHED;
   2161		bg->used = em->len;
   2162		bg->flags = map->type;
   2163		ret = btrfs_add_block_group_cache(fs_info, bg);
   2164		/*
   2165		 * We may have some valid block group cache added already, in
   2166		 * that case we skip to the next one.
   2167		 */
   2168		if (ret == -EEXIST) {
   2169			ret = 0;
   2170			btrfs_put_block_group(bg);
   2171			continue;
   2172		}
   2173
   2174		if (ret) {
   2175			btrfs_remove_free_space_cache(bg);
   2176			btrfs_put_block_group(bg);
   2177			break;
   2178		}
   2179
   2180		btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
   2181					0, 0, &space_info);
   2182		bg->space_info = space_info;
   2183		link_block_group(bg);
   2184
   2185		set_avail_alloc_bits(fs_info, bg->flags);
   2186	}
   2187	if (!ret)
   2188		btrfs_init_global_block_rsv(fs_info);
   2189	return ret;
   2190}
   2191
   2192int btrfs_read_block_groups(struct btrfs_fs_info *info)
   2193{
   2194	struct btrfs_root *root = btrfs_block_group_root(info);
   2195	struct btrfs_path *path;
   2196	int ret;
   2197	struct btrfs_block_group *cache;
   2198	struct btrfs_space_info *space_info;
   2199	struct btrfs_key key;
   2200	int need_clear = 0;
   2201	u64 cache_gen;
   2202
   2203	if (!root)
   2204		return fill_dummy_bgs(info);
   2205
   2206	key.objectid = 0;
   2207	key.offset = 0;
   2208	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
   2209	path = btrfs_alloc_path();
   2210	if (!path)
   2211		return -ENOMEM;
   2212
   2213	cache_gen = btrfs_super_cache_generation(info->super_copy);
   2214	if (btrfs_test_opt(info, SPACE_CACHE) &&
   2215	    btrfs_super_generation(info->super_copy) != cache_gen)
   2216		need_clear = 1;
   2217	if (btrfs_test_opt(info, CLEAR_CACHE))
   2218		need_clear = 1;
   2219
   2220	while (1) {
   2221		struct btrfs_block_group_item bgi;
   2222		struct extent_buffer *leaf;
   2223		int slot;
   2224
   2225		ret = find_first_block_group(info, path, &key);
   2226		if (ret > 0)
   2227			break;
   2228		if (ret != 0)
   2229			goto error;
   2230
   2231		leaf = path->nodes[0];
   2232		slot = path->slots[0];
   2233
   2234		read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
   2235				   sizeof(bgi));
   2236
   2237		btrfs_item_key_to_cpu(leaf, &key, slot);
   2238		btrfs_release_path(path);
   2239		ret = read_one_block_group(info, &bgi, &key, need_clear);
   2240		if (ret < 0)
   2241			goto error;
   2242		key.objectid += key.offset;
   2243		key.offset = 0;
   2244	}
   2245	btrfs_release_path(path);
   2246
   2247	list_for_each_entry(space_info, &info->space_info, list) {
   2248		int i;
   2249
   2250		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
   2251			if (list_empty(&space_info->block_groups[i]))
   2252				continue;
   2253			cache = list_first_entry(&space_info->block_groups[i],
   2254						 struct btrfs_block_group,
   2255						 list);
   2256			btrfs_sysfs_add_block_group_type(cache);
   2257		}
   2258
   2259		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
   2260		      (BTRFS_BLOCK_GROUP_RAID10 |
   2261		       BTRFS_BLOCK_GROUP_RAID1_MASK |
   2262		       BTRFS_BLOCK_GROUP_RAID56_MASK |
   2263		       BTRFS_BLOCK_GROUP_DUP)))
   2264			continue;
   2265		/*
   2266		 * Avoid allocating from un-mirrored block group if there are
   2267		 * mirrored block groups.
   2268		 */
   2269		list_for_each_entry(cache,
   2270				&space_info->block_groups[BTRFS_RAID_RAID0],
   2271				list)
   2272			inc_block_group_ro(cache, 1);
   2273		list_for_each_entry(cache,
   2274				&space_info->block_groups[BTRFS_RAID_SINGLE],
   2275				list)
   2276			inc_block_group_ro(cache, 1);
   2277	}
   2278
   2279	btrfs_init_global_block_rsv(info);
   2280	ret = check_chunk_block_group_mappings(info);
   2281error:
   2282	btrfs_free_path(path);
   2283	/*
   2284	 * We've hit some error while reading the extent tree, and have
   2285	 * rescue=ibadroots mount option.
   2286	 * Try to fill the tree using dummy block groups so that the user can
   2287	 * continue to mount and grab their data.
   2288	 */
   2289	if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
   2290		ret = fill_dummy_bgs(info);
   2291	return ret;
   2292}
   2293
   2294/*
   2295 * This function, insert_block_group_item(), belongs to the phase 2 of chunk
   2296 * allocation.
   2297 *
   2298 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
   2299 * phases.
   2300 */
   2301static int insert_block_group_item(struct btrfs_trans_handle *trans,
   2302				   struct btrfs_block_group *block_group)
   2303{
   2304	struct btrfs_fs_info *fs_info = trans->fs_info;
   2305	struct btrfs_block_group_item bgi;
   2306	struct btrfs_root *root = btrfs_block_group_root(fs_info);
   2307	struct btrfs_key key;
   2308
   2309	spin_lock(&block_group->lock);
   2310	btrfs_set_stack_block_group_used(&bgi, block_group->used);
   2311	btrfs_set_stack_block_group_chunk_objectid(&bgi,
   2312						   block_group->global_root_id);
   2313	btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
   2314	key.objectid = block_group->start;
   2315	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
   2316	key.offset = block_group->length;
   2317	spin_unlock(&block_group->lock);
   2318
   2319	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
   2320}
   2321
   2322static int insert_dev_extent(struct btrfs_trans_handle *trans,
   2323			    struct btrfs_device *device, u64 chunk_offset,
   2324			    u64 start, u64 num_bytes)
   2325{
   2326	struct btrfs_fs_info *fs_info = device->fs_info;
   2327	struct btrfs_root *root = fs_info->dev_root;
   2328	struct btrfs_path *path;
   2329	struct btrfs_dev_extent *extent;
   2330	struct extent_buffer *leaf;
   2331	struct btrfs_key key;
   2332	int ret;
   2333
   2334	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
   2335	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
   2336	path = btrfs_alloc_path();
   2337	if (!path)
   2338		return -ENOMEM;
   2339
   2340	key.objectid = device->devid;
   2341	key.type = BTRFS_DEV_EXTENT_KEY;
   2342	key.offset = start;
   2343	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
   2344	if (ret)
   2345		goto out;
   2346
   2347	leaf = path->nodes[0];
   2348	extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
   2349	btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
   2350	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
   2351					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
   2352	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
   2353
   2354	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
   2355	btrfs_mark_buffer_dirty(leaf);
   2356out:
   2357	btrfs_free_path(path);
   2358	return ret;
   2359}
   2360
   2361/*
   2362 * This function belongs to phase 2.
   2363 *
   2364 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
   2365 * phases.
   2366 */
   2367static int insert_dev_extents(struct btrfs_trans_handle *trans,
   2368				   u64 chunk_offset, u64 chunk_size)
   2369{
   2370	struct btrfs_fs_info *fs_info = trans->fs_info;
   2371	struct btrfs_device *device;
   2372	struct extent_map *em;
   2373	struct map_lookup *map;
   2374	u64 dev_offset;
   2375	u64 stripe_size;
   2376	int i;
   2377	int ret = 0;
   2378
   2379	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
   2380	if (IS_ERR(em))
   2381		return PTR_ERR(em);
   2382
   2383	map = em->map_lookup;
   2384	stripe_size = em->orig_block_len;
   2385
   2386	/*
   2387	 * Take the device list mutex to prevent races with the final phase of
   2388	 * a device replace operation that replaces the device object associated
   2389	 * with the map's stripes, because the device object's id can change
   2390	 * at any time during that final phase of the device replace operation
   2391	 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
   2392	 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
   2393	 * resulting in persisting a device extent item with such ID.
   2394	 */
   2395	mutex_lock(&fs_info->fs_devices->device_list_mutex);
   2396	for (i = 0; i < map->num_stripes; i++) {
   2397		device = map->stripes[i].dev;
   2398		dev_offset = map->stripes[i].physical;
   2399
   2400		ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
   2401				       stripe_size);
   2402		if (ret)
   2403			break;
   2404	}
   2405	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   2406
   2407	free_extent_map(em);
   2408	return ret;
   2409}
   2410
   2411/*
   2412 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
   2413 * chunk allocation.
   2414 *
   2415 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
   2416 * phases.
   2417 */
   2418void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
   2419{
   2420	struct btrfs_fs_info *fs_info = trans->fs_info;
   2421	struct btrfs_block_group *block_group;
   2422	int ret = 0;
   2423
   2424	while (!list_empty(&trans->new_bgs)) {
   2425		int index;
   2426
   2427		block_group = list_first_entry(&trans->new_bgs,
   2428					       struct btrfs_block_group,
   2429					       bg_list);
   2430		if (ret)
   2431			goto next;
   2432
   2433		index = btrfs_bg_flags_to_raid_index(block_group->flags);
   2434
   2435		ret = insert_block_group_item(trans, block_group);
   2436		if (ret)
   2437			btrfs_abort_transaction(trans, ret);
   2438		if (!block_group->chunk_item_inserted) {
   2439			mutex_lock(&fs_info->chunk_mutex);
   2440			ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
   2441			mutex_unlock(&fs_info->chunk_mutex);
   2442			if (ret)
   2443				btrfs_abort_transaction(trans, ret);
   2444		}
   2445		ret = insert_dev_extents(trans, block_group->start,
   2446					 block_group->length);
   2447		if (ret)
   2448			btrfs_abort_transaction(trans, ret);
   2449		add_block_group_free_space(trans, block_group);
   2450
   2451		/*
   2452		 * If we restriped during balance, we may have added a new raid
   2453		 * type, so now add the sysfs entries when it is safe to do so.
   2454		 * We don't have to worry about locking here as it's handled in
   2455		 * btrfs_sysfs_add_block_group_type.
   2456		 */
   2457		if (block_group->space_info->block_group_kobjs[index] == NULL)
   2458			btrfs_sysfs_add_block_group_type(block_group);
   2459
   2460		/* Already aborted the transaction if it failed. */
   2461next:
   2462		btrfs_delayed_refs_rsv_release(fs_info, 1);
   2463		list_del_init(&block_group->bg_list);
   2464	}
   2465	btrfs_trans_release_chunk_metadata(trans);
   2466}
   2467
   2468/*
   2469 * For extent tree v2 we use the block_group_item->chunk_offset to point at our
   2470 * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
   2471 */
   2472static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
   2473{
   2474	u64 div = SZ_1G;
   2475	u64 index;
   2476
   2477	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
   2478		return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
   2479
   2480	/* If we have a smaller fs index based on 128MiB. */
   2481	if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
   2482		div = SZ_128M;
   2483
   2484	offset = div64_u64(offset, div);
   2485	div64_u64_rem(offset, fs_info->nr_global_roots, &index);
   2486	return index;
   2487}
   2488
   2489struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
   2490						 u64 bytes_used, u64 type,
   2491						 u64 chunk_offset, u64 size)
   2492{
   2493	struct btrfs_fs_info *fs_info = trans->fs_info;
   2494	struct btrfs_block_group *cache;
   2495	int ret;
   2496
   2497	btrfs_set_log_full_commit(trans);
   2498
   2499	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
   2500	if (!cache)
   2501		return ERR_PTR(-ENOMEM);
   2502
   2503	cache->length = size;
   2504	set_free_space_tree_thresholds(cache);
   2505	cache->used = bytes_used;
   2506	cache->flags = type;
   2507	cache->last_byte_to_unpin = (u64)-1;
   2508	cache->cached = BTRFS_CACHE_FINISHED;
   2509	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
   2510
   2511	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
   2512		cache->needs_free_space = 1;
   2513
   2514	ret = btrfs_load_block_group_zone_info(cache, true);
   2515	if (ret) {
   2516		btrfs_put_block_group(cache);
   2517		return ERR_PTR(ret);
   2518	}
   2519
   2520	ret = exclude_super_stripes(cache);
   2521	if (ret) {
   2522		/* We may have excluded something, so call this just in case */
   2523		btrfs_free_excluded_extents(cache);
   2524		btrfs_put_block_group(cache);
   2525		return ERR_PTR(ret);
   2526	}
   2527
   2528	add_new_free_space(cache, chunk_offset, chunk_offset + size);
   2529
   2530	btrfs_free_excluded_extents(cache);
   2531
   2532#ifdef CONFIG_BTRFS_DEBUG
   2533	if (btrfs_should_fragment_free_space(cache)) {
   2534		u64 new_bytes_used = size - bytes_used;
   2535
   2536		bytes_used += new_bytes_used >> 1;
   2537		fragment_free_space(cache);
   2538	}
   2539#endif
   2540	/*
   2541	 * Ensure the corresponding space_info object is created and
   2542	 * assigned to our block group. We want our bg to be added to the rbtree
   2543	 * with its ->space_info set.
   2544	 */
   2545	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
   2546	ASSERT(cache->space_info);
   2547
   2548	ret = btrfs_add_block_group_cache(fs_info, cache);
   2549	if (ret) {
   2550		btrfs_remove_free_space_cache(cache);
   2551		btrfs_put_block_group(cache);
   2552		return ERR_PTR(ret);
   2553	}
   2554
   2555	/*
   2556	 * Now that our block group has its ->space_info set and is inserted in
   2557	 * the rbtree, update the space info's counters.
   2558	 */
   2559	trace_btrfs_add_block_group(fs_info, cache, 1);
   2560	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
   2561				cache->bytes_super, cache->zone_unusable,
   2562				&cache->space_info);
   2563	btrfs_update_global_block_rsv(fs_info);
   2564
   2565	link_block_group(cache);
   2566
   2567	list_add_tail(&cache->bg_list, &trans->new_bgs);
   2568	trans->delayed_ref_updates++;
   2569	btrfs_update_delayed_refs_rsv(trans);
   2570
   2571	set_avail_alloc_bits(fs_info, type);
   2572	return cache;
   2573}
   2574
   2575/*
   2576 * Mark one block group RO, can be called several times for the same block
   2577 * group.
   2578 *
   2579 * @cache:		the destination block group
   2580 * @do_chunk_alloc:	whether need to do chunk pre-allocation, this is to
   2581 * 			ensure we still have some free space after marking this
   2582 * 			block group RO.
   2583 */
   2584int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
   2585			     bool do_chunk_alloc)
   2586{
   2587	struct btrfs_fs_info *fs_info = cache->fs_info;
   2588	struct btrfs_trans_handle *trans;
   2589	struct btrfs_root *root = btrfs_block_group_root(fs_info);
   2590	u64 alloc_flags;
   2591	int ret;
   2592	bool dirty_bg_running;
   2593
   2594	/*
   2595	 * This can only happen when we are doing read-only scrub on read-only
   2596	 * mount.
   2597	 * In that case we should not start a new transaction on read-only fs.
   2598	 * Thus here we skip all chunk allocations.
   2599	 */
   2600	if (sb_rdonly(fs_info->sb)) {
   2601		mutex_lock(&fs_info->ro_block_group_mutex);
   2602		ret = inc_block_group_ro(cache, 0);
   2603		mutex_unlock(&fs_info->ro_block_group_mutex);
   2604		return ret;
   2605	}
   2606
   2607	do {
   2608		trans = btrfs_join_transaction(root);
   2609		if (IS_ERR(trans))
   2610			return PTR_ERR(trans);
   2611
   2612		dirty_bg_running = false;
   2613
   2614		/*
   2615		 * We're not allowed to set block groups readonly after the dirty
   2616		 * block group cache has started writing.  If it already started,
   2617		 * back off and let this transaction commit.
   2618		 */
   2619		mutex_lock(&fs_info->ro_block_group_mutex);
   2620		if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
   2621			u64 transid = trans->transid;
   2622
   2623			mutex_unlock(&fs_info->ro_block_group_mutex);
   2624			btrfs_end_transaction(trans);
   2625
   2626			ret = btrfs_wait_for_commit(fs_info, transid);
   2627			if (ret)
   2628				return ret;
   2629			dirty_bg_running = true;
   2630		}
   2631	} while (dirty_bg_running);
   2632
   2633	if (do_chunk_alloc) {
   2634		/*
   2635		 * If we are changing raid levels, try to allocate a
   2636		 * corresponding block group with the new raid level.
   2637		 */
   2638		alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
   2639		if (alloc_flags != cache->flags) {
   2640			ret = btrfs_chunk_alloc(trans, alloc_flags,
   2641						CHUNK_ALLOC_FORCE);
   2642			/*
   2643			 * ENOSPC is allowed here, we may have enough space
   2644			 * already allocated at the new raid level to carry on
   2645			 */
   2646			if (ret == -ENOSPC)
   2647				ret = 0;
   2648			if (ret < 0)
   2649				goto out;
   2650		}
   2651	}
   2652
   2653	ret = inc_block_group_ro(cache, 0);
   2654	if (!do_chunk_alloc || ret == -ETXTBSY)
   2655		goto unlock_out;
   2656	if (!ret)
   2657		goto out;
   2658	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
   2659	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
   2660	if (ret < 0)
   2661		goto out;
   2662	ret = inc_block_group_ro(cache, 0);
   2663	if (ret == -ETXTBSY)
   2664		goto unlock_out;
   2665out:
   2666	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
   2667		alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
   2668		mutex_lock(&fs_info->chunk_mutex);
   2669		check_system_chunk(trans, alloc_flags);
   2670		mutex_unlock(&fs_info->chunk_mutex);
   2671	}
   2672unlock_out:
   2673	mutex_unlock(&fs_info->ro_block_group_mutex);
   2674
   2675	btrfs_end_transaction(trans);
   2676	return ret;
   2677}
   2678
   2679void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
   2680{
   2681	struct btrfs_space_info *sinfo = cache->space_info;
   2682	u64 num_bytes;
   2683
   2684	BUG_ON(!cache->ro);
   2685
   2686	spin_lock(&sinfo->lock);
   2687	spin_lock(&cache->lock);
   2688	if (!--cache->ro) {
   2689		if (btrfs_is_zoned(cache->fs_info)) {
   2690			/* Migrate zone_unusable bytes back */
   2691			cache->zone_unusable =
   2692				(cache->alloc_offset - cache->used) +
   2693				(cache->length - cache->zone_capacity);
   2694			sinfo->bytes_zone_unusable += cache->zone_unusable;
   2695			sinfo->bytes_readonly -= cache->zone_unusable;
   2696		}
   2697		num_bytes = cache->length - cache->reserved -
   2698			    cache->pinned - cache->bytes_super -
   2699			    cache->zone_unusable - cache->used;
   2700		sinfo->bytes_readonly -= num_bytes;
   2701		list_del_init(&cache->ro_list);
   2702	}
   2703	spin_unlock(&cache->lock);
   2704	spin_unlock(&sinfo->lock);
   2705}
   2706
   2707static int update_block_group_item(struct btrfs_trans_handle *trans,
   2708				   struct btrfs_path *path,
   2709				   struct btrfs_block_group *cache)
   2710{
   2711	struct btrfs_fs_info *fs_info = trans->fs_info;
   2712	int ret;
   2713	struct btrfs_root *root = btrfs_block_group_root(fs_info);
   2714	unsigned long bi;
   2715	struct extent_buffer *leaf;
   2716	struct btrfs_block_group_item bgi;
   2717	struct btrfs_key key;
   2718
   2719	key.objectid = cache->start;
   2720	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
   2721	key.offset = cache->length;
   2722
   2723	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
   2724	if (ret) {
   2725		if (ret > 0)
   2726			ret = -ENOENT;
   2727		goto fail;
   2728	}
   2729
   2730	leaf = path->nodes[0];
   2731	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
   2732	btrfs_set_stack_block_group_used(&bgi, cache->used);
   2733	btrfs_set_stack_block_group_chunk_objectid(&bgi,
   2734						   cache->global_root_id);
   2735	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
   2736	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
   2737	btrfs_mark_buffer_dirty(leaf);
   2738fail:
   2739	btrfs_release_path(path);
   2740	return ret;
   2741
   2742}
   2743
   2744static int cache_save_setup(struct btrfs_block_group *block_group,
   2745			    struct btrfs_trans_handle *trans,
   2746			    struct btrfs_path *path)
   2747{
   2748	struct btrfs_fs_info *fs_info = block_group->fs_info;
   2749	struct btrfs_root *root = fs_info->tree_root;
   2750	struct inode *inode = NULL;
   2751	struct extent_changeset *data_reserved = NULL;
   2752	u64 alloc_hint = 0;
   2753	int dcs = BTRFS_DC_ERROR;
   2754	u64 cache_size = 0;
   2755	int retries = 0;
   2756	int ret = 0;
   2757
   2758	if (!btrfs_test_opt(fs_info, SPACE_CACHE))
   2759		return 0;
   2760
   2761	/*
   2762	 * If this block group is smaller than 100 megs don't bother caching the
   2763	 * block group.
   2764	 */
   2765	if (block_group->length < (100 * SZ_1M)) {
   2766		spin_lock(&block_group->lock);
   2767		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
   2768		spin_unlock(&block_group->lock);
   2769		return 0;
   2770	}
   2771
   2772	if (TRANS_ABORTED(trans))
   2773		return 0;
   2774again:
   2775	inode = lookup_free_space_inode(block_group, path);
   2776	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
   2777		ret = PTR_ERR(inode);
   2778		btrfs_release_path(path);
   2779		goto out;
   2780	}
   2781
   2782	if (IS_ERR(inode)) {
   2783		BUG_ON(retries);
   2784		retries++;
   2785
   2786		if (block_group->ro)
   2787			goto out_free;
   2788
   2789		ret = create_free_space_inode(trans, block_group, path);
   2790		if (ret)
   2791			goto out_free;
   2792		goto again;
   2793	}
   2794
   2795	/*
   2796	 * We want to set the generation to 0, that way if anything goes wrong
   2797	 * from here on out we know not to trust this cache when we load up next
   2798	 * time.
   2799	 */
   2800	BTRFS_I(inode)->generation = 0;
   2801	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   2802	if (ret) {
   2803		/*
   2804		 * So theoretically we could recover from this, simply set the
   2805		 * super cache generation to 0 so we know to invalidate the
   2806		 * cache, but then we'd have to keep track of the block groups
   2807		 * that fail this way so we know we _have_ to reset this cache
   2808		 * before the next commit or risk reading stale cache.  So to
   2809		 * limit our exposure to horrible edge cases lets just abort the
   2810		 * transaction, this only happens in really bad situations
   2811		 * anyway.
   2812		 */
   2813		btrfs_abort_transaction(trans, ret);
   2814		goto out_put;
   2815	}
   2816	WARN_ON(ret);
   2817
   2818	/* We've already setup this transaction, go ahead and exit */
   2819	if (block_group->cache_generation == trans->transid &&
   2820	    i_size_read(inode)) {
   2821		dcs = BTRFS_DC_SETUP;
   2822		goto out_put;
   2823	}
   2824
   2825	if (i_size_read(inode) > 0) {
   2826		ret = btrfs_check_trunc_cache_free_space(fs_info,
   2827					&fs_info->global_block_rsv);
   2828		if (ret)
   2829			goto out_put;
   2830
   2831		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
   2832		if (ret)
   2833			goto out_put;
   2834	}
   2835
   2836	spin_lock(&block_group->lock);
   2837	if (block_group->cached != BTRFS_CACHE_FINISHED ||
   2838	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
   2839		/*
   2840		 * don't bother trying to write stuff out _if_
   2841		 * a) we're not cached,
   2842		 * b) we're with nospace_cache mount option,
   2843		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
   2844		 */
   2845		dcs = BTRFS_DC_WRITTEN;
   2846		spin_unlock(&block_group->lock);
   2847		goto out_put;
   2848	}
   2849	spin_unlock(&block_group->lock);
   2850
   2851	/*
   2852	 * We hit an ENOSPC when setting up the cache in this transaction, just
   2853	 * skip doing the setup, we've already cleared the cache so we're safe.
   2854	 */
   2855	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
   2856		ret = -ENOSPC;
   2857		goto out_put;
   2858	}
   2859
   2860	/*
   2861	 * Try to preallocate enough space based on how big the block group is.
   2862	 * Keep in mind this has to include any pinned space which could end up
   2863	 * taking up quite a bit since it's not folded into the other space
   2864	 * cache.
   2865	 */
   2866	cache_size = div_u64(block_group->length, SZ_256M);
   2867	if (!cache_size)
   2868		cache_size = 1;
   2869
   2870	cache_size *= 16;
   2871	cache_size *= fs_info->sectorsize;
   2872
   2873	ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
   2874					  cache_size);
   2875	if (ret)
   2876		goto out_put;
   2877
   2878	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
   2879					      cache_size, cache_size,
   2880					      &alloc_hint);
   2881	/*
   2882	 * Our cache requires contiguous chunks so that we don't modify a bunch
   2883	 * of metadata or split extents when writing the cache out, which means
   2884	 * we can enospc if we are heavily fragmented in addition to just normal
   2885	 * out of space conditions.  So if we hit this just skip setting up any
   2886	 * other block groups for this transaction, maybe we'll unpin enough
   2887	 * space the next time around.
   2888	 */
   2889	if (!ret)
   2890		dcs = BTRFS_DC_SETUP;
   2891	else if (ret == -ENOSPC)
   2892		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
   2893
   2894out_put:
   2895	iput(inode);
   2896out_free:
   2897	btrfs_release_path(path);
   2898out:
   2899	spin_lock(&block_group->lock);
   2900	if (!ret && dcs == BTRFS_DC_SETUP)
   2901		block_group->cache_generation = trans->transid;
   2902	block_group->disk_cache_state = dcs;
   2903	spin_unlock(&block_group->lock);
   2904
   2905	extent_changeset_free(data_reserved);
   2906	return ret;
   2907}
   2908
   2909int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
   2910{
   2911	struct btrfs_fs_info *fs_info = trans->fs_info;
   2912	struct btrfs_block_group *cache, *tmp;
   2913	struct btrfs_transaction *cur_trans = trans->transaction;
   2914	struct btrfs_path *path;
   2915
   2916	if (list_empty(&cur_trans->dirty_bgs) ||
   2917	    !btrfs_test_opt(fs_info, SPACE_CACHE))
   2918		return 0;
   2919
   2920	path = btrfs_alloc_path();
   2921	if (!path)
   2922		return -ENOMEM;
   2923
   2924	/* Could add new block groups, use _safe just in case */
   2925	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
   2926				 dirty_list) {
   2927		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
   2928			cache_save_setup(cache, trans, path);
   2929	}
   2930
   2931	btrfs_free_path(path);
   2932	return 0;
   2933}
   2934
   2935/*
   2936 * Transaction commit does final block group cache writeback during a critical
   2937 * section where nothing is allowed to change the FS.  This is required in
   2938 * order for the cache to actually match the block group, but can introduce a
   2939 * lot of latency into the commit.
   2940 *
   2941 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
   2942 * There's a chance we'll have to redo some of it if the block group changes
   2943 * again during the commit, but it greatly reduces the commit latency by
   2944 * getting rid of the easy block groups while we're still allowing others to
   2945 * join the commit.
   2946 */
   2947int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
   2948{
   2949	struct btrfs_fs_info *fs_info = trans->fs_info;
   2950	struct btrfs_block_group *cache;
   2951	struct btrfs_transaction *cur_trans = trans->transaction;
   2952	int ret = 0;
   2953	int should_put;
   2954	struct btrfs_path *path = NULL;
   2955	LIST_HEAD(dirty);
   2956	struct list_head *io = &cur_trans->io_bgs;
   2957	int loops = 0;
   2958
   2959	spin_lock(&cur_trans->dirty_bgs_lock);
   2960	if (list_empty(&cur_trans->dirty_bgs)) {
   2961		spin_unlock(&cur_trans->dirty_bgs_lock);
   2962		return 0;
   2963	}
   2964	list_splice_init(&cur_trans->dirty_bgs, &dirty);
   2965	spin_unlock(&cur_trans->dirty_bgs_lock);
   2966
   2967again:
   2968	/* Make sure all the block groups on our dirty list actually exist */
   2969	btrfs_create_pending_block_groups(trans);
   2970
   2971	if (!path) {
   2972		path = btrfs_alloc_path();
   2973		if (!path) {
   2974			ret = -ENOMEM;
   2975			goto out;
   2976		}
   2977	}
   2978
   2979	/*
   2980	 * cache_write_mutex is here only to save us from balance or automatic
   2981	 * removal of empty block groups deleting this block group while we are
   2982	 * writing out the cache
   2983	 */
   2984	mutex_lock(&trans->transaction->cache_write_mutex);
   2985	while (!list_empty(&dirty)) {
   2986		bool drop_reserve = true;
   2987
   2988		cache = list_first_entry(&dirty, struct btrfs_block_group,
   2989					 dirty_list);
   2990		/*
   2991		 * This can happen if something re-dirties a block group that
   2992		 * is already under IO.  Just wait for it to finish and then do
   2993		 * it all again
   2994		 */
   2995		if (!list_empty(&cache->io_list)) {
   2996			list_del_init(&cache->io_list);
   2997			btrfs_wait_cache_io(trans, cache, path);
   2998			btrfs_put_block_group(cache);
   2999		}
   3000
   3001
   3002		/*
   3003		 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
   3004		 * it should update the cache_state.  Don't delete until after
   3005		 * we wait.
   3006		 *
   3007		 * Since we're not running in the commit critical section
   3008		 * we need the dirty_bgs_lock to protect from update_block_group
   3009		 */
   3010		spin_lock(&cur_trans->dirty_bgs_lock);
   3011		list_del_init(&cache->dirty_list);
   3012		spin_unlock(&cur_trans->dirty_bgs_lock);
   3013
   3014		should_put = 1;
   3015
   3016		cache_save_setup(cache, trans, path);
   3017
   3018		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
   3019			cache->io_ctl.inode = NULL;
   3020			ret = btrfs_write_out_cache(trans, cache, path);
   3021			if (ret == 0 && cache->io_ctl.inode) {
   3022				should_put = 0;
   3023
   3024				/*
   3025				 * The cache_write_mutex is protecting the
   3026				 * io_list, also refer to the definition of
   3027				 * btrfs_transaction::io_bgs for more details
   3028				 */
   3029				list_add_tail(&cache->io_list, io);
   3030			} else {
   3031				/*
   3032				 * If we failed to write the cache, the
   3033				 * generation will be bad and life goes on
   3034				 */
   3035				ret = 0;
   3036			}
   3037		}
   3038		if (!ret) {
   3039			ret = update_block_group_item(trans, path, cache);
   3040			/*
   3041			 * Our block group might still be attached to the list
   3042			 * of new block groups in the transaction handle of some
   3043			 * other task (struct btrfs_trans_handle->new_bgs). This
   3044			 * means its block group item isn't yet in the extent
   3045			 * tree. If this happens ignore the error, as we will
   3046			 * try again later in the critical section of the
   3047			 * transaction commit.
   3048			 */
   3049			if (ret == -ENOENT) {
   3050				ret = 0;
   3051				spin_lock(&cur_trans->dirty_bgs_lock);
   3052				if (list_empty(&cache->dirty_list)) {
   3053					list_add_tail(&cache->dirty_list,
   3054						      &cur_trans->dirty_bgs);
   3055					btrfs_get_block_group(cache);
   3056					drop_reserve = false;
   3057				}
   3058				spin_unlock(&cur_trans->dirty_bgs_lock);
   3059			} else if (ret) {
   3060				btrfs_abort_transaction(trans, ret);
   3061			}
   3062		}
   3063
   3064		/* If it's not on the io list, we need to put the block group */
   3065		if (should_put)
   3066			btrfs_put_block_group(cache);
   3067		if (drop_reserve)
   3068			btrfs_delayed_refs_rsv_release(fs_info, 1);
   3069		/*
   3070		 * Avoid blocking other tasks for too long. It might even save
   3071		 * us from writing caches for block groups that are going to be
   3072		 * removed.
   3073		 */
   3074		mutex_unlock(&trans->transaction->cache_write_mutex);
   3075		if (ret)
   3076			goto out;
   3077		mutex_lock(&trans->transaction->cache_write_mutex);
   3078	}
   3079	mutex_unlock(&trans->transaction->cache_write_mutex);
   3080
   3081	/*
   3082	 * Go through delayed refs for all the stuff we've just kicked off
   3083	 * and then loop back (just once)
   3084	 */
   3085	if (!ret)
   3086		ret = btrfs_run_delayed_refs(trans, 0);
   3087	if (!ret && loops == 0) {
   3088		loops++;
   3089		spin_lock(&cur_trans->dirty_bgs_lock);
   3090		list_splice_init(&cur_trans->dirty_bgs, &dirty);
   3091		/*
   3092		 * dirty_bgs_lock protects us from concurrent block group
   3093		 * deletes too (not just cache_write_mutex).
   3094		 */
   3095		if (!list_empty(&dirty)) {
   3096			spin_unlock(&cur_trans->dirty_bgs_lock);
   3097			goto again;
   3098		}
   3099		spin_unlock(&cur_trans->dirty_bgs_lock);
   3100	}
   3101out:
   3102	if (ret < 0) {
   3103		spin_lock(&cur_trans->dirty_bgs_lock);
   3104		list_splice_init(&dirty, &cur_trans->dirty_bgs);
   3105		spin_unlock(&cur_trans->dirty_bgs_lock);
   3106		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
   3107	}
   3108
   3109	btrfs_free_path(path);
   3110	return ret;
   3111}
   3112
   3113int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
   3114{
   3115	struct btrfs_fs_info *fs_info = trans->fs_info;
   3116	struct btrfs_block_group *cache;
   3117	struct btrfs_transaction *cur_trans = trans->transaction;
   3118	int ret = 0;
   3119	int should_put;
   3120	struct btrfs_path *path;
   3121	struct list_head *io = &cur_trans->io_bgs;
   3122
   3123	path = btrfs_alloc_path();
   3124	if (!path)
   3125		return -ENOMEM;
   3126
   3127	/*
   3128	 * Even though we are in the critical section of the transaction commit,
   3129	 * we can still have concurrent tasks adding elements to this
   3130	 * transaction's list of dirty block groups. These tasks correspond to
   3131	 * endio free space workers started when writeback finishes for a
   3132	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
   3133	 * allocate new block groups as a result of COWing nodes of the root
   3134	 * tree when updating the free space inode. The writeback for the space
   3135	 * caches is triggered by an earlier call to
   3136	 * btrfs_start_dirty_block_groups() and iterations of the following
   3137	 * loop.
   3138	 * Also we want to do the cache_save_setup first and then run the
   3139	 * delayed refs to make sure we have the best chance at doing this all
   3140	 * in one shot.
   3141	 */
   3142	spin_lock(&cur_trans->dirty_bgs_lock);
   3143	while (!list_empty(&cur_trans->dirty_bgs)) {
   3144		cache = list_first_entry(&cur_trans->dirty_bgs,
   3145					 struct btrfs_block_group,
   3146					 dirty_list);
   3147
   3148		/*
   3149		 * This can happen if cache_save_setup re-dirties a block group
   3150		 * that is already under IO.  Just wait for it to finish and
   3151		 * then do it all again
   3152		 */
   3153		if (!list_empty(&cache->io_list)) {
   3154			spin_unlock(&cur_trans->dirty_bgs_lock);
   3155			list_del_init(&cache->io_list);
   3156			btrfs_wait_cache_io(trans, cache, path);
   3157			btrfs_put_block_group(cache);
   3158			spin_lock(&cur_trans->dirty_bgs_lock);
   3159		}
   3160
   3161		/*
   3162		 * Don't remove from the dirty list until after we've waited on
   3163		 * any pending IO
   3164		 */
   3165		list_del_init(&cache->dirty_list);
   3166		spin_unlock(&cur_trans->dirty_bgs_lock);
   3167		should_put = 1;
   3168
   3169		cache_save_setup(cache, trans, path);
   3170
   3171		if (!ret)
   3172			ret = btrfs_run_delayed_refs(trans,
   3173						     (unsigned long) -1);
   3174
   3175		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
   3176			cache->io_ctl.inode = NULL;
   3177			ret = btrfs_write_out_cache(trans, cache, path);
   3178			if (ret == 0 && cache->io_ctl.inode) {
   3179				should_put = 0;
   3180				list_add_tail(&cache->io_list, io);
   3181			} else {
   3182				/*
   3183				 * If we failed to write the cache, the
   3184				 * generation will be bad and life goes on
   3185				 */
   3186				ret = 0;
   3187			}
   3188		}
   3189		if (!ret) {
   3190			ret = update_block_group_item(trans, path, cache);
   3191			/*
   3192			 * One of the free space endio workers might have
   3193			 * created a new block group while updating a free space
   3194			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
   3195			 * and hasn't released its transaction handle yet, in
   3196			 * which case the new block group is still attached to
   3197			 * its transaction handle and its creation has not
   3198			 * finished yet (no block group item in the extent tree
   3199			 * yet, etc). If this is the case, wait for all free
   3200			 * space endio workers to finish and retry. This is a
   3201			 * very rare case so no need for a more efficient and
   3202			 * complex approach.
   3203			 */
   3204			if (ret == -ENOENT) {
   3205				wait_event(cur_trans->writer_wait,
   3206				   atomic_read(&cur_trans->num_writers) == 1);
   3207				ret = update_block_group_item(trans, path, cache);
   3208			}
   3209			if (ret)
   3210				btrfs_abort_transaction(trans, ret);
   3211		}
   3212
   3213		/* If its not on the io list, we need to put the block group */
   3214		if (should_put)
   3215			btrfs_put_block_group(cache);
   3216		btrfs_delayed_refs_rsv_release(fs_info, 1);
   3217		spin_lock(&cur_trans->dirty_bgs_lock);
   3218	}
   3219	spin_unlock(&cur_trans->dirty_bgs_lock);
   3220
   3221	/*
   3222	 * Refer to the definition of io_bgs member for details why it's safe
   3223	 * to use it without any locking
   3224	 */
   3225	while (!list_empty(io)) {
   3226		cache = list_first_entry(io, struct btrfs_block_group,
   3227					 io_list);
   3228		list_del_init(&cache->io_list);
   3229		btrfs_wait_cache_io(trans, cache, path);
   3230		btrfs_put_block_group(cache);
   3231	}
   3232
   3233	btrfs_free_path(path);
   3234	return ret;
   3235}
   3236
   3237static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
   3238					      u64 bytes_freed)
   3239{
   3240	const struct btrfs_space_info *space_info = bg->space_info;
   3241	const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
   3242	const u64 new_val = bg->used;
   3243	const u64 old_val = new_val + bytes_freed;
   3244	u64 thresh;
   3245
   3246	if (reclaim_thresh == 0)
   3247		return false;
   3248
   3249	thresh = div_factor_fine(bg->length, reclaim_thresh);
   3250
   3251	/*
   3252	 * If we were below the threshold before don't reclaim, we are likely a
   3253	 * brand new block group and we don't want to relocate new block groups.
   3254	 */
   3255	if (old_val < thresh)
   3256		return false;
   3257	if (new_val >= thresh)
   3258		return false;
   3259	return true;
   3260}
   3261
   3262int btrfs_update_block_group(struct btrfs_trans_handle *trans,
   3263			     u64 bytenr, u64 num_bytes, bool alloc)
   3264{
   3265	struct btrfs_fs_info *info = trans->fs_info;
   3266	struct btrfs_block_group *cache = NULL;
   3267	u64 total = num_bytes;
   3268	u64 old_val;
   3269	u64 byte_in_group;
   3270	int factor;
   3271	int ret = 0;
   3272
   3273	/* Block accounting for super block */
   3274	spin_lock(&info->delalloc_root_lock);
   3275	old_val = btrfs_super_bytes_used(info->super_copy);
   3276	if (alloc)
   3277		old_val += num_bytes;
   3278	else
   3279		old_val -= num_bytes;
   3280	btrfs_set_super_bytes_used(info->super_copy, old_val);
   3281	spin_unlock(&info->delalloc_root_lock);
   3282
   3283	while (total) {
   3284		bool reclaim;
   3285
   3286		cache = btrfs_lookup_block_group(info, bytenr);
   3287		if (!cache) {
   3288			ret = -ENOENT;
   3289			break;
   3290		}
   3291		factor = btrfs_bg_type_to_factor(cache->flags);
   3292
   3293		/*
   3294		 * If this block group has free space cache written out, we
   3295		 * need to make sure to load it if we are removing space.  This
   3296		 * is because we need the unpinning stage to actually add the
   3297		 * space back to the block group, otherwise we will leak space.
   3298		 */
   3299		if (!alloc && !btrfs_block_group_done(cache))
   3300			btrfs_cache_block_group(cache, 1);
   3301
   3302		byte_in_group = bytenr - cache->start;
   3303		WARN_ON(byte_in_group > cache->length);
   3304
   3305		spin_lock(&cache->space_info->lock);
   3306		spin_lock(&cache->lock);
   3307
   3308		if (btrfs_test_opt(info, SPACE_CACHE) &&
   3309		    cache->disk_cache_state < BTRFS_DC_CLEAR)
   3310			cache->disk_cache_state = BTRFS_DC_CLEAR;
   3311
   3312		old_val = cache->used;
   3313		num_bytes = min(total, cache->length - byte_in_group);
   3314		if (alloc) {
   3315			old_val += num_bytes;
   3316			cache->used = old_val;
   3317			cache->reserved -= num_bytes;
   3318			cache->space_info->bytes_reserved -= num_bytes;
   3319			cache->space_info->bytes_used += num_bytes;
   3320			cache->space_info->disk_used += num_bytes * factor;
   3321			spin_unlock(&cache->lock);
   3322			spin_unlock(&cache->space_info->lock);
   3323		} else {
   3324			old_val -= num_bytes;
   3325			cache->used = old_val;
   3326			cache->pinned += num_bytes;
   3327			btrfs_space_info_update_bytes_pinned(info,
   3328					cache->space_info, num_bytes);
   3329			cache->space_info->bytes_used -= num_bytes;
   3330			cache->space_info->disk_used -= num_bytes * factor;
   3331
   3332			reclaim = should_reclaim_block_group(cache, num_bytes);
   3333			spin_unlock(&cache->lock);
   3334			spin_unlock(&cache->space_info->lock);
   3335
   3336			set_extent_dirty(&trans->transaction->pinned_extents,
   3337					 bytenr, bytenr + num_bytes - 1,
   3338					 GFP_NOFS | __GFP_NOFAIL);
   3339		}
   3340
   3341		spin_lock(&trans->transaction->dirty_bgs_lock);
   3342		if (list_empty(&cache->dirty_list)) {
   3343			list_add_tail(&cache->dirty_list,
   3344				      &trans->transaction->dirty_bgs);
   3345			trans->delayed_ref_updates++;
   3346			btrfs_get_block_group(cache);
   3347		}
   3348		spin_unlock(&trans->transaction->dirty_bgs_lock);
   3349
   3350		/*
   3351		 * No longer have used bytes in this block group, queue it for
   3352		 * deletion. We do this after adding the block group to the
   3353		 * dirty list to avoid races between cleaner kthread and space
   3354		 * cache writeout.
   3355		 */
   3356		if (!alloc && old_val == 0) {
   3357			if (!btrfs_test_opt(info, DISCARD_ASYNC))
   3358				btrfs_mark_bg_unused(cache);
   3359		} else if (!alloc && reclaim) {
   3360			btrfs_mark_bg_to_reclaim(cache);
   3361		}
   3362
   3363		btrfs_put_block_group(cache);
   3364		total -= num_bytes;
   3365		bytenr += num_bytes;
   3366	}
   3367
   3368	/* Modified block groups are accounted for in the delayed_refs_rsv. */
   3369	btrfs_update_delayed_refs_rsv(trans);
   3370	return ret;
   3371}
   3372
   3373/**
   3374 * btrfs_add_reserved_bytes - update the block_group and space info counters
   3375 * @cache:	The cache we are manipulating
   3376 * @ram_bytes:  The number of bytes of file content, and will be same to
   3377 *              @num_bytes except for the compress path.
   3378 * @num_bytes:	The number of bytes in question
   3379 * @delalloc:   The blocks are allocated for the delalloc write
   3380 *
   3381 * This is called by the allocator when it reserves space. If this is a
   3382 * reservation and the block group has become read only we cannot make the
   3383 * reservation and return -EAGAIN, otherwise this function always succeeds.
   3384 */
   3385int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
   3386			     u64 ram_bytes, u64 num_bytes, int delalloc)
   3387{
   3388	struct btrfs_space_info *space_info = cache->space_info;
   3389	int ret = 0;
   3390
   3391	spin_lock(&space_info->lock);
   3392	spin_lock(&cache->lock);
   3393	if (cache->ro) {
   3394		ret = -EAGAIN;
   3395	} else {
   3396		cache->reserved += num_bytes;
   3397		space_info->bytes_reserved += num_bytes;
   3398		trace_btrfs_space_reservation(cache->fs_info, "space_info",
   3399					      space_info->flags, num_bytes, 1);
   3400		btrfs_space_info_update_bytes_may_use(cache->fs_info,
   3401						      space_info, -ram_bytes);
   3402		if (delalloc)
   3403			cache->delalloc_bytes += num_bytes;
   3404
   3405		/*
   3406		 * Compression can use less space than we reserved, so wake
   3407		 * tickets if that happens
   3408		 */
   3409		if (num_bytes < ram_bytes)
   3410			btrfs_try_granting_tickets(cache->fs_info, space_info);
   3411	}
   3412	spin_unlock(&cache->lock);
   3413	spin_unlock(&space_info->lock);
   3414	return ret;
   3415}
   3416
   3417/**
   3418 * btrfs_free_reserved_bytes - update the block_group and space info counters
   3419 * @cache:      The cache we are manipulating
   3420 * @num_bytes:  The number of bytes in question
   3421 * @delalloc:   The blocks are allocated for the delalloc write
   3422 *
   3423 * This is called by somebody who is freeing space that was never actually used
   3424 * on disk.  For example if you reserve some space for a new leaf in transaction
   3425 * A and before transaction A commits you free that leaf, you call this with
   3426 * reserve set to 0 in order to clear the reservation.
   3427 */
   3428void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
   3429			       u64 num_bytes, int delalloc)
   3430{
   3431	struct btrfs_space_info *space_info = cache->space_info;
   3432
   3433	spin_lock(&space_info->lock);
   3434	spin_lock(&cache->lock);
   3435	if (cache->ro)
   3436		space_info->bytes_readonly += num_bytes;
   3437	cache->reserved -= num_bytes;
   3438	space_info->bytes_reserved -= num_bytes;
   3439	space_info->max_extent_size = 0;
   3440
   3441	if (delalloc)
   3442		cache->delalloc_bytes -= num_bytes;
   3443	spin_unlock(&cache->lock);
   3444
   3445	btrfs_try_granting_tickets(cache->fs_info, space_info);
   3446	spin_unlock(&space_info->lock);
   3447}
   3448
   3449static void force_metadata_allocation(struct btrfs_fs_info *info)
   3450{
   3451	struct list_head *head = &info->space_info;
   3452	struct btrfs_space_info *found;
   3453
   3454	list_for_each_entry(found, head, list) {
   3455		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
   3456			found->force_alloc = CHUNK_ALLOC_FORCE;
   3457	}
   3458}
   3459
   3460static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
   3461			      struct btrfs_space_info *sinfo, int force)
   3462{
   3463	u64 bytes_used = btrfs_space_info_used(sinfo, false);
   3464	u64 thresh;
   3465
   3466	if (force == CHUNK_ALLOC_FORCE)
   3467		return 1;
   3468
   3469	/*
   3470	 * in limited mode, we want to have some free space up to
   3471	 * about 1% of the FS size.
   3472	 */
   3473	if (force == CHUNK_ALLOC_LIMITED) {
   3474		thresh = btrfs_super_total_bytes(fs_info->super_copy);
   3475		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
   3476
   3477		if (sinfo->total_bytes - bytes_used < thresh)
   3478			return 1;
   3479	}
   3480
   3481	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
   3482		return 0;
   3483	return 1;
   3484}
   3485
   3486int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
   3487{
   3488	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
   3489
   3490	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
   3491}
   3492
   3493static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
   3494{
   3495	struct btrfs_block_group *bg;
   3496	int ret;
   3497
   3498	/*
   3499	 * Check if we have enough space in the system space info because we
   3500	 * will need to update device items in the chunk btree and insert a new
   3501	 * chunk item in the chunk btree as well. This will allocate a new
   3502	 * system block group if needed.
   3503	 */
   3504	check_system_chunk(trans, flags);
   3505
   3506	bg = btrfs_create_chunk(trans, flags);
   3507	if (IS_ERR(bg)) {
   3508		ret = PTR_ERR(bg);
   3509		goto out;
   3510	}
   3511
   3512	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
   3513	/*
   3514	 * Normally we are not expected to fail with -ENOSPC here, since we have
   3515	 * previously reserved space in the system space_info and allocated one
   3516	 * new system chunk if necessary. However there are three exceptions:
   3517	 *
   3518	 * 1) We may have enough free space in the system space_info but all the
   3519	 *    existing system block groups have a profile which can not be used
   3520	 *    for extent allocation.
   3521	 *
   3522	 *    This happens when mounting in degraded mode. For example we have a
   3523	 *    RAID1 filesystem with 2 devices, lose one device and mount the fs
   3524	 *    using the other device in degraded mode. If we then allocate a chunk,
   3525	 *    we may have enough free space in the existing system space_info, but
   3526	 *    none of the block groups can be used for extent allocation since they
   3527	 *    have a RAID1 profile, and because we are in degraded mode with a
   3528	 *    single device, we are forced to allocate a new system chunk with a
   3529	 *    SINGLE profile. Making check_system_chunk() iterate over all system
   3530	 *    block groups and check if they have a usable profile and enough space
   3531	 *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
   3532	 *    try again after forcing allocation of a new system chunk. Like this
   3533	 *    we avoid paying the cost of that search in normal circumstances, when
   3534	 *    we were not mounted in degraded mode;
   3535	 *
   3536	 * 2) We had enough free space info the system space_info, and one suitable
   3537	 *    block group to allocate from when we called check_system_chunk()
   3538	 *    above. However right after we called it, the only system block group
   3539	 *    with enough free space got turned into RO mode by a running scrub,
   3540	 *    and in this case we have to allocate a new one and retry. We only
   3541	 *    need do this allocate and retry once, since we have a transaction
   3542	 *    handle and scrub uses the commit root to search for block groups;
   3543	 *
   3544	 * 3) We had one system block group with enough free space when we called
   3545	 *    check_system_chunk(), but after that, right before we tried to
   3546	 *    allocate the last extent buffer we needed, a discard operation came
   3547	 *    in and it temporarily removed the last free space entry from the
   3548	 *    block group (discard removes a free space entry, discards it, and
   3549	 *    then adds back the entry to the block group cache).
   3550	 */
   3551	if (ret == -ENOSPC) {
   3552		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
   3553		struct btrfs_block_group *sys_bg;
   3554
   3555		sys_bg = btrfs_create_chunk(trans, sys_flags);
   3556		if (IS_ERR(sys_bg)) {
   3557			ret = PTR_ERR(sys_bg);
   3558			btrfs_abort_transaction(trans, ret);
   3559			goto out;
   3560		}
   3561
   3562		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
   3563		if (ret) {
   3564			btrfs_abort_transaction(trans, ret);
   3565			goto out;
   3566		}
   3567
   3568		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
   3569		if (ret) {
   3570			btrfs_abort_transaction(trans, ret);
   3571			goto out;
   3572		}
   3573	} else if (ret) {
   3574		btrfs_abort_transaction(trans, ret);
   3575		goto out;
   3576	}
   3577out:
   3578	btrfs_trans_release_chunk_metadata(trans);
   3579
   3580	if (ret)
   3581		return ERR_PTR(ret);
   3582
   3583	btrfs_get_block_group(bg);
   3584	return bg;
   3585}
   3586
   3587/*
   3588 * Chunk allocation is done in 2 phases:
   3589 *
   3590 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
   3591 *    the chunk, the chunk mapping, create its block group and add the items
   3592 *    that belong in the chunk btree to it - more specifically, we need to
   3593 *    update device items in the chunk btree and add a new chunk item to it.
   3594 *
   3595 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
   3596 *    group item to the extent btree and the device extent items to the devices
   3597 *    btree.
   3598 *
   3599 * This is done to prevent deadlocks. For example when COWing a node from the
   3600 * extent btree we are holding a write lock on the node's parent and if we
   3601 * trigger chunk allocation and attempted to insert the new block group item
   3602 * in the extent btree right way, we could deadlock because the path for the
   3603 * insertion can include that parent node. At first glance it seems impossible
   3604 * to trigger chunk allocation after starting a transaction since tasks should
   3605 * reserve enough transaction units (metadata space), however while that is true
   3606 * most of the time, chunk allocation may still be triggered for several reasons:
   3607 *
   3608 * 1) When reserving metadata, we check if there is enough free space in the
   3609 *    metadata space_info and therefore don't trigger allocation of a new chunk.
   3610 *    However later when the task actually tries to COW an extent buffer from
   3611 *    the extent btree or from the device btree for example, it is forced to
   3612 *    allocate a new block group (chunk) because the only one that had enough
   3613 *    free space was just turned to RO mode by a running scrub for example (or
   3614 *    device replace, block group reclaim thread, etc), so we can not use it
   3615 *    for allocating an extent and end up being forced to allocate a new one;
   3616 *
   3617 * 2) Because we only check that the metadata space_info has enough free bytes,
   3618 *    we end up not allocating a new metadata chunk in that case. However if
   3619 *    the filesystem was mounted in degraded mode, none of the existing block
   3620 *    groups might be suitable for extent allocation due to their incompatible
   3621 *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
   3622 *    use a RAID1 profile, in degraded mode using a single device). In this case
   3623 *    when the task attempts to COW some extent buffer of the extent btree for
   3624 *    example, it will trigger allocation of a new metadata block group with a
   3625 *    suitable profile (SINGLE profile in the example of the degraded mount of
   3626 *    the RAID1 filesystem);
   3627 *
   3628 * 3) The task has reserved enough transaction units / metadata space, but when
   3629 *    it attempts to COW an extent buffer from the extent or device btree for
   3630 *    example, it does not find any free extent in any metadata block group,
   3631 *    therefore forced to try to allocate a new metadata block group.
   3632 *    This is because some other task allocated all available extents in the
   3633 *    meanwhile - this typically happens with tasks that don't reserve space
   3634 *    properly, either intentionally or as a bug. One example where this is
   3635 *    done intentionally is fsync, as it does not reserve any transaction units
   3636 *    and ends up allocating a variable number of metadata extents for log
   3637 *    tree extent buffers;
   3638 *
   3639 * 4) The task has reserved enough transaction units / metadata space, but right
   3640 *    before it tries to allocate the last extent buffer it needs, a discard
   3641 *    operation comes in and, temporarily, removes the last free space entry from
   3642 *    the only metadata block group that had free space (discard starts by
   3643 *    removing a free space entry from a block group, then does the discard
   3644 *    operation and, once it's done, it adds back the free space entry to the
   3645 *    block group).
   3646 *
   3647 * We also need this 2 phases setup when adding a device to a filesystem with
   3648 * a seed device - we must create new metadata and system chunks without adding
   3649 * any of the block group items to the chunk, extent and device btrees. If we
   3650 * did not do it this way, we would get ENOSPC when attempting to update those
   3651 * btrees, since all the chunks from the seed device are read-only.
   3652 *
   3653 * Phase 1 does the updates and insertions to the chunk btree because if we had
   3654 * it done in phase 2 and have a thundering herd of tasks allocating chunks in
   3655 * parallel, we risk having too many system chunks allocated by many tasks if
   3656 * many tasks reach phase 1 without the previous ones completing phase 2. In the
   3657 * extreme case this leads to exhaustion of the system chunk array in the
   3658 * superblock. This is easier to trigger if using a btree node/leaf size of 64K
   3659 * and with RAID filesystems (so we have more device items in the chunk btree).
   3660 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
   3661 * the system chunk array due to concurrent allocations") provides more details.
   3662 *
   3663 * Allocation of system chunks does not happen through this function. A task that
   3664 * needs to update the chunk btree (the only btree that uses system chunks), must
   3665 * preallocate chunk space by calling either check_system_chunk() or
   3666 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
   3667 * metadata chunk or when removing a chunk, while the later is used before doing
   3668 * a modification to the chunk btree - use cases for the later are adding,
   3669 * removing and resizing a device as well as relocation of a system chunk.
   3670 * See the comment below for more details.
   3671 *
   3672 * The reservation of system space, done through check_system_chunk(), as well
   3673 * as all the updates and insertions into the chunk btree must be done while
   3674 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
   3675 * an extent buffer from the chunks btree we never trigger allocation of a new
   3676 * system chunk, which would result in a deadlock (trying to lock twice an
   3677 * extent buffer of the chunk btree, first time before triggering the chunk
   3678 * allocation and the second time during chunk allocation while attempting to
   3679 * update the chunks btree). The system chunk array is also updated while holding
   3680 * that mutex. The same logic applies to removing chunks - we must reserve system
   3681 * space, update the chunk btree and the system chunk array in the superblock
   3682 * while holding fs_info->chunk_mutex.
   3683 *
   3684 * This function, btrfs_chunk_alloc(), belongs to phase 1.
   3685 *
   3686 * If @force is CHUNK_ALLOC_FORCE:
   3687 *    - return 1 if it successfully allocates a chunk,
   3688 *    - return errors including -ENOSPC otherwise.
   3689 * If @force is NOT CHUNK_ALLOC_FORCE:
   3690 *    - return 0 if it doesn't need to allocate a new chunk,
   3691 *    - return 1 if it successfully allocates a chunk,
   3692 *    - return errors including -ENOSPC otherwise.
   3693 */
   3694int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
   3695		      enum btrfs_chunk_alloc_enum force)
   3696{
   3697	struct btrfs_fs_info *fs_info = trans->fs_info;
   3698	struct btrfs_space_info *space_info;
   3699	struct btrfs_block_group *ret_bg;
   3700	bool wait_for_alloc = false;
   3701	bool should_alloc = false;
   3702	bool from_extent_allocation = false;
   3703	int ret = 0;
   3704
   3705	if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
   3706		from_extent_allocation = true;
   3707		force = CHUNK_ALLOC_FORCE;
   3708	}
   3709
   3710	/* Don't re-enter if we're already allocating a chunk */
   3711	if (trans->allocating_chunk)
   3712		return -ENOSPC;
   3713	/*
   3714	 * Allocation of system chunks can not happen through this path, as we
   3715	 * could end up in a deadlock if we are allocating a data or metadata
   3716	 * chunk and there is another task modifying the chunk btree.
   3717	 *
   3718	 * This is because while we are holding the chunk mutex, we will attempt
   3719	 * to add the new chunk item to the chunk btree or update an existing
   3720	 * device item in the chunk btree, while the other task that is modifying
   3721	 * the chunk btree is attempting to COW an extent buffer while holding a
   3722	 * lock on it and on its parent - if the COW operation triggers a system
   3723	 * chunk allocation, then we can deadlock because we are holding the
   3724	 * chunk mutex and we may need to access that extent buffer or its parent
   3725	 * in order to add the chunk item or update a device item.
   3726	 *
   3727	 * Tasks that want to modify the chunk tree should reserve system space
   3728	 * before updating the chunk btree, by calling either
   3729	 * btrfs_reserve_chunk_metadata() or check_system_chunk().
   3730	 * It's possible that after a task reserves the space, it still ends up
   3731	 * here - this happens in the cases described above at do_chunk_alloc().
   3732	 * The task will have to either retry or fail.
   3733	 */
   3734	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
   3735		return -ENOSPC;
   3736
   3737	space_info = btrfs_find_space_info(fs_info, flags);
   3738	ASSERT(space_info);
   3739
   3740	do {
   3741		spin_lock(&space_info->lock);
   3742		if (force < space_info->force_alloc)
   3743			force = space_info->force_alloc;
   3744		should_alloc = should_alloc_chunk(fs_info, space_info, force);
   3745		if (space_info->full) {
   3746			/* No more free physical space */
   3747			if (should_alloc)
   3748				ret = -ENOSPC;
   3749			else
   3750				ret = 0;
   3751			spin_unlock(&space_info->lock);
   3752			return ret;
   3753		} else if (!should_alloc) {
   3754			spin_unlock(&space_info->lock);
   3755			return 0;
   3756		} else if (space_info->chunk_alloc) {
   3757			/*
   3758			 * Someone is already allocating, so we need to block
   3759			 * until this someone is finished and then loop to
   3760			 * recheck if we should continue with our allocation
   3761			 * attempt.
   3762			 */
   3763			wait_for_alloc = true;
   3764			spin_unlock(&space_info->lock);
   3765			mutex_lock(&fs_info->chunk_mutex);
   3766			mutex_unlock(&fs_info->chunk_mutex);
   3767		} else {
   3768			/* Proceed with allocation */
   3769			space_info->chunk_alloc = 1;
   3770			wait_for_alloc = false;
   3771			spin_unlock(&space_info->lock);
   3772		}
   3773
   3774		cond_resched();
   3775	} while (wait_for_alloc);
   3776
   3777	mutex_lock(&fs_info->chunk_mutex);
   3778	trans->allocating_chunk = true;
   3779
   3780	/*
   3781	 * If we have mixed data/metadata chunks we want to make sure we keep
   3782	 * allocating mixed chunks instead of individual chunks.
   3783	 */
   3784	if (btrfs_mixed_space_info(space_info))
   3785		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
   3786
   3787	/*
   3788	 * if we're doing a data chunk, go ahead and make sure that
   3789	 * we keep a reasonable number of metadata chunks allocated in the
   3790	 * FS as well.
   3791	 */
   3792	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
   3793		fs_info->data_chunk_allocations++;
   3794		if (!(fs_info->data_chunk_allocations %
   3795		      fs_info->metadata_ratio))
   3796			force_metadata_allocation(fs_info);
   3797	}
   3798
   3799	ret_bg = do_chunk_alloc(trans, flags);
   3800	trans->allocating_chunk = false;
   3801
   3802	if (IS_ERR(ret_bg)) {
   3803		ret = PTR_ERR(ret_bg);
   3804	} else if (from_extent_allocation) {
   3805		/*
   3806		 * New block group is likely to be used soon. Try to activate
   3807		 * it now. Failure is OK for now.
   3808		 */
   3809		btrfs_zone_activate(ret_bg);
   3810	}
   3811
   3812	if (!ret)
   3813		btrfs_put_block_group(ret_bg);
   3814
   3815	spin_lock(&space_info->lock);
   3816	if (ret < 0) {
   3817		if (ret == -ENOSPC)
   3818			space_info->full = 1;
   3819		else
   3820			goto out;
   3821	} else {
   3822		ret = 1;
   3823		space_info->max_extent_size = 0;
   3824	}
   3825
   3826	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
   3827out:
   3828	space_info->chunk_alloc = 0;
   3829	spin_unlock(&space_info->lock);
   3830	mutex_unlock(&fs_info->chunk_mutex);
   3831
   3832	return ret;
   3833}
   3834
   3835static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
   3836{
   3837	u64 num_dev;
   3838
   3839	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
   3840	if (!num_dev)
   3841		num_dev = fs_info->fs_devices->rw_devices;
   3842
   3843	return num_dev;
   3844}
   3845
   3846static void reserve_chunk_space(struct btrfs_trans_handle *trans,
   3847				u64 bytes,
   3848				u64 type)
   3849{
   3850	struct btrfs_fs_info *fs_info = trans->fs_info;
   3851	struct btrfs_space_info *info;
   3852	u64 left;
   3853	int ret = 0;
   3854
   3855	/*
   3856	 * Needed because we can end up allocating a system chunk and for an
   3857	 * atomic and race free space reservation in the chunk block reserve.
   3858	 */
   3859	lockdep_assert_held(&fs_info->chunk_mutex);
   3860
   3861	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
   3862	spin_lock(&info->lock);
   3863	left = info->total_bytes - btrfs_space_info_used(info, true);
   3864	spin_unlock(&info->lock);
   3865
   3866	if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
   3867		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
   3868			   left, bytes, type);
   3869		btrfs_dump_space_info(fs_info, info, 0, 0);
   3870	}
   3871
   3872	if (left < bytes) {
   3873		u64 flags = btrfs_system_alloc_profile(fs_info);
   3874		struct btrfs_block_group *bg;
   3875
   3876		/*
   3877		 * Ignore failure to create system chunk. We might end up not
   3878		 * needing it, as we might not need to COW all nodes/leafs from
   3879		 * the paths we visit in the chunk tree (they were already COWed
   3880		 * or created in the current transaction for example).
   3881		 */
   3882		bg = btrfs_create_chunk(trans, flags);
   3883		if (IS_ERR(bg)) {
   3884			ret = PTR_ERR(bg);
   3885		} else {
   3886			/*
   3887			 * If we fail to add the chunk item here, we end up
   3888			 * trying again at phase 2 of chunk allocation, at
   3889			 * btrfs_create_pending_block_groups(). So ignore
   3890			 * any error here. An ENOSPC here could happen, due to
   3891			 * the cases described at do_chunk_alloc() - the system
   3892			 * block group we just created was just turned into RO
   3893			 * mode by a scrub for example, or a running discard
   3894			 * temporarily removed its free space entries, etc.
   3895			 */
   3896			btrfs_chunk_alloc_add_chunk_item(trans, bg);
   3897		}
   3898	}
   3899
   3900	if (!ret) {
   3901		ret = btrfs_block_rsv_add(fs_info,
   3902					  &fs_info->chunk_block_rsv,
   3903					  bytes, BTRFS_RESERVE_NO_FLUSH);
   3904		if (!ret)
   3905			trans->chunk_bytes_reserved += bytes;
   3906	}
   3907}
   3908
   3909/*
   3910 * Reserve space in the system space for allocating or removing a chunk.
   3911 * The caller must be holding fs_info->chunk_mutex.
   3912 */
   3913void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
   3914{
   3915	struct btrfs_fs_info *fs_info = trans->fs_info;
   3916	const u64 num_devs = get_profile_num_devs(fs_info, type);
   3917	u64 bytes;
   3918
   3919	/* num_devs device items to update and 1 chunk item to add or remove. */
   3920	bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
   3921		btrfs_calc_insert_metadata_size(fs_info, 1);
   3922
   3923	reserve_chunk_space(trans, bytes, type);
   3924}
   3925
   3926/*
   3927 * Reserve space in the system space, if needed, for doing a modification to the
   3928 * chunk btree.
   3929 *
   3930 * @trans:		A transaction handle.
   3931 * @is_item_insertion:	Indicate if the modification is for inserting a new item
   3932 *			in the chunk btree or if it's for the deletion or update
   3933 *			of an existing item.
   3934 *
   3935 * This is used in a context where we need to update the chunk btree outside
   3936 * block group allocation and removal, to avoid a deadlock with a concurrent
   3937 * task that is allocating a metadata or data block group and therefore needs to
   3938 * update the chunk btree while holding the chunk mutex. After the update to the
   3939 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
   3940 *
   3941 */
   3942void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
   3943				  bool is_item_insertion)
   3944{
   3945	struct btrfs_fs_info *fs_info = trans->fs_info;
   3946	u64 bytes;
   3947
   3948	if (is_item_insertion)
   3949		bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
   3950	else
   3951		bytes = btrfs_calc_metadata_size(fs_info, 1);
   3952
   3953	mutex_lock(&fs_info->chunk_mutex);
   3954	reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
   3955	mutex_unlock(&fs_info->chunk_mutex);
   3956}
   3957
   3958void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
   3959{
   3960	struct btrfs_block_group *block_group;
   3961	u64 last = 0;
   3962
   3963	while (1) {
   3964		struct inode *inode;
   3965
   3966		block_group = btrfs_lookup_first_block_group(info, last);
   3967		while (block_group) {
   3968			btrfs_wait_block_group_cache_done(block_group);
   3969			spin_lock(&block_group->lock);
   3970			if (block_group->iref)
   3971				break;
   3972			spin_unlock(&block_group->lock);
   3973			block_group = btrfs_next_block_group(block_group);
   3974		}
   3975		if (!block_group) {
   3976			if (last == 0)
   3977				break;
   3978			last = 0;
   3979			continue;
   3980		}
   3981
   3982		inode = block_group->inode;
   3983		block_group->iref = 0;
   3984		block_group->inode = NULL;
   3985		spin_unlock(&block_group->lock);
   3986		ASSERT(block_group->io_ctl.inode == NULL);
   3987		iput(inode);
   3988		last = block_group->start + block_group->length;
   3989		btrfs_put_block_group(block_group);
   3990	}
   3991}
   3992
   3993/*
   3994 * Must be called only after stopping all workers, since we could have block
   3995 * group caching kthreads running, and therefore they could race with us if we
   3996 * freed the block groups before stopping them.
   3997 */
   3998int btrfs_free_block_groups(struct btrfs_fs_info *info)
   3999{
   4000	struct btrfs_block_group *block_group;
   4001	struct btrfs_space_info *space_info;
   4002	struct btrfs_caching_control *caching_ctl;
   4003	struct rb_node *n;
   4004
   4005	write_lock(&info->block_group_cache_lock);
   4006	while (!list_empty(&info->caching_block_groups)) {
   4007		caching_ctl = list_entry(info->caching_block_groups.next,
   4008					 struct btrfs_caching_control, list);
   4009		list_del(&caching_ctl->list);
   4010		btrfs_put_caching_control(caching_ctl);
   4011	}
   4012	write_unlock(&info->block_group_cache_lock);
   4013
   4014	spin_lock(&info->unused_bgs_lock);
   4015	while (!list_empty(&info->unused_bgs)) {
   4016		block_group = list_first_entry(&info->unused_bgs,
   4017					       struct btrfs_block_group,
   4018					       bg_list);
   4019		list_del_init(&block_group->bg_list);
   4020		btrfs_put_block_group(block_group);
   4021	}
   4022
   4023	while (!list_empty(&info->reclaim_bgs)) {
   4024		block_group = list_first_entry(&info->reclaim_bgs,
   4025					       struct btrfs_block_group,
   4026					       bg_list);
   4027		list_del_init(&block_group->bg_list);
   4028		btrfs_put_block_group(block_group);
   4029	}
   4030	spin_unlock(&info->unused_bgs_lock);
   4031
   4032	spin_lock(&info->zone_active_bgs_lock);
   4033	while (!list_empty(&info->zone_active_bgs)) {
   4034		block_group = list_first_entry(&info->zone_active_bgs,
   4035					       struct btrfs_block_group,
   4036					       active_bg_list);
   4037		list_del_init(&block_group->active_bg_list);
   4038		btrfs_put_block_group(block_group);
   4039	}
   4040	spin_unlock(&info->zone_active_bgs_lock);
   4041
   4042	write_lock(&info->block_group_cache_lock);
   4043	while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
   4044		block_group = rb_entry(n, struct btrfs_block_group,
   4045				       cache_node);
   4046		rb_erase_cached(&block_group->cache_node,
   4047				&info->block_group_cache_tree);
   4048		RB_CLEAR_NODE(&block_group->cache_node);
   4049		write_unlock(&info->block_group_cache_lock);
   4050
   4051		down_write(&block_group->space_info->groups_sem);
   4052		list_del(&block_group->list);
   4053		up_write(&block_group->space_info->groups_sem);
   4054
   4055		/*
   4056		 * We haven't cached this block group, which means we could
   4057		 * possibly have excluded extents on this block group.
   4058		 */
   4059		if (block_group->cached == BTRFS_CACHE_NO ||
   4060		    block_group->cached == BTRFS_CACHE_ERROR)
   4061			btrfs_free_excluded_extents(block_group);
   4062
   4063		btrfs_remove_free_space_cache(block_group);
   4064		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
   4065		ASSERT(list_empty(&block_group->dirty_list));
   4066		ASSERT(list_empty(&block_group->io_list));
   4067		ASSERT(list_empty(&block_group->bg_list));
   4068		ASSERT(refcount_read(&block_group->refs) == 1);
   4069		ASSERT(block_group->swap_extents == 0);
   4070		btrfs_put_block_group(block_group);
   4071
   4072		write_lock(&info->block_group_cache_lock);
   4073	}
   4074	write_unlock(&info->block_group_cache_lock);
   4075
   4076	btrfs_release_global_block_rsv(info);
   4077
   4078	while (!list_empty(&info->space_info)) {
   4079		space_info = list_entry(info->space_info.next,
   4080					struct btrfs_space_info,
   4081					list);
   4082
   4083		/*
   4084		 * Do not hide this behind enospc_debug, this is actually
   4085		 * important and indicates a real bug if this happens.
   4086		 */
   4087		if (WARN_ON(space_info->bytes_pinned > 0 ||
   4088			    space_info->bytes_may_use > 0))
   4089			btrfs_dump_space_info(info, space_info, 0, 0);
   4090
   4091		/*
   4092		 * If there was a failure to cleanup a log tree, very likely due
   4093		 * to an IO failure on a writeback attempt of one or more of its
   4094		 * extent buffers, we could not do proper (and cheap) unaccounting
   4095		 * of their reserved space, so don't warn on bytes_reserved > 0 in
   4096		 * that case.
   4097		 */
   4098		if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
   4099		    !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
   4100			if (WARN_ON(space_info->bytes_reserved > 0))
   4101				btrfs_dump_space_info(info, space_info, 0, 0);
   4102		}
   4103
   4104		WARN_ON(space_info->reclaim_size > 0);
   4105		list_del(&space_info->list);
   4106		btrfs_sysfs_remove_space_info(space_info);
   4107	}
   4108	return 0;
   4109}
   4110
   4111void btrfs_freeze_block_group(struct btrfs_block_group *cache)
   4112{
   4113	atomic_inc(&cache->frozen);
   4114}
   4115
   4116void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
   4117{
   4118	struct btrfs_fs_info *fs_info = block_group->fs_info;
   4119	struct extent_map_tree *em_tree;
   4120	struct extent_map *em;
   4121	bool cleanup;
   4122
   4123	spin_lock(&block_group->lock);
   4124	cleanup = (atomic_dec_and_test(&block_group->frozen) &&
   4125		   block_group->removed);
   4126	spin_unlock(&block_group->lock);
   4127
   4128	if (cleanup) {
   4129		em_tree = &fs_info->mapping_tree;
   4130		write_lock(&em_tree->lock);
   4131		em = lookup_extent_mapping(em_tree, block_group->start,
   4132					   1);
   4133		BUG_ON(!em); /* logic error, can't happen */
   4134		remove_extent_mapping(em_tree, em);
   4135		write_unlock(&em_tree->lock);
   4136
   4137		/* once for us and once for the tree */
   4138		free_extent_map(em);
   4139		free_extent_map(em);
   4140
   4141		/*
   4142		 * We may have left one free space entry and other possible
   4143		 * tasks trimming this block group have left 1 entry each one.
   4144		 * Free them if any.
   4145		 */
   4146		__btrfs_remove_free_space_cache(block_group->free_space_ctl);
   4147	}
   4148}
   4149
   4150bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
   4151{
   4152	bool ret = true;
   4153
   4154	spin_lock(&bg->lock);
   4155	if (bg->ro)
   4156		ret = false;
   4157	else
   4158		bg->swap_extents++;
   4159	spin_unlock(&bg->lock);
   4160
   4161	return ret;
   4162}
   4163
   4164void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
   4165{
   4166	spin_lock(&bg->lock);
   4167	ASSERT(!bg->ro);
   4168	ASSERT(bg->swap_extents >= amount);
   4169	bg->swap_extents -= amount;
   4170	spin_unlock(&bg->lock);
   4171}