cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

free-space-cache.c (115376B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2008 Red Hat.  All rights reserved.
      4 */
      5
      6#include <linux/pagemap.h>
      7#include <linux/sched.h>
      8#include <linux/sched/signal.h>
      9#include <linux/slab.h>
     10#include <linux/math64.h>
     11#include <linux/ratelimit.h>
     12#include <linux/error-injection.h>
     13#include <linux/sched/mm.h>
     14#include "misc.h"
     15#include "ctree.h"
     16#include "free-space-cache.h"
     17#include "transaction.h"
     18#include "disk-io.h"
     19#include "extent_io.h"
     20#include "volumes.h"
     21#include "space-info.h"
     22#include "delalloc-space.h"
     23#include "block-group.h"
     24#include "discard.h"
     25#include "subpage.h"
     26#include "inode-item.h"
     27
     28#define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
     29#define MAX_CACHE_BYTES_PER_GIG	SZ_64K
     30#define FORCE_EXTENT_THRESHOLD	SZ_1M
     31
     32struct btrfs_trim_range {
     33	u64 start;
     34	u64 bytes;
     35	struct list_head list;
     36};
     37
     38static int link_free_space(struct btrfs_free_space_ctl *ctl,
     39			   struct btrfs_free_space *info);
     40static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
     41			      struct btrfs_free_space *info, bool update_stat);
     42static int search_bitmap(struct btrfs_free_space_ctl *ctl,
     43			 struct btrfs_free_space *bitmap_info, u64 *offset,
     44			 u64 *bytes, bool for_alloc);
     45static void free_bitmap(struct btrfs_free_space_ctl *ctl,
     46			struct btrfs_free_space *bitmap_info);
     47static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
     48			      struct btrfs_free_space *info, u64 offset,
     49			      u64 bytes, bool update_stats);
     50
     51static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
     52					       struct btrfs_path *path,
     53					       u64 offset)
     54{
     55	struct btrfs_fs_info *fs_info = root->fs_info;
     56	struct btrfs_key key;
     57	struct btrfs_key location;
     58	struct btrfs_disk_key disk_key;
     59	struct btrfs_free_space_header *header;
     60	struct extent_buffer *leaf;
     61	struct inode *inode = NULL;
     62	unsigned nofs_flag;
     63	int ret;
     64
     65	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
     66	key.offset = offset;
     67	key.type = 0;
     68
     69	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
     70	if (ret < 0)
     71		return ERR_PTR(ret);
     72	if (ret > 0) {
     73		btrfs_release_path(path);
     74		return ERR_PTR(-ENOENT);
     75	}
     76
     77	leaf = path->nodes[0];
     78	header = btrfs_item_ptr(leaf, path->slots[0],
     79				struct btrfs_free_space_header);
     80	btrfs_free_space_key(leaf, header, &disk_key);
     81	btrfs_disk_key_to_cpu(&location, &disk_key);
     82	btrfs_release_path(path);
     83
     84	/*
     85	 * We are often under a trans handle at this point, so we need to make
     86	 * sure NOFS is set to keep us from deadlocking.
     87	 */
     88	nofs_flag = memalloc_nofs_save();
     89	inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
     90	btrfs_release_path(path);
     91	memalloc_nofs_restore(nofs_flag);
     92	if (IS_ERR(inode))
     93		return inode;
     94
     95	mapping_set_gfp_mask(inode->i_mapping,
     96			mapping_gfp_constraint(inode->i_mapping,
     97			~(__GFP_FS | __GFP_HIGHMEM)));
     98
     99	return inode;
    100}
    101
    102struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
    103		struct btrfs_path *path)
    104{
    105	struct btrfs_fs_info *fs_info = block_group->fs_info;
    106	struct inode *inode = NULL;
    107	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
    108
    109	spin_lock(&block_group->lock);
    110	if (block_group->inode)
    111		inode = igrab(block_group->inode);
    112	spin_unlock(&block_group->lock);
    113	if (inode)
    114		return inode;
    115
    116	inode = __lookup_free_space_inode(fs_info->tree_root, path,
    117					  block_group->start);
    118	if (IS_ERR(inode))
    119		return inode;
    120
    121	spin_lock(&block_group->lock);
    122	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
    123		btrfs_info(fs_info, "Old style space inode found, converting.");
    124		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
    125			BTRFS_INODE_NODATACOW;
    126		block_group->disk_cache_state = BTRFS_DC_CLEAR;
    127	}
    128
    129	if (!block_group->iref) {
    130		block_group->inode = igrab(inode);
    131		block_group->iref = 1;
    132	}
    133	spin_unlock(&block_group->lock);
    134
    135	return inode;
    136}
    137
    138static int __create_free_space_inode(struct btrfs_root *root,
    139				     struct btrfs_trans_handle *trans,
    140				     struct btrfs_path *path,
    141				     u64 ino, u64 offset)
    142{
    143	struct btrfs_key key;
    144	struct btrfs_disk_key disk_key;
    145	struct btrfs_free_space_header *header;
    146	struct btrfs_inode_item *inode_item;
    147	struct extent_buffer *leaf;
    148	/* We inline CRCs for the free disk space cache */
    149	const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
    150			  BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
    151	int ret;
    152
    153	ret = btrfs_insert_empty_inode(trans, root, path, ino);
    154	if (ret)
    155		return ret;
    156
    157	leaf = path->nodes[0];
    158	inode_item = btrfs_item_ptr(leaf, path->slots[0],
    159				    struct btrfs_inode_item);
    160	btrfs_item_key(leaf, &disk_key, path->slots[0]);
    161	memzero_extent_buffer(leaf, (unsigned long)inode_item,
    162			     sizeof(*inode_item));
    163	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
    164	btrfs_set_inode_size(leaf, inode_item, 0);
    165	btrfs_set_inode_nbytes(leaf, inode_item, 0);
    166	btrfs_set_inode_uid(leaf, inode_item, 0);
    167	btrfs_set_inode_gid(leaf, inode_item, 0);
    168	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
    169	btrfs_set_inode_flags(leaf, inode_item, flags);
    170	btrfs_set_inode_nlink(leaf, inode_item, 1);
    171	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
    172	btrfs_set_inode_block_group(leaf, inode_item, offset);
    173	btrfs_mark_buffer_dirty(leaf);
    174	btrfs_release_path(path);
    175
    176	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
    177	key.offset = offset;
    178	key.type = 0;
    179	ret = btrfs_insert_empty_item(trans, root, path, &key,
    180				      sizeof(struct btrfs_free_space_header));
    181	if (ret < 0) {
    182		btrfs_release_path(path);
    183		return ret;
    184	}
    185
    186	leaf = path->nodes[0];
    187	header = btrfs_item_ptr(leaf, path->slots[0],
    188				struct btrfs_free_space_header);
    189	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
    190	btrfs_set_free_space_key(leaf, header, &disk_key);
    191	btrfs_mark_buffer_dirty(leaf);
    192	btrfs_release_path(path);
    193
    194	return 0;
    195}
    196
    197int create_free_space_inode(struct btrfs_trans_handle *trans,
    198			    struct btrfs_block_group *block_group,
    199			    struct btrfs_path *path)
    200{
    201	int ret;
    202	u64 ino;
    203
    204	ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
    205	if (ret < 0)
    206		return ret;
    207
    208	return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
    209					 ino, block_group->start);
    210}
    211
    212/*
    213 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
    214 * handles lookup, otherwise it takes ownership and iputs the inode.
    215 * Don't reuse an inode pointer after passing it into this function.
    216 */
    217int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
    218				  struct inode *inode,
    219				  struct btrfs_block_group *block_group)
    220{
    221	struct btrfs_path *path;
    222	struct btrfs_key key;
    223	int ret = 0;
    224
    225	path = btrfs_alloc_path();
    226	if (!path)
    227		return -ENOMEM;
    228
    229	if (!inode)
    230		inode = lookup_free_space_inode(block_group, path);
    231	if (IS_ERR(inode)) {
    232		if (PTR_ERR(inode) != -ENOENT)
    233			ret = PTR_ERR(inode);
    234		goto out;
    235	}
    236	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
    237	if (ret) {
    238		btrfs_add_delayed_iput(inode);
    239		goto out;
    240	}
    241	clear_nlink(inode);
    242	/* One for the block groups ref */
    243	spin_lock(&block_group->lock);
    244	if (block_group->iref) {
    245		block_group->iref = 0;
    246		block_group->inode = NULL;
    247		spin_unlock(&block_group->lock);
    248		iput(inode);
    249	} else {
    250		spin_unlock(&block_group->lock);
    251	}
    252	/* One for the lookup ref */
    253	btrfs_add_delayed_iput(inode);
    254
    255	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
    256	key.type = 0;
    257	key.offset = block_group->start;
    258	ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
    259				-1, 1);
    260	if (ret) {
    261		if (ret > 0)
    262			ret = 0;
    263		goto out;
    264	}
    265	ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
    266out:
    267	btrfs_free_path(path);
    268	return ret;
    269}
    270
    271int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
    272				       struct btrfs_block_rsv *rsv)
    273{
    274	u64 needed_bytes;
    275	int ret;
    276
    277	/* 1 for slack space, 1 for updating the inode */
    278	needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
    279		btrfs_calc_metadata_size(fs_info, 1);
    280
    281	spin_lock(&rsv->lock);
    282	if (rsv->reserved < needed_bytes)
    283		ret = -ENOSPC;
    284	else
    285		ret = 0;
    286	spin_unlock(&rsv->lock);
    287	return ret;
    288}
    289
    290int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
    291				    struct btrfs_block_group *block_group,
    292				    struct inode *vfs_inode)
    293{
    294	struct btrfs_truncate_control control = {
    295		.inode = BTRFS_I(vfs_inode),
    296		.new_size = 0,
    297		.ino = btrfs_ino(BTRFS_I(vfs_inode)),
    298		.min_type = BTRFS_EXTENT_DATA_KEY,
    299		.clear_extent_range = true,
    300	};
    301	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
    302	struct btrfs_root *root = inode->root;
    303	struct extent_state *cached_state = NULL;
    304	int ret = 0;
    305	bool locked = false;
    306
    307	if (block_group) {
    308		struct btrfs_path *path = btrfs_alloc_path();
    309
    310		if (!path) {
    311			ret = -ENOMEM;
    312			goto fail;
    313		}
    314		locked = true;
    315		mutex_lock(&trans->transaction->cache_write_mutex);
    316		if (!list_empty(&block_group->io_list)) {
    317			list_del_init(&block_group->io_list);
    318
    319			btrfs_wait_cache_io(trans, block_group, path);
    320			btrfs_put_block_group(block_group);
    321		}
    322
    323		/*
    324		 * now that we've truncated the cache away, its no longer
    325		 * setup or written
    326		 */
    327		spin_lock(&block_group->lock);
    328		block_group->disk_cache_state = BTRFS_DC_CLEAR;
    329		spin_unlock(&block_group->lock);
    330		btrfs_free_path(path);
    331	}
    332
    333	btrfs_i_size_write(inode, 0);
    334	truncate_pagecache(vfs_inode, 0);
    335
    336	lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
    337	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
    338
    339	/*
    340	 * We skip the throttling logic for free space cache inodes, so we don't
    341	 * need to check for -EAGAIN.
    342	 */
    343	ret = btrfs_truncate_inode_items(trans, root, &control);
    344
    345	inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
    346	btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
    347
    348	unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
    349	if (ret)
    350		goto fail;
    351
    352	ret = btrfs_update_inode(trans, root, inode);
    353
    354fail:
    355	if (locked)
    356		mutex_unlock(&trans->transaction->cache_write_mutex);
    357	if (ret)
    358		btrfs_abort_transaction(trans, ret);
    359
    360	return ret;
    361}
    362
    363static void readahead_cache(struct inode *inode)
    364{
    365	struct file_ra_state ra;
    366	unsigned long last_index;
    367
    368	file_ra_state_init(&ra, inode->i_mapping);
    369	last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
    370
    371	page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
    372}
    373
    374static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
    375		       int write)
    376{
    377	int num_pages;
    378
    379	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
    380
    381	/* Make sure we can fit our crcs and generation into the first page */
    382	if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
    383		return -ENOSPC;
    384
    385	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
    386
    387	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
    388	if (!io_ctl->pages)
    389		return -ENOMEM;
    390
    391	io_ctl->num_pages = num_pages;
    392	io_ctl->fs_info = btrfs_sb(inode->i_sb);
    393	io_ctl->inode = inode;
    394
    395	return 0;
    396}
    397ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
    398
    399static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
    400{
    401	kfree(io_ctl->pages);
    402	io_ctl->pages = NULL;
    403}
    404
    405static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
    406{
    407	if (io_ctl->cur) {
    408		io_ctl->cur = NULL;
    409		io_ctl->orig = NULL;
    410	}
    411}
    412
    413static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
    414{
    415	ASSERT(io_ctl->index < io_ctl->num_pages);
    416	io_ctl->page = io_ctl->pages[io_ctl->index++];
    417	io_ctl->cur = page_address(io_ctl->page);
    418	io_ctl->orig = io_ctl->cur;
    419	io_ctl->size = PAGE_SIZE;
    420	if (clear)
    421		clear_page(io_ctl->cur);
    422}
    423
    424static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
    425{
    426	int i;
    427
    428	io_ctl_unmap_page(io_ctl);
    429
    430	for (i = 0; i < io_ctl->num_pages; i++) {
    431		if (io_ctl->pages[i]) {
    432			btrfs_page_clear_checked(io_ctl->fs_info,
    433					io_ctl->pages[i],
    434					page_offset(io_ctl->pages[i]),
    435					PAGE_SIZE);
    436			unlock_page(io_ctl->pages[i]);
    437			put_page(io_ctl->pages[i]);
    438		}
    439	}
    440}
    441
    442static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
    443{
    444	struct page *page;
    445	struct inode *inode = io_ctl->inode;
    446	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
    447	int i;
    448
    449	for (i = 0; i < io_ctl->num_pages; i++) {
    450		int ret;
    451
    452		page = find_or_create_page(inode->i_mapping, i, mask);
    453		if (!page) {
    454			io_ctl_drop_pages(io_ctl);
    455			return -ENOMEM;
    456		}
    457
    458		ret = set_page_extent_mapped(page);
    459		if (ret < 0) {
    460			unlock_page(page);
    461			put_page(page);
    462			io_ctl_drop_pages(io_ctl);
    463			return ret;
    464		}
    465
    466		io_ctl->pages[i] = page;
    467		if (uptodate && !PageUptodate(page)) {
    468			btrfs_read_folio(NULL, page_folio(page));
    469			lock_page(page);
    470			if (page->mapping != inode->i_mapping) {
    471				btrfs_err(BTRFS_I(inode)->root->fs_info,
    472					  "free space cache page truncated");
    473				io_ctl_drop_pages(io_ctl);
    474				return -EIO;
    475			}
    476			if (!PageUptodate(page)) {
    477				btrfs_err(BTRFS_I(inode)->root->fs_info,
    478					   "error reading free space cache");
    479				io_ctl_drop_pages(io_ctl);
    480				return -EIO;
    481			}
    482		}
    483	}
    484
    485	for (i = 0; i < io_ctl->num_pages; i++)
    486		clear_page_dirty_for_io(io_ctl->pages[i]);
    487
    488	return 0;
    489}
    490
    491static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
    492{
    493	io_ctl_map_page(io_ctl, 1);
    494
    495	/*
    496	 * Skip the csum areas.  If we don't check crcs then we just have a
    497	 * 64bit chunk at the front of the first page.
    498	 */
    499	io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
    500	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
    501
    502	put_unaligned_le64(generation, io_ctl->cur);
    503	io_ctl->cur += sizeof(u64);
    504}
    505
    506static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
    507{
    508	u64 cache_gen;
    509
    510	/*
    511	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
    512	 * chunk at the front of the first page.
    513	 */
    514	io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
    515	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
    516
    517	cache_gen = get_unaligned_le64(io_ctl->cur);
    518	if (cache_gen != generation) {
    519		btrfs_err_rl(io_ctl->fs_info,
    520			"space cache generation (%llu) does not match inode (%llu)",
    521				cache_gen, generation);
    522		io_ctl_unmap_page(io_ctl);
    523		return -EIO;
    524	}
    525	io_ctl->cur += sizeof(u64);
    526	return 0;
    527}
    528
    529static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
    530{
    531	u32 *tmp;
    532	u32 crc = ~(u32)0;
    533	unsigned offset = 0;
    534
    535	if (index == 0)
    536		offset = sizeof(u32) * io_ctl->num_pages;
    537
    538	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
    539	btrfs_crc32c_final(crc, (u8 *)&crc);
    540	io_ctl_unmap_page(io_ctl);
    541	tmp = page_address(io_ctl->pages[0]);
    542	tmp += index;
    543	*tmp = crc;
    544}
    545
    546static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
    547{
    548	u32 *tmp, val;
    549	u32 crc = ~(u32)0;
    550	unsigned offset = 0;
    551
    552	if (index == 0)
    553		offset = sizeof(u32) * io_ctl->num_pages;
    554
    555	tmp = page_address(io_ctl->pages[0]);
    556	tmp += index;
    557	val = *tmp;
    558
    559	io_ctl_map_page(io_ctl, 0);
    560	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
    561	btrfs_crc32c_final(crc, (u8 *)&crc);
    562	if (val != crc) {
    563		btrfs_err_rl(io_ctl->fs_info,
    564			"csum mismatch on free space cache");
    565		io_ctl_unmap_page(io_ctl);
    566		return -EIO;
    567	}
    568
    569	return 0;
    570}
    571
    572static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
    573			    void *bitmap)
    574{
    575	struct btrfs_free_space_entry *entry;
    576
    577	if (!io_ctl->cur)
    578		return -ENOSPC;
    579
    580	entry = io_ctl->cur;
    581	put_unaligned_le64(offset, &entry->offset);
    582	put_unaligned_le64(bytes, &entry->bytes);
    583	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
    584		BTRFS_FREE_SPACE_EXTENT;
    585	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
    586	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
    587
    588	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
    589		return 0;
    590
    591	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
    592
    593	/* No more pages to map */
    594	if (io_ctl->index >= io_ctl->num_pages)
    595		return 0;
    596
    597	/* map the next page */
    598	io_ctl_map_page(io_ctl, 1);
    599	return 0;
    600}
    601
    602static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
    603{
    604	if (!io_ctl->cur)
    605		return -ENOSPC;
    606
    607	/*
    608	 * If we aren't at the start of the current page, unmap this one and
    609	 * map the next one if there is any left.
    610	 */
    611	if (io_ctl->cur != io_ctl->orig) {
    612		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
    613		if (io_ctl->index >= io_ctl->num_pages)
    614			return -ENOSPC;
    615		io_ctl_map_page(io_ctl, 0);
    616	}
    617
    618	copy_page(io_ctl->cur, bitmap);
    619	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
    620	if (io_ctl->index < io_ctl->num_pages)
    621		io_ctl_map_page(io_ctl, 0);
    622	return 0;
    623}
    624
    625static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
    626{
    627	/*
    628	 * If we're not on the boundary we know we've modified the page and we
    629	 * need to crc the page.
    630	 */
    631	if (io_ctl->cur != io_ctl->orig)
    632		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
    633	else
    634		io_ctl_unmap_page(io_ctl);
    635
    636	while (io_ctl->index < io_ctl->num_pages) {
    637		io_ctl_map_page(io_ctl, 1);
    638		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
    639	}
    640}
    641
    642static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
    643			    struct btrfs_free_space *entry, u8 *type)
    644{
    645	struct btrfs_free_space_entry *e;
    646	int ret;
    647
    648	if (!io_ctl->cur) {
    649		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
    650		if (ret)
    651			return ret;
    652	}
    653
    654	e = io_ctl->cur;
    655	entry->offset = get_unaligned_le64(&e->offset);
    656	entry->bytes = get_unaligned_le64(&e->bytes);
    657	*type = e->type;
    658	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
    659	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
    660
    661	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
    662		return 0;
    663
    664	io_ctl_unmap_page(io_ctl);
    665
    666	return 0;
    667}
    668
    669static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
    670			      struct btrfs_free_space *entry)
    671{
    672	int ret;
    673
    674	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
    675	if (ret)
    676		return ret;
    677
    678	copy_page(entry->bitmap, io_ctl->cur);
    679	io_ctl_unmap_page(io_ctl);
    680
    681	return 0;
    682}
    683
    684static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
    685{
    686	struct btrfs_block_group *block_group = ctl->block_group;
    687	u64 max_bytes;
    688	u64 bitmap_bytes;
    689	u64 extent_bytes;
    690	u64 size = block_group->length;
    691	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
    692	u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
    693
    694	max_bitmaps = max_t(u64, max_bitmaps, 1);
    695
    696	ASSERT(ctl->total_bitmaps <= max_bitmaps);
    697
    698	/*
    699	 * We are trying to keep the total amount of memory used per 1GiB of
    700	 * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
    701	 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
    702	 * bitmaps, we may end up using more memory than this.
    703	 */
    704	if (size < SZ_1G)
    705		max_bytes = MAX_CACHE_BYTES_PER_GIG;
    706	else
    707		max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
    708
    709	bitmap_bytes = ctl->total_bitmaps * ctl->unit;
    710
    711	/*
    712	 * we want the extent entry threshold to always be at most 1/2 the max
    713	 * bytes we can have, or whatever is less than that.
    714	 */
    715	extent_bytes = max_bytes - bitmap_bytes;
    716	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
    717
    718	ctl->extents_thresh =
    719		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
    720}
    721
    722static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
    723				   struct btrfs_free_space_ctl *ctl,
    724				   struct btrfs_path *path, u64 offset)
    725{
    726	struct btrfs_fs_info *fs_info = root->fs_info;
    727	struct btrfs_free_space_header *header;
    728	struct extent_buffer *leaf;
    729	struct btrfs_io_ctl io_ctl;
    730	struct btrfs_key key;
    731	struct btrfs_free_space *e, *n;
    732	LIST_HEAD(bitmaps);
    733	u64 num_entries;
    734	u64 num_bitmaps;
    735	u64 generation;
    736	u8 type;
    737	int ret = 0;
    738
    739	/* Nothing in the space cache, goodbye */
    740	if (!i_size_read(inode))
    741		return 0;
    742
    743	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
    744	key.offset = offset;
    745	key.type = 0;
    746
    747	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    748	if (ret < 0)
    749		return 0;
    750	else if (ret > 0) {
    751		btrfs_release_path(path);
    752		return 0;
    753	}
    754
    755	ret = -1;
    756
    757	leaf = path->nodes[0];
    758	header = btrfs_item_ptr(leaf, path->slots[0],
    759				struct btrfs_free_space_header);
    760	num_entries = btrfs_free_space_entries(leaf, header);
    761	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
    762	generation = btrfs_free_space_generation(leaf, header);
    763	btrfs_release_path(path);
    764
    765	if (!BTRFS_I(inode)->generation) {
    766		btrfs_info(fs_info,
    767			   "the free space cache file (%llu) is invalid, skip it",
    768			   offset);
    769		return 0;
    770	}
    771
    772	if (BTRFS_I(inode)->generation != generation) {
    773		btrfs_err(fs_info,
    774			  "free space inode generation (%llu) did not match free space cache generation (%llu)",
    775			  BTRFS_I(inode)->generation, generation);
    776		return 0;
    777	}
    778
    779	if (!num_entries)
    780		return 0;
    781
    782	ret = io_ctl_init(&io_ctl, inode, 0);
    783	if (ret)
    784		return ret;
    785
    786	readahead_cache(inode);
    787
    788	ret = io_ctl_prepare_pages(&io_ctl, true);
    789	if (ret)
    790		goto out;
    791
    792	ret = io_ctl_check_crc(&io_ctl, 0);
    793	if (ret)
    794		goto free_cache;
    795
    796	ret = io_ctl_check_generation(&io_ctl, generation);
    797	if (ret)
    798		goto free_cache;
    799
    800	while (num_entries) {
    801		e = kmem_cache_zalloc(btrfs_free_space_cachep,
    802				      GFP_NOFS);
    803		if (!e) {
    804			ret = -ENOMEM;
    805			goto free_cache;
    806		}
    807
    808		ret = io_ctl_read_entry(&io_ctl, e, &type);
    809		if (ret) {
    810			kmem_cache_free(btrfs_free_space_cachep, e);
    811			goto free_cache;
    812		}
    813
    814		if (!e->bytes) {
    815			ret = -1;
    816			kmem_cache_free(btrfs_free_space_cachep, e);
    817			goto free_cache;
    818		}
    819
    820		if (type == BTRFS_FREE_SPACE_EXTENT) {
    821			spin_lock(&ctl->tree_lock);
    822			ret = link_free_space(ctl, e);
    823			spin_unlock(&ctl->tree_lock);
    824			if (ret) {
    825				btrfs_err(fs_info,
    826					"Duplicate entries in free space cache, dumping");
    827				kmem_cache_free(btrfs_free_space_cachep, e);
    828				goto free_cache;
    829			}
    830		} else {
    831			ASSERT(num_bitmaps);
    832			num_bitmaps--;
    833			e->bitmap = kmem_cache_zalloc(
    834					btrfs_free_space_bitmap_cachep, GFP_NOFS);
    835			if (!e->bitmap) {
    836				ret = -ENOMEM;
    837				kmem_cache_free(
    838					btrfs_free_space_cachep, e);
    839				goto free_cache;
    840			}
    841			spin_lock(&ctl->tree_lock);
    842			ret = link_free_space(ctl, e);
    843			ctl->total_bitmaps++;
    844			recalculate_thresholds(ctl);
    845			spin_unlock(&ctl->tree_lock);
    846			if (ret) {
    847				btrfs_err(fs_info,
    848					"Duplicate entries in free space cache, dumping");
    849				kmem_cache_free(btrfs_free_space_cachep, e);
    850				goto free_cache;
    851			}
    852			list_add_tail(&e->list, &bitmaps);
    853		}
    854
    855		num_entries--;
    856	}
    857
    858	io_ctl_unmap_page(&io_ctl);
    859
    860	/*
    861	 * We add the bitmaps at the end of the entries in order that
    862	 * the bitmap entries are added to the cache.
    863	 */
    864	list_for_each_entry_safe(e, n, &bitmaps, list) {
    865		list_del_init(&e->list);
    866		ret = io_ctl_read_bitmap(&io_ctl, e);
    867		if (ret)
    868			goto free_cache;
    869	}
    870
    871	io_ctl_drop_pages(&io_ctl);
    872	ret = 1;
    873out:
    874	io_ctl_free(&io_ctl);
    875	return ret;
    876free_cache:
    877	io_ctl_drop_pages(&io_ctl);
    878	__btrfs_remove_free_space_cache(ctl);
    879	goto out;
    880}
    881
    882static int copy_free_space_cache(struct btrfs_block_group *block_group,
    883				 struct btrfs_free_space_ctl *ctl)
    884{
    885	struct btrfs_free_space *info;
    886	struct rb_node *n;
    887	int ret = 0;
    888
    889	while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
    890		info = rb_entry(n, struct btrfs_free_space, offset_index);
    891		if (!info->bitmap) {
    892			unlink_free_space(ctl, info, true);
    893			ret = btrfs_add_free_space(block_group, info->offset,
    894						   info->bytes);
    895			kmem_cache_free(btrfs_free_space_cachep, info);
    896		} else {
    897			u64 offset = info->offset;
    898			u64 bytes = ctl->unit;
    899
    900			while (search_bitmap(ctl, info, &offset, &bytes,
    901					     false) == 0) {
    902				ret = btrfs_add_free_space(block_group, offset,
    903							   bytes);
    904				if (ret)
    905					break;
    906				bitmap_clear_bits(ctl, info, offset, bytes, true);
    907				offset = info->offset;
    908				bytes = ctl->unit;
    909			}
    910			free_bitmap(ctl, info);
    911		}
    912		cond_resched();
    913	}
    914	return ret;
    915}
    916
    917int load_free_space_cache(struct btrfs_block_group *block_group)
    918{
    919	struct btrfs_fs_info *fs_info = block_group->fs_info;
    920	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
    921	struct btrfs_free_space_ctl tmp_ctl = {};
    922	struct inode *inode;
    923	struct btrfs_path *path;
    924	int ret = 0;
    925	bool matched;
    926	u64 used = block_group->used;
    927
    928	/*
    929	 * Because we could potentially discard our loaded free space, we want
    930	 * to load everything into a temporary structure first, and then if it's
    931	 * valid copy it all into the actual free space ctl.
    932	 */
    933	btrfs_init_free_space_ctl(block_group, &tmp_ctl);
    934
    935	/*
    936	 * If this block group has been marked to be cleared for one reason or
    937	 * another then we can't trust the on disk cache, so just return.
    938	 */
    939	spin_lock(&block_group->lock);
    940	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
    941		spin_unlock(&block_group->lock);
    942		return 0;
    943	}
    944	spin_unlock(&block_group->lock);
    945
    946	path = btrfs_alloc_path();
    947	if (!path)
    948		return 0;
    949	path->search_commit_root = 1;
    950	path->skip_locking = 1;
    951
    952	/*
    953	 * We must pass a path with search_commit_root set to btrfs_iget in
    954	 * order to avoid a deadlock when allocating extents for the tree root.
    955	 *
    956	 * When we are COWing an extent buffer from the tree root, when looking
    957	 * for a free extent, at extent-tree.c:find_free_extent(), we can find
    958	 * block group without its free space cache loaded. When we find one
    959	 * we must load its space cache which requires reading its free space
    960	 * cache's inode item from the root tree. If this inode item is located
    961	 * in the same leaf that we started COWing before, then we end up in
    962	 * deadlock on the extent buffer (trying to read lock it when we
    963	 * previously write locked it).
    964	 *
    965	 * It's safe to read the inode item using the commit root because
    966	 * block groups, once loaded, stay in memory forever (until they are
    967	 * removed) as well as their space caches once loaded. New block groups
    968	 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
    969	 * we will never try to read their inode item while the fs is mounted.
    970	 */
    971	inode = lookup_free_space_inode(block_group, path);
    972	if (IS_ERR(inode)) {
    973		btrfs_free_path(path);
    974		return 0;
    975	}
    976
    977	/* We may have converted the inode and made the cache invalid. */
    978	spin_lock(&block_group->lock);
    979	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
    980		spin_unlock(&block_group->lock);
    981		btrfs_free_path(path);
    982		goto out;
    983	}
    984	spin_unlock(&block_group->lock);
    985
    986	ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
    987				      path, block_group->start);
    988	btrfs_free_path(path);
    989	if (ret <= 0)
    990		goto out;
    991
    992	matched = (tmp_ctl.free_space == (block_group->length - used -
    993					  block_group->bytes_super));
    994
    995	if (matched) {
    996		ret = copy_free_space_cache(block_group, &tmp_ctl);
    997		/*
    998		 * ret == 1 means we successfully loaded the free space cache,
    999		 * so we need to re-set it here.
   1000		 */
   1001		if (ret == 0)
   1002			ret = 1;
   1003	} else {
   1004		__btrfs_remove_free_space_cache(&tmp_ctl);
   1005		btrfs_warn(fs_info,
   1006			   "block group %llu has wrong amount of free space",
   1007			   block_group->start);
   1008		ret = -1;
   1009	}
   1010out:
   1011	if (ret < 0) {
   1012		/* This cache is bogus, make sure it gets cleared */
   1013		spin_lock(&block_group->lock);
   1014		block_group->disk_cache_state = BTRFS_DC_CLEAR;
   1015		spin_unlock(&block_group->lock);
   1016		ret = 0;
   1017
   1018		btrfs_warn(fs_info,
   1019			   "failed to load free space cache for block group %llu, rebuilding it now",
   1020			   block_group->start);
   1021	}
   1022
   1023	spin_lock(&ctl->tree_lock);
   1024	btrfs_discard_update_discardable(block_group);
   1025	spin_unlock(&ctl->tree_lock);
   1026	iput(inode);
   1027	return ret;
   1028}
   1029
   1030static noinline_for_stack
   1031int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
   1032			      struct btrfs_free_space_ctl *ctl,
   1033			      struct btrfs_block_group *block_group,
   1034			      int *entries, int *bitmaps,
   1035			      struct list_head *bitmap_list)
   1036{
   1037	int ret;
   1038	struct btrfs_free_cluster *cluster = NULL;
   1039	struct btrfs_free_cluster *cluster_locked = NULL;
   1040	struct rb_node *node = rb_first(&ctl->free_space_offset);
   1041	struct btrfs_trim_range *trim_entry;
   1042
   1043	/* Get the cluster for this block_group if it exists */
   1044	if (block_group && !list_empty(&block_group->cluster_list)) {
   1045		cluster = list_entry(block_group->cluster_list.next,
   1046				     struct btrfs_free_cluster,
   1047				     block_group_list);
   1048	}
   1049
   1050	if (!node && cluster) {
   1051		cluster_locked = cluster;
   1052		spin_lock(&cluster_locked->lock);
   1053		node = rb_first(&cluster->root);
   1054		cluster = NULL;
   1055	}
   1056
   1057	/* Write out the extent entries */
   1058	while (node) {
   1059		struct btrfs_free_space *e;
   1060
   1061		e = rb_entry(node, struct btrfs_free_space, offset_index);
   1062		*entries += 1;
   1063
   1064		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
   1065				       e->bitmap);
   1066		if (ret)
   1067			goto fail;
   1068
   1069		if (e->bitmap) {
   1070			list_add_tail(&e->list, bitmap_list);
   1071			*bitmaps += 1;
   1072		}
   1073		node = rb_next(node);
   1074		if (!node && cluster) {
   1075			node = rb_first(&cluster->root);
   1076			cluster_locked = cluster;
   1077			spin_lock(&cluster_locked->lock);
   1078			cluster = NULL;
   1079		}
   1080	}
   1081	if (cluster_locked) {
   1082		spin_unlock(&cluster_locked->lock);
   1083		cluster_locked = NULL;
   1084	}
   1085
   1086	/*
   1087	 * Make sure we don't miss any range that was removed from our rbtree
   1088	 * because trimming is running. Otherwise after a umount+mount (or crash
   1089	 * after committing the transaction) we would leak free space and get
   1090	 * an inconsistent free space cache report from fsck.
   1091	 */
   1092	list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
   1093		ret = io_ctl_add_entry(io_ctl, trim_entry->start,
   1094				       trim_entry->bytes, NULL);
   1095		if (ret)
   1096			goto fail;
   1097		*entries += 1;
   1098	}
   1099
   1100	return 0;
   1101fail:
   1102	if (cluster_locked)
   1103		spin_unlock(&cluster_locked->lock);
   1104	return -ENOSPC;
   1105}
   1106
   1107static noinline_for_stack int
   1108update_cache_item(struct btrfs_trans_handle *trans,
   1109		  struct btrfs_root *root,
   1110		  struct inode *inode,
   1111		  struct btrfs_path *path, u64 offset,
   1112		  int entries, int bitmaps)
   1113{
   1114	struct btrfs_key key;
   1115	struct btrfs_free_space_header *header;
   1116	struct extent_buffer *leaf;
   1117	int ret;
   1118
   1119	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
   1120	key.offset = offset;
   1121	key.type = 0;
   1122
   1123	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
   1124	if (ret < 0) {
   1125		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
   1126				 EXTENT_DELALLOC, 0, 0, NULL);
   1127		goto fail;
   1128	}
   1129	leaf = path->nodes[0];
   1130	if (ret > 0) {
   1131		struct btrfs_key found_key;
   1132		ASSERT(path->slots[0]);
   1133		path->slots[0]--;
   1134		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   1135		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
   1136		    found_key.offset != offset) {
   1137			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
   1138					 inode->i_size - 1, EXTENT_DELALLOC, 0,
   1139					 0, NULL);
   1140			btrfs_release_path(path);
   1141			goto fail;
   1142		}
   1143	}
   1144
   1145	BTRFS_I(inode)->generation = trans->transid;
   1146	header = btrfs_item_ptr(leaf, path->slots[0],
   1147				struct btrfs_free_space_header);
   1148	btrfs_set_free_space_entries(leaf, header, entries);
   1149	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
   1150	btrfs_set_free_space_generation(leaf, header, trans->transid);
   1151	btrfs_mark_buffer_dirty(leaf);
   1152	btrfs_release_path(path);
   1153
   1154	return 0;
   1155
   1156fail:
   1157	return -1;
   1158}
   1159
   1160static noinline_for_stack int write_pinned_extent_entries(
   1161			    struct btrfs_trans_handle *trans,
   1162			    struct btrfs_block_group *block_group,
   1163			    struct btrfs_io_ctl *io_ctl,
   1164			    int *entries)
   1165{
   1166	u64 start, extent_start, extent_end, len;
   1167	struct extent_io_tree *unpin = NULL;
   1168	int ret;
   1169
   1170	if (!block_group)
   1171		return 0;
   1172
   1173	/*
   1174	 * We want to add any pinned extents to our free space cache
   1175	 * so we don't leak the space
   1176	 *
   1177	 * We shouldn't have switched the pinned extents yet so this is the
   1178	 * right one
   1179	 */
   1180	unpin = &trans->transaction->pinned_extents;
   1181
   1182	start = block_group->start;
   1183
   1184	while (start < block_group->start + block_group->length) {
   1185		ret = find_first_extent_bit(unpin, start,
   1186					    &extent_start, &extent_end,
   1187					    EXTENT_DIRTY, NULL);
   1188		if (ret)
   1189			return 0;
   1190
   1191		/* This pinned extent is out of our range */
   1192		if (extent_start >= block_group->start + block_group->length)
   1193			return 0;
   1194
   1195		extent_start = max(extent_start, start);
   1196		extent_end = min(block_group->start + block_group->length,
   1197				 extent_end + 1);
   1198		len = extent_end - extent_start;
   1199
   1200		*entries += 1;
   1201		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
   1202		if (ret)
   1203			return -ENOSPC;
   1204
   1205		start = extent_end;
   1206	}
   1207
   1208	return 0;
   1209}
   1210
   1211static noinline_for_stack int
   1212write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
   1213{
   1214	struct btrfs_free_space *entry, *next;
   1215	int ret;
   1216
   1217	/* Write out the bitmaps */
   1218	list_for_each_entry_safe(entry, next, bitmap_list, list) {
   1219		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
   1220		if (ret)
   1221			return -ENOSPC;
   1222		list_del_init(&entry->list);
   1223	}
   1224
   1225	return 0;
   1226}
   1227
   1228static int flush_dirty_cache(struct inode *inode)
   1229{
   1230	int ret;
   1231
   1232	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
   1233	if (ret)
   1234		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
   1235				 EXTENT_DELALLOC, 0, 0, NULL);
   1236
   1237	return ret;
   1238}
   1239
   1240static void noinline_for_stack
   1241cleanup_bitmap_list(struct list_head *bitmap_list)
   1242{
   1243	struct btrfs_free_space *entry, *next;
   1244
   1245	list_for_each_entry_safe(entry, next, bitmap_list, list)
   1246		list_del_init(&entry->list);
   1247}
   1248
   1249static void noinline_for_stack
   1250cleanup_write_cache_enospc(struct inode *inode,
   1251			   struct btrfs_io_ctl *io_ctl,
   1252			   struct extent_state **cached_state)
   1253{
   1254	io_ctl_drop_pages(io_ctl);
   1255	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
   1256			     i_size_read(inode) - 1, cached_state);
   1257}
   1258
   1259static int __btrfs_wait_cache_io(struct btrfs_root *root,
   1260				 struct btrfs_trans_handle *trans,
   1261				 struct btrfs_block_group *block_group,
   1262				 struct btrfs_io_ctl *io_ctl,
   1263				 struct btrfs_path *path, u64 offset)
   1264{
   1265	int ret;
   1266	struct inode *inode = io_ctl->inode;
   1267
   1268	if (!inode)
   1269		return 0;
   1270
   1271	/* Flush the dirty pages in the cache file. */
   1272	ret = flush_dirty_cache(inode);
   1273	if (ret)
   1274		goto out;
   1275
   1276	/* Update the cache item to tell everyone this cache file is valid. */
   1277	ret = update_cache_item(trans, root, inode, path, offset,
   1278				io_ctl->entries, io_ctl->bitmaps);
   1279out:
   1280	if (ret) {
   1281		invalidate_inode_pages2(inode->i_mapping);
   1282		BTRFS_I(inode)->generation = 0;
   1283		if (block_group)
   1284			btrfs_debug(root->fs_info,
   1285	  "failed to write free space cache for block group %llu error %d",
   1286				  block_group->start, ret);
   1287	}
   1288	btrfs_update_inode(trans, root, BTRFS_I(inode));
   1289
   1290	if (block_group) {
   1291		/* the dirty list is protected by the dirty_bgs_lock */
   1292		spin_lock(&trans->transaction->dirty_bgs_lock);
   1293
   1294		/* the disk_cache_state is protected by the block group lock */
   1295		spin_lock(&block_group->lock);
   1296
   1297		/*
   1298		 * only mark this as written if we didn't get put back on
   1299		 * the dirty list while waiting for IO.   Otherwise our
   1300		 * cache state won't be right, and we won't get written again
   1301		 */
   1302		if (!ret && list_empty(&block_group->dirty_list))
   1303			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
   1304		else if (ret)
   1305			block_group->disk_cache_state = BTRFS_DC_ERROR;
   1306
   1307		spin_unlock(&block_group->lock);
   1308		spin_unlock(&trans->transaction->dirty_bgs_lock);
   1309		io_ctl->inode = NULL;
   1310		iput(inode);
   1311	}
   1312
   1313	return ret;
   1314
   1315}
   1316
   1317int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
   1318			struct btrfs_block_group *block_group,
   1319			struct btrfs_path *path)
   1320{
   1321	return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
   1322				     block_group, &block_group->io_ctl,
   1323				     path, block_group->start);
   1324}
   1325
   1326/**
   1327 * Write out cached info to an inode
   1328 *
   1329 * @root:        root the inode belongs to
   1330 * @inode:       freespace inode we are writing out
   1331 * @ctl:         free space cache we are going to write out
   1332 * @block_group: block_group for this cache if it belongs to a block_group
   1333 * @io_ctl:      holds context for the io
   1334 * @trans:       the trans handle
   1335 *
   1336 * This function writes out a free space cache struct to disk for quick recovery
   1337 * on mount.  This will return 0 if it was successful in writing the cache out,
   1338 * or an errno if it was not.
   1339 */
   1340static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
   1341				   struct btrfs_free_space_ctl *ctl,
   1342				   struct btrfs_block_group *block_group,
   1343				   struct btrfs_io_ctl *io_ctl,
   1344				   struct btrfs_trans_handle *trans)
   1345{
   1346	struct extent_state *cached_state = NULL;
   1347	LIST_HEAD(bitmap_list);
   1348	int entries = 0;
   1349	int bitmaps = 0;
   1350	int ret;
   1351	int must_iput = 0;
   1352
   1353	if (!i_size_read(inode))
   1354		return -EIO;
   1355
   1356	WARN_ON(io_ctl->pages);
   1357	ret = io_ctl_init(io_ctl, inode, 1);
   1358	if (ret)
   1359		return ret;
   1360
   1361	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
   1362		down_write(&block_group->data_rwsem);
   1363		spin_lock(&block_group->lock);
   1364		if (block_group->delalloc_bytes) {
   1365			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
   1366			spin_unlock(&block_group->lock);
   1367			up_write(&block_group->data_rwsem);
   1368			BTRFS_I(inode)->generation = 0;
   1369			ret = 0;
   1370			must_iput = 1;
   1371			goto out;
   1372		}
   1373		spin_unlock(&block_group->lock);
   1374	}
   1375
   1376	/* Lock all pages first so we can lock the extent safely. */
   1377	ret = io_ctl_prepare_pages(io_ctl, false);
   1378	if (ret)
   1379		goto out_unlock;
   1380
   1381	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
   1382			 &cached_state);
   1383
   1384	io_ctl_set_generation(io_ctl, trans->transid);
   1385
   1386	mutex_lock(&ctl->cache_writeout_mutex);
   1387	/* Write out the extent entries in the free space cache */
   1388	spin_lock(&ctl->tree_lock);
   1389	ret = write_cache_extent_entries(io_ctl, ctl,
   1390					 block_group, &entries, &bitmaps,
   1391					 &bitmap_list);
   1392	if (ret)
   1393		goto out_nospc_locked;
   1394
   1395	/*
   1396	 * Some spaces that are freed in the current transaction are pinned,
   1397	 * they will be added into free space cache after the transaction is
   1398	 * committed, we shouldn't lose them.
   1399	 *
   1400	 * If this changes while we are working we'll get added back to
   1401	 * the dirty list and redo it.  No locking needed
   1402	 */
   1403	ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
   1404	if (ret)
   1405		goto out_nospc_locked;
   1406
   1407	/*
   1408	 * At last, we write out all the bitmaps and keep cache_writeout_mutex
   1409	 * locked while doing it because a concurrent trim can be manipulating
   1410	 * or freeing the bitmap.
   1411	 */
   1412	ret = write_bitmap_entries(io_ctl, &bitmap_list);
   1413	spin_unlock(&ctl->tree_lock);
   1414	mutex_unlock(&ctl->cache_writeout_mutex);
   1415	if (ret)
   1416		goto out_nospc;
   1417
   1418	/* Zero out the rest of the pages just to make sure */
   1419	io_ctl_zero_remaining_pages(io_ctl);
   1420
   1421	/* Everything is written out, now we dirty the pages in the file. */
   1422	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
   1423				io_ctl->num_pages, 0, i_size_read(inode),
   1424				&cached_state, false);
   1425	if (ret)
   1426		goto out_nospc;
   1427
   1428	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
   1429		up_write(&block_group->data_rwsem);
   1430	/*
   1431	 * Release the pages and unlock the extent, we will flush
   1432	 * them out later
   1433	 */
   1434	io_ctl_drop_pages(io_ctl);
   1435	io_ctl_free(io_ctl);
   1436
   1437	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
   1438			     i_size_read(inode) - 1, &cached_state);
   1439
   1440	/*
   1441	 * at this point the pages are under IO and we're happy,
   1442	 * The caller is responsible for waiting on them and updating
   1443	 * the cache and the inode
   1444	 */
   1445	io_ctl->entries = entries;
   1446	io_ctl->bitmaps = bitmaps;
   1447
   1448	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
   1449	if (ret)
   1450		goto out;
   1451
   1452	return 0;
   1453
   1454out_nospc_locked:
   1455	cleanup_bitmap_list(&bitmap_list);
   1456	spin_unlock(&ctl->tree_lock);
   1457	mutex_unlock(&ctl->cache_writeout_mutex);
   1458
   1459out_nospc:
   1460	cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
   1461
   1462out_unlock:
   1463	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
   1464		up_write(&block_group->data_rwsem);
   1465
   1466out:
   1467	io_ctl->inode = NULL;
   1468	io_ctl_free(io_ctl);
   1469	if (ret) {
   1470		invalidate_inode_pages2(inode->i_mapping);
   1471		BTRFS_I(inode)->generation = 0;
   1472	}
   1473	btrfs_update_inode(trans, root, BTRFS_I(inode));
   1474	if (must_iput)
   1475		iput(inode);
   1476	return ret;
   1477}
   1478
   1479int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
   1480			  struct btrfs_block_group *block_group,
   1481			  struct btrfs_path *path)
   1482{
   1483	struct btrfs_fs_info *fs_info = trans->fs_info;
   1484	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   1485	struct inode *inode;
   1486	int ret = 0;
   1487
   1488	spin_lock(&block_group->lock);
   1489	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
   1490		spin_unlock(&block_group->lock);
   1491		return 0;
   1492	}
   1493	spin_unlock(&block_group->lock);
   1494
   1495	inode = lookup_free_space_inode(block_group, path);
   1496	if (IS_ERR(inode))
   1497		return 0;
   1498
   1499	ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
   1500				block_group, &block_group->io_ctl, trans);
   1501	if (ret) {
   1502		btrfs_debug(fs_info,
   1503	  "failed to write free space cache for block group %llu error %d",
   1504			  block_group->start, ret);
   1505		spin_lock(&block_group->lock);
   1506		block_group->disk_cache_state = BTRFS_DC_ERROR;
   1507		spin_unlock(&block_group->lock);
   1508
   1509		block_group->io_ctl.inode = NULL;
   1510		iput(inode);
   1511	}
   1512
   1513	/*
   1514	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
   1515	 * to wait for IO and put the inode
   1516	 */
   1517
   1518	return ret;
   1519}
   1520
   1521static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
   1522					  u64 offset)
   1523{
   1524	ASSERT(offset >= bitmap_start);
   1525	offset -= bitmap_start;
   1526	return (unsigned long)(div_u64(offset, unit));
   1527}
   1528
   1529static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
   1530{
   1531	return (unsigned long)(div_u64(bytes, unit));
   1532}
   1533
   1534static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
   1535				   u64 offset)
   1536{
   1537	u64 bitmap_start;
   1538	u64 bytes_per_bitmap;
   1539
   1540	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
   1541	bitmap_start = offset - ctl->start;
   1542	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
   1543	bitmap_start *= bytes_per_bitmap;
   1544	bitmap_start += ctl->start;
   1545
   1546	return bitmap_start;
   1547}
   1548
   1549static int tree_insert_offset(struct rb_root *root, u64 offset,
   1550			      struct rb_node *node, int bitmap)
   1551{
   1552	struct rb_node **p = &root->rb_node;
   1553	struct rb_node *parent = NULL;
   1554	struct btrfs_free_space *info;
   1555
   1556	while (*p) {
   1557		parent = *p;
   1558		info = rb_entry(parent, struct btrfs_free_space, offset_index);
   1559
   1560		if (offset < info->offset) {
   1561			p = &(*p)->rb_left;
   1562		} else if (offset > info->offset) {
   1563			p = &(*p)->rb_right;
   1564		} else {
   1565			/*
   1566			 * we could have a bitmap entry and an extent entry
   1567			 * share the same offset.  If this is the case, we want
   1568			 * the extent entry to always be found first if we do a
   1569			 * linear search through the tree, since we want to have
   1570			 * the quickest allocation time, and allocating from an
   1571			 * extent is faster than allocating from a bitmap.  So
   1572			 * if we're inserting a bitmap and we find an entry at
   1573			 * this offset, we want to go right, or after this entry
   1574			 * logically.  If we are inserting an extent and we've
   1575			 * found a bitmap, we want to go left, or before
   1576			 * logically.
   1577			 */
   1578			if (bitmap) {
   1579				if (info->bitmap) {
   1580					WARN_ON_ONCE(1);
   1581					return -EEXIST;
   1582				}
   1583				p = &(*p)->rb_right;
   1584			} else {
   1585				if (!info->bitmap) {
   1586					WARN_ON_ONCE(1);
   1587					return -EEXIST;
   1588				}
   1589				p = &(*p)->rb_left;
   1590			}
   1591		}
   1592	}
   1593
   1594	rb_link_node(node, parent, p);
   1595	rb_insert_color(node, root);
   1596
   1597	return 0;
   1598}
   1599
   1600/*
   1601 * This is a little subtle.  We *only* have ->max_extent_size set if we actually
   1602 * searched through the bitmap and figured out the largest ->max_extent_size,
   1603 * otherwise it's 0.  In the case that it's 0 we don't want to tell the
   1604 * allocator the wrong thing, we want to use the actual real max_extent_size
   1605 * we've found already if it's larger, or we want to use ->bytes.
   1606 *
   1607 * This matters because find_free_space() will skip entries who's ->bytes is
   1608 * less than the required bytes.  So if we didn't search down this bitmap, we
   1609 * may pick some previous entry that has a smaller ->max_extent_size than we
   1610 * have.  For example, assume we have two entries, one that has
   1611 * ->max_extent_size set to 4K and ->bytes set to 1M.  A second entry hasn't set
   1612 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous.  We will
   1613 *  call into find_free_space(), and return with max_extent_size == 4K, because
   1614 *  that first bitmap entry had ->max_extent_size set, but the second one did
   1615 *  not.  If instead we returned 8K we'd come in searching for 8K, and find the
   1616 *  8K contiguous range.
   1617 *
   1618 *  Consider the other case, we have 2 8K chunks in that second entry and still
   1619 *  don't have ->max_extent_size set.  We'll return 16K, and the next time the
   1620 *  allocator comes in it'll fully search our second bitmap, and this time it'll
   1621 *  get an uptodate value of 8K as the maximum chunk size.  Then we'll get the
   1622 *  right allocation the next loop through.
   1623 */
   1624static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
   1625{
   1626	if (entry->bitmap && entry->max_extent_size)
   1627		return entry->max_extent_size;
   1628	return entry->bytes;
   1629}
   1630
   1631/*
   1632 * We want the largest entry to be leftmost, so this is inverted from what you'd
   1633 * normally expect.
   1634 */
   1635static bool entry_less(struct rb_node *node, const struct rb_node *parent)
   1636{
   1637	const struct btrfs_free_space *entry, *exist;
   1638
   1639	entry = rb_entry(node, struct btrfs_free_space, bytes_index);
   1640	exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
   1641	return get_max_extent_size(exist) < get_max_extent_size(entry);
   1642}
   1643
   1644/*
   1645 * searches the tree for the given offset.
   1646 *
   1647 * fuzzy - If this is set, then we are trying to make an allocation, and we just
   1648 * want a section that has at least bytes size and comes at or after the given
   1649 * offset.
   1650 */
   1651static struct btrfs_free_space *
   1652tree_search_offset(struct btrfs_free_space_ctl *ctl,
   1653		   u64 offset, int bitmap_only, int fuzzy)
   1654{
   1655	struct rb_node *n = ctl->free_space_offset.rb_node;
   1656	struct btrfs_free_space *entry = NULL, *prev = NULL;
   1657
   1658	/* find entry that is closest to the 'offset' */
   1659	while (n) {
   1660		entry = rb_entry(n, struct btrfs_free_space, offset_index);
   1661		prev = entry;
   1662
   1663		if (offset < entry->offset)
   1664			n = n->rb_left;
   1665		else if (offset > entry->offset)
   1666			n = n->rb_right;
   1667		else
   1668			break;
   1669
   1670		entry = NULL;
   1671	}
   1672
   1673	if (bitmap_only) {
   1674		if (!entry)
   1675			return NULL;
   1676		if (entry->bitmap)
   1677			return entry;
   1678
   1679		/*
   1680		 * bitmap entry and extent entry may share same offset,
   1681		 * in that case, bitmap entry comes after extent entry.
   1682		 */
   1683		n = rb_next(n);
   1684		if (!n)
   1685			return NULL;
   1686		entry = rb_entry(n, struct btrfs_free_space, offset_index);
   1687		if (entry->offset != offset)
   1688			return NULL;
   1689
   1690		WARN_ON(!entry->bitmap);
   1691		return entry;
   1692	} else if (entry) {
   1693		if (entry->bitmap) {
   1694			/*
   1695			 * if previous extent entry covers the offset,
   1696			 * we should return it instead of the bitmap entry
   1697			 */
   1698			n = rb_prev(&entry->offset_index);
   1699			if (n) {
   1700				prev = rb_entry(n, struct btrfs_free_space,
   1701						offset_index);
   1702				if (!prev->bitmap &&
   1703				    prev->offset + prev->bytes > offset)
   1704					entry = prev;
   1705			}
   1706		}
   1707		return entry;
   1708	}
   1709
   1710	if (!prev)
   1711		return NULL;
   1712
   1713	/* find last entry before the 'offset' */
   1714	entry = prev;
   1715	if (entry->offset > offset) {
   1716		n = rb_prev(&entry->offset_index);
   1717		if (n) {
   1718			entry = rb_entry(n, struct btrfs_free_space,
   1719					offset_index);
   1720			ASSERT(entry->offset <= offset);
   1721		} else {
   1722			if (fuzzy)
   1723				return entry;
   1724			else
   1725				return NULL;
   1726		}
   1727	}
   1728
   1729	if (entry->bitmap) {
   1730		n = rb_prev(&entry->offset_index);
   1731		if (n) {
   1732			prev = rb_entry(n, struct btrfs_free_space,
   1733					offset_index);
   1734			if (!prev->bitmap &&
   1735			    prev->offset + prev->bytes > offset)
   1736				return prev;
   1737		}
   1738		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
   1739			return entry;
   1740	} else if (entry->offset + entry->bytes > offset)
   1741		return entry;
   1742
   1743	if (!fuzzy)
   1744		return NULL;
   1745
   1746	while (1) {
   1747		n = rb_next(&entry->offset_index);
   1748		if (!n)
   1749			return NULL;
   1750		entry = rb_entry(n, struct btrfs_free_space, offset_index);
   1751		if (entry->bitmap) {
   1752			if (entry->offset + BITS_PER_BITMAP *
   1753			    ctl->unit > offset)
   1754				break;
   1755		} else {
   1756			if (entry->offset + entry->bytes > offset)
   1757				break;
   1758		}
   1759	}
   1760	return entry;
   1761}
   1762
   1763static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
   1764				     struct btrfs_free_space *info,
   1765				     bool update_stat)
   1766{
   1767	rb_erase(&info->offset_index, &ctl->free_space_offset);
   1768	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
   1769	ctl->free_extents--;
   1770
   1771	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
   1772		ctl->discardable_extents[BTRFS_STAT_CURR]--;
   1773		ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
   1774	}
   1775
   1776	if (update_stat)
   1777		ctl->free_space -= info->bytes;
   1778}
   1779
   1780static int link_free_space(struct btrfs_free_space_ctl *ctl,
   1781			   struct btrfs_free_space *info)
   1782{
   1783	int ret = 0;
   1784
   1785	ASSERT(info->bytes || info->bitmap);
   1786	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
   1787				 &info->offset_index, (info->bitmap != NULL));
   1788	if (ret)
   1789		return ret;
   1790
   1791	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
   1792
   1793	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
   1794		ctl->discardable_extents[BTRFS_STAT_CURR]++;
   1795		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
   1796	}
   1797
   1798	ctl->free_space += info->bytes;
   1799	ctl->free_extents++;
   1800	return ret;
   1801}
   1802
   1803static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
   1804				struct btrfs_free_space *info)
   1805{
   1806	ASSERT(info->bitmap);
   1807
   1808	/*
   1809	 * If our entry is empty it's because we're on a cluster and we don't
   1810	 * want to re-link it into our ctl bytes index.
   1811	 */
   1812	if (RB_EMPTY_NODE(&info->bytes_index))
   1813		return;
   1814
   1815	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
   1816	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
   1817}
   1818
   1819static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
   1820				     struct btrfs_free_space *info,
   1821				     u64 offset, u64 bytes, bool update_stat)
   1822{
   1823	unsigned long start, count, end;
   1824	int extent_delta = -1;
   1825
   1826	start = offset_to_bit(info->offset, ctl->unit, offset);
   1827	count = bytes_to_bits(bytes, ctl->unit);
   1828	end = start + count;
   1829	ASSERT(end <= BITS_PER_BITMAP);
   1830
   1831	bitmap_clear(info->bitmap, start, count);
   1832
   1833	info->bytes -= bytes;
   1834	if (info->max_extent_size > ctl->unit)
   1835		info->max_extent_size = 0;
   1836
   1837	relink_bitmap_entry(ctl, info);
   1838
   1839	if (start && test_bit(start - 1, info->bitmap))
   1840		extent_delta++;
   1841
   1842	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
   1843		extent_delta++;
   1844
   1845	info->bitmap_extents += extent_delta;
   1846	if (!btrfs_free_space_trimmed(info)) {
   1847		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
   1848		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
   1849	}
   1850
   1851	if (update_stat)
   1852		ctl->free_space -= bytes;
   1853}
   1854
   1855static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
   1856			    struct btrfs_free_space *info, u64 offset,
   1857			    u64 bytes)
   1858{
   1859	unsigned long start, count, end;
   1860	int extent_delta = 1;
   1861
   1862	start = offset_to_bit(info->offset, ctl->unit, offset);
   1863	count = bytes_to_bits(bytes, ctl->unit);
   1864	end = start + count;
   1865	ASSERT(end <= BITS_PER_BITMAP);
   1866
   1867	bitmap_set(info->bitmap, start, count);
   1868
   1869	/*
   1870	 * We set some bytes, we have no idea what the max extent size is
   1871	 * anymore.
   1872	 */
   1873	info->max_extent_size = 0;
   1874	info->bytes += bytes;
   1875	ctl->free_space += bytes;
   1876
   1877	relink_bitmap_entry(ctl, info);
   1878
   1879	if (start && test_bit(start - 1, info->bitmap))
   1880		extent_delta--;
   1881
   1882	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
   1883		extent_delta--;
   1884
   1885	info->bitmap_extents += extent_delta;
   1886	if (!btrfs_free_space_trimmed(info)) {
   1887		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
   1888		ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
   1889	}
   1890}
   1891
   1892/*
   1893 * If we can not find suitable extent, we will use bytes to record
   1894 * the size of the max extent.
   1895 */
   1896static int search_bitmap(struct btrfs_free_space_ctl *ctl,
   1897			 struct btrfs_free_space *bitmap_info, u64 *offset,
   1898			 u64 *bytes, bool for_alloc)
   1899{
   1900	unsigned long found_bits = 0;
   1901	unsigned long max_bits = 0;
   1902	unsigned long bits, i;
   1903	unsigned long next_zero;
   1904	unsigned long extent_bits;
   1905
   1906	/*
   1907	 * Skip searching the bitmap if we don't have a contiguous section that
   1908	 * is large enough for this allocation.
   1909	 */
   1910	if (for_alloc &&
   1911	    bitmap_info->max_extent_size &&
   1912	    bitmap_info->max_extent_size < *bytes) {
   1913		*bytes = bitmap_info->max_extent_size;
   1914		return -1;
   1915	}
   1916
   1917	i = offset_to_bit(bitmap_info->offset, ctl->unit,
   1918			  max_t(u64, *offset, bitmap_info->offset));
   1919	bits = bytes_to_bits(*bytes, ctl->unit);
   1920
   1921	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
   1922		if (for_alloc && bits == 1) {
   1923			found_bits = 1;
   1924			break;
   1925		}
   1926		next_zero = find_next_zero_bit(bitmap_info->bitmap,
   1927					       BITS_PER_BITMAP, i);
   1928		extent_bits = next_zero - i;
   1929		if (extent_bits >= bits) {
   1930			found_bits = extent_bits;
   1931			break;
   1932		} else if (extent_bits > max_bits) {
   1933			max_bits = extent_bits;
   1934		}
   1935		i = next_zero;
   1936	}
   1937
   1938	if (found_bits) {
   1939		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
   1940		*bytes = (u64)(found_bits) * ctl->unit;
   1941		return 0;
   1942	}
   1943
   1944	*bytes = (u64)(max_bits) * ctl->unit;
   1945	bitmap_info->max_extent_size = *bytes;
   1946	relink_bitmap_entry(ctl, bitmap_info);
   1947	return -1;
   1948}
   1949
   1950/* Cache the size of the max extent in bytes */
   1951static struct btrfs_free_space *
   1952find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
   1953		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
   1954{
   1955	struct btrfs_free_space *entry;
   1956	struct rb_node *node;
   1957	u64 tmp;
   1958	u64 align_off;
   1959	int ret;
   1960
   1961	if (!ctl->free_space_offset.rb_node)
   1962		goto out;
   1963again:
   1964	if (use_bytes_index) {
   1965		node = rb_first_cached(&ctl->free_space_bytes);
   1966	} else {
   1967		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
   1968					   0, 1);
   1969		if (!entry)
   1970			goto out;
   1971		node = &entry->offset_index;
   1972	}
   1973
   1974	for (; node; node = rb_next(node)) {
   1975		if (use_bytes_index)
   1976			entry = rb_entry(node, struct btrfs_free_space,
   1977					 bytes_index);
   1978		else
   1979			entry = rb_entry(node, struct btrfs_free_space,
   1980					 offset_index);
   1981
   1982		/*
   1983		 * If we are using the bytes index then all subsequent entries
   1984		 * in this tree are going to be < bytes, so simply set the max
   1985		 * extent size and exit the loop.
   1986		 *
   1987		 * If we're using the offset index then we need to keep going
   1988		 * through the rest of the tree.
   1989		 */
   1990		if (entry->bytes < *bytes) {
   1991			*max_extent_size = max(get_max_extent_size(entry),
   1992					       *max_extent_size);
   1993			if (use_bytes_index)
   1994				break;
   1995			continue;
   1996		}
   1997
   1998		/* make sure the space returned is big enough
   1999		 * to match our requested alignment
   2000		 */
   2001		if (*bytes >= align) {
   2002			tmp = entry->offset - ctl->start + align - 1;
   2003			tmp = div64_u64(tmp, align);
   2004			tmp = tmp * align + ctl->start;
   2005			align_off = tmp - entry->offset;
   2006		} else {
   2007			align_off = 0;
   2008			tmp = entry->offset;
   2009		}
   2010
   2011		/*
   2012		 * We don't break here if we're using the bytes index because we
   2013		 * may have another entry that has the correct alignment that is
   2014		 * the right size, so we don't want to miss that possibility.
   2015		 * At worst this adds another loop through the logic, but if we
   2016		 * broke here we could prematurely ENOSPC.
   2017		 */
   2018		if (entry->bytes < *bytes + align_off) {
   2019			*max_extent_size = max(get_max_extent_size(entry),
   2020					       *max_extent_size);
   2021			continue;
   2022		}
   2023
   2024		if (entry->bitmap) {
   2025			struct rb_node *old_next = rb_next(node);
   2026			u64 size = *bytes;
   2027
   2028			ret = search_bitmap(ctl, entry, &tmp, &size, true);
   2029			if (!ret) {
   2030				*offset = tmp;
   2031				*bytes = size;
   2032				return entry;
   2033			} else {
   2034				*max_extent_size =
   2035					max(get_max_extent_size(entry),
   2036					    *max_extent_size);
   2037			}
   2038
   2039			/*
   2040			 * The bitmap may have gotten re-arranged in the space
   2041			 * index here because the max_extent_size may have been
   2042			 * updated.  Start from the beginning again if this
   2043			 * happened.
   2044			 */
   2045			if (use_bytes_index && old_next != rb_next(node))
   2046				goto again;
   2047			continue;
   2048		}
   2049
   2050		*offset = tmp;
   2051		*bytes = entry->bytes - align_off;
   2052		return entry;
   2053	}
   2054out:
   2055	return NULL;
   2056}
   2057
   2058static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
   2059			   struct btrfs_free_space *info, u64 offset)
   2060{
   2061	info->offset = offset_to_bitmap(ctl, offset);
   2062	info->bytes = 0;
   2063	info->bitmap_extents = 0;
   2064	INIT_LIST_HEAD(&info->list);
   2065	link_free_space(ctl, info);
   2066	ctl->total_bitmaps++;
   2067	recalculate_thresholds(ctl);
   2068}
   2069
   2070static void free_bitmap(struct btrfs_free_space_ctl *ctl,
   2071			struct btrfs_free_space *bitmap_info)
   2072{
   2073	/*
   2074	 * Normally when this is called, the bitmap is completely empty. However,
   2075	 * if we are blowing up the free space cache for one reason or another
   2076	 * via __btrfs_remove_free_space_cache(), then it may not be freed and
   2077	 * we may leave stats on the table.
   2078	 */
   2079	if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
   2080		ctl->discardable_extents[BTRFS_STAT_CURR] -=
   2081			bitmap_info->bitmap_extents;
   2082		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
   2083
   2084	}
   2085	unlink_free_space(ctl, bitmap_info, true);
   2086	kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
   2087	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
   2088	ctl->total_bitmaps--;
   2089	recalculate_thresholds(ctl);
   2090}
   2091
   2092static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
   2093			      struct btrfs_free_space *bitmap_info,
   2094			      u64 *offset, u64 *bytes)
   2095{
   2096	u64 end;
   2097	u64 search_start, search_bytes;
   2098	int ret;
   2099
   2100again:
   2101	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
   2102
   2103	/*
   2104	 * We need to search for bits in this bitmap.  We could only cover some
   2105	 * of the extent in this bitmap thanks to how we add space, so we need
   2106	 * to search for as much as it as we can and clear that amount, and then
   2107	 * go searching for the next bit.
   2108	 */
   2109	search_start = *offset;
   2110	search_bytes = ctl->unit;
   2111	search_bytes = min(search_bytes, end - search_start + 1);
   2112	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
   2113			    false);
   2114	if (ret < 0 || search_start != *offset)
   2115		return -EINVAL;
   2116
   2117	/* We may have found more bits than what we need */
   2118	search_bytes = min(search_bytes, *bytes);
   2119
   2120	/* Cannot clear past the end of the bitmap */
   2121	search_bytes = min(search_bytes, end - search_start + 1);
   2122
   2123	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
   2124	*offset += search_bytes;
   2125	*bytes -= search_bytes;
   2126
   2127	if (*bytes) {
   2128		struct rb_node *next = rb_next(&bitmap_info->offset_index);
   2129		if (!bitmap_info->bytes)
   2130			free_bitmap(ctl, bitmap_info);
   2131
   2132		/*
   2133		 * no entry after this bitmap, but we still have bytes to
   2134		 * remove, so something has gone wrong.
   2135		 */
   2136		if (!next)
   2137			return -EINVAL;
   2138
   2139		bitmap_info = rb_entry(next, struct btrfs_free_space,
   2140				       offset_index);
   2141
   2142		/*
   2143		 * if the next entry isn't a bitmap we need to return to let the
   2144		 * extent stuff do its work.
   2145		 */
   2146		if (!bitmap_info->bitmap)
   2147			return -EAGAIN;
   2148
   2149		/*
   2150		 * Ok the next item is a bitmap, but it may not actually hold
   2151		 * the information for the rest of this free space stuff, so
   2152		 * look for it, and if we don't find it return so we can try
   2153		 * everything over again.
   2154		 */
   2155		search_start = *offset;
   2156		search_bytes = ctl->unit;
   2157		ret = search_bitmap(ctl, bitmap_info, &search_start,
   2158				    &search_bytes, false);
   2159		if (ret < 0 || search_start != *offset)
   2160			return -EAGAIN;
   2161
   2162		goto again;
   2163	} else if (!bitmap_info->bytes)
   2164		free_bitmap(ctl, bitmap_info);
   2165
   2166	return 0;
   2167}
   2168
   2169static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
   2170			       struct btrfs_free_space *info, u64 offset,
   2171			       u64 bytes, enum btrfs_trim_state trim_state)
   2172{
   2173	u64 bytes_to_set = 0;
   2174	u64 end;
   2175
   2176	/*
   2177	 * This is a tradeoff to make bitmap trim state minimal.  We mark the
   2178	 * whole bitmap untrimmed if at any point we add untrimmed regions.
   2179	 */
   2180	if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
   2181		if (btrfs_free_space_trimmed(info)) {
   2182			ctl->discardable_extents[BTRFS_STAT_CURR] +=
   2183				info->bitmap_extents;
   2184			ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
   2185		}
   2186		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   2187	}
   2188
   2189	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
   2190
   2191	bytes_to_set = min(end - offset, bytes);
   2192
   2193	bitmap_set_bits(ctl, info, offset, bytes_to_set);
   2194
   2195	return bytes_to_set;
   2196
   2197}
   2198
   2199static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
   2200		      struct btrfs_free_space *info)
   2201{
   2202	struct btrfs_block_group *block_group = ctl->block_group;
   2203	struct btrfs_fs_info *fs_info = block_group->fs_info;
   2204	bool forced = false;
   2205
   2206#ifdef CONFIG_BTRFS_DEBUG
   2207	if (btrfs_should_fragment_free_space(block_group))
   2208		forced = true;
   2209#endif
   2210
   2211	/* This is a way to reclaim large regions from the bitmaps. */
   2212	if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
   2213		return false;
   2214
   2215	/*
   2216	 * If we are below the extents threshold then we can add this as an
   2217	 * extent, and don't have to deal with the bitmap
   2218	 */
   2219	if (!forced && ctl->free_extents < ctl->extents_thresh) {
   2220		/*
   2221		 * If this block group has some small extents we don't want to
   2222		 * use up all of our free slots in the cache with them, we want
   2223		 * to reserve them to larger extents, however if we have plenty
   2224		 * of cache left then go ahead an dadd them, no sense in adding
   2225		 * the overhead of a bitmap if we don't have to.
   2226		 */
   2227		if (info->bytes <= fs_info->sectorsize * 8) {
   2228			if (ctl->free_extents * 3 <= ctl->extents_thresh)
   2229				return false;
   2230		} else {
   2231			return false;
   2232		}
   2233	}
   2234
   2235	/*
   2236	 * The original block groups from mkfs can be really small, like 8
   2237	 * megabytes, so don't bother with a bitmap for those entries.  However
   2238	 * some block groups can be smaller than what a bitmap would cover but
   2239	 * are still large enough that they could overflow the 32k memory limit,
   2240	 * so allow those block groups to still be allowed to have a bitmap
   2241	 * entry.
   2242	 */
   2243	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
   2244		return false;
   2245
   2246	return true;
   2247}
   2248
   2249static const struct btrfs_free_space_op free_space_op = {
   2250	.use_bitmap		= use_bitmap,
   2251};
   2252
   2253static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
   2254			      struct btrfs_free_space *info)
   2255{
   2256	struct btrfs_free_space *bitmap_info;
   2257	struct btrfs_block_group *block_group = NULL;
   2258	int added = 0;
   2259	u64 bytes, offset, bytes_added;
   2260	enum btrfs_trim_state trim_state;
   2261	int ret;
   2262
   2263	bytes = info->bytes;
   2264	offset = info->offset;
   2265	trim_state = info->trim_state;
   2266
   2267	if (!ctl->op->use_bitmap(ctl, info))
   2268		return 0;
   2269
   2270	if (ctl->op == &free_space_op)
   2271		block_group = ctl->block_group;
   2272again:
   2273	/*
   2274	 * Since we link bitmaps right into the cluster we need to see if we
   2275	 * have a cluster here, and if so and it has our bitmap we need to add
   2276	 * the free space to that bitmap.
   2277	 */
   2278	if (block_group && !list_empty(&block_group->cluster_list)) {
   2279		struct btrfs_free_cluster *cluster;
   2280		struct rb_node *node;
   2281		struct btrfs_free_space *entry;
   2282
   2283		cluster = list_entry(block_group->cluster_list.next,
   2284				     struct btrfs_free_cluster,
   2285				     block_group_list);
   2286		spin_lock(&cluster->lock);
   2287		node = rb_first(&cluster->root);
   2288		if (!node) {
   2289			spin_unlock(&cluster->lock);
   2290			goto no_cluster_bitmap;
   2291		}
   2292
   2293		entry = rb_entry(node, struct btrfs_free_space, offset_index);
   2294		if (!entry->bitmap) {
   2295			spin_unlock(&cluster->lock);
   2296			goto no_cluster_bitmap;
   2297		}
   2298
   2299		if (entry->offset == offset_to_bitmap(ctl, offset)) {
   2300			bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
   2301							  bytes, trim_state);
   2302			bytes -= bytes_added;
   2303			offset += bytes_added;
   2304		}
   2305		spin_unlock(&cluster->lock);
   2306		if (!bytes) {
   2307			ret = 1;
   2308			goto out;
   2309		}
   2310	}
   2311
   2312no_cluster_bitmap:
   2313	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
   2314					 1, 0);
   2315	if (!bitmap_info) {
   2316		ASSERT(added == 0);
   2317		goto new_bitmap;
   2318	}
   2319
   2320	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
   2321					  trim_state);
   2322	bytes -= bytes_added;
   2323	offset += bytes_added;
   2324	added = 0;
   2325
   2326	if (!bytes) {
   2327		ret = 1;
   2328		goto out;
   2329	} else
   2330		goto again;
   2331
   2332new_bitmap:
   2333	if (info && info->bitmap) {
   2334		add_new_bitmap(ctl, info, offset);
   2335		added = 1;
   2336		info = NULL;
   2337		goto again;
   2338	} else {
   2339		spin_unlock(&ctl->tree_lock);
   2340
   2341		/* no pre-allocated info, allocate a new one */
   2342		if (!info) {
   2343			info = kmem_cache_zalloc(btrfs_free_space_cachep,
   2344						 GFP_NOFS);
   2345			if (!info) {
   2346				spin_lock(&ctl->tree_lock);
   2347				ret = -ENOMEM;
   2348				goto out;
   2349			}
   2350		}
   2351
   2352		/* allocate the bitmap */
   2353		info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
   2354						 GFP_NOFS);
   2355		info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
   2356		spin_lock(&ctl->tree_lock);
   2357		if (!info->bitmap) {
   2358			ret = -ENOMEM;
   2359			goto out;
   2360		}
   2361		goto again;
   2362	}
   2363
   2364out:
   2365	if (info) {
   2366		if (info->bitmap)
   2367			kmem_cache_free(btrfs_free_space_bitmap_cachep,
   2368					info->bitmap);
   2369		kmem_cache_free(btrfs_free_space_cachep, info);
   2370	}
   2371
   2372	return ret;
   2373}
   2374
   2375/*
   2376 * Free space merging rules:
   2377 *  1) Merge trimmed areas together
   2378 *  2) Let untrimmed areas coalesce with trimmed areas
   2379 *  3) Always pull neighboring regions from bitmaps
   2380 *
   2381 * The above rules are for when we merge free space based on btrfs_trim_state.
   2382 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
   2383 * same reason: to promote larger extent regions which makes life easier for
   2384 * find_free_extent().  Rule 2 enables coalescing based on the common path
   2385 * being returning free space from btrfs_finish_extent_commit().  So when free
   2386 * space is trimmed, it will prevent aggregating trimmed new region and
   2387 * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
   2388 * and provide find_free_extent() with the largest extents possible hoping for
   2389 * the reuse path.
   2390 */
   2391static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
   2392			  struct btrfs_free_space *info, bool update_stat)
   2393{
   2394	struct btrfs_free_space *left_info = NULL;
   2395	struct btrfs_free_space *right_info;
   2396	bool merged = false;
   2397	u64 offset = info->offset;
   2398	u64 bytes = info->bytes;
   2399	const bool is_trimmed = btrfs_free_space_trimmed(info);
   2400
   2401	/*
   2402	 * first we want to see if there is free space adjacent to the range we
   2403	 * are adding, if there is remove that struct and add a new one to
   2404	 * cover the entire range
   2405	 */
   2406	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
   2407	if (right_info && rb_prev(&right_info->offset_index))
   2408		left_info = rb_entry(rb_prev(&right_info->offset_index),
   2409				     struct btrfs_free_space, offset_index);
   2410	else if (!right_info)
   2411		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
   2412
   2413	/* See try_merge_free_space() comment. */
   2414	if (right_info && !right_info->bitmap &&
   2415	    (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
   2416		unlink_free_space(ctl, right_info, update_stat);
   2417		info->bytes += right_info->bytes;
   2418		kmem_cache_free(btrfs_free_space_cachep, right_info);
   2419		merged = true;
   2420	}
   2421
   2422	/* See try_merge_free_space() comment. */
   2423	if (left_info && !left_info->bitmap &&
   2424	    left_info->offset + left_info->bytes == offset &&
   2425	    (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
   2426		unlink_free_space(ctl, left_info, update_stat);
   2427		info->offset = left_info->offset;
   2428		info->bytes += left_info->bytes;
   2429		kmem_cache_free(btrfs_free_space_cachep, left_info);
   2430		merged = true;
   2431	}
   2432
   2433	return merged;
   2434}
   2435
   2436static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
   2437				     struct btrfs_free_space *info,
   2438				     bool update_stat)
   2439{
   2440	struct btrfs_free_space *bitmap;
   2441	unsigned long i;
   2442	unsigned long j;
   2443	const u64 end = info->offset + info->bytes;
   2444	const u64 bitmap_offset = offset_to_bitmap(ctl, end);
   2445	u64 bytes;
   2446
   2447	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
   2448	if (!bitmap)
   2449		return false;
   2450
   2451	i = offset_to_bit(bitmap->offset, ctl->unit, end);
   2452	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
   2453	if (j == i)
   2454		return false;
   2455	bytes = (j - i) * ctl->unit;
   2456	info->bytes += bytes;
   2457
   2458	/* See try_merge_free_space() comment. */
   2459	if (!btrfs_free_space_trimmed(bitmap))
   2460		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   2461
   2462	bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
   2463
   2464	if (!bitmap->bytes)
   2465		free_bitmap(ctl, bitmap);
   2466
   2467	return true;
   2468}
   2469
   2470static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
   2471				       struct btrfs_free_space *info,
   2472				       bool update_stat)
   2473{
   2474	struct btrfs_free_space *bitmap;
   2475	u64 bitmap_offset;
   2476	unsigned long i;
   2477	unsigned long j;
   2478	unsigned long prev_j;
   2479	u64 bytes;
   2480
   2481	bitmap_offset = offset_to_bitmap(ctl, info->offset);
   2482	/* If we're on a boundary, try the previous logical bitmap. */
   2483	if (bitmap_offset == info->offset) {
   2484		if (info->offset == 0)
   2485			return false;
   2486		bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
   2487	}
   2488
   2489	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
   2490	if (!bitmap)
   2491		return false;
   2492
   2493	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
   2494	j = 0;
   2495	prev_j = (unsigned long)-1;
   2496	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
   2497		if (j > i)
   2498			break;
   2499		prev_j = j;
   2500	}
   2501	if (prev_j == i)
   2502		return false;
   2503
   2504	if (prev_j == (unsigned long)-1)
   2505		bytes = (i + 1) * ctl->unit;
   2506	else
   2507		bytes = (i - prev_j) * ctl->unit;
   2508
   2509	info->offset -= bytes;
   2510	info->bytes += bytes;
   2511
   2512	/* See try_merge_free_space() comment. */
   2513	if (!btrfs_free_space_trimmed(bitmap))
   2514		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   2515
   2516	bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
   2517
   2518	if (!bitmap->bytes)
   2519		free_bitmap(ctl, bitmap);
   2520
   2521	return true;
   2522}
   2523
   2524/*
   2525 * We prefer always to allocate from extent entries, both for clustered and
   2526 * non-clustered allocation requests. So when attempting to add a new extent
   2527 * entry, try to see if there's adjacent free space in bitmap entries, and if
   2528 * there is, migrate that space from the bitmaps to the extent.
   2529 * Like this we get better chances of satisfying space allocation requests
   2530 * because we attempt to satisfy them based on a single cache entry, and never
   2531 * on 2 or more entries - even if the entries represent a contiguous free space
   2532 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
   2533 * ends).
   2534 */
   2535static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
   2536			      struct btrfs_free_space *info,
   2537			      bool update_stat)
   2538{
   2539	/*
   2540	 * Only work with disconnected entries, as we can change their offset,
   2541	 * and must be extent entries.
   2542	 */
   2543	ASSERT(!info->bitmap);
   2544	ASSERT(RB_EMPTY_NODE(&info->offset_index));
   2545
   2546	if (ctl->total_bitmaps > 0) {
   2547		bool stole_end;
   2548		bool stole_front = false;
   2549
   2550		stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
   2551		if (ctl->total_bitmaps > 0)
   2552			stole_front = steal_from_bitmap_to_front(ctl, info,
   2553								 update_stat);
   2554
   2555		if (stole_end || stole_front)
   2556			try_merge_free_space(ctl, info, update_stat);
   2557	}
   2558}
   2559
   2560int __btrfs_add_free_space(struct btrfs_block_group *block_group,
   2561			   u64 offset, u64 bytes,
   2562			   enum btrfs_trim_state trim_state)
   2563{
   2564	struct btrfs_fs_info *fs_info = block_group->fs_info;
   2565	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   2566	struct btrfs_free_space *info;
   2567	int ret = 0;
   2568	u64 filter_bytes = bytes;
   2569
   2570	ASSERT(!btrfs_is_zoned(fs_info));
   2571
   2572	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
   2573	if (!info)
   2574		return -ENOMEM;
   2575
   2576	info->offset = offset;
   2577	info->bytes = bytes;
   2578	info->trim_state = trim_state;
   2579	RB_CLEAR_NODE(&info->offset_index);
   2580	RB_CLEAR_NODE(&info->bytes_index);
   2581
   2582	spin_lock(&ctl->tree_lock);
   2583
   2584	if (try_merge_free_space(ctl, info, true))
   2585		goto link;
   2586
   2587	/*
   2588	 * There was no extent directly to the left or right of this new
   2589	 * extent then we know we're going to have to allocate a new extent, so
   2590	 * before we do that see if we need to drop this into a bitmap
   2591	 */
   2592	ret = insert_into_bitmap(ctl, info);
   2593	if (ret < 0) {
   2594		goto out;
   2595	} else if (ret) {
   2596		ret = 0;
   2597		goto out;
   2598	}
   2599link:
   2600	/*
   2601	 * Only steal free space from adjacent bitmaps if we're sure we're not
   2602	 * going to add the new free space to existing bitmap entries - because
   2603	 * that would mean unnecessary work that would be reverted. Therefore
   2604	 * attempt to steal space from bitmaps if we're adding an extent entry.
   2605	 */
   2606	steal_from_bitmap(ctl, info, true);
   2607
   2608	filter_bytes = max(filter_bytes, info->bytes);
   2609
   2610	ret = link_free_space(ctl, info);
   2611	if (ret)
   2612		kmem_cache_free(btrfs_free_space_cachep, info);
   2613out:
   2614	btrfs_discard_update_discardable(block_group);
   2615	spin_unlock(&ctl->tree_lock);
   2616
   2617	if (ret) {
   2618		btrfs_crit(fs_info, "unable to add free space :%d", ret);
   2619		ASSERT(ret != -EEXIST);
   2620	}
   2621
   2622	if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
   2623		btrfs_discard_check_filter(block_group, filter_bytes);
   2624		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
   2625	}
   2626
   2627	return ret;
   2628}
   2629
   2630static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
   2631					u64 bytenr, u64 size, bool used)
   2632{
   2633	struct btrfs_space_info *sinfo = block_group->space_info;
   2634	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   2635	u64 offset = bytenr - block_group->start;
   2636	u64 to_free, to_unusable;
   2637	int bg_reclaim_threshold = 0;
   2638	bool initial = (size == block_group->length);
   2639	u64 reclaimable_unusable;
   2640
   2641	WARN_ON(!initial && offset + size > block_group->zone_capacity);
   2642
   2643	if (!initial)
   2644		bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
   2645
   2646	spin_lock(&ctl->tree_lock);
   2647	if (!used)
   2648		to_free = size;
   2649	else if (initial)
   2650		to_free = block_group->zone_capacity;
   2651	else if (offset >= block_group->alloc_offset)
   2652		to_free = size;
   2653	else if (offset + size <= block_group->alloc_offset)
   2654		to_free = 0;
   2655	else
   2656		to_free = offset + size - block_group->alloc_offset;
   2657	to_unusable = size - to_free;
   2658
   2659	ctl->free_space += to_free;
   2660	/*
   2661	 * If the block group is read-only, we should account freed space into
   2662	 * bytes_readonly.
   2663	 */
   2664	if (!block_group->ro)
   2665		block_group->zone_unusable += to_unusable;
   2666	spin_unlock(&ctl->tree_lock);
   2667	if (!used) {
   2668		spin_lock(&block_group->lock);
   2669		block_group->alloc_offset -= size;
   2670		spin_unlock(&block_group->lock);
   2671	}
   2672
   2673	reclaimable_unusable = block_group->zone_unusable -
   2674			       (block_group->length - block_group->zone_capacity);
   2675	/* All the region is now unusable. Mark it as unused and reclaim */
   2676	if (block_group->zone_unusable == block_group->length) {
   2677		btrfs_mark_bg_unused(block_group);
   2678	} else if (bg_reclaim_threshold &&
   2679		   reclaimable_unusable >=
   2680		   div_factor_fine(block_group->zone_capacity,
   2681				   bg_reclaim_threshold)) {
   2682		btrfs_mark_bg_to_reclaim(block_group);
   2683	}
   2684
   2685	return 0;
   2686}
   2687
   2688int btrfs_add_free_space(struct btrfs_block_group *block_group,
   2689			 u64 bytenr, u64 size)
   2690{
   2691	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   2692
   2693	if (btrfs_is_zoned(block_group->fs_info))
   2694		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
   2695						    true);
   2696
   2697	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
   2698		trim_state = BTRFS_TRIM_STATE_TRIMMED;
   2699
   2700	return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
   2701}
   2702
   2703int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
   2704				u64 bytenr, u64 size)
   2705{
   2706	if (btrfs_is_zoned(block_group->fs_info))
   2707		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
   2708						    false);
   2709
   2710	return btrfs_add_free_space(block_group, bytenr, size);
   2711}
   2712
   2713/*
   2714 * This is a subtle distinction because when adding free space back in general,
   2715 * we want it to be added as untrimmed for async. But in the case where we add
   2716 * it on loading of a block group, we want to consider it trimmed.
   2717 */
   2718int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
   2719				       u64 bytenr, u64 size)
   2720{
   2721	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   2722
   2723	if (btrfs_is_zoned(block_group->fs_info))
   2724		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
   2725						    true);
   2726
   2727	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
   2728	    btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
   2729		trim_state = BTRFS_TRIM_STATE_TRIMMED;
   2730
   2731	return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
   2732}
   2733
   2734int btrfs_remove_free_space(struct btrfs_block_group *block_group,
   2735			    u64 offset, u64 bytes)
   2736{
   2737	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   2738	struct btrfs_free_space *info;
   2739	int ret;
   2740	bool re_search = false;
   2741
   2742	if (btrfs_is_zoned(block_group->fs_info)) {
   2743		/*
   2744		 * This can happen with conventional zones when replaying log.
   2745		 * Since the allocation info of tree-log nodes are not recorded
   2746		 * to the extent-tree, calculate_alloc_pointer() failed to
   2747		 * advance the allocation pointer after last allocated tree log
   2748		 * node blocks.
   2749		 *
   2750		 * This function is called from
   2751		 * btrfs_pin_extent_for_log_replay() when replaying the log.
   2752		 * Advance the pointer not to overwrite the tree-log nodes.
   2753		 */
   2754		if (block_group->start + block_group->alloc_offset <
   2755		    offset + bytes) {
   2756			block_group->alloc_offset =
   2757				offset + bytes - block_group->start;
   2758		}
   2759		return 0;
   2760	}
   2761
   2762	spin_lock(&ctl->tree_lock);
   2763
   2764again:
   2765	ret = 0;
   2766	if (!bytes)
   2767		goto out_lock;
   2768
   2769	info = tree_search_offset(ctl, offset, 0, 0);
   2770	if (!info) {
   2771		/*
   2772		 * oops didn't find an extent that matched the space we wanted
   2773		 * to remove, look for a bitmap instead
   2774		 */
   2775		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
   2776					  1, 0);
   2777		if (!info) {
   2778			/*
   2779			 * If we found a partial bit of our free space in a
   2780			 * bitmap but then couldn't find the other part this may
   2781			 * be a problem, so WARN about it.
   2782			 */
   2783			WARN_ON(re_search);
   2784			goto out_lock;
   2785		}
   2786	}
   2787
   2788	re_search = false;
   2789	if (!info->bitmap) {
   2790		unlink_free_space(ctl, info, true);
   2791		if (offset == info->offset) {
   2792			u64 to_free = min(bytes, info->bytes);
   2793
   2794			info->bytes -= to_free;
   2795			info->offset += to_free;
   2796			if (info->bytes) {
   2797				ret = link_free_space(ctl, info);
   2798				WARN_ON(ret);
   2799			} else {
   2800				kmem_cache_free(btrfs_free_space_cachep, info);
   2801			}
   2802
   2803			offset += to_free;
   2804			bytes -= to_free;
   2805			goto again;
   2806		} else {
   2807			u64 old_end = info->bytes + info->offset;
   2808
   2809			info->bytes = offset - info->offset;
   2810			ret = link_free_space(ctl, info);
   2811			WARN_ON(ret);
   2812			if (ret)
   2813				goto out_lock;
   2814
   2815			/* Not enough bytes in this entry to satisfy us */
   2816			if (old_end < offset + bytes) {
   2817				bytes -= old_end - offset;
   2818				offset = old_end;
   2819				goto again;
   2820			} else if (old_end == offset + bytes) {
   2821				/* all done */
   2822				goto out_lock;
   2823			}
   2824			spin_unlock(&ctl->tree_lock);
   2825
   2826			ret = __btrfs_add_free_space(block_group,
   2827						     offset + bytes,
   2828						     old_end - (offset + bytes),
   2829						     info->trim_state);
   2830			WARN_ON(ret);
   2831			goto out;
   2832		}
   2833	}
   2834
   2835	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
   2836	if (ret == -EAGAIN) {
   2837		re_search = true;
   2838		goto again;
   2839	}
   2840out_lock:
   2841	btrfs_discard_update_discardable(block_group);
   2842	spin_unlock(&ctl->tree_lock);
   2843out:
   2844	return ret;
   2845}
   2846
   2847void btrfs_dump_free_space(struct btrfs_block_group *block_group,
   2848			   u64 bytes)
   2849{
   2850	struct btrfs_fs_info *fs_info = block_group->fs_info;
   2851	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   2852	struct btrfs_free_space *info;
   2853	struct rb_node *n;
   2854	int count = 0;
   2855
   2856	/*
   2857	 * Zoned btrfs does not use free space tree and cluster. Just print
   2858	 * out the free space after the allocation offset.
   2859	 */
   2860	if (btrfs_is_zoned(fs_info)) {
   2861		btrfs_info(fs_info, "free space %llu active %d",
   2862			   block_group->zone_capacity - block_group->alloc_offset,
   2863			   block_group->zone_is_active);
   2864		return;
   2865	}
   2866
   2867	spin_lock(&ctl->tree_lock);
   2868	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
   2869		info = rb_entry(n, struct btrfs_free_space, offset_index);
   2870		if (info->bytes >= bytes && !block_group->ro)
   2871			count++;
   2872		btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
   2873			   info->offset, info->bytes,
   2874		       (info->bitmap) ? "yes" : "no");
   2875	}
   2876	spin_unlock(&ctl->tree_lock);
   2877	btrfs_info(fs_info, "block group has cluster?: %s",
   2878	       list_empty(&block_group->cluster_list) ? "no" : "yes");
   2879	btrfs_info(fs_info,
   2880		   "%d blocks of free space at or bigger than bytes is", count);
   2881}
   2882
   2883void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
   2884			       struct btrfs_free_space_ctl *ctl)
   2885{
   2886	struct btrfs_fs_info *fs_info = block_group->fs_info;
   2887
   2888	spin_lock_init(&ctl->tree_lock);
   2889	ctl->unit = fs_info->sectorsize;
   2890	ctl->start = block_group->start;
   2891	ctl->block_group = block_group;
   2892	ctl->op = &free_space_op;
   2893	ctl->free_space_bytes = RB_ROOT_CACHED;
   2894	INIT_LIST_HEAD(&ctl->trimming_ranges);
   2895	mutex_init(&ctl->cache_writeout_mutex);
   2896
   2897	/*
   2898	 * we only want to have 32k of ram per block group for keeping
   2899	 * track of free space, and if we pass 1/2 of that we want to
   2900	 * start converting things over to using bitmaps
   2901	 */
   2902	ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
   2903}
   2904
   2905/*
   2906 * for a given cluster, put all of its extents back into the free
   2907 * space cache.  If the block group passed doesn't match the block group
   2908 * pointed to by the cluster, someone else raced in and freed the
   2909 * cluster already.  In that case, we just return without changing anything
   2910 */
   2911static void __btrfs_return_cluster_to_free_space(
   2912			     struct btrfs_block_group *block_group,
   2913			     struct btrfs_free_cluster *cluster)
   2914{
   2915	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   2916	struct btrfs_free_space *entry;
   2917	struct rb_node *node;
   2918
   2919	spin_lock(&cluster->lock);
   2920	if (cluster->block_group != block_group) {
   2921		spin_unlock(&cluster->lock);
   2922		return;
   2923	}
   2924
   2925	cluster->block_group = NULL;
   2926	cluster->window_start = 0;
   2927	list_del_init(&cluster->block_group_list);
   2928
   2929	node = rb_first(&cluster->root);
   2930	while (node) {
   2931		bool bitmap;
   2932
   2933		entry = rb_entry(node, struct btrfs_free_space, offset_index);
   2934		node = rb_next(&entry->offset_index);
   2935		rb_erase(&entry->offset_index, &cluster->root);
   2936		RB_CLEAR_NODE(&entry->offset_index);
   2937
   2938		bitmap = (entry->bitmap != NULL);
   2939		if (!bitmap) {
   2940			/* Merging treats extents as if they were new */
   2941			if (!btrfs_free_space_trimmed(entry)) {
   2942				ctl->discardable_extents[BTRFS_STAT_CURR]--;
   2943				ctl->discardable_bytes[BTRFS_STAT_CURR] -=
   2944					entry->bytes;
   2945			}
   2946
   2947			try_merge_free_space(ctl, entry, false);
   2948			steal_from_bitmap(ctl, entry, false);
   2949
   2950			/* As we insert directly, update these statistics */
   2951			if (!btrfs_free_space_trimmed(entry)) {
   2952				ctl->discardable_extents[BTRFS_STAT_CURR]++;
   2953				ctl->discardable_bytes[BTRFS_STAT_CURR] +=
   2954					entry->bytes;
   2955			}
   2956		}
   2957		tree_insert_offset(&ctl->free_space_offset,
   2958				   entry->offset, &entry->offset_index, bitmap);
   2959		rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
   2960			      entry_less);
   2961	}
   2962	cluster->root = RB_ROOT;
   2963	spin_unlock(&cluster->lock);
   2964	btrfs_put_block_group(block_group);
   2965}
   2966
   2967static void __btrfs_remove_free_space_cache_locked(
   2968				struct btrfs_free_space_ctl *ctl)
   2969{
   2970	struct btrfs_free_space *info;
   2971	struct rb_node *node;
   2972
   2973	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
   2974		info = rb_entry(node, struct btrfs_free_space, offset_index);
   2975		if (!info->bitmap) {
   2976			unlink_free_space(ctl, info, true);
   2977			kmem_cache_free(btrfs_free_space_cachep, info);
   2978		} else {
   2979			free_bitmap(ctl, info);
   2980		}
   2981
   2982		cond_resched_lock(&ctl->tree_lock);
   2983	}
   2984}
   2985
   2986void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
   2987{
   2988	spin_lock(&ctl->tree_lock);
   2989	__btrfs_remove_free_space_cache_locked(ctl);
   2990	if (ctl->block_group)
   2991		btrfs_discard_update_discardable(ctl->block_group);
   2992	spin_unlock(&ctl->tree_lock);
   2993}
   2994
   2995void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
   2996{
   2997	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   2998	struct btrfs_free_cluster *cluster;
   2999	struct list_head *head;
   3000
   3001	spin_lock(&ctl->tree_lock);
   3002	while ((head = block_group->cluster_list.next) !=
   3003	       &block_group->cluster_list) {
   3004		cluster = list_entry(head, struct btrfs_free_cluster,
   3005				     block_group_list);
   3006
   3007		WARN_ON(cluster->block_group != block_group);
   3008		__btrfs_return_cluster_to_free_space(block_group, cluster);
   3009
   3010		cond_resched_lock(&ctl->tree_lock);
   3011	}
   3012	__btrfs_remove_free_space_cache_locked(ctl);
   3013	btrfs_discard_update_discardable(block_group);
   3014	spin_unlock(&ctl->tree_lock);
   3015
   3016}
   3017
   3018/**
   3019 * btrfs_is_free_space_trimmed - see if everything is trimmed
   3020 * @block_group: block_group of interest
   3021 *
   3022 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
   3023 */
   3024bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
   3025{
   3026	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3027	struct btrfs_free_space *info;
   3028	struct rb_node *node;
   3029	bool ret = true;
   3030
   3031	spin_lock(&ctl->tree_lock);
   3032	node = rb_first(&ctl->free_space_offset);
   3033
   3034	while (node) {
   3035		info = rb_entry(node, struct btrfs_free_space, offset_index);
   3036
   3037		if (!btrfs_free_space_trimmed(info)) {
   3038			ret = false;
   3039			break;
   3040		}
   3041
   3042		node = rb_next(node);
   3043	}
   3044
   3045	spin_unlock(&ctl->tree_lock);
   3046	return ret;
   3047}
   3048
   3049u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
   3050			       u64 offset, u64 bytes, u64 empty_size,
   3051			       u64 *max_extent_size)
   3052{
   3053	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3054	struct btrfs_discard_ctl *discard_ctl =
   3055					&block_group->fs_info->discard_ctl;
   3056	struct btrfs_free_space *entry = NULL;
   3057	u64 bytes_search = bytes + empty_size;
   3058	u64 ret = 0;
   3059	u64 align_gap = 0;
   3060	u64 align_gap_len = 0;
   3061	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   3062	bool use_bytes_index = (offset == block_group->start);
   3063
   3064	ASSERT(!btrfs_is_zoned(block_group->fs_info));
   3065
   3066	spin_lock(&ctl->tree_lock);
   3067	entry = find_free_space(ctl, &offset, &bytes_search,
   3068				block_group->full_stripe_len, max_extent_size,
   3069				use_bytes_index);
   3070	if (!entry)
   3071		goto out;
   3072
   3073	ret = offset;
   3074	if (entry->bitmap) {
   3075		bitmap_clear_bits(ctl, entry, offset, bytes, true);
   3076
   3077		if (!btrfs_free_space_trimmed(entry))
   3078			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
   3079
   3080		if (!entry->bytes)
   3081			free_bitmap(ctl, entry);
   3082	} else {
   3083		unlink_free_space(ctl, entry, true);
   3084		align_gap_len = offset - entry->offset;
   3085		align_gap = entry->offset;
   3086		align_gap_trim_state = entry->trim_state;
   3087
   3088		if (!btrfs_free_space_trimmed(entry))
   3089			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
   3090
   3091		entry->offset = offset + bytes;
   3092		WARN_ON(entry->bytes < bytes + align_gap_len);
   3093
   3094		entry->bytes -= bytes + align_gap_len;
   3095		if (!entry->bytes)
   3096			kmem_cache_free(btrfs_free_space_cachep, entry);
   3097		else
   3098			link_free_space(ctl, entry);
   3099	}
   3100out:
   3101	btrfs_discard_update_discardable(block_group);
   3102	spin_unlock(&ctl->tree_lock);
   3103
   3104	if (align_gap_len)
   3105		__btrfs_add_free_space(block_group, align_gap, align_gap_len,
   3106				       align_gap_trim_state);
   3107	return ret;
   3108}
   3109
   3110/*
   3111 * given a cluster, put all of its extents back into the free space
   3112 * cache.  If a block group is passed, this function will only free
   3113 * a cluster that belongs to the passed block group.
   3114 *
   3115 * Otherwise, it'll get a reference on the block group pointed to by the
   3116 * cluster and remove the cluster from it.
   3117 */
   3118void btrfs_return_cluster_to_free_space(
   3119			       struct btrfs_block_group *block_group,
   3120			       struct btrfs_free_cluster *cluster)
   3121{
   3122	struct btrfs_free_space_ctl *ctl;
   3123
   3124	/* first, get a safe pointer to the block group */
   3125	spin_lock(&cluster->lock);
   3126	if (!block_group) {
   3127		block_group = cluster->block_group;
   3128		if (!block_group) {
   3129			spin_unlock(&cluster->lock);
   3130			return;
   3131		}
   3132	} else if (cluster->block_group != block_group) {
   3133		/* someone else has already freed it don't redo their work */
   3134		spin_unlock(&cluster->lock);
   3135		return;
   3136	}
   3137	btrfs_get_block_group(block_group);
   3138	spin_unlock(&cluster->lock);
   3139
   3140	ctl = block_group->free_space_ctl;
   3141
   3142	/* now return any extents the cluster had on it */
   3143	spin_lock(&ctl->tree_lock);
   3144	__btrfs_return_cluster_to_free_space(block_group, cluster);
   3145	spin_unlock(&ctl->tree_lock);
   3146
   3147	btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
   3148
   3149	/* finally drop our ref */
   3150	btrfs_put_block_group(block_group);
   3151}
   3152
   3153static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
   3154				   struct btrfs_free_cluster *cluster,
   3155				   struct btrfs_free_space *entry,
   3156				   u64 bytes, u64 min_start,
   3157				   u64 *max_extent_size)
   3158{
   3159	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3160	int err;
   3161	u64 search_start = cluster->window_start;
   3162	u64 search_bytes = bytes;
   3163	u64 ret = 0;
   3164
   3165	search_start = min_start;
   3166	search_bytes = bytes;
   3167
   3168	err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
   3169	if (err) {
   3170		*max_extent_size = max(get_max_extent_size(entry),
   3171				       *max_extent_size);
   3172		return 0;
   3173	}
   3174
   3175	ret = search_start;
   3176	bitmap_clear_bits(ctl, entry, ret, bytes, false);
   3177
   3178	return ret;
   3179}
   3180
   3181/*
   3182 * given a cluster, try to allocate 'bytes' from it, returns 0
   3183 * if it couldn't find anything suitably large, or a logical disk offset
   3184 * if things worked out
   3185 */
   3186u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
   3187			     struct btrfs_free_cluster *cluster, u64 bytes,
   3188			     u64 min_start, u64 *max_extent_size)
   3189{
   3190	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3191	struct btrfs_discard_ctl *discard_ctl =
   3192					&block_group->fs_info->discard_ctl;
   3193	struct btrfs_free_space *entry = NULL;
   3194	struct rb_node *node;
   3195	u64 ret = 0;
   3196
   3197	ASSERT(!btrfs_is_zoned(block_group->fs_info));
   3198
   3199	spin_lock(&cluster->lock);
   3200	if (bytes > cluster->max_size)
   3201		goto out;
   3202
   3203	if (cluster->block_group != block_group)
   3204		goto out;
   3205
   3206	node = rb_first(&cluster->root);
   3207	if (!node)
   3208		goto out;
   3209
   3210	entry = rb_entry(node, struct btrfs_free_space, offset_index);
   3211	while (1) {
   3212		if (entry->bytes < bytes)
   3213			*max_extent_size = max(get_max_extent_size(entry),
   3214					       *max_extent_size);
   3215
   3216		if (entry->bytes < bytes ||
   3217		    (!entry->bitmap && entry->offset < min_start)) {
   3218			node = rb_next(&entry->offset_index);
   3219			if (!node)
   3220				break;
   3221			entry = rb_entry(node, struct btrfs_free_space,
   3222					 offset_index);
   3223			continue;
   3224		}
   3225
   3226		if (entry->bitmap) {
   3227			ret = btrfs_alloc_from_bitmap(block_group,
   3228						      cluster, entry, bytes,
   3229						      cluster->window_start,
   3230						      max_extent_size);
   3231			if (ret == 0) {
   3232				node = rb_next(&entry->offset_index);
   3233				if (!node)
   3234					break;
   3235				entry = rb_entry(node, struct btrfs_free_space,
   3236						 offset_index);
   3237				continue;
   3238			}
   3239			cluster->window_start += bytes;
   3240		} else {
   3241			ret = entry->offset;
   3242
   3243			entry->offset += bytes;
   3244			entry->bytes -= bytes;
   3245		}
   3246
   3247		break;
   3248	}
   3249out:
   3250	spin_unlock(&cluster->lock);
   3251
   3252	if (!ret)
   3253		return 0;
   3254
   3255	spin_lock(&ctl->tree_lock);
   3256
   3257	if (!btrfs_free_space_trimmed(entry))
   3258		atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
   3259
   3260	ctl->free_space -= bytes;
   3261	if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
   3262		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
   3263
   3264	spin_lock(&cluster->lock);
   3265	if (entry->bytes == 0) {
   3266		rb_erase(&entry->offset_index, &cluster->root);
   3267		ctl->free_extents--;
   3268		if (entry->bitmap) {
   3269			kmem_cache_free(btrfs_free_space_bitmap_cachep,
   3270					entry->bitmap);
   3271			ctl->total_bitmaps--;
   3272			recalculate_thresholds(ctl);
   3273		} else if (!btrfs_free_space_trimmed(entry)) {
   3274			ctl->discardable_extents[BTRFS_STAT_CURR]--;
   3275		}
   3276		kmem_cache_free(btrfs_free_space_cachep, entry);
   3277	}
   3278
   3279	spin_unlock(&cluster->lock);
   3280	spin_unlock(&ctl->tree_lock);
   3281
   3282	return ret;
   3283}
   3284
   3285static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
   3286				struct btrfs_free_space *entry,
   3287				struct btrfs_free_cluster *cluster,
   3288				u64 offset, u64 bytes,
   3289				u64 cont1_bytes, u64 min_bytes)
   3290{
   3291	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3292	unsigned long next_zero;
   3293	unsigned long i;
   3294	unsigned long want_bits;
   3295	unsigned long min_bits;
   3296	unsigned long found_bits;
   3297	unsigned long max_bits = 0;
   3298	unsigned long start = 0;
   3299	unsigned long total_found = 0;
   3300	int ret;
   3301
   3302	i = offset_to_bit(entry->offset, ctl->unit,
   3303			  max_t(u64, offset, entry->offset));
   3304	want_bits = bytes_to_bits(bytes, ctl->unit);
   3305	min_bits = bytes_to_bits(min_bytes, ctl->unit);
   3306
   3307	/*
   3308	 * Don't bother looking for a cluster in this bitmap if it's heavily
   3309	 * fragmented.
   3310	 */
   3311	if (entry->max_extent_size &&
   3312	    entry->max_extent_size < cont1_bytes)
   3313		return -ENOSPC;
   3314again:
   3315	found_bits = 0;
   3316	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
   3317		next_zero = find_next_zero_bit(entry->bitmap,
   3318					       BITS_PER_BITMAP, i);
   3319		if (next_zero - i >= min_bits) {
   3320			found_bits = next_zero - i;
   3321			if (found_bits > max_bits)
   3322				max_bits = found_bits;
   3323			break;
   3324		}
   3325		if (next_zero - i > max_bits)
   3326			max_bits = next_zero - i;
   3327		i = next_zero;
   3328	}
   3329
   3330	if (!found_bits) {
   3331		entry->max_extent_size = (u64)max_bits * ctl->unit;
   3332		return -ENOSPC;
   3333	}
   3334
   3335	if (!total_found) {
   3336		start = i;
   3337		cluster->max_size = 0;
   3338	}
   3339
   3340	total_found += found_bits;
   3341
   3342	if (cluster->max_size < found_bits * ctl->unit)
   3343		cluster->max_size = found_bits * ctl->unit;
   3344
   3345	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
   3346		i = next_zero + 1;
   3347		goto again;
   3348	}
   3349
   3350	cluster->window_start = start * ctl->unit + entry->offset;
   3351	rb_erase(&entry->offset_index, &ctl->free_space_offset);
   3352	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
   3353
   3354	/*
   3355	 * We need to know if we're currently on the normal space index when we
   3356	 * manipulate the bitmap so that we know we need to remove and re-insert
   3357	 * it into the space_index tree.  Clear the bytes_index node here so the
   3358	 * bitmap manipulation helpers know not to mess with the space_index
   3359	 * until this bitmap entry is added back into the normal cache.
   3360	 */
   3361	RB_CLEAR_NODE(&entry->bytes_index);
   3362
   3363	ret = tree_insert_offset(&cluster->root, entry->offset,
   3364				 &entry->offset_index, 1);
   3365	ASSERT(!ret); /* -EEXIST; Logic error */
   3366
   3367	trace_btrfs_setup_cluster(block_group, cluster,
   3368				  total_found * ctl->unit, 1);
   3369	return 0;
   3370}
   3371
   3372/*
   3373 * This searches the block group for just extents to fill the cluster with.
   3374 * Try to find a cluster with at least bytes total bytes, at least one
   3375 * extent of cont1_bytes, and other clusters of at least min_bytes.
   3376 */
   3377static noinline int
   3378setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
   3379			struct btrfs_free_cluster *cluster,
   3380			struct list_head *bitmaps, u64 offset, u64 bytes,
   3381			u64 cont1_bytes, u64 min_bytes)
   3382{
   3383	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3384	struct btrfs_free_space *first = NULL;
   3385	struct btrfs_free_space *entry = NULL;
   3386	struct btrfs_free_space *last;
   3387	struct rb_node *node;
   3388	u64 window_free;
   3389	u64 max_extent;
   3390	u64 total_size = 0;
   3391
   3392	entry = tree_search_offset(ctl, offset, 0, 1);
   3393	if (!entry)
   3394		return -ENOSPC;
   3395
   3396	/*
   3397	 * We don't want bitmaps, so just move along until we find a normal
   3398	 * extent entry.
   3399	 */
   3400	while (entry->bitmap || entry->bytes < min_bytes) {
   3401		if (entry->bitmap && list_empty(&entry->list))
   3402			list_add_tail(&entry->list, bitmaps);
   3403		node = rb_next(&entry->offset_index);
   3404		if (!node)
   3405			return -ENOSPC;
   3406		entry = rb_entry(node, struct btrfs_free_space, offset_index);
   3407	}
   3408
   3409	window_free = entry->bytes;
   3410	max_extent = entry->bytes;
   3411	first = entry;
   3412	last = entry;
   3413
   3414	for (node = rb_next(&entry->offset_index); node;
   3415	     node = rb_next(&entry->offset_index)) {
   3416		entry = rb_entry(node, struct btrfs_free_space, offset_index);
   3417
   3418		if (entry->bitmap) {
   3419			if (list_empty(&entry->list))
   3420				list_add_tail(&entry->list, bitmaps);
   3421			continue;
   3422		}
   3423
   3424		if (entry->bytes < min_bytes)
   3425			continue;
   3426
   3427		last = entry;
   3428		window_free += entry->bytes;
   3429		if (entry->bytes > max_extent)
   3430			max_extent = entry->bytes;
   3431	}
   3432
   3433	if (window_free < bytes || max_extent < cont1_bytes)
   3434		return -ENOSPC;
   3435
   3436	cluster->window_start = first->offset;
   3437
   3438	node = &first->offset_index;
   3439
   3440	/*
   3441	 * now we've found our entries, pull them out of the free space
   3442	 * cache and put them into the cluster rbtree
   3443	 */
   3444	do {
   3445		int ret;
   3446
   3447		entry = rb_entry(node, struct btrfs_free_space, offset_index);
   3448		node = rb_next(&entry->offset_index);
   3449		if (entry->bitmap || entry->bytes < min_bytes)
   3450			continue;
   3451
   3452		rb_erase(&entry->offset_index, &ctl->free_space_offset);
   3453		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
   3454		ret = tree_insert_offset(&cluster->root, entry->offset,
   3455					 &entry->offset_index, 0);
   3456		total_size += entry->bytes;
   3457		ASSERT(!ret); /* -EEXIST; Logic error */
   3458	} while (node && entry != last);
   3459
   3460	cluster->max_size = max_extent;
   3461	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
   3462	return 0;
   3463}
   3464
   3465/*
   3466 * This specifically looks for bitmaps that may work in the cluster, we assume
   3467 * that we have already failed to find extents that will work.
   3468 */
   3469static noinline int
   3470setup_cluster_bitmap(struct btrfs_block_group *block_group,
   3471		     struct btrfs_free_cluster *cluster,
   3472		     struct list_head *bitmaps, u64 offset, u64 bytes,
   3473		     u64 cont1_bytes, u64 min_bytes)
   3474{
   3475	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3476	struct btrfs_free_space *entry = NULL;
   3477	int ret = -ENOSPC;
   3478	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
   3479
   3480	if (ctl->total_bitmaps == 0)
   3481		return -ENOSPC;
   3482
   3483	/*
   3484	 * The bitmap that covers offset won't be in the list unless offset
   3485	 * is just its start offset.
   3486	 */
   3487	if (!list_empty(bitmaps))
   3488		entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
   3489
   3490	if (!entry || entry->offset != bitmap_offset) {
   3491		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
   3492		if (entry && list_empty(&entry->list))
   3493			list_add(&entry->list, bitmaps);
   3494	}
   3495
   3496	list_for_each_entry(entry, bitmaps, list) {
   3497		if (entry->bytes < bytes)
   3498			continue;
   3499		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
   3500					   bytes, cont1_bytes, min_bytes);
   3501		if (!ret)
   3502			return 0;
   3503	}
   3504
   3505	/*
   3506	 * The bitmaps list has all the bitmaps that record free space
   3507	 * starting after offset, so no more search is required.
   3508	 */
   3509	return -ENOSPC;
   3510}
   3511
   3512/*
   3513 * here we try to find a cluster of blocks in a block group.  The goal
   3514 * is to find at least bytes+empty_size.
   3515 * We might not find them all in one contiguous area.
   3516 *
   3517 * returns zero and sets up cluster if things worked out, otherwise
   3518 * it returns -enospc
   3519 */
   3520int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
   3521			     struct btrfs_free_cluster *cluster,
   3522			     u64 offset, u64 bytes, u64 empty_size)
   3523{
   3524	struct btrfs_fs_info *fs_info = block_group->fs_info;
   3525	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3526	struct btrfs_free_space *entry, *tmp;
   3527	LIST_HEAD(bitmaps);
   3528	u64 min_bytes;
   3529	u64 cont1_bytes;
   3530	int ret;
   3531
   3532	/*
   3533	 * Choose the minimum extent size we'll require for this
   3534	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
   3535	 * For metadata, allow allocates with smaller extents.  For
   3536	 * data, keep it dense.
   3537	 */
   3538	if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
   3539		cont1_bytes = min_bytes = bytes + empty_size;
   3540	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
   3541		cont1_bytes = bytes;
   3542		min_bytes = fs_info->sectorsize;
   3543	} else {
   3544		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
   3545		min_bytes = fs_info->sectorsize;
   3546	}
   3547
   3548	spin_lock(&ctl->tree_lock);
   3549
   3550	/*
   3551	 * If we know we don't have enough space to make a cluster don't even
   3552	 * bother doing all the work to try and find one.
   3553	 */
   3554	if (ctl->free_space < bytes) {
   3555		spin_unlock(&ctl->tree_lock);
   3556		return -ENOSPC;
   3557	}
   3558
   3559	spin_lock(&cluster->lock);
   3560
   3561	/* someone already found a cluster, hooray */
   3562	if (cluster->block_group) {
   3563		ret = 0;
   3564		goto out;
   3565	}
   3566
   3567	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
   3568				 min_bytes);
   3569
   3570	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
   3571				      bytes + empty_size,
   3572				      cont1_bytes, min_bytes);
   3573	if (ret)
   3574		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
   3575					   offset, bytes + empty_size,
   3576					   cont1_bytes, min_bytes);
   3577
   3578	/* Clear our temporary list */
   3579	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
   3580		list_del_init(&entry->list);
   3581
   3582	if (!ret) {
   3583		btrfs_get_block_group(block_group);
   3584		list_add_tail(&cluster->block_group_list,
   3585			      &block_group->cluster_list);
   3586		cluster->block_group = block_group;
   3587	} else {
   3588		trace_btrfs_failed_cluster_setup(block_group);
   3589	}
   3590out:
   3591	spin_unlock(&cluster->lock);
   3592	spin_unlock(&ctl->tree_lock);
   3593
   3594	return ret;
   3595}
   3596
   3597/*
   3598 * simple code to zero out a cluster
   3599 */
   3600void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
   3601{
   3602	spin_lock_init(&cluster->lock);
   3603	spin_lock_init(&cluster->refill_lock);
   3604	cluster->root = RB_ROOT;
   3605	cluster->max_size = 0;
   3606	cluster->fragmented = false;
   3607	INIT_LIST_HEAD(&cluster->block_group_list);
   3608	cluster->block_group = NULL;
   3609}
   3610
   3611static int do_trimming(struct btrfs_block_group *block_group,
   3612		       u64 *total_trimmed, u64 start, u64 bytes,
   3613		       u64 reserved_start, u64 reserved_bytes,
   3614		       enum btrfs_trim_state reserved_trim_state,
   3615		       struct btrfs_trim_range *trim_entry)
   3616{
   3617	struct btrfs_space_info *space_info = block_group->space_info;
   3618	struct btrfs_fs_info *fs_info = block_group->fs_info;
   3619	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3620	int ret;
   3621	int update = 0;
   3622	const u64 end = start + bytes;
   3623	const u64 reserved_end = reserved_start + reserved_bytes;
   3624	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   3625	u64 trimmed = 0;
   3626
   3627	spin_lock(&space_info->lock);
   3628	spin_lock(&block_group->lock);
   3629	if (!block_group->ro) {
   3630		block_group->reserved += reserved_bytes;
   3631		space_info->bytes_reserved += reserved_bytes;
   3632		update = 1;
   3633	}
   3634	spin_unlock(&block_group->lock);
   3635	spin_unlock(&space_info->lock);
   3636
   3637	ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
   3638	if (!ret) {
   3639		*total_trimmed += trimmed;
   3640		trim_state = BTRFS_TRIM_STATE_TRIMMED;
   3641	}
   3642
   3643	mutex_lock(&ctl->cache_writeout_mutex);
   3644	if (reserved_start < start)
   3645		__btrfs_add_free_space(block_group, reserved_start,
   3646				       start - reserved_start,
   3647				       reserved_trim_state);
   3648	if (start + bytes < reserved_start + reserved_bytes)
   3649		__btrfs_add_free_space(block_group, end, reserved_end - end,
   3650				       reserved_trim_state);
   3651	__btrfs_add_free_space(block_group, start, bytes, trim_state);
   3652	list_del(&trim_entry->list);
   3653	mutex_unlock(&ctl->cache_writeout_mutex);
   3654
   3655	if (update) {
   3656		spin_lock(&space_info->lock);
   3657		spin_lock(&block_group->lock);
   3658		if (block_group->ro)
   3659			space_info->bytes_readonly += reserved_bytes;
   3660		block_group->reserved -= reserved_bytes;
   3661		space_info->bytes_reserved -= reserved_bytes;
   3662		spin_unlock(&block_group->lock);
   3663		spin_unlock(&space_info->lock);
   3664	}
   3665
   3666	return ret;
   3667}
   3668
   3669/*
   3670 * If @async is set, then we will trim 1 region and return.
   3671 */
   3672static int trim_no_bitmap(struct btrfs_block_group *block_group,
   3673			  u64 *total_trimmed, u64 start, u64 end, u64 minlen,
   3674			  bool async)
   3675{
   3676	struct btrfs_discard_ctl *discard_ctl =
   3677					&block_group->fs_info->discard_ctl;
   3678	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3679	struct btrfs_free_space *entry;
   3680	struct rb_node *node;
   3681	int ret = 0;
   3682	u64 extent_start;
   3683	u64 extent_bytes;
   3684	enum btrfs_trim_state extent_trim_state;
   3685	u64 bytes;
   3686	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
   3687
   3688	while (start < end) {
   3689		struct btrfs_trim_range trim_entry;
   3690
   3691		mutex_lock(&ctl->cache_writeout_mutex);
   3692		spin_lock(&ctl->tree_lock);
   3693
   3694		if (ctl->free_space < minlen)
   3695			goto out_unlock;
   3696
   3697		entry = tree_search_offset(ctl, start, 0, 1);
   3698		if (!entry)
   3699			goto out_unlock;
   3700
   3701		/* Skip bitmaps and if async, already trimmed entries */
   3702		while (entry->bitmap ||
   3703		       (async && btrfs_free_space_trimmed(entry))) {
   3704			node = rb_next(&entry->offset_index);
   3705			if (!node)
   3706				goto out_unlock;
   3707			entry = rb_entry(node, struct btrfs_free_space,
   3708					 offset_index);
   3709		}
   3710
   3711		if (entry->offset >= end)
   3712			goto out_unlock;
   3713
   3714		extent_start = entry->offset;
   3715		extent_bytes = entry->bytes;
   3716		extent_trim_state = entry->trim_state;
   3717		if (async) {
   3718			start = entry->offset;
   3719			bytes = entry->bytes;
   3720			if (bytes < minlen) {
   3721				spin_unlock(&ctl->tree_lock);
   3722				mutex_unlock(&ctl->cache_writeout_mutex);
   3723				goto next;
   3724			}
   3725			unlink_free_space(ctl, entry, true);
   3726			/*
   3727			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
   3728			 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
   3729			 * X when we come back around.  So trim it now.
   3730			 */
   3731			if (max_discard_size &&
   3732			    bytes >= (max_discard_size +
   3733				      BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
   3734				bytes = max_discard_size;
   3735				extent_bytes = max_discard_size;
   3736				entry->offset += max_discard_size;
   3737				entry->bytes -= max_discard_size;
   3738				link_free_space(ctl, entry);
   3739			} else {
   3740				kmem_cache_free(btrfs_free_space_cachep, entry);
   3741			}
   3742		} else {
   3743			start = max(start, extent_start);
   3744			bytes = min(extent_start + extent_bytes, end) - start;
   3745			if (bytes < minlen) {
   3746				spin_unlock(&ctl->tree_lock);
   3747				mutex_unlock(&ctl->cache_writeout_mutex);
   3748				goto next;
   3749			}
   3750
   3751			unlink_free_space(ctl, entry, true);
   3752			kmem_cache_free(btrfs_free_space_cachep, entry);
   3753		}
   3754
   3755		spin_unlock(&ctl->tree_lock);
   3756		trim_entry.start = extent_start;
   3757		trim_entry.bytes = extent_bytes;
   3758		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
   3759		mutex_unlock(&ctl->cache_writeout_mutex);
   3760
   3761		ret = do_trimming(block_group, total_trimmed, start, bytes,
   3762				  extent_start, extent_bytes, extent_trim_state,
   3763				  &trim_entry);
   3764		if (ret) {
   3765			block_group->discard_cursor = start + bytes;
   3766			break;
   3767		}
   3768next:
   3769		start += bytes;
   3770		block_group->discard_cursor = start;
   3771		if (async && *total_trimmed)
   3772			break;
   3773
   3774		if (fatal_signal_pending(current)) {
   3775			ret = -ERESTARTSYS;
   3776			break;
   3777		}
   3778
   3779		cond_resched();
   3780	}
   3781
   3782	return ret;
   3783
   3784out_unlock:
   3785	block_group->discard_cursor = btrfs_block_group_end(block_group);
   3786	spin_unlock(&ctl->tree_lock);
   3787	mutex_unlock(&ctl->cache_writeout_mutex);
   3788
   3789	return ret;
   3790}
   3791
   3792/*
   3793 * If we break out of trimming a bitmap prematurely, we should reset the
   3794 * trimming bit.  In a rather contrieved case, it's possible to race here so
   3795 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
   3796 *
   3797 * start = start of bitmap
   3798 * end = near end of bitmap
   3799 *
   3800 * Thread 1:			Thread 2:
   3801 * trim_bitmaps(start)
   3802 *				trim_bitmaps(end)
   3803 *				end_trimming_bitmap()
   3804 * reset_trimming_bitmap()
   3805 */
   3806static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
   3807{
   3808	struct btrfs_free_space *entry;
   3809
   3810	spin_lock(&ctl->tree_lock);
   3811	entry = tree_search_offset(ctl, offset, 1, 0);
   3812	if (entry) {
   3813		if (btrfs_free_space_trimmed(entry)) {
   3814			ctl->discardable_extents[BTRFS_STAT_CURR] +=
   3815				entry->bitmap_extents;
   3816			ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
   3817		}
   3818		entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   3819	}
   3820
   3821	spin_unlock(&ctl->tree_lock);
   3822}
   3823
   3824static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
   3825				struct btrfs_free_space *entry)
   3826{
   3827	if (btrfs_free_space_trimming_bitmap(entry)) {
   3828		entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
   3829		ctl->discardable_extents[BTRFS_STAT_CURR] -=
   3830			entry->bitmap_extents;
   3831		ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
   3832	}
   3833}
   3834
   3835/*
   3836 * If @async is set, then we will trim 1 region and return.
   3837 */
   3838static int trim_bitmaps(struct btrfs_block_group *block_group,
   3839			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
   3840			u64 maxlen, bool async)
   3841{
   3842	struct btrfs_discard_ctl *discard_ctl =
   3843					&block_group->fs_info->discard_ctl;
   3844	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3845	struct btrfs_free_space *entry;
   3846	int ret = 0;
   3847	int ret2;
   3848	u64 bytes;
   3849	u64 offset = offset_to_bitmap(ctl, start);
   3850	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
   3851
   3852	while (offset < end) {
   3853		bool next_bitmap = false;
   3854		struct btrfs_trim_range trim_entry;
   3855
   3856		mutex_lock(&ctl->cache_writeout_mutex);
   3857		spin_lock(&ctl->tree_lock);
   3858
   3859		if (ctl->free_space < minlen) {
   3860			block_group->discard_cursor =
   3861				btrfs_block_group_end(block_group);
   3862			spin_unlock(&ctl->tree_lock);
   3863			mutex_unlock(&ctl->cache_writeout_mutex);
   3864			break;
   3865		}
   3866
   3867		entry = tree_search_offset(ctl, offset, 1, 0);
   3868		/*
   3869		 * Bitmaps are marked trimmed lossily now to prevent constant
   3870		 * discarding of the same bitmap (the reason why we are bound
   3871		 * by the filters).  So, retrim the block group bitmaps when we
   3872		 * are preparing to punt to the unused_bgs list.  This uses
   3873		 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
   3874		 * which is the only discard index which sets minlen to 0.
   3875		 */
   3876		if (!entry || (async && minlen && start == offset &&
   3877			       btrfs_free_space_trimmed(entry))) {
   3878			spin_unlock(&ctl->tree_lock);
   3879			mutex_unlock(&ctl->cache_writeout_mutex);
   3880			next_bitmap = true;
   3881			goto next;
   3882		}
   3883
   3884		/*
   3885		 * Async discard bitmap trimming begins at by setting the start
   3886		 * to be key.objectid and the offset_to_bitmap() aligns to the
   3887		 * start of the bitmap.  This lets us know we are fully
   3888		 * scanning the bitmap rather than only some portion of it.
   3889		 */
   3890		if (start == offset)
   3891			entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
   3892
   3893		bytes = minlen;
   3894		ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
   3895		if (ret2 || start >= end) {
   3896			/*
   3897			 * We lossily consider a bitmap trimmed if we only skip
   3898			 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
   3899			 */
   3900			if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
   3901				end_trimming_bitmap(ctl, entry);
   3902			else
   3903				entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
   3904			spin_unlock(&ctl->tree_lock);
   3905			mutex_unlock(&ctl->cache_writeout_mutex);
   3906			next_bitmap = true;
   3907			goto next;
   3908		}
   3909
   3910		/*
   3911		 * We already trimmed a region, but are using the locking above
   3912		 * to reset the trim_state.
   3913		 */
   3914		if (async && *total_trimmed) {
   3915			spin_unlock(&ctl->tree_lock);
   3916			mutex_unlock(&ctl->cache_writeout_mutex);
   3917			goto out;
   3918		}
   3919
   3920		bytes = min(bytes, end - start);
   3921		if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
   3922			spin_unlock(&ctl->tree_lock);
   3923			mutex_unlock(&ctl->cache_writeout_mutex);
   3924			goto next;
   3925		}
   3926
   3927		/*
   3928		 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
   3929		 * If X < @minlen, we won't trim X when we come back around.
   3930		 * So trim it now.  We differ here from trimming extents as we
   3931		 * don't keep individual state per bit.
   3932		 */
   3933		if (async &&
   3934		    max_discard_size &&
   3935		    bytes > (max_discard_size + minlen))
   3936			bytes = max_discard_size;
   3937
   3938		bitmap_clear_bits(ctl, entry, start, bytes, true);
   3939		if (entry->bytes == 0)
   3940			free_bitmap(ctl, entry);
   3941
   3942		spin_unlock(&ctl->tree_lock);
   3943		trim_entry.start = start;
   3944		trim_entry.bytes = bytes;
   3945		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
   3946		mutex_unlock(&ctl->cache_writeout_mutex);
   3947
   3948		ret = do_trimming(block_group, total_trimmed, start, bytes,
   3949				  start, bytes, 0, &trim_entry);
   3950		if (ret) {
   3951			reset_trimming_bitmap(ctl, offset);
   3952			block_group->discard_cursor =
   3953				btrfs_block_group_end(block_group);
   3954			break;
   3955		}
   3956next:
   3957		if (next_bitmap) {
   3958			offset += BITS_PER_BITMAP * ctl->unit;
   3959			start = offset;
   3960		} else {
   3961			start += bytes;
   3962		}
   3963		block_group->discard_cursor = start;
   3964
   3965		if (fatal_signal_pending(current)) {
   3966			if (start != offset)
   3967				reset_trimming_bitmap(ctl, offset);
   3968			ret = -ERESTARTSYS;
   3969			break;
   3970		}
   3971
   3972		cond_resched();
   3973	}
   3974
   3975	if (offset >= end)
   3976		block_group->discard_cursor = end;
   3977
   3978out:
   3979	return ret;
   3980}
   3981
   3982int btrfs_trim_block_group(struct btrfs_block_group *block_group,
   3983			   u64 *trimmed, u64 start, u64 end, u64 minlen)
   3984{
   3985	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
   3986	int ret;
   3987	u64 rem = 0;
   3988
   3989	ASSERT(!btrfs_is_zoned(block_group->fs_info));
   3990
   3991	*trimmed = 0;
   3992
   3993	spin_lock(&block_group->lock);
   3994	if (block_group->removed) {
   3995		spin_unlock(&block_group->lock);
   3996		return 0;
   3997	}
   3998	btrfs_freeze_block_group(block_group);
   3999	spin_unlock(&block_group->lock);
   4000
   4001	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
   4002	if (ret)
   4003		goto out;
   4004
   4005	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
   4006	div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
   4007	/* If we ended in the middle of a bitmap, reset the trimming flag */
   4008	if (rem)
   4009		reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
   4010out:
   4011	btrfs_unfreeze_block_group(block_group);
   4012	return ret;
   4013}
   4014
   4015int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
   4016				   u64 *trimmed, u64 start, u64 end, u64 minlen,
   4017				   bool async)
   4018{
   4019	int ret;
   4020
   4021	*trimmed = 0;
   4022
   4023	spin_lock(&block_group->lock);
   4024	if (block_group->removed) {
   4025		spin_unlock(&block_group->lock);
   4026		return 0;
   4027	}
   4028	btrfs_freeze_block_group(block_group);
   4029	spin_unlock(&block_group->lock);
   4030
   4031	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
   4032	btrfs_unfreeze_block_group(block_group);
   4033
   4034	return ret;
   4035}
   4036
   4037int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
   4038				   u64 *trimmed, u64 start, u64 end, u64 minlen,
   4039				   u64 maxlen, bool async)
   4040{
   4041	int ret;
   4042
   4043	*trimmed = 0;
   4044
   4045	spin_lock(&block_group->lock);
   4046	if (block_group->removed) {
   4047		spin_unlock(&block_group->lock);
   4048		return 0;
   4049	}
   4050	btrfs_freeze_block_group(block_group);
   4051	spin_unlock(&block_group->lock);
   4052
   4053	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
   4054			   async);
   4055
   4056	btrfs_unfreeze_block_group(block_group);
   4057
   4058	return ret;
   4059}
   4060
   4061bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
   4062{
   4063	return btrfs_super_cache_generation(fs_info->super_copy);
   4064}
   4065
   4066static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
   4067				       struct btrfs_trans_handle *trans)
   4068{
   4069	struct btrfs_block_group *block_group;
   4070	struct rb_node *node;
   4071	int ret = 0;
   4072
   4073	btrfs_info(fs_info, "cleaning free space cache v1");
   4074
   4075	node = rb_first_cached(&fs_info->block_group_cache_tree);
   4076	while (node) {
   4077		block_group = rb_entry(node, struct btrfs_block_group, cache_node);
   4078		ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
   4079		if (ret)
   4080			goto out;
   4081		node = rb_next(node);
   4082	}
   4083out:
   4084	return ret;
   4085}
   4086
   4087int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
   4088{
   4089	struct btrfs_trans_handle *trans;
   4090	int ret;
   4091
   4092	/*
   4093	 * update_super_roots will appropriately set or unset
   4094	 * super_copy->cache_generation based on SPACE_CACHE and
   4095	 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
   4096	 * transaction commit whether we are enabling space cache v1 and don't
   4097	 * have any other work to do, or are disabling it and removing free
   4098	 * space inodes.
   4099	 */
   4100	trans = btrfs_start_transaction(fs_info->tree_root, 0);
   4101	if (IS_ERR(trans))
   4102		return PTR_ERR(trans);
   4103
   4104	if (!active) {
   4105		set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
   4106		ret = cleanup_free_space_cache_v1(fs_info, trans);
   4107		if (ret) {
   4108			btrfs_abort_transaction(trans, ret);
   4109			btrfs_end_transaction(trans);
   4110			goto out;
   4111		}
   4112	}
   4113
   4114	ret = btrfs_commit_transaction(trans);
   4115out:
   4116	clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
   4117
   4118	return ret;
   4119}
   4120
   4121#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
   4122/*
   4123 * Use this if you need to make a bitmap or extent entry specifically, it
   4124 * doesn't do any of the merging that add_free_space does, this acts a lot like
   4125 * how the free space cache loading stuff works, so you can get really weird
   4126 * configurations.
   4127 */
   4128int test_add_free_space_entry(struct btrfs_block_group *cache,
   4129			      u64 offset, u64 bytes, bool bitmap)
   4130{
   4131	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
   4132	struct btrfs_free_space *info = NULL, *bitmap_info;
   4133	void *map = NULL;
   4134	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
   4135	u64 bytes_added;
   4136	int ret;
   4137
   4138again:
   4139	if (!info) {
   4140		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
   4141		if (!info)
   4142			return -ENOMEM;
   4143	}
   4144
   4145	if (!bitmap) {
   4146		spin_lock(&ctl->tree_lock);
   4147		info->offset = offset;
   4148		info->bytes = bytes;
   4149		info->max_extent_size = 0;
   4150		ret = link_free_space(ctl, info);
   4151		spin_unlock(&ctl->tree_lock);
   4152		if (ret)
   4153			kmem_cache_free(btrfs_free_space_cachep, info);
   4154		return ret;
   4155	}
   4156
   4157	if (!map) {
   4158		map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
   4159		if (!map) {
   4160			kmem_cache_free(btrfs_free_space_cachep, info);
   4161			return -ENOMEM;
   4162		}
   4163	}
   4164
   4165	spin_lock(&ctl->tree_lock);
   4166	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
   4167					 1, 0);
   4168	if (!bitmap_info) {
   4169		info->bitmap = map;
   4170		map = NULL;
   4171		add_new_bitmap(ctl, info, offset);
   4172		bitmap_info = info;
   4173		info = NULL;
   4174	}
   4175
   4176	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
   4177					  trim_state);
   4178
   4179	bytes -= bytes_added;
   4180	offset += bytes_added;
   4181	spin_unlock(&ctl->tree_lock);
   4182
   4183	if (bytes)
   4184		goto again;
   4185
   4186	if (info)
   4187		kmem_cache_free(btrfs_free_space_cachep, info);
   4188	if (map)
   4189		kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
   4190	return 0;
   4191}
   4192
   4193/*
   4194 * Checks to see if the given range is in the free space cache.  This is really
   4195 * just used to check the absence of space, so if there is free space in the
   4196 * range at all we will return 1.
   4197 */
   4198int test_check_exists(struct btrfs_block_group *cache,
   4199		      u64 offset, u64 bytes)
   4200{
   4201	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
   4202	struct btrfs_free_space *info;
   4203	int ret = 0;
   4204
   4205	spin_lock(&ctl->tree_lock);
   4206	info = tree_search_offset(ctl, offset, 0, 0);
   4207	if (!info) {
   4208		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
   4209					  1, 0);
   4210		if (!info)
   4211			goto out;
   4212	}
   4213
   4214have_info:
   4215	if (info->bitmap) {
   4216		u64 bit_off, bit_bytes;
   4217		struct rb_node *n;
   4218		struct btrfs_free_space *tmp;
   4219
   4220		bit_off = offset;
   4221		bit_bytes = ctl->unit;
   4222		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
   4223		if (!ret) {
   4224			if (bit_off == offset) {
   4225				ret = 1;
   4226				goto out;
   4227			} else if (bit_off > offset &&
   4228				   offset + bytes > bit_off) {
   4229				ret = 1;
   4230				goto out;
   4231			}
   4232		}
   4233
   4234		n = rb_prev(&info->offset_index);
   4235		while (n) {
   4236			tmp = rb_entry(n, struct btrfs_free_space,
   4237				       offset_index);
   4238			if (tmp->offset + tmp->bytes < offset)
   4239				break;
   4240			if (offset + bytes < tmp->offset) {
   4241				n = rb_prev(&tmp->offset_index);
   4242				continue;
   4243			}
   4244			info = tmp;
   4245			goto have_info;
   4246		}
   4247
   4248		n = rb_next(&info->offset_index);
   4249		while (n) {
   4250			tmp = rb_entry(n, struct btrfs_free_space,
   4251				       offset_index);
   4252			if (offset + bytes < tmp->offset)
   4253				break;
   4254			if (tmp->offset + tmp->bytes < offset) {
   4255				n = rb_next(&tmp->offset_index);
   4256				continue;
   4257			}
   4258			info = tmp;
   4259			goto have_info;
   4260		}
   4261
   4262		ret = 0;
   4263		goto out;
   4264	}
   4265
   4266	if (info->offset == offset) {
   4267		ret = 1;
   4268		goto out;
   4269	}
   4270
   4271	if (offset > info->offset && offset < info->offset + info->bytes)
   4272		ret = 1;
   4273out:
   4274	spin_unlock(&ctl->tree_lock);
   4275	return ret;
   4276}
   4277#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */