cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

delalloc-space.c (17361B)


      1// SPDX-License-Identifier: GPL-2.0
      2
      3#include "ctree.h"
      4#include "delalloc-space.h"
      5#include "block-rsv.h"
      6#include "btrfs_inode.h"
      7#include "space-info.h"
      8#include "transaction.h"
      9#include "qgroup.h"
     10#include "block-group.h"
     11
     12/*
     13 * HOW DOES THIS WORK
     14 *
     15 * There are two stages to data reservations, one for data and one for metadata
     16 * to handle the new extents and checksums generated by writing data.
     17 *
     18 *
     19 * DATA RESERVATION
     20 *   The general flow of the data reservation is as follows
     21 *
     22 *   -> Reserve
     23 *     We call into btrfs_reserve_data_bytes() for the user request bytes that
     24 *     they wish to write.  We make this reservation and add it to
     25 *     space_info->bytes_may_use.  We set EXTENT_DELALLOC on the inode io_tree
     26 *     for the range and carry on if this is buffered, or follow up trying to
     27 *     make a real allocation if we are pre-allocating or doing O_DIRECT.
     28 *
     29 *   -> Use
     30 *     At writepages()/prealloc/O_DIRECT time we will call into
     31 *     btrfs_reserve_extent() for some part or all of this range of bytes.  We
     32 *     will make the allocation and subtract space_info->bytes_may_use by the
     33 *     original requested length and increase the space_info->bytes_reserved by
     34 *     the allocated length.  This distinction is important because compression
     35 *     may allocate a smaller on disk extent than we previously reserved.
     36 *
     37 *   -> Allocation
     38 *     finish_ordered_io() will insert the new file extent item for this range,
     39 *     and then add a delayed ref update for the extent tree.  Once that delayed
     40 *     ref is written the extent size is subtracted from
     41 *     space_info->bytes_reserved and added to space_info->bytes_used.
     42 *
     43 *   Error handling
     44 *
     45 *   -> By the reservation maker
     46 *     This is the simplest case, we haven't completed our operation and we know
     47 *     how much we reserved, we can simply call
     48 *     btrfs_free_reserved_data_space*() and it will be removed from
     49 *     space_info->bytes_may_use.
     50 *
     51 *   -> After the reservation has been made, but before cow_file_range()
     52 *     This is specifically for the delalloc case.  You must clear
     53 *     EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
     54 *     be subtracted from space_info->bytes_may_use.
     55 *
     56 * METADATA RESERVATION
     57 *   The general metadata reservation lifetimes are discussed elsewhere, this
     58 *   will just focus on how it is used for delalloc space.
     59 *
     60 *   We keep track of two things on a per inode bases
     61 *
     62 *   ->outstanding_extents
     63 *     This is the number of file extent items we'll need to handle all of the
     64 *     outstanding DELALLOC space we have in this inode.  We limit the maximum
     65 *     size of an extent, so a large contiguous dirty area may require more than
     66 *     one outstanding_extent, which is why count_max_extents() is used to
     67 *     determine how many outstanding_extents get added.
     68 *
     69 *   ->csum_bytes
     70 *     This is essentially how many dirty bytes we have for this inode, so we
     71 *     can calculate the number of checksum items we would have to add in order
     72 *     to checksum our outstanding data.
     73 *
     74 *   We keep a per-inode block_rsv in order to make it easier to keep track of
     75 *   our reservation.  We use btrfs_calculate_inode_block_rsv_size() to
     76 *   calculate the current theoretical maximum reservation we would need for the
     77 *   metadata for this inode.  We call this and then adjust our reservation as
     78 *   necessary, either by attempting to reserve more space, or freeing up excess
     79 *   space.
     80 *
     81 * OUTSTANDING_EXTENTS HANDLING
     82 *
     83 *  ->outstanding_extents is used for keeping track of how many extents we will
     84 *  need to use for this inode, and it will fluctuate depending on where you are
     85 *  in the life cycle of the dirty data.  Consider the following normal case for
     86 *  a completely clean inode, with a num_bytes < our maximum allowed extent size
     87 *
     88 *  -> reserve
     89 *    ->outstanding_extents += 1 (current value is 1)
     90 *
     91 *  -> set_delalloc
     92 *    ->outstanding_extents += 1 (current value is 2)
     93 *
     94 *  -> btrfs_delalloc_release_extents()
     95 *    ->outstanding_extents -= 1 (current value is 1)
     96 *
     97 *    We must call this once we are done, as we hold our reservation for the
     98 *    duration of our operation, and then assume set_delalloc will update the
     99 *    counter appropriately.
    100 *
    101 *  -> add ordered extent
    102 *    ->outstanding_extents += 1 (current value is 2)
    103 *
    104 *  -> btrfs_clear_delalloc_extent
    105 *    ->outstanding_extents -= 1 (current value is 1)
    106 *
    107 *  -> finish_ordered_io/btrfs_remove_ordered_extent
    108 *    ->outstanding_extents -= 1 (current value is 0)
    109 *
    110 *  Each stage is responsible for their own accounting of the extent, thus
    111 *  making error handling and cleanup easier.
    112 */
    113
    114int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
    115{
    116	struct btrfs_root *root = inode->root;
    117	struct btrfs_fs_info *fs_info = root->fs_info;
    118	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
    119
    120	/* Make sure bytes are sectorsize aligned */
    121	bytes = ALIGN(bytes, fs_info->sectorsize);
    122
    123	if (btrfs_is_free_space_inode(inode))
    124		flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
    125
    126	return btrfs_reserve_data_bytes(fs_info, bytes, flush);
    127}
    128
    129int btrfs_check_data_free_space(struct btrfs_inode *inode,
    130			struct extent_changeset **reserved, u64 start, u64 len)
    131{
    132	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    133	int ret;
    134
    135	/* align the range */
    136	len = round_up(start + len, fs_info->sectorsize) -
    137	      round_down(start, fs_info->sectorsize);
    138	start = round_down(start, fs_info->sectorsize);
    139
    140	ret = btrfs_alloc_data_chunk_ondemand(inode, len);
    141	if (ret < 0)
    142		return ret;
    143
    144	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
    145	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
    146	if (ret < 0) {
    147		btrfs_free_reserved_data_space_noquota(fs_info, len);
    148		extent_changeset_free(*reserved);
    149		*reserved = NULL;
    150	} else {
    151		ret = 0;
    152	}
    153	return ret;
    154}
    155
    156/*
    157 * Called if we need to clear a data reservation for this inode
    158 * Normally in a error case.
    159 *
    160 * This one will *NOT* use accurate qgroup reserved space API, just for case
    161 * which we can't sleep and is sure it won't affect qgroup reserved space.
    162 * Like clear_bit_hook().
    163 */
    164void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
    165					    u64 len)
    166{
    167	struct btrfs_space_info *data_sinfo;
    168
    169	ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
    170
    171	data_sinfo = fs_info->data_sinfo;
    172	btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
    173}
    174
    175/*
    176 * Called if we need to clear a data reservation for this inode
    177 * Normally in a error case.
    178 *
    179 * This one will handle the per-inode data rsv map for accurate reserved
    180 * space framework.
    181 */
    182void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
    183			struct extent_changeset *reserved, u64 start, u64 len)
    184{
    185	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    186
    187	/* Make sure the range is aligned to sectorsize */
    188	len = round_up(start + len, fs_info->sectorsize) -
    189	      round_down(start, fs_info->sectorsize);
    190	start = round_down(start, fs_info->sectorsize);
    191
    192	btrfs_free_reserved_data_space_noquota(fs_info, len);
    193	btrfs_qgroup_free_data(inode, reserved, start, len);
    194}
    195
    196/**
    197 * Release any excessive reservation
    198 *
    199 * @inode:       the inode we need to release from
    200 * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
    201 *               meta reservation needs to know if we are freeing qgroup
    202 *               reservation or just converting it into per-trans.  Normally
    203 *               @qgroup_free is true for error handling, and false for normal
    204 *               release.
    205 *
    206 * This is the same as btrfs_block_rsv_release, except that it handles the
    207 * tracepoint for the reservation.
    208 */
    209static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
    210{
    211	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    212	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
    213	u64 released = 0;
    214	u64 qgroup_to_release = 0;
    215
    216	/*
    217	 * Since we statically set the block_rsv->size we just want to say we
    218	 * are releasing 0 bytes, and then we'll just get the reservation over
    219	 * the size free'd.
    220	 */
    221	released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
    222					   &qgroup_to_release);
    223	if (released > 0)
    224		trace_btrfs_space_reservation(fs_info, "delalloc",
    225					      btrfs_ino(inode), released, 0);
    226	if (qgroup_free)
    227		btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
    228	else
    229		btrfs_qgroup_convert_reserved_meta(inode->root,
    230						   qgroup_to_release);
    231}
    232
    233static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
    234						 struct btrfs_inode *inode)
    235{
    236	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
    237	u64 reserve_size = 0;
    238	u64 qgroup_rsv_size = 0;
    239	u64 csum_leaves;
    240	unsigned outstanding_extents;
    241
    242	lockdep_assert_held(&inode->lock);
    243	outstanding_extents = inode->outstanding_extents;
    244
    245	/*
    246	 * Insert size for the number of outstanding extents, 1 normal size for
    247	 * updating the inode.
    248	 */
    249	if (outstanding_extents) {
    250		reserve_size = btrfs_calc_insert_metadata_size(fs_info,
    251						outstanding_extents);
    252		reserve_size += btrfs_calc_metadata_size(fs_info, 1);
    253	}
    254	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
    255						 inode->csum_bytes);
    256	reserve_size += btrfs_calc_insert_metadata_size(fs_info,
    257							csum_leaves);
    258	/*
    259	 * For qgroup rsv, the calculation is very simple:
    260	 * account one nodesize for each outstanding extent
    261	 *
    262	 * This is overestimating in most cases.
    263	 */
    264	qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
    265
    266	spin_lock(&block_rsv->lock);
    267	block_rsv->size = reserve_size;
    268	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
    269	spin_unlock(&block_rsv->lock);
    270}
    271
    272static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
    273				    u64 num_bytes, u64 disk_num_bytes,
    274				    u64 *meta_reserve, u64 *qgroup_reserve)
    275{
    276	u64 nr_extents = count_max_extents(num_bytes);
    277	u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
    278	u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
    279
    280	*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
    281						nr_extents + csum_leaves);
    282
    283	/*
    284	 * finish_ordered_io has to update the inode, so add the space required
    285	 * for an inode update.
    286	 */
    287	*meta_reserve += inode_update;
    288	*qgroup_reserve = nr_extents * fs_info->nodesize;
    289}
    290
    291int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
    292				    u64 disk_num_bytes, bool noflush)
    293{
    294	struct btrfs_root *root = inode->root;
    295	struct btrfs_fs_info *fs_info = root->fs_info;
    296	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
    297	u64 meta_reserve, qgroup_reserve;
    298	unsigned nr_extents;
    299	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
    300	int ret = 0;
    301
    302	/*
    303	 * If we are a free space inode we need to not flush since we will be in
    304	 * the middle of a transaction commit.  We also don't need the delalloc
    305	 * mutex since we won't race with anybody.  We need this mostly to make
    306	 * lockdep shut its filthy mouth.
    307	 *
    308	 * If we have a transaction open (can happen if we call truncate_block
    309	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
    310	 */
    311	if (noflush || btrfs_is_free_space_inode(inode)) {
    312		flush = BTRFS_RESERVE_NO_FLUSH;
    313	} else {
    314		if (current->journal_info)
    315			flush = BTRFS_RESERVE_FLUSH_LIMIT;
    316
    317		if (btrfs_transaction_in_commit(fs_info))
    318			schedule_timeout(1);
    319	}
    320
    321	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
    322	disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
    323
    324	/*
    325	 * We always want to do it this way, every other way is wrong and ends
    326	 * in tears.  Pre-reserving the amount we are going to add will always
    327	 * be the right way, because otherwise if we have enough parallelism we
    328	 * could end up with thousands of inodes all holding little bits of
    329	 * reservations they were able to make previously and the only way to
    330	 * reclaim that space is to ENOSPC out the operations and clear
    331	 * everything out and try again, which is bad.  This way we just
    332	 * over-reserve slightly, and clean up the mess when we are done.
    333	 */
    334	calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
    335				&meta_reserve, &qgroup_reserve);
    336	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
    337						 noflush);
    338	if (ret)
    339		return ret;
    340	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
    341	if (ret) {
    342		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
    343		return ret;
    344	}
    345
    346	/*
    347	 * Now we need to update our outstanding extents and csum bytes _first_
    348	 * and then add the reservation to the block_rsv.  This keeps us from
    349	 * racing with an ordered completion or some such that would think it
    350	 * needs to free the reservation we just made.
    351	 */
    352	spin_lock(&inode->lock);
    353	nr_extents = count_max_extents(num_bytes);
    354	btrfs_mod_outstanding_extents(inode, nr_extents);
    355	inode->csum_bytes += disk_num_bytes;
    356	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
    357	spin_unlock(&inode->lock);
    358
    359	/* Now we can safely add our space to our block rsv */
    360	btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
    361	trace_btrfs_space_reservation(root->fs_info, "delalloc",
    362				      btrfs_ino(inode), meta_reserve, 1);
    363
    364	spin_lock(&block_rsv->lock);
    365	block_rsv->qgroup_rsv_reserved += qgroup_reserve;
    366	spin_unlock(&block_rsv->lock);
    367
    368	return 0;
    369}
    370
    371/**
    372 * Release a metadata reservation for an inode
    373 *
    374 * @inode: the inode to release the reservation for.
    375 * @num_bytes: the number of bytes we are releasing.
    376 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
    377 *
    378 * This will release the metadata reservation for an inode.  This can be called
    379 * once we complete IO for a given set of bytes to release their metadata
    380 * reservations, or on error for the same reason.
    381 */
    382void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
    383				     bool qgroup_free)
    384{
    385	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    386
    387	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
    388	spin_lock(&inode->lock);
    389	inode->csum_bytes -= num_bytes;
    390	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
    391	spin_unlock(&inode->lock);
    392
    393	if (btrfs_is_testing(fs_info))
    394		return;
    395
    396	btrfs_inode_rsv_release(inode, qgroup_free);
    397}
    398
    399/**
    400 * btrfs_delalloc_release_extents - release our outstanding_extents
    401 * @inode: the inode to balance the reservation for.
    402 * @num_bytes: the number of bytes we originally reserved with
    403 *
    404 * When we reserve space we increase outstanding_extents for the extents we may
    405 * add.  Once we've set the range as delalloc or created our ordered extents we
    406 * have outstanding_extents to track the real usage, so we use this to free our
    407 * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
    408 * with btrfs_delalloc_reserve_metadata.
    409 */
    410void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
    411{
    412	struct btrfs_fs_info *fs_info = inode->root->fs_info;
    413	unsigned num_extents;
    414
    415	spin_lock(&inode->lock);
    416	num_extents = count_max_extents(num_bytes);
    417	btrfs_mod_outstanding_extents(inode, -num_extents);
    418	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
    419	spin_unlock(&inode->lock);
    420
    421	if (btrfs_is_testing(fs_info))
    422		return;
    423
    424	btrfs_inode_rsv_release(inode, true);
    425}
    426
    427/**
    428 * btrfs_delalloc_reserve_space - reserve data and metadata space for
    429 * delalloc
    430 * @inode: inode we're writing to
    431 * @start: start range we are writing to
    432 * @len: how long the range we are writing to
    433 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
    434 * 	      current reservation.
    435 *
    436 * This will do the following things
    437 *
    438 * - reserve space in data space info for num bytes
    439 *   and reserve precious corresponding qgroup space
    440 *   (Done in check_data_free_space)
    441 *
    442 * - reserve space for metadata space, based on the number of outstanding
    443 *   extents and how much csums will be needed
    444 *   also reserve metadata space in a per root over-reserve method.
    445 * - add to the inodes->delalloc_bytes
    446 * - add it to the fs_info's delalloc inodes list.
    447 *   (Above 3 all done in delalloc_reserve_metadata)
    448 *
    449 * Return 0 for success
    450 * Return <0 for error(-ENOSPC or -EQUOT)
    451 */
    452int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
    453			struct extent_changeset **reserved, u64 start, u64 len)
    454{
    455	int ret;
    456
    457	ret = btrfs_check_data_free_space(inode, reserved, start, len);
    458	if (ret < 0)
    459		return ret;
    460	ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
    461	if (ret < 0) {
    462		btrfs_free_reserved_data_space(inode, *reserved, start, len);
    463		extent_changeset_free(*reserved);
    464		*reserved = NULL;
    465	}
    466	return ret;
    467}
    468
    469/**
    470 * Release data and metadata space for delalloc
    471 *
    472 * @inode:       inode we're releasing space for
    473 * @reserved:    list of changed/reserved ranges
    474 * @start:       start position of the space already reserved
    475 * @len:         length of the space already reserved
    476 * @qgroup_free: should qgroup reserved-space also be freed
    477 *
    478 * This function will release the metadata space that was not used and will
    479 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
    480 * list if there are no delalloc bytes left.
    481 * Also it will handle the qgroup reserved space.
    482 */
    483void btrfs_delalloc_release_space(struct btrfs_inode *inode,
    484				  struct extent_changeset *reserved,
    485				  u64 start, u64 len, bool qgroup_free)
    486{
    487	btrfs_delalloc_release_metadata(inode, len, qgroup_free);
    488	btrfs_free_reserved_data_space(inode, reserved, start, len);
    489}