cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

xfs_ag_resv.c (12134B)


      1// SPDX-License-Identifier: GPL-2.0+
      2/*
      3 * Copyright (C) 2016 Oracle.  All Rights Reserved.
      4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
      5 */
      6#include "xfs.h"
      7#include "xfs_fs.h"
      8#include "xfs_shared.h"
      9#include "xfs_format.h"
     10#include "xfs_log_format.h"
     11#include "xfs_trans_resv.h"
     12#include "xfs_mount.h"
     13#include "xfs_alloc.h"
     14#include "xfs_errortag.h"
     15#include "xfs_error.h"
     16#include "xfs_trace.h"
     17#include "xfs_trans.h"
     18#include "xfs_rmap_btree.h"
     19#include "xfs_btree.h"
     20#include "xfs_refcount_btree.h"
     21#include "xfs_ialloc_btree.h"
     22#include "xfs_ag.h"
     23#include "xfs_ag_resv.h"
     24
     25/*
     26 * Per-AG Block Reservations
     27 *
     28 * For some kinds of allocation group metadata structures, it is advantageous
     29 * to reserve a small number of blocks in each AG so that future expansions of
     30 * that data structure do not encounter ENOSPC because errors during a btree
     31 * split cause the filesystem to go offline.
     32 *
     33 * Prior to the introduction of reflink, this wasn't an issue because the free
     34 * space btrees maintain a reserve of space (the AGFL) to handle any expansion
     35 * that may be necessary; and allocations of other metadata (inodes, BMBT,
     36 * dir/attr) aren't restricted to a single AG.  However, with reflink it is
     37 * possible to allocate all the space in an AG, have subsequent reflink/CoW
     38 * activity expand the refcount btree, and discover that there's no space left
     39 * to handle that expansion.  Since we can calculate the maximum size of the
     40 * refcount btree, we can reserve space for it and avoid ENOSPC.
     41 *
     42 * Handling per-AG reservations consists of three changes to the allocator's
     43 * behavior:  First, because these reservations are always needed, we decrease
     44 * the ag_max_usable counter to reflect the size of the AG after the reserved
     45 * blocks are taken.  Second, the reservations must be reflected in the
     46 * fdblocks count to maintain proper accounting.  Third, each AG must maintain
     47 * its own reserved block counter so that we can calculate the amount of space
     48 * that must remain free to maintain the reservations.  Fourth, the "remaining
     49 * reserved blocks" count must be used when calculating the length of the
     50 * longest free extent in an AG and to clamp maxlen in the per-AG allocation
     51 * functions.  In other words, we maintain a virtual allocation via in-core
     52 * accounting tricks so that we don't have to clean up after a crash. :)
     53 *
     54 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
     55 * values via struct xfs_alloc_arg or directly to the xfs_free_extent
     56 * function.  It might seem a little funny to maintain a reservoir of blocks
     57 * to feed another reservoir, but the AGFL only holds enough blocks to get
     58 * through the next transaction.  The per-AG reservation is to ensure (we
     59 * hope) that each AG never runs out of blocks.  Each data structure wanting
     60 * to use the reservation system should update ask/used in xfs_ag_resv_init.
     61 */
     62
     63/*
     64 * Are we critically low on blocks?  For now we'll define that as the number
     65 * of blocks we can get our hands on being less than 10% of what we reserved
     66 * or less than some arbitrary number (maximum btree height).
     67 */
     68bool
     69xfs_ag_resv_critical(
     70	struct xfs_perag		*pag,
     71	enum xfs_ag_resv_type		type)
     72{
     73	xfs_extlen_t			avail;
     74	xfs_extlen_t			orig;
     75
     76	switch (type) {
     77	case XFS_AG_RESV_METADATA:
     78		avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
     79		orig = pag->pag_meta_resv.ar_asked;
     80		break;
     81	case XFS_AG_RESV_RMAPBT:
     82		avail = pag->pagf_freeblks + pag->pagf_flcount -
     83			pag->pag_meta_resv.ar_reserved;
     84		orig = pag->pag_rmapbt_resv.ar_asked;
     85		break;
     86	default:
     87		ASSERT(0);
     88		return false;
     89	}
     90
     91	trace_xfs_ag_resv_critical(pag, type, avail);
     92
     93	/* Critically low if less than 10% or max btree height remains. */
     94	return XFS_TEST_ERROR(avail < orig / 10 ||
     95			      avail < pag->pag_mount->m_agbtree_maxlevels,
     96			pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
     97}
     98
     99/*
    100 * How many blocks are reserved but not used, and therefore must not be
    101 * allocated away?
    102 */
    103xfs_extlen_t
    104xfs_ag_resv_needed(
    105	struct xfs_perag		*pag,
    106	enum xfs_ag_resv_type		type)
    107{
    108	xfs_extlen_t			len;
    109
    110	len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
    111	switch (type) {
    112	case XFS_AG_RESV_METADATA:
    113	case XFS_AG_RESV_RMAPBT:
    114		len -= xfs_perag_resv(pag, type)->ar_reserved;
    115		break;
    116	case XFS_AG_RESV_NONE:
    117		/* empty */
    118		break;
    119	default:
    120		ASSERT(0);
    121	}
    122
    123	trace_xfs_ag_resv_needed(pag, type, len);
    124
    125	return len;
    126}
    127
    128/* Clean out a reservation */
    129static int
    130__xfs_ag_resv_free(
    131	struct xfs_perag		*pag,
    132	enum xfs_ag_resv_type		type)
    133{
    134	struct xfs_ag_resv		*resv;
    135	xfs_extlen_t			oldresv;
    136	int				error;
    137
    138	trace_xfs_ag_resv_free(pag, type, 0);
    139
    140	resv = xfs_perag_resv(pag, type);
    141	if (pag->pag_agno == 0)
    142		pag->pag_mount->m_ag_max_usable += resv->ar_asked;
    143	/*
    144	 * RMAPBT blocks come from the AGFL and AGFL blocks are always
    145	 * considered "free", so whatever was reserved at mount time must be
    146	 * given back at umount.
    147	 */
    148	if (type == XFS_AG_RESV_RMAPBT)
    149		oldresv = resv->ar_orig_reserved;
    150	else
    151		oldresv = resv->ar_reserved;
    152	error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
    153	resv->ar_reserved = 0;
    154	resv->ar_asked = 0;
    155	resv->ar_orig_reserved = 0;
    156
    157	if (error)
    158		trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
    159				error, _RET_IP_);
    160	return error;
    161}
    162
    163/* Free a per-AG reservation. */
    164int
    165xfs_ag_resv_free(
    166	struct xfs_perag		*pag)
    167{
    168	int				error;
    169	int				err2;
    170
    171	error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
    172	err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
    173	if (err2 && !error)
    174		error = err2;
    175	return error;
    176}
    177
    178static int
    179__xfs_ag_resv_init(
    180	struct xfs_perag		*pag,
    181	enum xfs_ag_resv_type		type,
    182	xfs_extlen_t			ask,
    183	xfs_extlen_t			used)
    184{
    185	struct xfs_mount		*mp = pag->pag_mount;
    186	struct xfs_ag_resv		*resv;
    187	int				error;
    188	xfs_extlen_t			hidden_space;
    189
    190	if (used > ask)
    191		ask = used;
    192
    193	switch (type) {
    194	case XFS_AG_RESV_RMAPBT:
    195		/*
    196		 * Space taken by the rmapbt is not subtracted from fdblocks
    197		 * because the rmapbt lives in the free space.  Here we must
    198		 * subtract the entire reservation from fdblocks so that we
    199		 * always have blocks available for rmapbt expansion.
    200		 */
    201		hidden_space = ask;
    202		break;
    203	case XFS_AG_RESV_METADATA:
    204		/*
    205		 * Space taken by all other metadata btrees are accounted
    206		 * on-disk as used space.  We therefore only hide the space
    207		 * that is reserved but not used by the trees.
    208		 */
    209		hidden_space = ask - used;
    210		break;
    211	default:
    212		ASSERT(0);
    213		return -EINVAL;
    214	}
    215
    216	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
    217		error = -ENOSPC;
    218	else
    219		error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
    220	if (error) {
    221		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
    222				error, _RET_IP_);
    223		xfs_warn(mp,
    224"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
    225				pag->pag_agno);
    226		return error;
    227	}
    228
    229	/*
    230	 * Reduce the maximum per-AG allocation length by however much we're
    231	 * trying to reserve for an AG.  Since this is a filesystem-wide
    232	 * counter, we only make the adjustment for AG 0.  This assumes that
    233	 * there aren't any AGs hungrier for per-AG reservation than AG 0.
    234	 */
    235	if (pag->pag_agno == 0)
    236		mp->m_ag_max_usable -= ask;
    237
    238	resv = xfs_perag_resv(pag, type);
    239	resv->ar_asked = ask;
    240	resv->ar_orig_reserved = hidden_space;
    241	resv->ar_reserved = ask - used;
    242
    243	trace_xfs_ag_resv_init(pag, type, ask);
    244	return 0;
    245}
    246
    247/* Create a per-AG block reservation. */
    248int
    249xfs_ag_resv_init(
    250	struct xfs_perag		*pag,
    251	struct xfs_trans		*tp)
    252{
    253	struct xfs_mount		*mp = pag->pag_mount;
    254	xfs_extlen_t			ask;
    255	xfs_extlen_t			used;
    256	int				error = 0, error2;
    257	bool				has_resv = false;
    258
    259	/* Create the metadata reservation. */
    260	if (pag->pag_meta_resv.ar_asked == 0) {
    261		ask = used = 0;
    262
    263		error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
    264		if (error)
    265			goto out;
    266
    267		error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
    268		if (error)
    269			goto out;
    270
    271		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
    272				ask, used);
    273		if (error) {
    274			/*
    275			 * Because we didn't have per-AG reservations when the
    276			 * finobt feature was added we might not be able to
    277			 * reserve all needed blocks.  Warn and fall back to the
    278			 * old and potentially buggy code in that case, but
    279			 * ensure we do have the reservation for the refcountbt.
    280			 */
    281			ask = used = 0;
    282
    283			mp->m_finobt_nores = true;
    284
    285			error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
    286					&used);
    287			if (error)
    288				goto out;
    289
    290			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
    291					ask, used);
    292			if (error)
    293				goto out;
    294		}
    295		if (ask)
    296			has_resv = true;
    297	}
    298
    299	/* Create the RMAPBT metadata reservation */
    300	if (pag->pag_rmapbt_resv.ar_asked == 0) {
    301		ask = used = 0;
    302
    303		error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
    304		if (error)
    305			goto out;
    306
    307		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
    308		if (error)
    309			goto out;
    310		if (ask)
    311			has_resv = true;
    312	}
    313
    314out:
    315	/*
    316	 * Initialize the pagf if we have at least one active reservation on the
    317	 * AG. This may have occurred already via reservation calculation, but
    318	 * fall back to an explicit init to ensure the in-core allocbt usage
    319	 * counters are initialized as soon as possible. This is important
    320	 * because filesystems with large perag reservations are susceptible to
    321	 * free space reservation problems that the allocbt counter is used to
    322	 * address.
    323	 */
    324	if (has_resv) {
    325		error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
    326		if (error2)
    327			return error2;
    328
    329		/*
    330		 * If there isn't enough space in the AG to satisfy the
    331		 * reservation, let the caller know that there wasn't enough
    332		 * space.  Callers are responsible for deciding what to do
    333		 * next, since (in theory) we can stumble along with
    334		 * insufficient reservation if data blocks are being freed to
    335		 * replenish the AG's free space.
    336		 */
    337		if (!error &&
    338		    xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
    339		    xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
    340		    pag->pagf_freeblks + pag->pagf_flcount)
    341			error = -ENOSPC;
    342	}
    343
    344	return error;
    345}
    346
    347/* Allocate a block from the reservation. */
    348void
    349xfs_ag_resv_alloc_extent(
    350	struct xfs_perag		*pag,
    351	enum xfs_ag_resv_type		type,
    352	struct xfs_alloc_arg		*args)
    353{
    354	struct xfs_ag_resv		*resv;
    355	xfs_extlen_t			len;
    356	uint				field;
    357
    358	trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
    359
    360	switch (type) {
    361	case XFS_AG_RESV_AGFL:
    362		return;
    363	case XFS_AG_RESV_METADATA:
    364	case XFS_AG_RESV_RMAPBT:
    365		resv = xfs_perag_resv(pag, type);
    366		break;
    367	default:
    368		ASSERT(0);
    369		fallthrough;
    370	case XFS_AG_RESV_NONE:
    371		field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
    372				       XFS_TRANS_SB_FDBLOCKS;
    373		xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
    374		return;
    375	}
    376
    377	len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
    378	resv->ar_reserved -= len;
    379	if (type == XFS_AG_RESV_RMAPBT)
    380		return;
    381	/* Allocations of reserved blocks only need on-disk sb updates... */
    382	xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
    383	/* ...but non-reserved blocks need in-core and on-disk updates. */
    384	if (args->len > len)
    385		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
    386				-((int64_t)args->len - len));
    387}
    388
    389/* Free a block to the reservation. */
    390void
    391xfs_ag_resv_free_extent(
    392	struct xfs_perag		*pag,
    393	enum xfs_ag_resv_type		type,
    394	struct xfs_trans		*tp,
    395	xfs_extlen_t			len)
    396{
    397	xfs_extlen_t			leftover;
    398	struct xfs_ag_resv		*resv;
    399
    400	trace_xfs_ag_resv_free_extent(pag, type, len);
    401
    402	switch (type) {
    403	case XFS_AG_RESV_AGFL:
    404		return;
    405	case XFS_AG_RESV_METADATA:
    406	case XFS_AG_RESV_RMAPBT:
    407		resv = xfs_perag_resv(pag, type);
    408		break;
    409	default:
    410		ASSERT(0);
    411		fallthrough;
    412	case XFS_AG_RESV_NONE:
    413		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
    414		return;
    415	}
    416
    417	leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
    418	resv->ar_reserved += leftover;
    419	if (type == XFS_AG_RESV_RMAPBT)
    420		return;
    421	/* Freeing into the reserved pool only requires on-disk update... */
    422	xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
    423	/* ...but freeing beyond that requires in-core and on-disk update. */
    424	if (len > leftover)
    425		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
    426}