cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

fscounters.c (10769B)


      1// SPDX-License-Identifier: GPL-2.0+
      2/*
      3 * Copyright (C) 2019 Oracle.  All Rights Reserved.
      4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
      5 */
      6#include "xfs.h"
      7#include "xfs_fs.h"
      8#include "xfs_shared.h"
      9#include "xfs_format.h"
     10#include "xfs_trans_resv.h"
     11#include "xfs_mount.h"
     12#include "xfs_alloc.h"
     13#include "xfs_ialloc.h"
     14#include "xfs_health.h"
     15#include "xfs_btree.h"
     16#include "xfs_ag.h"
     17#include "scrub/scrub.h"
     18#include "scrub/common.h"
     19#include "scrub/trace.h"
     20
     21/*
     22 * FS Summary Counters
     23 * ===================
     24 *
     25 * The basics of filesystem summary counter checking are that we iterate the
     26 * AGs counting the number of free blocks, free space btree blocks, per-AG
     27 * reservations, inodes, delayed allocation reservations, and free inodes.
     28 * Then we compare what we computed against the in-core counters.
     29 *
     30 * However, the reality is that summary counters are a tricky beast to check.
     31 * While we /could/ freeze the filesystem and scramble around the AGs counting
     32 * the free blocks, in practice we prefer not do that for a scan because
     33 * freezing is costly.  To get around this, we added a per-cpu counter of the
     34 * delalloc reservations so that we can rotor around the AGs relatively
     35 * quickly, and we allow the counts to be slightly off because we're not taking
     36 * any locks while we do this.
     37 *
     38 * So the first thing we do is warm up the buffer cache in the setup routine by
     39 * walking all the AGs to make sure the incore per-AG structure has been
     40 * initialized.  The expected value calculation then iterates the incore per-AG
     41 * structures as quickly as it can.  We snapshot the percpu counters before and
     42 * after this operation and use the difference in counter values to guess at
     43 * our tolerance for mismatch between expected and actual counter values.
     44 */
     45
     46/*
     47 * Since the expected value computation is lockless but only browses incore
     48 * values, the percpu counters should be fairly close to each other.  However,
     49 * we'll allow ourselves to be off by at least this (arbitrary) amount.
     50 */
     51#define XCHK_FSCOUNT_MIN_VARIANCE	(512)
     52
     53/*
     54 * Make sure the per-AG structure has been initialized from the on-disk header
     55 * contents and trust that the incore counters match the ondisk counters.  (The
     56 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
     57 * summary counters after checking all AG headers).  Do this from the setup
     58 * function so that the inner AG aggregation loop runs as quickly as possible.
     59 *
     60 * This function runs during the setup phase /before/ we start checking any
     61 * metadata.
     62 */
     63STATIC int
     64xchk_fscount_warmup(
     65	struct xfs_scrub	*sc)
     66{
     67	struct xfs_mount	*mp = sc->mp;
     68	struct xfs_buf		*agi_bp = NULL;
     69	struct xfs_buf		*agf_bp = NULL;
     70	struct xfs_perag	*pag = NULL;
     71	xfs_agnumber_t		agno;
     72	int			error = 0;
     73
     74	for_each_perag(mp, agno, pag) {
     75		if (xchk_should_terminate(sc, &error))
     76			break;
     77		if (pag->pagi_init && pag->pagf_init)
     78			continue;
     79
     80		/* Lock both AG headers. */
     81		error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
     82		if (error)
     83			break;
     84		error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
     85		if (error)
     86			break;
     87
     88		/*
     89		 * These are supposed to be initialized by the header read
     90		 * function.
     91		 */
     92		if (!pag->pagi_init || !pag->pagf_init) {
     93			error = -EFSCORRUPTED;
     94			break;
     95		}
     96
     97		xfs_buf_relse(agf_bp);
     98		agf_bp = NULL;
     99		xfs_buf_relse(agi_bp);
    100		agi_bp = NULL;
    101	}
    102
    103	if (agf_bp)
    104		xfs_buf_relse(agf_bp);
    105	if (agi_bp)
    106		xfs_buf_relse(agi_bp);
    107	if (pag)
    108		xfs_perag_put(pag);
    109	return error;
    110}
    111
    112int
    113xchk_setup_fscounters(
    114	struct xfs_scrub	*sc)
    115{
    116	struct xchk_fscounters	*fsc;
    117	int			error;
    118
    119	sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
    120	if (!sc->buf)
    121		return -ENOMEM;
    122	fsc = sc->buf;
    123
    124	xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
    125
    126	/* We must get the incore counters set up before we can proceed. */
    127	error = xchk_fscount_warmup(sc);
    128	if (error)
    129		return error;
    130
    131	/*
    132	 * Pause background reclaim while we're scrubbing to reduce the
    133	 * likelihood of background perturbations to the counters throwing off
    134	 * our calculations.
    135	 */
    136	xchk_stop_reaping(sc);
    137
    138	return xchk_trans_alloc(sc, 0);
    139}
    140
    141/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
    142static int
    143xchk_fscount_btreeblks(
    144	struct xfs_scrub	*sc,
    145	struct xchk_fscounters	*fsc,
    146	xfs_agnumber_t		agno)
    147{
    148	xfs_extlen_t		blocks;
    149	int			error;
    150
    151	error = xchk_ag_init_existing(sc, agno, &sc->sa);
    152	if (error)
    153		goto out_free;
    154
    155	error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
    156	if (error)
    157		goto out_free;
    158	fsc->fdblocks += blocks - 1;
    159
    160	error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
    161	if (error)
    162		goto out_free;
    163	fsc->fdblocks += blocks - 1;
    164
    165out_free:
    166	xchk_ag_free(sc, &sc->sa);
    167	return error;
    168}
    169
    170/*
    171 * Calculate what the global in-core counters ought to be from the incore
    172 * per-AG structure.  Callers can compare this to the actual in-core counters
    173 * to estimate by how much both in-core and on-disk counters need to be
    174 * adjusted.
    175 */
    176STATIC int
    177xchk_fscount_aggregate_agcounts(
    178	struct xfs_scrub	*sc,
    179	struct xchk_fscounters	*fsc)
    180{
    181	struct xfs_mount	*mp = sc->mp;
    182	struct xfs_perag	*pag;
    183	uint64_t		delayed;
    184	xfs_agnumber_t		agno;
    185	int			tries = 8;
    186	int			error = 0;
    187
    188retry:
    189	fsc->icount = 0;
    190	fsc->ifree = 0;
    191	fsc->fdblocks = 0;
    192
    193	for_each_perag(mp, agno, pag) {
    194		if (xchk_should_terminate(sc, &error))
    195			break;
    196
    197		/* This somehow got unset since the warmup? */
    198		if (!pag->pagi_init || !pag->pagf_init) {
    199			error = -EFSCORRUPTED;
    200			break;
    201		}
    202
    203		/* Count all the inodes */
    204		fsc->icount += pag->pagi_count;
    205		fsc->ifree += pag->pagi_freecount;
    206
    207		/* Add up the free/freelist/bnobt/cntbt blocks */
    208		fsc->fdblocks += pag->pagf_freeblks;
    209		fsc->fdblocks += pag->pagf_flcount;
    210		if (xfs_has_lazysbcount(sc->mp)) {
    211			fsc->fdblocks += pag->pagf_btreeblks;
    212		} else {
    213			error = xchk_fscount_btreeblks(sc, fsc, agno);
    214			if (error)
    215				break;
    216		}
    217
    218		/*
    219		 * Per-AG reservations are taken out of the incore counters,
    220		 * so they must be left out of the free blocks computation.
    221		 */
    222		fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
    223		fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
    224
    225	}
    226	if (pag)
    227		xfs_perag_put(pag);
    228	if (error)
    229		return error;
    230
    231	/*
    232	 * The global incore space reservation is taken from the incore
    233	 * counters, so leave that out of the computation.
    234	 */
    235	fsc->fdblocks -= mp->m_resblks_avail;
    236
    237	/*
    238	 * Delayed allocation reservations are taken out of the incore counters
    239	 * but not recorded on disk, so leave them and their indlen blocks out
    240	 * of the computation.
    241	 */
    242	delayed = percpu_counter_sum(&mp->m_delalloc_blks);
    243	fsc->fdblocks -= delayed;
    244
    245	trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
    246			delayed);
    247
    248
    249	/* Bail out if the values we compute are totally nonsense. */
    250	if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
    251	    fsc->fdblocks > mp->m_sb.sb_dblocks ||
    252	    fsc->ifree > fsc->icount_max)
    253		return -EFSCORRUPTED;
    254
    255	/*
    256	 * If ifree > icount then we probably had some perturbation in the
    257	 * counters while we were calculating things.  We'll try a few times
    258	 * to maintain ifree <= icount before giving up.
    259	 */
    260	if (fsc->ifree > fsc->icount) {
    261		if (tries--)
    262			goto retry;
    263		xchk_set_incomplete(sc);
    264		return 0;
    265	}
    266
    267	return 0;
    268}
    269
    270/*
    271 * Is the @counter reasonably close to the @expected value?
    272 *
    273 * We neither locked nor froze anything in the filesystem while aggregating the
    274 * per-AG data to compute the @expected value, which means that the counter
    275 * could have changed.  We know the @old_value of the summation of the counter
    276 * before the aggregation, and we re-sum the counter now.  If the expected
    277 * value falls between the two summations, we're ok.
    278 *
    279 * Otherwise, we /might/ have a problem.  If the change in the summations is
    280 * more than we want to tolerate, the filesystem is probably busy and we should
    281 * just send back INCOMPLETE and see if userspace will try again.
    282 */
    283static inline bool
    284xchk_fscount_within_range(
    285	struct xfs_scrub	*sc,
    286	const int64_t		old_value,
    287	struct percpu_counter	*counter,
    288	uint64_t		expected)
    289{
    290	int64_t			min_value, max_value;
    291	int64_t			curr_value = percpu_counter_sum(counter);
    292
    293	trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
    294			old_value);
    295
    296	/* Negative values are always wrong. */
    297	if (curr_value < 0)
    298		return false;
    299
    300	/* Exact matches are always ok. */
    301	if (curr_value == expected)
    302		return true;
    303
    304	min_value = min(old_value, curr_value);
    305	max_value = max(old_value, curr_value);
    306
    307	/* Within the before-and-after range is ok. */
    308	if (expected >= min_value && expected <= max_value)
    309		return true;
    310
    311	/*
    312	 * If the difference between the two summations is too large, the fs
    313	 * might just be busy and so we'll mark the scrub incomplete.  Return
    314	 * true here so that we don't mark the counter corrupt.
    315	 *
    316	 * XXX: In the future when userspace can grant scrub permission to
    317	 * quiesce the filesystem to solve the outsized variance problem, this
    318	 * check should be moved up and the return code changed to signal to
    319	 * userspace that we need quiesce permission.
    320	 */
    321	if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
    322		xchk_set_incomplete(sc);
    323		return true;
    324	}
    325
    326	return false;
    327}
    328
    329/* Check the superblock counters. */
    330int
    331xchk_fscounters(
    332	struct xfs_scrub	*sc)
    333{
    334	struct xfs_mount	*mp = sc->mp;
    335	struct xchk_fscounters	*fsc = sc->buf;
    336	int64_t			icount, ifree, fdblocks;
    337	int			error;
    338
    339	/* Snapshot the percpu counters. */
    340	icount = percpu_counter_sum(&mp->m_icount);
    341	ifree = percpu_counter_sum(&mp->m_ifree);
    342	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
    343
    344	/* No negative values, please! */
    345	if (icount < 0 || ifree < 0 || fdblocks < 0)
    346		xchk_set_corrupt(sc);
    347
    348	/* See if icount is obviously wrong. */
    349	if (icount < fsc->icount_min || icount > fsc->icount_max)
    350		xchk_set_corrupt(sc);
    351
    352	/* See if fdblocks is obviously wrong. */
    353	if (fdblocks > mp->m_sb.sb_dblocks)
    354		xchk_set_corrupt(sc);
    355
    356	/*
    357	 * If ifree exceeds icount by more than the minimum variance then
    358	 * something's probably wrong with the counters.
    359	 */
    360	if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
    361		xchk_set_corrupt(sc);
    362
    363	/* Walk the incore AG headers to calculate the expected counters. */
    364	error = xchk_fscount_aggregate_agcounts(sc, fsc);
    365	if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
    366		return error;
    367	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
    368		return 0;
    369
    370	/* Compare the in-core counters with whatever we counted. */
    371	if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
    372		xchk_set_corrupt(sc);
    373
    374	if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
    375		xchk_set_corrupt(sc);
    376
    377	if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
    378			fsc->fdblocks))
    379		xchk_set_corrupt(sc);
    380
    381	return 0;
    382}