cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

btree.c (19590B)


      1// SPDX-License-Identifier: GPL-2.0+
      2/*
      3 * Copyright (C) 2017 Oracle.  All Rights Reserved.
      4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
      5 */
      6#include "xfs.h"
      7#include "xfs_fs.h"
      8#include "xfs_shared.h"
      9#include "xfs_format.h"
     10#include "xfs_trans_resv.h"
     11#include "xfs_mount.h"
     12#include "xfs_inode.h"
     13#include "xfs_btree.h"
     14#include "scrub/scrub.h"
     15#include "scrub/common.h"
     16#include "scrub/btree.h"
     17#include "scrub/trace.h"
     18
     19/* btree scrubbing */
     20
     21/*
     22 * Check for btree operation errors.  See the section about handling
     23 * operational errors in common.c.
     24 */
     25static bool
     26__xchk_btree_process_error(
     27	struct xfs_scrub	*sc,
     28	struct xfs_btree_cur	*cur,
     29	int			level,
     30	int			*error,
     31	__u32			errflag,
     32	void			*ret_ip)
     33{
     34	if (*error == 0)
     35		return true;
     36
     37	switch (*error) {
     38	case -EDEADLOCK:
     39		/* Used to restart an op with deadlock avoidance. */
     40		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
     41		break;
     42	case -EFSBADCRC:
     43	case -EFSCORRUPTED:
     44		/* Note the badness but don't abort. */
     45		sc->sm->sm_flags |= errflag;
     46		*error = 0;
     47		fallthrough;
     48	default:
     49		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
     50			trace_xchk_ifork_btree_op_error(sc, cur, level,
     51					*error, ret_ip);
     52		else
     53			trace_xchk_btree_op_error(sc, cur, level,
     54					*error, ret_ip);
     55		break;
     56	}
     57	return false;
     58}
     59
     60bool
     61xchk_btree_process_error(
     62	struct xfs_scrub	*sc,
     63	struct xfs_btree_cur	*cur,
     64	int			level,
     65	int			*error)
     66{
     67	return __xchk_btree_process_error(sc, cur, level, error,
     68			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
     69}
     70
     71bool
     72xchk_btree_xref_process_error(
     73	struct xfs_scrub	*sc,
     74	struct xfs_btree_cur	*cur,
     75	int			level,
     76	int			*error)
     77{
     78	return __xchk_btree_process_error(sc, cur, level, error,
     79			XFS_SCRUB_OFLAG_XFAIL, __return_address);
     80}
     81
     82/* Record btree block corruption. */
     83static void
     84__xchk_btree_set_corrupt(
     85	struct xfs_scrub	*sc,
     86	struct xfs_btree_cur	*cur,
     87	int			level,
     88	__u32			errflag,
     89	void			*ret_ip)
     90{
     91	sc->sm->sm_flags |= errflag;
     92
     93	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
     94		trace_xchk_ifork_btree_error(sc, cur, level,
     95				ret_ip);
     96	else
     97		trace_xchk_btree_error(sc, cur, level,
     98				ret_ip);
     99}
    100
    101void
    102xchk_btree_set_corrupt(
    103	struct xfs_scrub	*sc,
    104	struct xfs_btree_cur	*cur,
    105	int			level)
    106{
    107	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
    108			__return_address);
    109}
    110
    111void
    112xchk_btree_xref_set_corrupt(
    113	struct xfs_scrub	*sc,
    114	struct xfs_btree_cur	*cur,
    115	int			level)
    116{
    117	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
    118			__return_address);
    119}
    120
    121/*
    122 * Make sure this record is in order and doesn't stray outside of the parent
    123 * keys.
    124 */
    125STATIC void
    126xchk_btree_rec(
    127	struct xchk_btree	*bs)
    128{
    129	struct xfs_btree_cur	*cur = bs->cur;
    130	union xfs_btree_rec	*rec;
    131	union xfs_btree_key	key;
    132	union xfs_btree_key	hkey;
    133	union xfs_btree_key	*keyp;
    134	struct xfs_btree_block	*block;
    135	struct xfs_btree_block	*keyblock;
    136	struct xfs_buf		*bp;
    137
    138	block = xfs_btree_get_block(cur, 0, &bp);
    139	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
    140
    141	trace_xchk_btree_rec(bs->sc, cur, 0);
    142
    143	/* If this isn't the first record, are they in order? */
    144	if (cur->bc_levels[0].ptr > 1 &&
    145	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
    146		xchk_btree_set_corrupt(bs->sc, cur, 0);
    147	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
    148
    149	if (cur->bc_nlevels == 1)
    150		return;
    151
    152	/* Is this at least as large as the parent low key? */
    153	cur->bc_ops->init_key_from_rec(&key, rec);
    154	keyblock = xfs_btree_get_block(cur, 1, &bp);
    155	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
    156	if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
    157		xchk_btree_set_corrupt(bs->sc, cur, 1);
    158
    159	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
    160		return;
    161
    162	/* Is this no larger than the parent high key? */
    163	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
    164	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
    165	if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
    166		xchk_btree_set_corrupt(bs->sc, cur, 1);
    167}
    168
    169/*
    170 * Make sure this key is in order and doesn't stray outside of the parent
    171 * keys.
    172 */
    173STATIC void
    174xchk_btree_key(
    175	struct xchk_btree	*bs,
    176	int			level)
    177{
    178	struct xfs_btree_cur	*cur = bs->cur;
    179	union xfs_btree_key	*key;
    180	union xfs_btree_key	*keyp;
    181	struct xfs_btree_block	*block;
    182	struct xfs_btree_block	*keyblock;
    183	struct xfs_buf		*bp;
    184
    185	block = xfs_btree_get_block(cur, level, &bp);
    186	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
    187
    188	trace_xchk_btree_key(bs->sc, cur, level);
    189
    190	/* If this isn't the first key, are they in order? */
    191	if (cur->bc_levels[level].ptr > 1 &&
    192	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
    193		xchk_btree_set_corrupt(bs->sc, cur, level);
    194	memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
    195
    196	if (level + 1 >= cur->bc_nlevels)
    197		return;
    198
    199	/* Is this at least as large as the parent low key? */
    200	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
    201	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
    202	if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
    203		xchk_btree_set_corrupt(bs->sc, cur, level);
    204
    205	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
    206		return;
    207
    208	/* Is this no larger than the parent high key? */
    209	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
    210	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
    211			keyblock);
    212	if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
    213		xchk_btree_set_corrupt(bs->sc, cur, level);
    214}
    215
    216/*
    217 * Check a btree pointer.  Returns true if it's ok to use this pointer.
    218 * Callers do not need to set the corrupt flag.
    219 */
    220static bool
    221xchk_btree_ptr_ok(
    222	struct xchk_btree	*bs,
    223	int			level,
    224	union xfs_btree_ptr	*ptr)
    225{
    226	bool			res;
    227
    228	/* A btree rooted in an inode has no block pointer to the root. */
    229	if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
    230	    level == bs->cur->bc_nlevels)
    231		return true;
    232
    233	/* Otherwise, check the pointers. */
    234	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
    235		res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
    236	else
    237		res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
    238	if (!res)
    239		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
    240
    241	return res;
    242}
    243
    244/* Check that a btree block's sibling matches what we expect it. */
    245STATIC int
    246xchk_btree_block_check_sibling(
    247	struct xchk_btree	*bs,
    248	int			level,
    249	int			direction,
    250	union xfs_btree_ptr	*sibling)
    251{
    252	struct xfs_btree_cur	*cur = bs->cur;
    253	struct xfs_btree_block	*pblock;
    254	struct xfs_buf		*pbp;
    255	struct xfs_btree_cur	*ncur = NULL;
    256	union xfs_btree_ptr	*pp;
    257	int			success;
    258	int			error;
    259
    260	error = xfs_btree_dup_cursor(cur, &ncur);
    261	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
    262	    !ncur)
    263		return error;
    264
    265	/*
    266	 * If the pointer is null, we shouldn't be able to move the upper
    267	 * level pointer anywhere.
    268	 */
    269	if (xfs_btree_ptr_is_null(cur, sibling)) {
    270		if (direction > 0)
    271			error = xfs_btree_increment(ncur, level + 1, &success);
    272		else
    273			error = xfs_btree_decrement(ncur, level + 1, &success);
    274		if (error == 0 && success)
    275			xchk_btree_set_corrupt(bs->sc, cur, level);
    276		error = 0;
    277		goto out;
    278	}
    279
    280	/* Increment upper level pointer. */
    281	if (direction > 0)
    282		error = xfs_btree_increment(ncur, level + 1, &success);
    283	else
    284		error = xfs_btree_decrement(ncur, level + 1, &success);
    285	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
    286		goto out;
    287	if (!success) {
    288		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
    289		goto out;
    290	}
    291
    292	/* Compare upper level pointer to sibling pointer. */
    293	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
    294	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
    295	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
    296		goto out;
    297	if (pbp)
    298		xchk_buffer_recheck(bs->sc, pbp);
    299
    300	if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
    301		xchk_btree_set_corrupt(bs->sc, cur, level);
    302out:
    303	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
    304	return error;
    305}
    306
    307/* Check the siblings of a btree block. */
    308STATIC int
    309xchk_btree_block_check_siblings(
    310	struct xchk_btree	*bs,
    311	struct xfs_btree_block	*block)
    312{
    313	struct xfs_btree_cur	*cur = bs->cur;
    314	union xfs_btree_ptr	leftsib;
    315	union xfs_btree_ptr	rightsib;
    316	int			level;
    317	int			error = 0;
    318
    319	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
    320	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
    321	level = xfs_btree_get_level(block);
    322
    323	/* Root block should never have siblings. */
    324	if (level == cur->bc_nlevels - 1) {
    325		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
    326		    !xfs_btree_ptr_is_null(cur, &rightsib))
    327			xchk_btree_set_corrupt(bs->sc, cur, level);
    328		goto out;
    329	}
    330
    331	/*
    332	 * Does the left & right sibling pointers match the adjacent
    333	 * parent level pointers?
    334	 * (These function absorbs error codes for us.)
    335	 */
    336	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
    337	if (error)
    338		return error;
    339	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
    340	if (error)
    341		return error;
    342out:
    343	return error;
    344}
    345
    346struct check_owner {
    347	struct list_head	list;
    348	xfs_daddr_t		daddr;
    349	int			level;
    350};
    351
    352/*
    353 * Make sure this btree block isn't in the free list and that there's
    354 * an rmap record for it.
    355 */
    356STATIC int
    357xchk_btree_check_block_owner(
    358	struct xchk_btree	*bs,
    359	int			level,
    360	xfs_daddr_t		daddr)
    361{
    362	xfs_agnumber_t		agno;
    363	xfs_agblock_t		agbno;
    364	xfs_btnum_t		btnum;
    365	bool			init_sa;
    366	int			error = 0;
    367
    368	if (!bs->cur)
    369		return 0;
    370
    371	btnum = bs->cur->bc_btnum;
    372	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
    373	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
    374
    375	init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
    376	if (init_sa) {
    377		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
    378		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
    379				level, &error))
    380			goto out_free;
    381	}
    382
    383	xchk_xref_is_used_space(bs->sc, agbno, 1);
    384	/*
    385	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
    386	 * have to nullify it (to shut down further block owner checks) if
    387	 * self-xref encounters problems.
    388	 */
    389	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
    390		bs->cur = NULL;
    391
    392	xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
    393	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
    394		bs->cur = NULL;
    395
    396out_free:
    397	if (init_sa)
    398		xchk_ag_free(bs->sc, &bs->sc->sa);
    399
    400	return error;
    401}
    402
    403/* Check the owner of a btree block. */
    404STATIC int
    405xchk_btree_check_owner(
    406	struct xchk_btree	*bs,
    407	int			level,
    408	struct xfs_buf		*bp)
    409{
    410	struct xfs_btree_cur	*cur = bs->cur;
    411	struct check_owner	*co;
    412
    413	/*
    414	 * In theory, xfs_btree_get_block should only give us a null buffer
    415	 * pointer for the root of a root-in-inode btree type, but we need
    416	 * to check defensively here in case the cursor state is also screwed
    417	 * up.
    418	 */
    419	if (bp == NULL) {
    420		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
    421			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
    422		return 0;
    423	}
    424
    425	/*
    426	 * We want to cross-reference each btree block with the bnobt
    427	 * and the rmapbt.  We cannot cross-reference the bnobt or
    428	 * rmapbt while scanning the bnobt or rmapbt, respectively,
    429	 * because we cannot alter the cursor and we'd prefer not to
    430	 * duplicate cursors.  Therefore, save the buffer daddr for
    431	 * later scanning.
    432	 */
    433	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
    434		co = kmem_alloc(sizeof(struct check_owner),
    435				KM_MAYFAIL);
    436		if (!co)
    437			return -ENOMEM;
    438		co->level = level;
    439		co->daddr = xfs_buf_daddr(bp);
    440		list_add_tail(&co->list, &bs->to_check);
    441		return 0;
    442	}
    443
    444	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
    445}
    446
    447/* Decide if we want to check minrecs of a btree block in the inode root. */
    448static inline bool
    449xchk_btree_check_iroot_minrecs(
    450	struct xchk_btree	*bs)
    451{
    452	/*
    453	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
    454	 * would miscalculate the space required for the data fork bmbt root
    455	 * when adding an attr fork, and promote the iroot contents to an
    456	 * external block unnecessarily.  This went unnoticed for many years
    457	 * until scrub found filesystems in this state.  Inode rooted btrees are
    458	 * not supposed to have immediate child blocks that are small enough
    459	 * that the contents could fit in the inode root, but we can't fail
    460	 * existing filesystems, so instead we disable the check for data fork
    461	 * bmap btrees when there's an attr fork.
    462	 */
    463	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
    464	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
    465	    XFS_IFORK_Q(bs->sc->ip))
    466		return false;
    467
    468	return true;
    469}
    470
    471/*
    472 * Check that this btree block has at least minrecs records or is one of the
    473 * special blocks that don't require that.
    474 */
    475STATIC void
    476xchk_btree_check_minrecs(
    477	struct xchk_btree	*bs,
    478	int			level,
    479	struct xfs_btree_block	*block)
    480{
    481	struct xfs_btree_cur	*cur = bs->cur;
    482	unsigned int		root_level = cur->bc_nlevels - 1;
    483	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
    484
    485	/* More records than minrecs means the block is ok. */
    486	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
    487		return;
    488
    489	/*
    490	 * For btrees rooted in the inode, it's possible that the root block
    491	 * contents spilled into a regular ondisk block because there wasn't
    492	 * enough space in the inode root.  The number of records in that
    493	 * child block might be less than the standard minrecs, but that's ok
    494	 * provided that there's only one direct child of the root.
    495	 */
    496	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
    497	    level == cur->bc_nlevels - 2) {
    498		struct xfs_btree_block	*root_block;
    499		struct xfs_buf		*root_bp;
    500		int			root_maxrecs;
    501
    502		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
    503		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
    504		if (xchk_btree_check_iroot_minrecs(bs) &&
    505		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
    506		     numrecs <= root_maxrecs))
    507			xchk_btree_set_corrupt(bs->sc, cur, level);
    508		return;
    509	}
    510
    511	/*
    512	 * Otherwise, only the root level is allowed to have fewer than minrecs
    513	 * records or keyptrs.
    514	 */
    515	if (level < root_level)
    516		xchk_btree_set_corrupt(bs->sc, cur, level);
    517}
    518
    519/*
    520 * Grab and scrub a btree block given a btree pointer.  Returns block
    521 * and buffer pointers (if applicable) if they're ok to use.
    522 */
    523STATIC int
    524xchk_btree_get_block(
    525	struct xchk_btree	*bs,
    526	int			level,
    527	union xfs_btree_ptr	*pp,
    528	struct xfs_btree_block	**pblock,
    529	struct xfs_buf		**pbp)
    530{
    531	xfs_failaddr_t		failed_at;
    532	int			error;
    533
    534	*pblock = NULL;
    535	*pbp = NULL;
    536
    537	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
    538	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
    539	    !*pblock)
    540		return error;
    541
    542	xfs_btree_get_block(bs->cur, level, pbp);
    543	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
    544		failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
    545				level, *pbp);
    546	else
    547		failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
    548				 level, *pbp);
    549	if (failed_at) {
    550		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
    551		return 0;
    552	}
    553	if (*pbp)
    554		xchk_buffer_recheck(bs->sc, *pbp);
    555
    556	xchk_btree_check_minrecs(bs, level, *pblock);
    557
    558	/*
    559	 * Check the block's owner; this function absorbs error codes
    560	 * for us.
    561	 */
    562	error = xchk_btree_check_owner(bs, level, *pbp);
    563	if (error)
    564		return error;
    565
    566	/*
    567	 * Check the block's siblings; this function absorbs error codes
    568	 * for us.
    569	 */
    570	return xchk_btree_block_check_siblings(bs, *pblock);
    571}
    572
    573/*
    574 * Check that the low and high keys of this block match the keys stored
    575 * in the parent block.
    576 */
    577STATIC void
    578xchk_btree_block_keys(
    579	struct xchk_btree	*bs,
    580	int			level,
    581	struct xfs_btree_block	*block)
    582{
    583	union xfs_btree_key	block_keys;
    584	struct xfs_btree_cur	*cur = bs->cur;
    585	union xfs_btree_key	*high_bk;
    586	union xfs_btree_key	*parent_keys;
    587	union xfs_btree_key	*high_pk;
    588	struct xfs_btree_block	*parent_block;
    589	struct xfs_buf		*bp;
    590
    591	if (level >= cur->bc_nlevels - 1)
    592		return;
    593
    594	/* Calculate the keys for this block. */
    595	xfs_btree_get_keys(cur, block, &block_keys);
    596
    597	/* Obtain the parent's copy of the keys for this block. */
    598	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
    599	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
    600			parent_block);
    601
    602	if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
    603		xchk_btree_set_corrupt(bs->sc, cur, 1);
    604
    605	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
    606		return;
    607
    608	/* Get high keys */
    609	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
    610	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
    611			parent_block);
    612
    613	if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
    614		xchk_btree_set_corrupt(bs->sc, cur, 1);
    615}
    616
    617/*
    618 * Visit all nodes and leaves of a btree.  Check that all pointers and
    619 * records are in order, that the keys reflect the records, and use a callback
    620 * so that the caller can verify individual records.
    621 */
    622int
    623xchk_btree(
    624	struct xfs_scrub		*sc,
    625	struct xfs_btree_cur		*cur,
    626	xchk_btree_rec_fn		scrub_fn,
    627	const struct xfs_owner_info	*oinfo,
    628	void				*private)
    629{
    630	union xfs_btree_ptr		ptr;
    631	struct xchk_btree		*bs;
    632	union xfs_btree_ptr		*pp;
    633	union xfs_btree_rec		*recp;
    634	struct xfs_btree_block		*block;
    635	struct xfs_buf			*bp;
    636	struct check_owner		*co;
    637	struct check_owner		*n;
    638	size_t				cur_sz;
    639	int				level;
    640	int				error = 0;
    641
    642	/*
    643	 * Allocate the btree scrub context from the heap, because this
    644	 * structure can get rather large.  Don't let a caller feed us a
    645	 * totally absurd size.
    646	 */
    647	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
    648	if (cur_sz > PAGE_SIZE) {
    649		xchk_btree_set_corrupt(sc, cur, 0);
    650		return 0;
    651	}
    652	bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
    653	if (!bs)
    654		return -ENOMEM;
    655	bs->cur = cur;
    656	bs->scrub_rec = scrub_fn;
    657	bs->oinfo = oinfo;
    658	bs->private = private;
    659	bs->sc = sc;
    660
    661	/* Initialize scrub state */
    662	INIT_LIST_HEAD(&bs->to_check);
    663
    664	/*
    665	 * Load the root of the btree.  The helper function absorbs
    666	 * error codes for us.
    667	 */
    668	level = cur->bc_nlevels - 1;
    669	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
    670	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
    671		goto out;
    672	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
    673	if (error || !block)
    674		goto out;
    675
    676	cur->bc_levels[level].ptr = 1;
    677
    678	while (level < cur->bc_nlevels) {
    679		block = xfs_btree_get_block(cur, level, &bp);
    680
    681		if (level == 0) {
    682			/* End of leaf, pop back towards the root. */
    683			if (cur->bc_levels[level].ptr >
    684			    be16_to_cpu(block->bb_numrecs)) {
    685				xchk_btree_block_keys(bs, level, block);
    686				if (level < cur->bc_nlevels - 1)
    687					cur->bc_levels[level + 1].ptr++;
    688				level++;
    689				continue;
    690			}
    691
    692			/* Records in order for scrub? */
    693			xchk_btree_rec(bs);
    694
    695			/* Call out to the record checker. */
    696			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
    697					block);
    698			error = bs->scrub_rec(bs, recp);
    699			if (error)
    700				break;
    701			if (xchk_should_terminate(sc, &error) ||
    702			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
    703				break;
    704
    705			cur->bc_levels[level].ptr++;
    706			continue;
    707		}
    708
    709		/* End of node, pop back towards the root. */
    710		if (cur->bc_levels[level].ptr >
    711					be16_to_cpu(block->bb_numrecs)) {
    712			xchk_btree_block_keys(bs, level, block);
    713			if (level < cur->bc_nlevels - 1)
    714				cur->bc_levels[level + 1].ptr++;
    715			level++;
    716			continue;
    717		}
    718
    719		/* Keys in order for scrub? */
    720		xchk_btree_key(bs, level);
    721
    722		/* Drill another level deeper. */
    723		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
    724		if (!xchk_btree_ptr_ok(bs, level, pp)) {
    725			cur->bc_levels[level].ptr++;
    726			continue;
    727		}
    728		level--;
    729		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
    730		if (error || !block)
    731			goto out;
    732
    733		cur->bc_levels[level].ptr = 1;
    734	}
    735
    736out:
    737	/* Process deferred owner checks on btree blocks. */
    738	list_for_each_entry_safe(co, n, &bs->to_check, list) {
    739		if (!error && bs->cur)
    740			error = xchk_btree_check_block_owner(bs, co->level,
    741					co->daddr);
    742		list_del(&co->list);
    743		kmem_free(co);
    744	}
    745	kmem_free(bs);
    746
    747	return error;
    748}