cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

xfs_mount.c (37982B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
      4 * All Rights Reserved.
      5 */
      6#include "xfs.h"
      7#include "xfs_fs.h"
      8#include "xfs_shared.h"
      9#include "xfs_format.h"
     10#include "xfs_log_format.h"
     11#include "xfs_trans_resv.h"
     12#include "xfs_bit.h"
     13#include "xfs_sb.h"
     14#include "xfs_mount.h"
     15#include "xfs_inode.h"
     16#include "xfs_dir2.h"
     17#include "xfs_ialloc.h"
     18#include "xfs_alloc.h"
     19#include "xfs_rtalloc.h"
     20#include "xfs_bmap.h"
     21#include "xfs_trans.h"
     22#include "xfs_trans_priv.h"
     23#include "xfs_log.h"
     24#include "xfs_log_priv.h"
     25#include "xfs_error.h"
     26#include "xfs_quota.h"
     27#include "xfs_fsops.h"
     28#include "xfs_icache.h"
     29#include "xfs_sysfs.h"
     30#include "xfs_rmap_btree.h"
     31#include "xfs_refcount_btree.h"
     32#include "xfs_reflink.h"
     33#include "xfs_extent_busy.h"
     34#include "xfs_health.h"
     35#include "xfs_trace.h"
     36#include "xfs_ag.h"
     37
     38static DEFINE_MUTEX(xfs_uuid_table_mutex);
     39static int xfs_uuid_table_size;
     40static uuid_t *xfs_uuid_table;
     41
     42void
     43xfs_uuid_table_free(void)
     44{
     45	if (xfs_uuid_table_size == 0)
     46		return;
     47	kmem_free(xfs_uuid_table);
     48	xfs_uuid_table = NULL;
     49	xfs_uuid_table_size = 0;
     50}
     51
     52/*
     53 * See if the UUID is unique among mounted XFS filesystems.
     54 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
     55 */
     56STATIC int
     57xfs_uuid_mount(
     58	struct xfs_mount	*mp)
     59{
     60	uuid_t			*uuid = &mp->m_sb.sb_uuid;
     61	int			hole, i;
     62
     63	/* Publish UUID in struct super_block */
     64	uuid_copy(&mp->m_super->s_uuid, uuid);
     65
     66	if (xfs_has_nouuid(mp))
     67		return 0;
     68
     69	if (uuid_is_null(uuid)) {
     70		xfs_warn(mp, "Filesystem has null UUID - can't mount");
     71		return -EINVAL;
     72	}
     73
     74	mutex_lock(&xfs_uuid_table_mutex);
     75	for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
     76		if (uuid_is_null(&xfs_uuid_table[i])) {
     77			hole = i;
     78			continue;
     79		}
     80		if (uuid_equal(uuid, &xfs_uuid_table[i]))
     81			goto out_duplicate;
     82	}
     83
     84	if (hole < 0) {
     85		xfs_uuid_table = krealloc(xfs_uuid_table,
     86			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
     87			GFP_KERNEL | __GFP_NOFAIL);
     88		hole = xfs_uuid_table_size++;
     89	}
     90	xfs_uuid_table[hole] = *uuid;
     91	mutex_unlock(&xfs_uuid_table_mutex);
     92
     93	return 0;
     94
     95 out_duplicate:
     96	mutex_unlock(&xfs_uuid_table_mutex);
     97	xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
     98	return -EINVAL;
     99}
    100
    101STATIC void
    102xfs_uuid_unmount(
    103	struct xfs_mount	*mp)
    104{
    105	uuid_t			*uuid = &mp->m_sb.sb_uuid;
    106	int			i;
    107
    108	if (xfs_has_nouuid(mp))
    109		return;
    110
    111	mutex_lock(&xfs_uuid_table_mutex);
    112	for (i = 0; i < xfs_uuid_table_size; i++) {
    113		if (uuid_is_null(&xfs_uuid_table[i]))
    114			continue;
    115		if (!uuid_equal(uuid, &xfs_uuid_table[i]))
    116			continue;
    117		memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
    118		break;
    119	}
    120	ASSERT(i < xfs_uuid_table_size);
    121	mutex_unlock(&xfs_uuid_table_mutex);
    122}
    123
    124/*
    125 * Check size of device based on the (data/realtime) block count.
    126 * Note: this check is used by the growfs code as well as mount.
    127 */
    128int
    129xfs_sb_validate_fsb_count(
    130	xfs_sb_t	*sbp,
    131	uint64_t	nblocks)
    132{
    133	ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
    134	ASSERT(sbp->sb_blocklog >= BBSHIFT);
    135
    136	/* Limited by ULONG_MAX of page cache index */
    137	if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
    138		return -EFBIG;
    139	return 0;
    140}
    141
    142/*
    143 * xfs_readsb
    144 *
    145 * Does the initial read of the superblock.
    146 */
    147int
    148xfs_readsb(
    149	struct xfs_mount *mp,
    150	int		flags)
    151{
    152	unsigned int	sector_size;
    153	struct xfs_buf	*bp;
    154	struct xfs_sb	*sbp = &mp->m_sb;
    155	int		error;
    156	int		loud = !(flags & XFS_MFSI_QUIET);
    157	const struct xfs_buf_ops *buf_ops;
    158
    159	ASSERT(mp->m_sb_bp == NULL);
    160	ASSERT(mp->m_ddev_targp != NULL);
    161
    162	/*
    163	 * For the initial read, we must guess at the sector
    164	 * size based on the block device.  It's enough to
    165	 * get the sb_sectsize out of the superblock and
    166	 * then reread with the proper length.
    167	 * We don't verify it yet, because it may not be complete.
    168	 */
    169	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
    170	buf_ops = NULL;
    171
    172	/*
    173	 * Allocate a (locked) buffer to hold the superblock. This will be kept
    174	 * around at all times to optimize access to the superblock. Therefore,
    175	 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
    176	 * elevated.
    177	 */
    178reread:
    179	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
    180				      BTOBB(sector_size), XBF_NO_IOACCT, &bp,
    181				      buf_ops);
    182	if (error) {
    183		if (loud)
    184			xfs_warn(mp, "SB validate failed with error %d.", error);
    185		/* bad CRC means corrupted metadata */
    186		if (error == -EFSBADCRC)
    187			error = -EFSCORRUPTED;
    188		return error;
    189	}
    190
    191	/*
    192	 * Initialize the mount structure from the superblock.
    193	 */
    194	xfs_sb_from_disk(sbp, bp->b_addr);
    195
    196	/*
    197	 * If we haven't validated the superblock, do so now before we try
    198	 * to check the sector size and reread the superblock appropriately.
    199	 */
    200	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
    201		if (loud)
    202			xfs_warn(mp, "Invalid superblock magic number");
    203		error = -EINVAL;
    204		goto release_buf;
    205	}
    206
    207	/*
    208	 * We must be able to do sector-sized and sector-aligned IO.
    209	 */
    210	if (sector_size > sbp->sb_sectsize) {
    211		if (loud)
    212			xfs_warn(mp, "device supports %u byte sectors (not %u)",
    213				sector_size, sbp->sb_sectsize);
    214		error = -ENOSYS;
    215		goto release_buf;
    216	}
    217
    218	if (buf_ops == NULL) {
    219		/*
    220		 * Re-read the superblock so the buffer is correctly sized,
    221		 * and properly verified.
    222		 */
    223		xfs_buf_relse(bp);
    224		sector_size = sbp->sb_sectsize;
    225		buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
    226		goto reread;
    227	}
    228
    229	mp->m_features |= xfs_sb_version_to_features(sbp);
    230	xfs_reinit_percpu_counters(mp);
    231
    232	/* no need to be quiet anymore, so reset the buf ops */
    233	bp->b_ops = &xfs_sb_buf_ops;
    234
    235	mp->m_sb_bp = bp;
    236	xfs_buf_unlock(bp);
    237	return 0;
    238
    239release_buf:
    240	xfs_buf_relse(bp);
    241	return error;
    242}
    243
    244/*
    245 * If the sunit/swidth change would move the precomputed root inode value, we
    246 * must reject the ondisk change because repair will stumble over that.
    247 * However, we allow the mount to proceed because we never rejected this
    248 * combination before.  Returns true to update the sb, false otherwise.
    249 */
    250static inline int
    251xfs_check_new_dalign(
    252	struct xfs_mount	*mp,
    253	int			new_dalign,
    254	bool			*update_sb)
    255{
    256	struct xfs_sb		*sbp = &mp->m_sb;
    257	xfs_ino_t		calc_ino;
    258
    259	calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
    260	trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
    261
    262	if (sbp->sb_rootino == calc_ino) {
    263		*update_sb = true;
    264		return 0;
    265	}
    266
    267	xfs_warn(mp,
    268"Cannot change stripe alignment; would require moving root inode.");
    269
    270	/*
    271	 * XXX: Next time we add a new incompat feature, this should start
    272	 * returning -EINVAL to fail the mount.  Until then, spit out a warning
    273	 * that we're ignoring the administrator's instructions.
    274	 */
    275	xfs_warn(mp, "Skipping superblock stripe alignment update.");
    276	*update_sb = false;
    277	return 0;
    278}
    279
    280/*
    281 * If we were provided with new sunit/swidth values as mount options, make sure
    282 * that they pass basic alignment and superblock feature checks, and convert
    283 * them into the same units (FSB) that everything else expects.  This step
    284 * /must/ be done before computing the inode geometry.
    285 */
    286STATIC int
    287xfs_validate_new_dalign(
    288	struct xfs_mount	*mp)
    289{
    290	if (mp->m_dalign == 0)
    291		return 0;
    292
    293	/*
    294	 * If stripe unit and stripe width are not multiples
    295	 * of the fs blocksize turn off alignment.
    296	 */
    297	if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
    298	    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
    299		xfs_warn(mp,
    300	"alignment check failed: sunit/swidth vs. blocksize(%d)",
    301			mp->m_sb.sb_blocksize);
    302		return -EINVAL;
    303	} else {
    304		/*
    305		 * Convert the stripe unit and width to FSBs.
    306		 */
    307		mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
    308		if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
    309			xfs_warn(mp,
    310		"alignment check failed: sunit/swidth vs. agsize(%d)",
    311				 mp->m_sb.sb_agblocks);
    312			return -EINVAL;
    313		} else if (mp->m_dalign) {
    314			mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
    315		} else {
    316			xfs_warn(mp,
    317		"alignment check failed: sunit(%d) less than bsize(%d)",
    318				 mp->m_dalign, mp->m_sb.sb_blocksize);
    319			return -EINVAL;
    320		}
    321	}
    322
    323	if (!xfs_has_dalign(mp)) {
    324		xfs_warn(mp,
    325"cannot change alignment: superblock does not support data alignment");
    326		return -EINVAL;
    327	}
    328
    329	return 0;
    330}
    331
    332/* Update alignment values based on mount options and sb values. */
    333STATIC int
    334xfs_update_alignment(
    335	struct xfs_mount	*mp)
    336{
    337	struct xfs_sb		*sbp = &mp->m_sb;
    338
    339	if (mp->m_dalign) {
    340		bool		update_sb;
    341		int		error;
    342
    343		if (sbp->sb_unit == mp->m_dalign &&
    344		    sbp->sb_width == mp->m_swidth)
    345			return 0;
    346
    347		error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
    348		if (error || !update_sb)
    349			return error;
    350
    351		sbp->sb_unit = mp->m_dalign;
    352		sbp->sb_width = mp->m_swidth;
    353		mp->m_update_sb = true;
    354	} else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) {
    355		mp->m_dalign = sbp->sb_unit;
    356		mp->m_swidth = sbp->sb_width;
    357	}
    358
    359	return 0;
    360}
    361
    362/*
    363 * precalculate the low space thresholds for dynamic speculative preallocation.
    364 */
    365void
    366xfs_set_low_space_thresholds(
    367	struct xfs_mount	*mp)
    368{
    369	uint64_t		dblocks = mp->m_sb.sb_dblocks;
    370	uint64_t		rtexts = mp->m_sb.sb_rextents;
    371	int			i;
    372
    373	do_div(dblocks, 100);
    374	do_div(rtexts, 100);
    375
    376	for (i = 0; i < XFS_LOWSP_MAX; i++) {
    377		mp->m_low_space[i] = dblocks * (i + 1);
    378		mp->m_low_rtexts[i] = rtexts * (i + 1);
    379	}
    380}
    381
    382/*
    383 * Check that the data (and log if separate) is an ok size.
    384 */
    385STATIC int
    386xfs_check_sizes(
    387	struct xfs_mount *mp)
    388{
    389	struct xfs_buf	*bp;
    390	xfs_daddr_t	d;
    391	int		error;
    392
    393	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
    394	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
    395		xfs_warn(mp, "filesystem size mismatch detected");
    396		return -EFBIG;
    397	}
    398	error = xfs_buf_read_uncached(mp->m_ddev_targp,
    399					d - XFS_FSS_TO_BB(mp, 1),
    400					XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
    401	if (error) {
    402		xfs_warn(mp, "last sector read failed");
    403		return error;
    404	}
    405	xfs_buf_relse(bp);
    406
    407	if (mp->m_logdev_targp == mp->m_ddev_targp)
    408		return 0;
    409
    410	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
    411	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
    412		xfs_warn(mp, "log size mismatch detected");
    413		return -EFBIG;
    414	}
    415	error = xfs_buf_read_uncached(mp->m_logdev_targp,
    416					d - XFS_FSB_TO_BB(mp, 1),
    417					XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
    418	if (error) {
    419		xfs_warn(mp, "log device read failed");
    420		return error;
    421	}
    422	xfs_buf_relse(bp);
    423	return 0;
    424}
    425
    426/*
    427 * Clear the quotaflags in memory and in the superblock.
    428 */
    429int
    430xfs_mount_reset_sbqflags(
    431	struct xfs_mount	*mp)
    432{
    433	mp->m_qflags = 0;
    434
    435	/* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
    436	if (mp->m_sb.sb_qflags == 0)
    437		return 0;
    438	spin_lock(&mp->m_sb_lock);
    439	mp->m_sb.sb_qflags = 0;
    440	spin_unlock(&mp->m_sb_lock);
    441
    442	if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
    443		return 0;
    444
    445	return xfs_sync_sb(mp, false);
    446}
    447
    448uint64_t
    449xfs_default_resblks(xfs_mount_t *mp)
    450{
    451	uint64_t resblks;
    452
    453	/*
    454	 * We default to 5% or 8192 fsbs of space reserved, whichever is
    455	 * smaller.  This is intended to cover concurrent allocation
    456	 * transactions when we initially hit enospc. These each require a 4
    457	 * block reservation. Hence by default we cover roughly 2000 concurrent
    458	 * allocation reservations.
    459	 */
    460	resblks = mp->m_sb.sb_dblocks;
    461	do_div(resblks, 20);
    462	resblks = min_t(uint64_t, resblks, 8192);
    463	return resblks;
    464}
    465
    466/* Ensure the summary counts are correct. */
    467STATIC int
    468xfs_check_summary_counts(
    469	struct xfs_mount	*mp)
    470{
    471	int			error = 0;
    472
    473	/*
    474	 * The AG0 superblock verifier rejects in-progress filesystems,
    475	 * so we should never see the flag set this far into mounting.
    476	 */
    477	if (mp->m_sb.sb_inprogress) {
    478		xfs_err(mp, "sb_inprogress set after log recovery??");
    479		WARN_ON(1);
    480		return -EFSCORRUPTED;
    481	}
    482
    483	/*
    484	 * Now the log is mounted, we know if it was an unclean shutdown or
    485	 * not. If it was, with the first phase of recovery has completed, we
    486	 * have consistent AG blocks on disk. We have not recovered EFIs yet,
    487	 * but they are recovered transactionally in the second recovery phase
    488	 * later.
    489	 *
    490	 * If the log was clean when we mounted, we can check the summary
    491	 * counters.  If any of them are obviously incorrect, we can recompute
    492	 * them from the AGF headers in the next step.
    493	 */
    494	if (xfs_is_clean(mp) &&
    495	    (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
    496	     !xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
    497	     mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
    498		xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
    499
    500	/*
    501	 * We can safely re-initialise incore superblock counters from the
    502	 * per-ag data. These may not be correct if the filesystem was not
    503	 * cleanly unmounted, so we waited for recovery to finish before doing
    504	 * this.
    505	 *
    506	 * If the filesystem was cleanly unmounted or the previous check did
    507	 * not flag anything weird, then we can trust the values in the
    508	 * superblock to be correct and we don't need to do anything here.
    509	 * Otherwise, recalculate the summary counters.
    510	 */
    511	if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
    512	    xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
    513		error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
    514		if (error)
    515			return error;
    516	}
    517
    518	/*
    519	 * Older kernels misused sb_frextents to reflect both incore
    520	 * reservations made by running transactions and the actual count of
    521	 * free rt extents in the ondisk metadata.  Transactions committed
    522	 * during runtime can therefore contain a superblock update that
    523	 * undercounts the number of free rt extents tracked in the rt bitmap.
    524	 * A clean unmount record will have the correct frextents value since
    525	 * there can be no other transactions running at that point.
    526	 *
    527	 * If we're mounting the rt volume after recovering the log, recompute
    528	 * frextents from the rtbitmap file to fix the inconsistency.
    529	 */
    530	if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
    531		error = xfs_rtalloc_reinit_frextents(mp);
    532		if (error)
    533			return error;
    534	}
    535
    536	return 0;
    537}
    538
    539/*
    540 * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
    541 * internal inode structures can be sitting in the CIL and AIL at this point,
    542 * so we need to unpin them, write them back and/or reclaim them before unmount
    543 * can proceed.  In other words, callers are required to have inactivated all
    544 * inodes.
    545 *
    546 * An inode cluster that has been freed can have its buffer still pinned in
    547 * memory because the transaction is still sitting in a iclog. The stale inodes
    548 * on that buffer will be pinned to the buffer until the transaction hits the
    549 * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
    550 * may never see the pinned buffer, so nothing will push out the iclog and
    551 * unpin the buffer.
    552 *
    553 * Hence we need to force the log to unpin everything first. However, log
    554 * forces don't wait for the discards they issue to complete, so we have to
    555 * explicitly wait for them to complete here as well.
    556 *
    557 * Then we can tell the world we are unmounting so that error handling knows
    558 * that the filesystem is going away and we should error out anything that we
    559 * have been retrying in the background.  This will prevent never-ending
    560 * retries in AIL pushing from hanging the unmount.
    561 *
    562 * Finally, we can push the AIL to clean all the remaining dirty objects, then
    563 * reclaim the remaining inodes that are still in memory at this point in time.
    564 */
    565static void
    566xfs_unmount_flush_inodes(
    567	struct xfs_mount	*mp)
    568{
    569	xfs_log_force(mp, XFS_LOG_SYNC);
    570	xfs_extent_busy_wait_all(mp);
    571	flush_workqueue(xfs_discard_wq);
    572
    573	set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate);
    574
    575	xfs_ail_push_all_sync(mp->m_ail);
    576	xfs_inodegc_stop(mp);
    577	cancel_delayed_work_sync(&mp->m_reclaim_work);
    578	xfs_reclaim_inodes(mp);
    579	xfs_health_unmount(mp);
    580}
    581
    582static void
    583xfs_mount_setup_inode_geom(
    584	struct xfs_mount	*mp)
    585{
    586	struct xfs_ino_geometry *igeo = M_IGEO(mp);
    587
    588	igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
    589	ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp));
    590
    591	xfs_ialloc_setup_geometry(mp);
    592}
    593
    594/* Compute maximum possible height for per-AG btree types for this fs. */
    595static inline void
    596xfs_agbtree_compute_maxlevels(
    597	struct xfs_mount	*mp)
    598{
    599	unsigned int		levels;
    600
    601	levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
    602	levels = max(levels, mp->m_rmap_maxlevels);
    603	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
    604}
    605
    606/*
    607 * This function does the following on an initial mount of a file system:
    608 *	- reads the superblock from disk and init the mount struct
    609 *	- if we're a 32-bit kernel, do a size check on the superblock
    610 *		so we don't mount terabyte filesystems
    611 *	- init mount struct realtime fields
    612 *	- allocate inode hash table for fs
    613 *	- init directory manager
    614 *	- perform recovery and init the log manager
    615 */
    616int
    617xfs_mountfs(
    618	struct xfs_mount	*mp)
    619{
    620	struct xfs_sb		*sbp = &(mp->m_sb);
    621	struct xfs_inode	*rip;
    622	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
    623	uint64_t		resblks;
    624	uint			quotamount = 0;
    625	uint			quotaflags = 0;
    626	int			error = 0;
    627
    628	xfs_sb_mount_common(mp, sbp);
    629
    630	/*
    631	 * Check for a mismatched features2 values.  Older kernels read & wrote
    632	 * into the wrong sb offset for sb_features2 on some platforms due to
    633	 * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
    634	 * which made older superblock reading/writing routines swap it as a
    635	 * 64-bit value.
    636	 *
    637	 * For backwards compatibility, we make both slots equal.
    638	 *
    639	 * If we detect a mismatched field, we OR the set bits into the existing
    640	 * features2 field in case it has already been modified; we don't want
    641	 * to lose any features.  We then update the bad location with the ORed
    642	 * value so that older kernels will see any features2 flags. The
    643	 * superblock writeback code ensures the new sb_features2 is copied to
    644	 * sb_bad_features2 before it is logged or written to disk.
    645	 */
    646	if (xfs_sb_has_mismatched_features2(sbp)) {
    647		xfs_warn(mp, "correcting sb_features alignment problem");
    648		sbp->sb_features2 |= sbp->sb_bad_features2;
    649		mp->m_update_sb = true;
    650	}
    651
    652
    653	/* always use v2 inodes by default now */
    654	if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
    655		mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
    656		mp->m_features |= XFS_FEAT_NLINK;
    657		mp->m_update_sb = true;
    658	}
    659
    660	/*
    661	 * If we were given new sunit/swidth options, do some basic validation
    662	 * checks and convert the incore dalign and swidth values to the
    663	 * same units (FSB) that everything else uses.  This /must/ happen
    664	 * before computing the inode geometry.
    665	 */
    666	error = xfs_validate_new_dalign(mp);
    667	if (error)
    668		goto out;
    669
    670	xfs_alloc_compute_maxlevels(mp);
    671	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
    672	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
    673	xfs_mount_setup_inode_geom(mp);
    674	xfs_rmapbt_compute_maxlevels(mp);
    675	xfs_refcountbt_compute_maxlevels(mp);
    676
    677	xfs_agbtree_compute_maxlevels(mp);
    678
    679	/*
    680	 * Check if sb_agblocks is aligned at stripe boundary.  If sb_agblocks
    681	 * is NOT aligned turn off m_dalign since allocator alignment is within
    682	 * an ag, therefore ag has to be aligned at stripe boundary.  Note that
    683	 * we must compute the free space and rmap btree geometry before doing
    684	 * this.
    685	 */
    686	error = xfs_update_alignment(mp);
    687	if (error)
    688		goto out;
    689
    690	/* enable fail_at_unmount as default */
    691	mp->m_fail_unmount = true;
    692
    693	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
    694			       NULL, mp->m_super->s_id);
    695	if (error)
    696		goto out;
    697
    698	error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
    699			       &mp->m_kobj, "stats");
    700	if (error)
    701		goto out_remove_sysfs;
    702
    703	error = xfs_error_sysfs_init(mp);
    704	if (error)
    705		goto out_del_stats;
    706
    707	error = xfs_errortag_init(mp);
    708	if (error)
    709		goto out_remove_error_sysfs;
    710
    711	error = xfs_uuid_mount(mp);
    712	if (error)
    713		goto out_remove_errortag;
    714
    715	/*
    716	 * Update the preferred write size based on the information from the
    717	 * on-disk superblock.
    718	 */
    719	mp->m_allocsize_log =
    720		max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
    721	mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
    722
    723	/* set the low space thresholds for dynamic preallocation */
    724	xfs_set_low_space_thresholds(mp);
    725
    726	/*
    727	 * If enabled, sparse inode chunk alignment is expected to match the
    728	 * cluster size. Full inode chunk alignment must match the chunk size,
    729	 * but that is checked on sb read verification...
    730	 */
    731	if (xfs_has_sparseinodes(mp) &&
    732	    mp->m_sb.sb_spino_align !=
    733			XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
    734		xfs_warn(mp,
    735	"Sparse inode block alignment (%u) must match cluster size (%llu).",
    736			 mp->m_sb.sb_spino_align,
    737			 XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
    738		error = -EINVAL;
    739		goto out_remove_uuid;
    740	}
    741
    742	/*
    743	 * Check that the data (and log if separate) is an ok size.
    744	 */
    745	error = xfs_check_sizes(mp);
    746	if (error)
    747		goto out_remove_uuid;
    748
    749	/*
    750	 * Initialize realtime fields in the mount structure
    751	 */
    752	error = xfs_rtmount_init(mp);
    753	if (error) {
    754		xfs_warn(mp, "RT mount failed");
    755		goto out_remove_uuid;
    756	}
    757
    758	/*
    759	 *  Copies the low order bits of the timestamp and the randomly
    760	 *  set "sequence" number out of a UUID.
    761	 */
    762	mp->m_fixedfsid[0] =
    763		(get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) |
    764		 get_unaligned_be16(&sbp->sb_uuid.b[4]);
    765	mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]);
    766
    767	error = xfs_da_mount(mp);
    768	if (error) {
    769		xfs_warn(mp, "Failed dir/attr init: %d", error);
    770		goto out_remove_uuid;
    771	}
    772
    773	/*
    774	 * Initialize the precomputed transaction reservations values.
    775	 */
    776	xfs_trans_init(mp);
    777
    778	/*
    779	 * Allocate and initialize the per-ag data.
    780	 */
    781	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
    782	if (error) {
    783		xfs_warn(mp, "Failed per-ag init: %d", error);
    784		goto out_free_dir;
    785	}
    786
    787	if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
    788		xfs_warn(mp, "no log defined");
    789		error = -EFSCORRUPTED;
    790		goto out_free_perag;
    791	}
    792
    793	error = xfs_inodegc_register_shrinker(mp);
    794	if (error)
    795		goto out_fail_wait;
    796
    797	/*
    798	 * Log's mount-time initialization. The first part of recovery can place
    799	 * some items on the AIL, to be handled when recovery is finished or
    800	 * cancelled.
    801	 */
    802	error = xfs_log_mount(mp, mp->m_logdev_targp,
    803			      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
    804			      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
    805	if (error) {
    806		xfs_warn(mp, "log mount failed");
    807		goto out_inodegc_shrinker;
    808	}
    809
    810	/* Enable background inode inactivation workers. */
    811	xfs_inodegc_start(mp);
    812	xfs_blockgc_start(mp);
    813
    814	/*
    815	 * Now that we've recovered any pending superblock feature bit
    816	 * additions, we can finish setting up the attr2 behaviour for the
    817	 * mount. The noattr2 option overrides the superblock flag, so only
    818	 * check the superblock feature flag if the mount option is not set.
    819	 */
    820	if (xfs_has_noattr2(mp)) {
    821		mp->m_features &= ~XFS_FEAT_ATTR2;
    822	} else if (!xfs_has_attr2(mp) &&
    823		   (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
    824		mp->m_features |= XFS_FEAT_ATTR2;
    825	}
    826
    827	/*
    828	 * Get and sanity-check the root inode.
    829	 * Save the pointer to it in the mount structure.
    830	 */
    831	error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED,
    832			 XFS_ILOCK_EXCL, &rip);
    833	if (error) {
    834		xfs_warn(mp,
    835			"Failed to read root inode 0x%llx, error %d",
    836			sbp->sb_rootino, -error);
    837		goto out_log_dealloc;
    838	}
    839
    840	ASSERT(rip != NULL);
    841
    842	if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
    843		xfs_warn(mp, "corrupted root inode %llu: not a directory",
    844			(unsigned long long)rip->i_ino);
    845		xfs_iunlock(rip, XFS_ILOCK_EXCL);
    846		error = -EFSCORRUPTED;
    847		goto out_rele_rip;
    848	}
    849	mp->m_rootip = rip;	/* save it */
    850
    851	xfs_iunlock(rip, XFS_ILOCK_EXCL);
    852
    853	/*
    854	 * Initialize realtime inode pointers in the mount structure
    855	 */
    856	error = xfs_rtmount_inodes(mp);
    857	if (error) {
    858		/*
    859		 * Free up the root inode.
    860		 */
    861		xfs_warn(mp, "failed to read RT inodes");
    862		goto out_rele_rip;
    863	}
    864
    865	/* Make sure the summary counts are ok. */
    866	error = xfs_check_summary_counts(mp);
    867	if (error)
    868		goto out_rtunmount;
    869
    870	/*
    871	 * If this is a read-only mount defer the superblock updates until
    872	 * the next remount into writeable mode.  Otherwise we would never
    873	 * perform the update e.g. for the root filesystem.
    874	 */
    875	if (mp->m_update_sb && !xfs_is_readonly(mp)) {
    876		error = xfs_sync_sb(mp, false);
    877		if (error) {
    878			xfs_warn(mp, "failed to write sb changes");
    879			goto out_rtunmount;
    880		}
    881	}
    882
    883	/*
    884	 * Initialise the XFS quota management subsystem for this mount
    885	 */
    886	if (XFS_IS_QUOTA_ON(mp)) {
    887		error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
    888		if (error)
    889			goto out_rtunmount;
    890	} else {
    891		/*
    892		 * If a file system had quotas running earlier, but decided to
    893		 * mount without -o uquota/pquota/gquota options, revoke the
    894		 * quotachecked license.
    895		 */
    896		if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
    897			xfs_notice(mp, "resetting quota flags");
    898			error = xfs_mount_reset_sbqflags(mp);
    899			if (error)
    900				goto out_rtunmount;
    901		}
    902	}
    903
    904	/*
    905	 * Finish recovering the file system.  This part needed to be delayed
    906	 * until after the root and real-time bitmap inodes were consistently
    907	 * read in.  Temporarily create per-AG space reservations for metadata
    908	 * btree shape changes because space freeing transactions (for inode
    909	 * inactivation) require the per-AG reservation in lieu of reserving
    910	 * blocks.
    911	 */
    912	error = xfs_fs_reserve_ag_blocks(mp);
    913	if (error && error == -ENOSPC)
    914		xfs_warn(mp,
    915	"ENOSPC reserving per-AG metadata pool, log recovery may fail.");
    916	error = xfs_log_mount_finish(mp);
    917	xfs_fs_unreserve_ag_blocks(mp);
    918	if (error) {
    919		xfs_warn(mp, "log mount finish failed");
    920		goto out_rtunmount;
    921	}
    922
    923	/*
    924	 * Now the log is fully replayed, we can transition to full read-only
    925	 * mode for read-only mounts. This will sync all the metadata and clean
    926	 * the log so that the recovery we just performed does not have to be
    927	 * replayed again on the next mount.
    928	 *
    929	 * We use the same quiesce mechanism as the rw->ro remount, as they are
    930	 * semantically identical operations.
    931	 */
    932	if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
    933		xfs_log_clean(mp);
    934
    935	/*
    936	 * Complete the quota initialisation, post-log-replay component.
    937	 */
    938	if (quotamount) {
    939		ASSERT(mp->m_qflags == 0);
    940		mp->m_qflags = quotaflags;
    941
    942		xfs_qm_mount_quotas(mp);
    943	}
    944
    945	/*
    946	 * Now we are mounted, reserve a small amount of unused space for
    947	 * privileged transactions. This is needed so that transaction
    948	 * space required for critical operations can dip into this pool
    949	 * when at ENOSPC. This is needed for operations like create with
    950	 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
    951	 * are not allowed to use this reserved space.
    952	 *
    953	 * This may drive us straight to ENOSPC on mount, but that implies
    954	 * we were already there on the last unmount. Warn if this occurs.
    955	 */
    956	if (!xfs_is_readonly(mp)) {
    957		resblks = xfs_default_resblks(mp);
    958		error = xfs_reserve_blocks(mp, &resblks, NULL);
    959		if (error)
    960			xfs_warn(mp,
    961	"Unable to allocate reserve blocks. Continuing without reserve pool.");
    962
    963		/* Reserve AG blocks for future btree expansion. */
    964		error = xfs_fs_reserve_ag_blocks(mp);
    965		if (error && error != -ENOSPC)
    966			goto out_agresv;
    967	}
    968
    969	return 0;
    970
    971 out_agresv:
    972	xfs_fs_unreserve_ag_blocks(mp);
    973	xfs_qm_unmount_quotas(mp);
    974 out_rtunmount:
    975	xfs_rtunmount_inodes(mp);
    976 out_rele_rip:
    977	xfs_irele(rip);
    978	/* Clean out dquots that might be in memory after quotacheck. */
    979	xfs_qm_unmount(mp);
    980
    981	/*
    982	 * Inactivate all inodes that might still be in memory after a log
    983	 * intent recovery failure so that reclaim can free them.  Metadata
    984	 * inodes and the root directory shouldn't need inactivation, but the
    985	 * mount failed for some reason, so pull down all the state and flee.
    986	 */
    987	xfs_inodegc_flush(mp);
    988
    989	/*
    990	 * Flush all inode reclamation work and flush the log.
    991	 * We have to do this /after/ rtunmount and qm_unmount because those
    992	 * two will have scheduled delayed reclaim for the rt/quota inodes.
    993	 *
    994	 * This is slightly different from the unmountfs call sequence
    995	 * because we could be tearing down a partially set up mount.  In
    996	 * particular, if log_mount_finish fails we bail out without calling
    997	 * qm_unmount_quotas and therefore rely on qm_unmount to release the
    998	 * quota inodes.
    999	 */
   1000	xfs_unmount_flush_inodes(mp);
   1001 out_log_dealloc:
   1002	xfs_log_mount_cancel(mp);
   1003 out_inodegc_shrinker:
   1004	unregister_shrinker(&mp->m_inodegc_shrinker);
   1005 out_fail_wait:
   1006	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
   1007		xfs_buftarg_drain(mp->m_logdev_targp);
   1008	xfs_buftarg_drain(mp->m_ddev_targp);
   1009 out_free_perag:
   1010	xfs_free_perag(mp);
   1011 out_free_dir:
   1012	xfs_da_unmount(mp);
   1013 out_remove_uuid:
   1014	xfs_uuid_unmount(mp);
   1015 out_remove_errortag:
   1016	xfs_errortag_del(mp);
   1017 out_remove_error_sysfs:
   1018	xfs_error_sysfs_del(mp);
   1019 out_del_stats:
   1020	xfs_sysfs_del(&mp->m_stats.xs_kobj);
   1021 out_remove_sysfs:
   1022	xfs_sysfs_del(&mp->m_kobj);
   1023 out:
   1024	return error;
   1025}
   1026
   1027/*
   1028 * This flushes out the inodes,dquots and the superblock, unmounts the
   1029 * log and makes sure that incore structures are freed.
   1030 */
   1031void
   1032xfs_unmountfs(
   1033	struct xfs_mount	*mp)
   1034{
   1035	uint64_t		resblks;
   1036	int			error;
   1037
   1038	/*
   1039	 * Perform all on-disk metadata updates required to inactivate inodes
   1040	 * that the VFS evicted earlier in the unmount process.  Freeing inodes
   1041	 * and discarding CoW fork preallocations can cause shape changes to
   1042	 * the free inode and refcount btrees, respectively, so we must finish
   1043	 * this before we discard the metadata space reservations.  Metadata
   1044	 * inodes and the root directory do not require inactivation.
   1045	 */
   1046	xfs_inodegc_flush(mp);
   1047
   1048	xfs_blockgc_stop(mp);
   1049	xfs_fs_unreserve_ag_blocks(mp);
   1050	xfs_qm_unmount_quotas(mp);
   1051	xfs_rtunmount_inodes(mp);
   1052	xfs_irele(mp->m_rootip);
   1053
   1054	xfs_unmount_flush_inodes(mp);
   1055
   1056	xfs_qm_unmount(mp);
   1057
   1058	/*
   1059	 * Unreserve any blocks we have so that when we unmount we don't account
   1060	 * the reserved free space as used. This is really only necessary for
   1061	 * lazy superblock counting because it trusts the incore superblock
   1062	 * counters to be absolutely correct on clean unmount.
   1063	 *
   1064	 * We don't bother correcting this elsewhere for lazy superblock
   1065	 * counting because on mount of an unclean filesystem we reconstruct the
   1066	 * correct counter value and this is irrelevant.
   1067	 *
   1068	 * For non-lazy counter filesystems, this doesn't matter at all because
   1069	 * we only every apply deltas to the superblock and hence the incore
   1070	 * value does not matter....
   1071	 */
   1072	resblks = 0;
   1073	error = xfs_reserve_blocks(mp, &resblks, NULL);
   1074	if (error)
   1075		xfs_warn(mp, "Unable to free reserved block pool. "
   1076				"Freespace may not be correct on next mount.");
   1077
   1078	xfs_log_unmount(mp);
   1079	xfs_da_unmount(mp);
   1080	xfs_uuid_unmount(mp);
   1081
   1082#if defined(DEBUG)
   1083	xfs_errortag_clearall(mp);
   1084#endif
   1085	unregister_shrinker(&mp->m_inodegc_shrinker);
   1086	xfs_free_perag(mp);
   1087
   1088	xfs_errortag_del(mp);
   1089	xfs_error_sysfs_del(mp);
   1090	xfs_sysfs_del(&mp->m_stats.xs_kobj);
   1091	xfs_sysfs_del(&mp->m_kobj);
   1092}
   1093
   1094/*
   1095 * Determine whether modifications can proceed. The caller specifies the minimum
   1096 * freeze level for which modifications should not be allowed. This allows
   1097 * certain operations to proceed while the freeze sequence is in progress, if
   1098 * necessary.
   1099 */
   1100bool
   1101xfs_fs_writable(
   1102	struct xfs_mount	*mp,
   1103	int			level)
   1104{
   1105	ASSERT(level > SB_UNFROZEN);
   1106	if ((mp->m_super->s_writers.frozen >= level) ||
   1107	    xfs_is_shutdown(mp) || xfs_is_readonly(mp))
   1108		return false;
   1109
   1110	return true;
   1111}
   1112
   1113/* Adjust m_fdblocks or m_frextents. */
   1114int
   1115xfs_mod_freecounter(
   1116	struct xfs_mount	*mp,
   1117	struct percpu_counter	*counter,
   1118	int64_t			delta,
   1119	bool			rsvd)
   1120{
   1121	int64_t			lcounter;
   1122	long long		res_used;
   1123	uint64_t		set_aside = 0;
   1124	s32			batch;
   1125	bool			has_resv_pool;
   1126
   1127	ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
   1128	has_resv_pool = (counter == &mp->m_fdblocks);
   1129	if (rsvd)
   1130		ASSERT(has_resv_pool);
   1131
   1132	if (delta > 0) {
   1133		/*
   1134		 * If the reserve pool is depleted, put blocks back into it
   1135		 * first. Most of the time the pool is full.
   1136		 */
   1137		if (likely(!has_resv_pool ||
   1138			   mp->m_resblks == mp->m_resblks_avail)) {
   1139			percpu_counter_add(counter, delta);
   1140			return 0;
   1141		}
   1142
   1143		spin_lock(&mp->m_sb_lock);
   1144		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
   1145
   1146		if (res_used > delta) {
   1147			mp->m_resblks_avail += delta;
   1148		} else {
   1149			delta -= res_used;
   1150			mp->m_resblks_avail = mp->m_resblks;
   1151			percpu_counter_add(counter, delta);
   1152		}
   1153		spin_unlock(&mp->m_sb_lock);
   1154		return 0;
   1155	}
   1156
   1157	/*
   1158	 * Taking blocks away, need to be more accurate the closer we
   1159	 * are to zero.
   1160	 *
   1161	 * If the counter has a value of less than 2 * max batch size,
   1162	 * then make everything serialise as we are real close to
   1163	 * ENOSPC.
   1164	 */
   1165	if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
   1166				     XFS_FDBLOCKS_BATCH) < 0)
   1167		batch = 1;
   1168	else
   1169		batch = XFS_FDBLOCKS_BATCH;
   1170
   1171	/*
   1172	 * Set aside allocbt blocks because these blocks are tracked as free
   1173	 * space but not available for allocation. Technically this means that a
   1174	 * single reservation cannot consume all remaining free space, but the
   1175	 * ratio of allocbt blocks to usable free blocks should be rather small.
   1176	 * The tradeoff without this is that filesystems that maintain high
   1177	 * perag block reservations can over reserve physical block availability
   1178	 * and fail physical allocation, which leads to much more serious
   1179	 * problems (i.e. transaction abort, pagecache discards, etc.) than
   1180	 * slightly premature -ENOSPC.
   1181	 */
   1182	if (has_resv_pool)
   1183		set_aside = xfs_fdblocks_unavailable(mp);
   1184	percpu_counter_add_batch(counter, delta, batch);
   1185	if (__percpu_counter_compare(counter, set_aside,
   1186				     XFS_FDBLOCKS_BATCH) >= 0) {
   1187		/* we had space! */
   1188		return 0;
   1189	}
   1190
   1191	/*
   1192	 * lock up the sb for dipping into reserves before releasing the space
   1193	 * that took us to ENOSPC.
   1194	 */
   1195	spin_lock(&mp->m_sb_lock);
   1196	percpu_counter_add(counter, -delta);
   1197	if (!has_resv_pool || !rsvd)
   1198		goto fdblocks_enospc;
   1199
   1200	lcounter = (long long)mp->m_resblks_avail + delta;
   1201	if (lcounter >= 0) {
   1202		mp->m_resblks_avail = lcounter;
   1203		spin_unlock(&mp->m_sb_lock);
   1204		return 0;
   1205	}
   1206	xfs_warn_once(mp,
   1207"Reserve blocks depleted! Consider increasing reserve pool size.");
   1208
   1209fdblocks_enospc:
   1210	spin_unlock(&mp->m_sb_lock);
   1211	return -ENOSPC;
   1212}
   1213
   1214/*
   1215 * Used to free the superblock along various error paths.
   1216 */
   1217void
   1218xfs_freesb(
   1219	struct xfs_mount	*mp)
   1220{
   1221	struct xfs_buf		*bp = mp->m_sb_bp;
   1222
   1223	xfs_buf_lock(bp);
   1224	mp->m_sb_bp = NULL;
   1225	xfs_buf_relse(bp);
   1226}
   1227
   1228/*
   1229 * If the underlying (data/log/rt) device is readonly, there are some
   1230 * operations that cannot proceed.
   1231 */
   1232int
   1233xfs_dev_is_read_only(
   1234	struct xfs_mount	*mp,
   1235	char			*message)
   1236{
   1237	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
   1238	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
   1239	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
   1240		xfs_notice(mp, "%s required on read-only device.", message);
   1241		xfs_notice(mp, "write access unavailable, cannot proceed.");
   1242		return -EROFS;
   1243	}
   1244	return 0;
   1245}
   1246
   1247/* Force the summary counters to be recalculated at next mount. */
   1248void
   1249xfs_force_summary_recalc(
   1250	struct xfs_mount	*mp)
   1251{
   1252	if (!xfs_has_lazysbcount(mp))
   1253		return;
   1254
   1255	xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
   1256}
   1257
   1258/*
   1259 * Enable a log incompat feature flag in the primary superblock.  The caller
   1260 * cannot have any other transactions in progress.
   1261 */
   1262int
   1263xfs_add_incompat_log_feature(
   1264	struct xfs_mount	*mp,
   1265	uint32_t		feature)
   1266{
   1267	struct xfs_dsb		*dsb;
   1268	int			error;
   1269
   1270	ASSERT(hweight32(feature) == 1);
   1271	ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
   1272
   1273	/*
   1274	 * Force the log to disk and kick the background AIL thread to reduce
   1275	 * the chances that the bwrite will stall waiting for the AIL to unpin
   1276	 * the primary superblock buffer.  This isn't a data integrity
   1277	 * operation, so we don't need a synchronous push.
   1278	 */
   1279	error = xfs_log_force(mp, XFS_LOG_SYNC);
   1280	if (error)
   1281		return error;
   1282	xfs_ail_push_all(mp->m_ail);
   1283
   1284	/*
   1285	 * Lock the primary superblock buffer to serialize all callers that
   1286	 * are trying to set feature bits.
   1287	 */
   1288	xfs_buf_lock(mp->m_sb_bp);
   1289	xfs_buf_hold(mp->m_sb_bp);
   1290
   1291	if (xfs_is_shutdown(mp)) {
   1292		error = -EIO;
   1293		goto rele;
   1294	}
   1295
   1296	if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature))
   1297		goto rele;
   1298
   1299	/*
   1300	 * Write the primary superblock to disk immediately, because we need
   1301	 * the log_incompat bit to be set in the primary super now to protect
   1302	 * the log items that we're going to commit later.
   1303	 */
   1304	dsb = mp->m_sb_bp->b_addr;
   1305	xfs_sb_to_disk(dsb, &mp->m_sb);
   1306	dsb->sb_features_log_incompat |= cpu_to_be32(feature);
   1307	error = xfs_bwrite(mp->m_sb_bp);
   1308	if (error)
   1309		goto shutdown;
   1310
   1311	/*
   1312	 * Add the feature bits to the incore superblock before we unlock the
   1313	 * buffer.
   1314	 */
   1315	xfs_sb_add_incompat_log_features(&mp->m_sb, feature);
   1316	xfs_buf_relse(mp->m_sb_bp);
   1317
   1318	/* Log the superblock to disk. */
   1319	return xfs_sync_sb(mp, false);
   1320shutdown:
   1321	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
   1322rele:
   1323	xfs_buf_relse(mp->m_sb_bp);
   1324	return error;
   1325}
   1326
   1327/*
   1328 * Clear all the log incompat flags from the superblock.
   1329 *
   1330 * The caller cannot be in a transaction, must ensure that the log does not
   1331 * contain any log items protected by any log incompat bit, and must ensure
   1332 * that there are no other threads that depend on the state of the log incompat
   1333 * feature flags in the primary super.
   1334 *
   1335 * Returns true if the superblock is dirty.
   1336 */
   1337bool
   1338xfs_clear_incompat_log_features(
   1339	struct xfs_mount	*mp)
   1340{
   1341	bool			ret = false;
   1342
   1343	if (!xfs_has_crc(mp) ||
   1344	    !xfs_sb_has_incompat_log_feature(&mp->m_sb,
   1345				XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
   1346	    xfs_is_shutdown(mp))
   1347		return false;
   1348
   1349	/*
   1350	 * Update the incore superblock.  We synchronize on the primary super
   1351	 * buffer lock to be consistent with the add function, though at least
   1352	 * in theory this shouldn't be necessary.
   1353	 */
   1354	xfs_buf_lock(mp->m_sb_bp);
   1355	xfs_buf_hold(mp->m_sb_bp);
   1356
   1357	if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
   1358				XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
   1359		xfs_sb_remove_incompat_log_features(&mp->m_sb);
   1360		ret = true;
   1361	}
   1362
   1363	xfs_buf_relse(mp->m_sb_bp);
   1364	return ret;
   1365}
   1366
   1367/*
   1368 * Update the in-core delayed block counter.
   1369 *
   1370 * We prefer to update the counter without having to take a spinlock for every
   1371 * counter update (i.e. batching).  Each change to delayed allocation
   1372 * reservations can change can easily exceed the default percpu counter
   1373 * batching, so we use a larger batch factor here.
   1374 *
   1375 * Note that we don't currently have any callers requiring fast summation
   1376 * (e.g. percpu_counter_read) so we can use a big batch value here.
   1377 */
   1378#define XFS_DELALLOC_BATCH	(4096)
   1379void
   1380xfs_mod_delalloc(
   1381	struct xfs_mount	*mp,
   1382	int64_t			delta)
   1383{
   1384	percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
   1385			XFS_DELALLOC_BATCH);
   1386}