cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

super.c (76669B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2007 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/blkdev.h>
      7#include <linux/module.h>
      8#include <linux/fs.h>
      9#include <linux/pagemap.h>
     10#include <linux/highmem.h>
     11#include <linux/time.h>
     12#include <linux/init.h>
     13#include <linux/seq_file.h>
     14#include <linux/string.h>
     15#include <linux/backing-dev.h>
     16#include <linux/mount.h>
     17#include <linux/writeback.h>
     18#include <linux/statfs.h>
     19#include <linux/compat.h>
     20#include <linux/parser.h>
     21#include <linux/ctype.h>
     22#include <linux/namei.h>
     23#include <linux/miscdevice.h>
     24#include <linux/magic.h>
     25#include <linux/slab.h>
     26#include <linux/ratelimit.h>
     27#include <linux/crc32c.h>
     28#include <linux/btrfs.h>
     29#include "delayed-inode.h"
     30#include "ctree.h"
     31#include "disk-io.h"
     32#include "transaction.h"
     33#include "btrfs_inode.h"
     34#include "print-tree.h"
     35#include "props.h"
     36#include "xattr.h"
     37#include "volumes.h"
     38#include "export.h"
     39#include "compression.h"
     40#include "rcu-string.h"
     41#include "dev-replace.h"
     42#include "free-space-cache.h"
     43#include "backref.h"
     44#include "space-info.h"
     45#include "sysfs.h"
     46#include "zoned.h"
     47#include "tests/btrfs-tests.h"
     48#include "block-group.h"
     49#include "discard.h"
     50#include "qgroup.h"
     51#define CREATE_TRACE_POINTS
     52#include <trace/events/btrfs.h>
     53
     54static const struct super_operations btrfs_super_ops;
     55
     56/*
     57 * Types for mounting the default subvolume and a subvolume explicitly
     58 * requested by subvol=/path. That way the callchain is straightforward and we
     59 * don't have to play tricks with the mount options and recursive calls to
     60 * btrfs_mount.
     61 *
     62 * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
     63 */
     64static struct file_system_type btrfs_fs_type;
     65static struct file_system_type btrfs_root_fs_type;
     66
     67static int btrfs_remount(struct super_block *sb, int *flags, char *data);
     68
     69#ifdef CONFIG_PRINTK
     70
     71#define STATE_STRING_PREFACE	": state "
     72#define STATE_STRING_BUF_LEN	(sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
     73
     74/*
     75 * Characters to print to indicate error conditions or uncommon filesystem sate.
     76 * RO is not an error.
     77 */
     78static const char fs_state_chars[] = {
     79	[BTRFS_FS_STATE_ERROR]			= 'E',
     80	[BTRFS_FS_STATE_REMOUNTING]		= 'M',
     81	[BTRFS_FS_STATE_RO]			= 0,
     82	[BTRFS_FS_STATE_TRANS_ABORTED]		= 'A',
     83	[BTRFS_FS_STATE_DEV_REPLACING]		= 'R',
     84	[BTRFS_FS_STATE_DUMMY_FS_INFO]		= 0,
     85	[BTRFS_FS_STATE_NO_CSUMS]		= 'C',
     86	[BTRFS_FS_STATE_LOG_CLEANUP_ERROR]	= 'L',
     87};
     88
     89static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
     90{
     91	unsigned int bit;
     92	bool states_printed = false;
     93	unsigned long fs_state = READ_ONCE(info->fs_state);
     94	char *curr = buf;
     95
     96	memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
     97	curr += sizeof(STATE_STRING_PREFACE) - 1;
     98
     99	for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
    100		WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
    101		if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
    102			*curr++ = fs_state_chars[bit];
    103			states_printed = true;
    104		}
    105	}
    106
    107	/* If no states were printed, reset the buffer */
    108	if (!states_printed)
    109		curr = buf;
    110
    111	*curr++ = 0;
    112}
    113#endif
    114
    115/*
    116 * Generally the error codes correspond to their respective errors, but there
    117 * are a few special cases.
    118 *
    119 * EUCLEAN: Any sort of corruption that we encounter.  The tree-checker for
    120 *          instance will return EUCLEAN if any of the blocks are corrupted in
    121 *          a way that is problematic.  We want to reserve EUCLEAN for these
    122 *          sort of corruptions.
    123 *
    124 * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
    125 *        need to use EROFS for this case.  We will have no idea of the
    126 *        original failure, that will have been reported at the time we tripped
    127 *        over the error.  Each subsequent error that doesn't have any context
    128 *        of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
    129 */
    130const char * __attribute_const__ btrfs_decode_error(int errno)
    131{
    132	char *errstr = "unknown";
    133
    134	switch (errno) {
    135	case -ENOENT:		/* -2 */
    136		errstr = "No such entry";
    137		break;
    138	case -EIO:		/* -5 */
    139		errstr = "IO failure";
    140		break;
    141	case -ENOMEM:		/* -12*/
    142		errstr = "Out of memory";
    143		break;
    144	case -EEXIST:		/* -17 */
    145		errstr = "Object already exists";
    146		break;
    147	case -ENOSPC:		/* -28 */
    148		errstr = "No space left";
    149		break;
    150	case -EROFS:		/* -30 */
    151		errstr = "Readonly filesystem";
    152		break;
    153	case -EOPNOTSUPP:	/* -95 */
    154		errstr = "Operation not supported";
    155		break;
    156	case -EUCLEAN:		/* -117 */
    157		errstr = "Filesystem corrupted";
    158		break;
    159	case -EDQUOT:		/* -122 */
    160		errstr = "Quota exceeded";
    161		break;
    162	}
    163
    164	return errstr;
    165}
    166
    167/*
    168 * __btrfs_handle_fs_error decodes expected errors from the caller and
    169 * invokes the appropriate error response.
    170 */
    171__cold
    172void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
    173		       unsigned int line, int errno, const char *fmt, ...)
    174{
    175	struct super_block *sb = fs_info->sb;
    176#ifdef CONFIG_PRINTK
    177	char statestr[STATE_STRING_BUF_LEN];
    178	const char *errstr;
    179#endif
    180
    181	/*
    182	 * Special case: if the error is EROFS, and we're already
    183	 * under SB_RDONLY, then it is safe here.
    184	 */
    185	if (errno == -EROFS && sb_rdonly(sb))
    186  		return;
    187
    188#ifdef CONFIG_PRINTK
    189	errstr = btrfs_decode_error(errno);
    190	btrfs_state_to_string(fs_info, statestr);
    191	if (fmt) {
    192		struct va_format vaf;
    193		va_list args;
    194
    195		va_start(args, fmt);
    196		vaf.fmt = fmt;
    197		vaf.va = &args;
    198
    199		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
    200			sb->s_id, statestr, function, line, errno, errstr, &vaf);
    201		va_end(args);
    202	} else {
    203		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
    204			sb->s_id, statestr, function, line, errno, errstr);
    205	}
    206#endif
    207
    208	/*
    209	 * Today we only save the error info to memory.  Long term we'll
    210	 * also send it down to the disk
    211	 */
    212	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
    213
    214	/* Don't go through full error handling during mount */
    215	if (!(sb->s_flags & SB_BORN))
    216		return;
    217
    218	if (sb_rdonly(sb))
    219		return;
    220
    221	btrfs_discard_stop(fs_info);
    222
    223	/* btrfs handle error by forcing the filesystem readonly */
    224	btrfs_set_sb_rdonly(sb);
    225	btrfs_info(fs_info, "forced readonly");
    226	/*
    227	 * Note that a running device replace operation is not canceled here
    228	 * although there is no way to update the progress. It would add the
    229	 * risk of a deadlock, therefore the canceling is omitted. The only
    230	 * penalty is that some I/O remains active until the procedure
    231	 * completes. The next time when the filesystem is mounted writable
    232	 * again, the device replace operation continues.
    233	 */
    234}
    235
    236#ifdef CONFIG_PRINTK
    237static const char * const logtypes[] = {
    238	"emergency",
    239	"alert",
    240	"critical",
    241	"error",
    242	"warning",
    243	"notice",
    244	"info",
    245	"debug",
    246};
    247
    248
    249/*
    250 * Use one ratelimit state per log level so that a flood of less important
    251 * messages doesn't cause more important ones to be dropped.
    252 */
    253static struct ratelimit_state printk_limits[] = {
    254	RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
    255	RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
    256	RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
    257	RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
    258	RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
    259	RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
    260	RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
    261	RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
    262};
    263
    264void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
    265{
    266	char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
    267	struct va_format vaf;
    268	va_list args;
    269	int kern_level;
    270	const char *type = logtypes[4];
    271	struct ratelimit_state *ratelimit = &printk_limits[4];
    272
    273	va_start(args, fmt);
    274
    275	while ((kern_level = printk_get_level(fmt)) != 0) {
    276		size_t size = printk_skip_level(fmt) - fmt;
    277
    278		if (kern_level >= '0' && kern_level <= '7') {
    279			memcpy(lvl, fmt,  size);
    280			lvl[size] = '\0';
    281			type = logtypes[kern_level - '0'];
    282			ratelimit = &printk_limits[kern_level - '0'];
    283		}
    284		fmt += size;
    285	}
    286
    287	vaf.fmt = fmt;
    288	vaf.va = &args;
    289
    290	if (__ratelimit(ratelimit)) {
    291		if (fs_info) {
    292			char statestr[STATE_STRING_BUF_LEN];
    293
    294			btrfs_state_to_string(fs_info, statestr);
    295			_printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
    296				fs_info->sb->s_id, statestr, &vaf);
    297		} else {
    298			_printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
    299		}
    300	}
    301
    302	va_end(args);
    303}
    304#endif
    305
    306#if BITS_PER_LONG == 32
    307void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
    308{
    309	if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
    310		btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
    311		btrfs_warn(fs_info,
    312"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
    313			   BTRFS_32BIT_MAX_FILE_SIZE >> 40);
    314		btrfs_warn(fs_info,
    315			   "please consider upgrading to 64bit kernel/hardware");
    316	}
    317}
    318
    319void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
    320{
    321	if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
    322		btrfs_err(fs_info, "reached 32bit limit for logical addresses");
    323		btrfs_err(fs_info,
    324"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
    325			  BTRFS_32BIT_MAX_FILE_SIZE >> 40);
    326		btrfs_err(fs_info,
    327			   "please consider upgrading to 64bit kernel/hardware");
    328	}
    329}
    330#endif
    331
    332/*
    333 * We only mark the transaction aborted and then set the file system read-only.
    334 * This will prevent new transactions from starting or trying to join this
    335 * one.
    336 *
    337 * This means that error recovery at the call site is limited to freeing
    338 * any local memory allocations and passing the error code up without
    339 * further cleanup. The transaction should complete as it normally would
    340 * in the call path but will return -EIO.
    341 *
    342 * We'll complete the cleanup in btrfs_end_transaction and
    343 * btrfs_commit_transaction.
    344 */
    345__cold
    346void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
    347			       const char *function,
    348			       unsigned int line, int errno)
    349{
    350	struct btrfs_fs_info *fs_info = trans->fs_info;
    351
    352	WRITE_ONCE(trans->aborted, errno);
    353	WRITE_ONCE(trans->transaction->aborted, errno);
    354	/* Wake up anybody who may be waiting on this transaction */
    355	wake_up(&fs_info->transaction_wait);
    356	wake_up(&fs_info->transaction_blocked_wait);
    357	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
    358}
    359/*
    360 * __btrfs_panic decodes unexpected, fatal errors from the caller,
    361 * issues an alert, and either panics or BUGs, depending on mount options.
    362 */
    363__cold
    364void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
    365		   unsigned int line, int errno, const char *fmt, ...)
    366{
    367	char *s_id = "<unknown>";
    368	const char *errstr;
    369	struct va_format vaf = { .fmt = fmt };
    370	va_list args;
    371
    372	if (fs_info)
    373		s_id = fs_info->sb->s_id;
    374
    375	va_start(args, fmt);
    376	vaf.va = &args;
    377
    378	errstr = btrfs_decode_error(errno);
    379	if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
    380		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
    381			s_id, function, line, &vaf, errno, errstr);
    382
    383	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
    384		   function, line, &vaf, errno, errstr);
    385	va_end(args);
    386	/* Caller calls BUG() */
    387}
    388
    389static void btrfs_put_super(struct super_block *sb)
    390{
    391	close_ctree(btrfs_sb(sb));
    392}
    393
    394enum {
    395	Opt_acl, Opt_noacl,
    396	Opt_clear_cache,
    397	Opt_commit_interval,
    398	Opt_compress,
    399	Opt_compress_force,
    400	Opt_compress_force_type,
    401	Opt_compress_type,
    402	Opt_degraded,
    403	Opt_device,
    404	Opt_fatal_errors,
    405	Opt_flushoncommit, Opt_noflushoncommit,
    406	Opt_max_inline,
    407	Opt_barrier, Opt_nobarrier,
    408	Opt_datacow, Opt_nodatacow,
    409	Opt_datasum, Opt_nodatasum,
    410	Opt_defrag, Opt_nodefrag,
    411	Opt_discard, Opt_nodiscard,
    412	Opt_discard_mode,
    413	Opt_norecovery,
    414	Opt_ratio,
    415	Opt_rescan_uuid_tree,
    416	Opt_skip_balance,
    417	Opt_space_cache, Opt_no_space_cache,
    418	Opt_space_cache_version,
    419	Opt_ssd, Opt_nossd,
    420	Opt_ssd_spread, Opt_nossd_spread,
    421	Opt_subvol,
    422	Opt_subvol_empty,
    423	Opt_subvolid,
    424	Opt_thread_pool,
    425	Opt_treelog, Opt_notreelog,
    426	Opt_user_subvol_rm_allowed,
    427
    428	/* Rescue options */
    429	Opt_rescue,
    430	Opt_usebackuproot,
    431	Opt_nologreplay,
    432	Opt_ignorebadroots,
    433	Opt_ignoredatacsums,
    434	Opt_rescue_all,
    435
    436	/* Deprecated options */
    437	Opt_recovery,
    438	Opt_inode_cache, Opt_noinode_cache,
    439
    440	/* Debugging options */
    441	Opt_check_integrity,
    442	Opt_check_integrity_including_extent_data,
    443	Opt_check_integrity_print_mask,
    444	Opt_enospc_debug, Opt_noenospc_debug,
    445#ifdef CONFIG_BTRFS_DEBUG
    446	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
    447#endif
    448#ifdef CONFIG_BTRFS_FS_REF_VERIFY
    449	Opt_ref_verify,
    450#endif
    451	Opt_err,
    452};
    453
    454static const match_table_t tokens = {
    455	{Opt_acl, "acl"},
    456	{Opt_noacl, "noacl"},
    457	{Opt_clear_cache, "clear_cache"},
    458	{Opt_commit_interval, "commit=%u"},
    459	{Opt_compress, "compress"},
    460	{Opt_compress_type, "compress=%s"},
    461	{Opt_compress_force, "compress-force"},
    462	{Opt_compress_force_type, "compress-force=%s"},
    463	{Opt_degraded, "degraded"},
    464	{Opt_device, "device=%s"},
    465	{Opt_fatal_errors, "fatal_errors=%s"},
    466	{Opt_flushoncommit, "flushoncommit"},
    467	{Opt_noflushoncommit, "noflushoncommit"},
    468	{Opt_inode_cache, "inode_cache"},
    469	{Opt_noinode_cache, "noinode_cache"},
    470	{Opt_max_inline, "max_inline=%s"},
    471	{Opt_barrier, "barrier"},
    472	{Opt_nobarrier, "nobarrier"},
    473	{Opt_datacow, "datacow"},
    474	{Opt_nodatacow, "nodatacow"},
    475	{Opt_datasum, "datasum"},
    476	{Opt_nodatasum, "nodatasum"},
    477	{Opt_defrag, "autodefrag"},
    478	{Opt_nodefrag, "noautodefrag"},
    479	{Opt_discard, "discard"},
    480	{Opt_discard_mode, "discard=%s"},
    481	{Opt_nodiscard, "nodiscard"},
    482	{Opt_norecovery, "norecovery"},
    483	{Opt_ratio, "metadata_ratio=%u"},
    484	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
    485	{Opt_skip_balance, "skip_balance"},
    486	{Opt_space_cache, "space_cache"},
    487	{Opt_no_space_cache, "nospace_cache"},
    488	{Opt_space_cache_version, "space_cache=%s"},
    489	{Opt_ssd, "ssd"},
    490	{Opt_nossd, "nossd"},
    491	{Opt_ssd_spread, "ssd_spread"},
    492	{Opt_nossd_spread, "nossd_spread"},
    493	{Opt_subvol, "subvol=%s"},
    494	{Opt_subvol_empty, "subvol="},
    495	{Opt_subvolid, "subvolid=%s"},
    496	{Opt_thread_pool, "thread_pool=%u"},
    497	{Opt_treelog, "treelog"},
    498	{Opt_notreelog, "notreelog"},
    499	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
    500
    501	/* Rescue options */
    502	{Opt_rescue, "rescue=%s"},
    503	/* Deprecated, with alias rescue=nologreplay */
    504	{Opt_nologreplay, "nologreplay"},
    505	/* Deprecated, with alias rescue=usebackuproot */
    506	{Opt_usebackuproot, "usebackuproot"},
    507
    508	/* Deprecated options */
    509	{Opt_recovery, "recovery"},
    510
    511	/* Debugging options */
    512	{Opt_check_integrity, "check_int"},
    513	{Opt_check_integrity_including_extent_data, "check_int_data"},
    514	{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
    515	{Opt_enospc_debug, "enospc_debug"},
    516	{Opt_noenospc_debug, "noenospc_debug"},
    517#ifdef CONFIG_BTRFS_DEBUG
    518	{Opt_fragment_data, "fragment=data"},
    519	{Opt_fragment_metadata, "fragment=metadata"},
    520	{Opt_fragment_all, "fragment=all"},
    521#endif
    522#ifdef CONFIG_BTRFS_FS_REF_VERIFY
    523	{Opt_ref_verify, "ref_verify"},
    524#endif
    525	{Opt_err, NULL},
    526};
    527
    528static const match_table_t rescue_tokens = {
    529	{Opt_usebackuproot, "usebackuproot"},
    530	{Opt_nologreplay, "nologreplay"},
    531	{Opt_ignorebadroots, "ignorebadroots"},
    532	{Opt_ignorebadroots, "ibadroots"},
    533	{Opt_ignoredatacsums, "ignoredatacsums"},
    534	{Opt_ignoredatacsums, "idatacsums"},
    535	{Opt_rescue_all, "all"},
    536	{Opt_err, NULL},
    537};
    538
    539static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
    540			    const char *opt_name)
    541{
    542	if (fs_info->mount_opt & opt) {
    543		btrfs_err(fs_info, "%s must be used with ro mount option",
    544			  opt_name);
    545		return true;
    546	}
    547	return false;
    548}
    549
    550static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
    551{
    552	char *opts;
    553	char *orig;
    554	char *p;
    555	substring_t args[MAX_OPT_ARGS];
    556	int ret = 0;
    557
    558	opts = kstrdup(options, GFP_KERNEL);
    559	if (!opts)
    560		return -ENOMEM;
    561	orig = opts;
    562
    563	while ((p = strsep(&opts, ":")) != NULL) {
    564		int token;
    565
    566		if (!*p)
    567			continue;
    568		token = match_token(p, rescue_tokens, args);
    569		switch (token){
    570		case Opt_usebackuproot:
    571			btrfs_info(info,
    572				   "trying to use backup root at mount time");
    573			btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
    574			break;
    575		case Opt_nologreplay:
    576			btrfs_set_and_info(info, NOLOGREPLAY,
    577					   "disabling log replay at mount time");
    578			break;
    579		case Opt_ignorebadroots:
    580			btrfs_set_and_info(info, IGNOREBADROOTS,
    581					   "ignoring bad roots");
    582			break;
    583		case Opt_ignoredatacsums:
    584			btrfs_set_and_info(info, IGNOREDATACSUMS,
    585					   "ignoring data csums");
    586			break;
    587		case Opt_rescue_all:
    588			btrfs_info(info, "enabling all of the rescue options");
    589			btrfs_set_and_info(info, IGNOREDATACSUMS,
    590					   "ignoring data csums");
    591			btrfs_set_and_info(info, IGNOREBADROOTS,
    592					   "ignoring bad roots");
    593			btrfs_set_and_info(info, NOLOGREPLAY,
    594					   "disabling log replay at mount time");
    595			break;
    596		case Opt_err:
    597			btrfs_info(info, "unrecognized rescue option '%s'", p);
    598			ret = -EINVAL;
    599			goto out;
    600		default:
    601			break;
    602		}
    603
    604	}
    605out:
    606	kfree(orig);
    607	return ret;
    608}
    609
    610/*
    611 * Regular mount options parser.  Everything that is needed only when
    612 * reading in a new superblock is parsed here.
    613 * XXX JDM: This needs to be cleaned up for remount.
    614 */
    615int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
    616			unsigned long new_flags)
    617{
    618	substring_t args[MAX_OPT_ARGS];
    619	char *p, *num;
    620	int intarg;
    621	int ret = 0;
    622	char *compress_type;
    623	bool compress_force = false;
    624	enum btrfs_compression_type saved_compress_type;
    625	int saved_compress_level;
    626	bool saved_compress_force;
    627	int no_compress = 0;
    628
    629	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
    630		btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
    631	else if (btrfs_free_space_cache_v1_active(info)) {
    632		if (btrfs_is_zoned(info)) {
    633			btrfs_info(info,
    634			"zoned: clearing existing space cache");
    635			btrfs_set_super_cache_generation(info->super_copy, 0);
    636		} else {
    637			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
    638		}
    639	}
    640
    641	/*
    642	 * Even the options are empty, we still need to do extra check
    643	 * against new flags
    644	 */
    645	if (!options)
    646		goto check;
    647
    648	while ((p = strsep(&options, ",")) != NULL) {
    649		int token;
    650		if (!*p)
    651			continue;
    652
    653		token = match_token(p, tokens, args);
    654		switch (token) {
    655		case Opt_degraded:
    656			btrfs_info(info, "allowing degraded mounts");
    657			btrfs_set_opt(info->mount_opt, DEGRADED);
    658			break;
    659		case Opt_subvol:
    660		case Opt_subvol_empty:
    661		case Opt_subvolid:
    662		case Opt_device:
    663			/*
    664			 * These are parsed by btrfs_parse_subvol_options or
    665			 * btrfs_parse_device_options and can be ignored here.
    666			 */
    667			break;
    668		case Opt_nodatasum:
    669			btrfs_set_and_info(info, NODATASUM,
    670					   "setting nodatasum");
    671			break;
    672		case Opt_datasum:
    673			if (btrfs_test_opt(info, NODATASUM)) {
    674				if (btrfs_test_opt(info, NODATACOW))
    675					btrfs_info(info,
    676						   "setting datasum, datacow enabled");
    677				else
    678					btrfs_info(info, "setting datasum");
    679			}
    680			btrfs_clear_opt(info->mount_opt, NODATACOW);
    681			btrfs_clear_opt(info->mount_opt, NODATASUM);
    682			break;
    683		case Opt_nodatacow:
    684			if (!btrfs_test_opt(info, NODATACOW)) {
    685				if (!btrfs_test_opt(info, COMPRESS) ||
    686				    !btrfs_test_opt(info, FORCE_COMPRESS)) {
    687					btrfs_info(info,
    688						   "setting nodatacow, compression disabled");
    689				} else {
    690					btrfs_info(info, "setting nodatacow");
    691				}
    692			}
    693			btrfs_clear_opt(info->mount_opt, COMPRESS);
    694			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
    695			btrfs_set_opt(info->mount_opt, NODATACOW);
    696			btrfs_set_opt(info->mount_opt, NODATASUM);
    697			break;
    698		case Opt_datacow:
    699			btrfs_clear_and_info(info, NODATACOW,
    700					     "setting datacow");
    701			break;
    702		case Opt_compress_force:
    703		case Opt_compress_force_type:
    704			compress_force = true;
    705			fallthrough;
    706		case Opt_compress:
    707		case Opt_compress_type:
    708			saved_compress_type = btrfs_test_opt(info,
    709							     COMPRESS) ?
    710				info->compress_type : BTRFS_COMPRESS_NONE;
    711			saved_compress_force =
    712				btrfs_test_opt(info, FORCE_COMPRESS);
    713			saved_compress_level = info->compress_level;
    714			if (token == Opt_compress ||
    715			    token == Opt_compress_force ||
    716			    strncmp(args[0].from, "zlib", 4) == 0) {
    717				compress_type = "zlib";
    718
    719				info->compress_type = BTRFS_COMPRESS_ZLIB;
    720				info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
    721				/*
    722				 * args[0] contains uninitialized data since
    723				 * for these tokens we don't expect any
    724				 * parameter.
    725				 */
    726				if (token != Opt_compress &&
    727				    token != Opt_compress_force)
    728					info->compress_level =
    729					  btrfs_compress_str2level(
    730							BTRFS_COMPRESS_ZLIB,
    731							args[0].from + 4);
    732				btrfs_set_opt(info->mount_opt, COMPRESS);
    733				btrfs_clear_opt(info->mount_opt, NODATACOW);
    734				btrfs_clear_opt(info->mount_opt, NODATASUM);
    735				no_compress = 0;
    736			} else if (strncmp(args[0].from, "lzo", 3) == 0) {
    737				compress_type = "lzo";
    738				info->compress_type = BTRFS_COMPRESS_LZO;
    739				info->compress_level = 0;
    740				btrfs_set_opt(info->mount_opt, COMPRESS);
    741				btrfs_clear_opt(info->mount_opt, NODATACOW);
    742				btrfs_clear_opt(info->mount_opt, NODATASUM);
    743				btrfs_set_fs_incompat(info, COMPRESS_LZO);
    744				no_compress = 0;
    745			} else if (strncmp(args[0].from, "zstd", 4) == 0) {
    746				compress_type = "zstd";
    747				info->compress_type = BTRFS_COMPRESS_ZSTD;
    748				info->compress_level =
    749					btrfs_compress_str2level(
    750							 BTRFS_COMPRESS_ZSTD,
    751							 args[0].from + 4);
    752				btrfs_set_opt(info->mount_opt, COMPRESS);
    753				btrfs_clear_opt(info->mount_opt, NODATACOW);
    754				btrfs_clear_opt(info->mount_opt, NODATASUM);
    755				btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
    756				no_compress = 0;
    757			} else if (strncmp(args[0].from, "no", 2) == 0) {
    758				compress_type = "no";
    759				info->compress_level = 0;
    760				info->compress_type = 0;
    761				btrfs_clear_opt(info->mount_opt, COMPRESS);
    762				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
    763				compress_force = false;
    764				no_compress++;
    765			} else {
    766				btrfs_err(info, "unrecognized compression value %s",
    767					  args[0].from);
    768				ret = -EINVAL;
    769				goto out;
    770			}
    771
    772			if (compress_force) {
    773				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
    774			} else {
    775				/*
    776				 * If we remount from compress-force=xxx to
    777				 * compress=xxx, we need clear FORCE_COMPRESS
    778				 * flag, otherwise, there is no way for users
    779				 * to disable forcible compression separately.
    780				 */
    781				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
    782			}
    783			if (no_compress == 1) {
    784				btrfs_info(info, "use no compression");
    785			} else if ((info->compress_type != saved_compress_type) ||
    786				   (compress_force != saved_compress_force) ||
    787				   (info->compress_level != saved_compress_level)) {
    788				btrfs_info(info, "%s %s compression, level %d",
    789					   (compress_force) ? "force" : "use",
    790					   compress_type, info->compress_level);
    791			}
    792			compress_force = false;
    793			break;
    794		case Opt_ssd:
    795			btrfs_set_and_info(info, SSD,
    796					   "enabling ssd optimizations");
    797			btrfs_clear_opt(info->mount_opt, NOSSD);
    798			break;
    799		case Opt_ssd_spread:
    800			btrfs_set_and_info(info, SSD,
    801					   "enabling ssd optimizations");
    802			btrfs_set_and_info(info, SSD_SPREAD,
    803					   "using spread ssd allocation scheme");
    804			btrfs_clear_opt(info->mount_opt, NOSSD);
    805			break;
    806		case Opt_nossd:
    807			btrfs_set_opt(info->mount_opt, NOSSD);
    808			btrfs_clear_and_info(info, SSD,
    809					     "not using ssd optimizations");
    810			fallthrough;
    811		case Opt_nossd_spread:
    812			btrfs_clear_and_info(info, SSD_SPREAD,
    813					     "not using spread ssd allocation scheme");
    814			break;
    815		case Opt_barrier:
    816			btrfs_clear_and_info(info, NOBARRIER,
    817					     "turning on barriers");
    818			break;
    819		case Opt_nobarrier:
    820			btrfs_set_and_info(info, NOBARRIER,
    821					   "turning off barriers");
    822			break;
    823		case Opt_thread_pool:
    824			ret = match_int(&args[0], &intarg);
    825			if (ret) {
    826				btrfs_err(info, "unrecognized thread_pool value %s",
    827					  args[0].from);
    828				goto out;
    829			} else if (intarg == 0) {
    830				btrfs_err(info, "invalid value 0 for thread_pool");
    831				ret = -EINVAL;
    832				goto out;
    833			}
    834			info->thread_pool_size = intarg;
    835			break;
    836		case Opt_max_inline:
    837			num = match_strdup(&args[0]);
    838			if (num) {
    839				info->max_inline = memparse(num, NULL);
    840				kfree(num);
    841
    842				if (info->max_inline) {
    843					info->max_inline = min_t(u64,
    844						info->max_inline,
    845						info->sectorsize);
    846				}
    847				btrfs_info(info, "max_inline at %llu",
    848					   info->max_inline);
    849			} else {
    850				ret = -ENOMEM;
    851				goto out;
    852			}
    853			break;
    854		case Opt_acl:
    855#ifdef CONFIG_BTRFS_FS_POSIX_ACL
    856			info->sb->s_flags |= SB_POSIXACL;
    857			break;
    858#else
    859			btrfs_err(info, "support for ACL not compiled in!");
    860			ret = -EINVAL;
    861			goto out;
    862#endif
    863		case Opt_noacl:
    864			info->sb->s_flags &= ~SB_POSIXACL;
    865			break;
    866		case Opt_notreelog:
    867			btrfs_set_and_info(info, NOTREELOG,
    868					   "disabling tree log");
    869			break;
    870		case Opt_treelog:
    871			btrfs_clear_and_info(info, NOTREELOG,
    872					     "enabling tree log");
    873			break;
    874		case Opt_norecovery:
    875		case Opt_nologreplay:
    876			btrfs_warn(info,
    877		"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
    878			btrfs_set_and_info(info, NOLOGREPLAY,
    879					   "disabling log replay at mount time");
    880			break;
    881		case Opt_flushoncommit:
    882			btrfs_set_and_info(info, FLUSHONCOMMIT,
    883					   "turning on flush-on-commit");
    884			break;
    885		case Opt_noflushoncommit:
    886			btrfs_clear_and_info(info, FLUSHONCOMMIT,
    887					     "turning off flush-on-commit");
    888			break;
    889		case Opt_ratio:
    890			ret = match_int(&args[0], &intarg);
    891			if (ret) {
    892				btrfs_err(info, "unrecognized metadata_ratio value %s",
    893					  args[0].from);
    894				goto out;
    895			}
    896			info->metadata_ratio = intarg;
    897			btrfs_info(info, "metadata ratio %u",
    898				   info->metadata_ratio);
    899			break;
    900		case Opt_discard:
    901		case Opt_discard_mode:
    902			if (token == Opt_discard ||
    903			    strcmp(args[0].from, "sync") == 0) {
    904				btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
    905				btrfs_set_and_info(info, DISCARD_SYNC,
    906						   "turning on sync discard");
    907			} else if (strcmp(args[0].from, "async") == 0) {
    908				btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
    909				btrfs_set_and_info(info, DISCARD_ASYNC,
    910						   "turning on async discard");
    911			} else {
    912				btrfs_err(info, "unrecognized discard mode value %s",
    913					  args[0].from);
    914				ret = -EINVAL;
    915				goto out;
    916			}
    917			break;
    918		case Opt_nodiscard:
    919			btrfs_clear_and_info(info, DISCARD_SYNC,
    920					     "turning off discard");
    921			btrfs_clear_and_info(info, DISCARD_ASYNC,
    922					     "turning off async discard");
    923			break;
    924		case Opt_space_cache:
    925		case Opt_space_cache_version:
    926			/*
    927			 * We already set FREE_SPACE_TREE above because we have
    928			 * compat_ro(FREE_SPACE_TREE) set, and we aren't going
    929			 * to allow v1 to be set for extent tree v2, simply
    930			 * ignore this setting if we're extent tree v2.
    931			 */
    932			if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
    933				break;
    934			if (token == Opt_space_cache ||
    935			    strcmp(args[0].from, "v1") == 0) {
    936				btrfs_clear_opt(info->mount_opt,
    937						FREE_SPACE_TREE);
    938				btrfs_set_and_info(info, SPACE_CACHE,
    939					   "enabling disk space caching");
    940			} else if (strcmp(args[0].from, "v2") == 0) {
    941				btrfs_clear_opt(info->mount_opt,
    942						SPACE_CACHE);
    943				btrfs_set_and_info(info, FREE_SPACE_TREE,
    944						   "enabling free space tree");
    945			} else {
    946				btrfs_err(info, "unrecognized space_cache value %s",
    947					  args[0].from);
    948				ret = -EINVAL;
    949				goto out;
    950			}
    951			break;
    952		case Opt_rescan_uuid_tree:
    953			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
    954			break;
    955		case Opt_no_space_cache:
    956			/*
    957			 * We cannot operate without the free space tree with
    958			 * extent tree v2, ignore this option.
    959			 */
    960			if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
    961				break;
    962			if (btrfs_test_opt(info, SPACE_CACHE)) {
    963				btrfs_clear_and_info(info, SPACE_CACHE,
    964					     "disabling disk space caching");
    965			}
    966			if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
    967				btrfs_clear_and_info(info, FREE_SPACE_TREE,
    968					     "disabling free space tree");
    969			}
    970			break;
    971		case Opt_inode_cache:
    972		case Opt_noinode_cache:
    973			btrfs_warn(info,
    974	"the 'inode_cache' option is deprecated and has no effect since 5.11");
    975			break;
    976		case Opt_clear_cache:
    977			/*
    978			 * We cannot clear the free space tree with extent tree
    979			 * v2, ignore this option.
    980			 */
    981			if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
    982				break;
    983			btrfs_set_and_info(info, CLEAR_CACHE,
    984					   "force clearing of disk cache");
    985			break;
    986		case Opt_user_subvol_rm_allowed:
    987			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
    988			break;
    989		case Opt_enospc_debug:
    990			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
    991			break;
    992		case Opt_noenospc_debug:
    993			btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
    994			break;
    995		case Opt_defrag:
    996			btrfs_set_and_info(info, AUTO_DEFRAG,
    997					   "enabling auto defrag");
    998			break;
    999		case Opt_nodefrag:
   1000			btrfs_clear_and_info(info, AUTO_DEFRAG,
   1001					     "disabling auto defrag");
   1002			break;
   1003		case Opt_recovery:
   1004		case Opt_usebackuproot:
   1005			btrfs_warn(info,
   1006			"'%s' is deprecated, use 'rescue=usebackuproot' instead",
   1007				   token == Opt_recovery ? "recovery" :
   1008				   "usebackuproot");
   1009			btrfs_info(info,
   1010				   "trying to use backup root at mount time");
   1011			btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
   1012			break;
   1013		case Opt_skip_balance:
   1014			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
   1015			break;
   1016#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
   1017		case Opt_check_integrity_including_extent_data:
   1018			btrfs_info(info,
   1019				   "enabling check integrity including extent data");
   1020			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
   1021			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
   1022			break;
   1023		case Opt_check_integrity:
   1024			btrfs_info(info, "enabling check integrity");
   1025			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
   1026			break;
   1027		case Opt_check_integrity_print_mask:
   1028			ret = match_int(&args[0], &intarg);
   1029			if (ret) {
   1030				btrfs_err(info,
   1031				"unrecognized check_integrity_print_mask value %s",
   1032					args[0].from);
   1033				goto out;
   1034			}
   1035			info->check_integrity_print_mask = intarg;
   1036			btrfs_info(info, "check_integrity_print_mask 0x%x",
   1037				   info->check_integrity_print_mask);
   1038			break;
   1039#else
   1040		case Opt_check_integrity_including_extent_data:
   1041		case Opt_check_integrity:
   1042		case Opt_check_integrity_print_mask:
   1043			btrfs_err(info,
   1044				  "support for check_integrity* not compiled in!");
   1045			ret = -EINVAL;
   1046			goto out;
   1047#endif
   1048		case Opt_fatal_errors:
   1049			if (strcmp(args[0].from, "panic") == 0) {
   1050				btrfs_set_opt(info->mount_opt,
   1051					      PANIC_ON_FATAL_ERROR);
   1052			} else if (strcmp(args[0].from, "bug") == 0) {
   1053				btrfs_clear_opt(info->mount_opt,
   1054					      PANIC_ON_FATAL_ERROR);
   1055			} else {
   1056				btrfs_err(info, "unrecognized fatal_errors value %s",
   1057					  args[0].from);
   1058				ret = -EINVAL;
   1059				goto out;
   1060			}
   1061			break;
   1062		case Opt_commit_interval:
   1063			intarg = 0;
   1064			ret = match_int(&args[0], &intarg);
   1065			if (ret) {
   1066				btrfs_err(info, "unrecognized commit_interval value %s",
   1067					  args[0].from);
   1068				ret = -EINVAL;
   1069				goto out;
   1070			}
   1071			if (intarg == 0) {
   1072				btrfs_info(info,
   1073					   "using default commit interval %us",
   1074					   BTRFS_DEFAULT_COMMIT_INTERVAL);
   1075				intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
   1076			} else if (intarg > 300) {
   1077				btrfs_warn(info, "excessive commit interval %d",
   1078					   intarg);
   1079			}
   1080			info->commit_interval = intarg;
   1081			break;
   1082		case Opt_rescue:
   1083			ret = parse_rescue_options(info, args[0].from);
   1084			if (ret < 0) {
   1085				btrfs_err(info, "unrecognized rescue value %s",
   1086					  args[0].from);
   1087				goto out;
   1088			}
   1089			break;
   1090#ifdef CONFIG_BTRFS_DEBUG
   1091		case Opt_fragment_all:
   1092			btrfs_info(info, "fragmenting all space");
   1093			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
   1094			btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
   1095			break;
   1096		case Opt_fragment_metadata:
   1097			btrfs_info(info, "fragmenting metadata");
   1098			btrfs_set_opt(info->mount_opt,
   1099				      FRAGMENT_METADATA);
   1100			break;
   1101		case Opt_fragment_data:
   1102			btrfs_info(info, "fragmenting data");
   1103			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
   1104			break;
   1105#endif
   1106#ifdef CONFIG_BTRFS_FS_REF_VERIFY
   1107		case Opt_ref_verify:
   1108			btrfs_info(info, "doing ref verification");
   1109			btrfs_set_opt(info->mount_opt, REF_VERIFY);
   1110			break;
   1111#endif
   1112		case Opt_err:
   1113			btrfs_err(info, "unrecognized mount option '%s'", p);
   1114			ret = -EINVAL;
   1115			goto out;
   1116		default:
   1117			break;
   1118		}
   1119	}
   1120check:
   1121	/* We're read-only, don't have to check. */
   1122	if (new_flags & SB_RDONLY)
   1123		goto out;
   1124
   1125	if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
   1126	    check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
   1127	    check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
   1128		ret = -EINVAL;
   1129out:
   1130	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
   1131	    !btrfs_test_opt(info, FREE_SPACE_TREE) &&
   1132	    !btrfs_test_opt(info, CLEAR_CACHE)) {
   1133		btrfs_err(info, "cannot disable free space tree");
   1134		ret = -EINVAL;
   1135
   1136	}
   1137	if (!ret)
   1138		ret = btrfs_check_mountopts_zoned(info);
   1139	if (!ret && btrfs_test_opt(info, SPACE_CACHE))
   1140		btrfs_info(info, "disk space caching is enabled");
   1141	if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
   1142		btrfs_info(info, "using free space tree");
   1143	return ret;
   1144}
   1145
   1146/*
   1147 * Parse mount options that are required early in the mount process.
   1148 *
   1149 * All other options will be parsed on much later in the mount process and
   1150 * only when we need to allocate a new super block.
   1151 */
   1152static int btrfs_parse_device_options(const char *options, fmode_t flags,
   1153				      void *holder)
   1154{
   1155	substring_t args[MAX_OPT_ARGS];
   1156	char *device_name, *opts, *orig, *p;
   1157	struct btrfs_device *device = NULL;
   1158	int error = 0;
   1159
   1160	lockdep_assert_held(&uuid_mutex);
   1161
   1162	if (!options)
   1163		return 0;
   1164
   1165	/*
   1166	 * strsep changes the string, duplicate it because btrfs_parse_options
   1167	 * gets called later
   1168	 */
   1169	opts = kstrdup(options, GFP_KERNEL);
   1170	if (!opts)
   1171		return -ENOMEM;
   1172	orig = opts;
   1173
   1174	while ((p = strsep(&opts, ",")) != NULL) {
   1175		int token;
   1176
   1177		if (!*p)
   1178			continue;
   1179
   1180		token = match_token(p, tokens, args);
   1181		if (token == Opt_device) {
   1182			device_name = match_strdup(&args[0]);
   1183			if (!device_name) {
   1184				error = -ENOMEM;
   1185				goto out;
   1186			}
   1187			device = btrfs_scan_one_device(device_name, flags,
   1188					holder);
   1189			kfree(device_name);
   1190			if (IS_ERR(device)) {
   1191				error = PTR_ERR(device);
   1192				goto out;
   1193			}
   1194		}
   1195	}
   1196
   1197out:
   1198	kfree(orig);
   1199	return error;
   1200}
   1201
   1202/*
   1203 * Parse mount options that are related to subvolume id
   1204 *
   1205 * The value is later passed to mount_subvol()
   1206 */
   1207static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
   1208		u64 *subvol_objectid)
   1209{
   1210	substring_t args[MAX_OPT_ARGS];
   1211	char *opts, *orig, *p;
   1212	int error = 0;
   1213	u64 subvolid;
   1214
   1215	if (!options)
   1216		return 0;
   1217
   1218	/*
   1219	 * strsep changes the string, duplicate it because
   1220	 * btrfs_parse_device_options gets called later
   1221	 */
   1222	opts = kstrdup(options, GFP_KERNEL);
   1223	if (!opts)
   1224		return -ENOMEM;
   1225	orig = opts;
   1226
   1227	while ((p = strsep(&opts, ",")) != NULL) {
   1228		int token;
   1229		if (!*p)
   1230			continue;
   1231
   1232		token = match_token(p, tokens, args);
   1233		switch (token) {
   1234		case Opt_subvol:
   1235			kfree(*subvol_name);
   1236			*subvol_name = match_strdup(&args[0]);
   1237			if (!*subvol_name) {
   1238				error = -ENOMEM;
   1239				goto out;
   1240			}
   1241			break;
   1242		case Opt_subvolid:
   1243			error = match_u64(&args[0], &subvolid);
   1244			if (error)
   1245				goto out;
   1246
   1247			/* we want the original fs_tree */
   1248			if (subvolid == 0)
   1249				subvolid = BTRFS_FS_TREE_OBJECTID;
   1250
   1251			*subvol_objectid = subvolid;
   1252			break;
   1253		default:
   1254			break;
   1255		}
   1256	}
   1257
   1258out:
   1259	kfree(orig);
   1260	return error;
   1261}
   1262
   1263char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
   1264					  u64 subvol_objectid)
   1265{
   1266	struct btrfs_root *root = fs_info->tree_root;
   1267	struct btrfs_root *fs_root = NULL;
   1268	struct btrfs_root_ref *root_ref;
   1269	struct btrfs_inode_ref *inode_ref;
   1270	struct btrfs_key key;
   1271	struct btrfs_path *path = NULL;
   1272	char *name = NULL, *ptr;
   1273	u64 dirid;
   1274	int len;
   1275	int ret;
   1276
   1277	path = btrfs_alloc_path();
   1278	if (!path) {
   1279		ret = -ENOMEM;
   1280		goto err;
   1281	}
   1282
   1283	name = kmalloc(PATH_MAX, GFP_KERNEL);
   1284	if (!name) {
   1285		ret = -ENOMEM;
   1286		goto err;
   1287	}
   1288	ptr = name + PATH_MAX - 1;
   1289	ptr[0] = '\0';
   1290
   1291	/*
   1292	 * Walk up the subvolume trees in the tree of tree roots by root
   1293	 * backrefs until we hit the top-level subvolume.
   1294	 */
   1295	while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
   1296		key.objectid = subvol_objectid;
   1297		key.type = BTRFS_ROOT_BACKREF_KEY;
   1298		key.offset = (u64)-1;
   1299
   1300		ret = btrfs_search_backwards(root, &key, path);
   1301		if (ret < 0) {
   1302			goto err;
   1303		} else if (ret > 0) {
   1304			ret = -ENOENT;
   1305			goto err;
   1306		}
   1307
   1308		subvol_objectid = key.offset;
   1309
   1310		root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
   1311					  struct btrfs_root_ref);
   1312		len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
   1313		ptr -= len + 1;
   1314		if (ptr < name) {
   1315			ret = -ENAMETOOLONG;
   1316			goto err;
   1317		}
   1318		read_extent_buffer(path->nodes[0], ptr + 1,
   1319				   (unsigned long)(root_ref + 1), len);
   1320		ptr[0] = '/';
   1321		dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
   1322		btrfs_release_path(path);
   1323
   1324		fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true);
   1325		if (IS_ERR(fs_root)) {
   1326			ret = PTR_ERR(fs_root);
   1327			fs_root = NULL;
   1328			goto err;
   1329		}
   1330
   1331		/*
   1332		 * Walk up the filesystem tree by inode refs until we hit the
   1333		 * root directory.
   1334		 */
   1335		while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
   1336			key.objectid = dirid;
   1337			key.type = BTRFS_INODE_REF_KEY;
   1338			key.offset = (u64)-1;
   1339
   1340			ret = btrfs_search_backwards(fs_root, &key, path);
   1341			if (ret < 0) {
   1342				goto err;
   1343			} else if (ret > 0) {
   1344				ret = -ENOENT;
   1345				goto err;
   1346			}
   1347
   1348			dirid = key.offset;
   1349
   1350			inode_ref = btrfs_item_ptr(path->nodes[0],
   1351						   path->slots[0],
   1352						   struct btrfs_inode_ref);
   1353			len = btrfs_inode_ref_name_len(path->nodes[0],
   1354						       inode_ref);
   1355			ptr -= len + 1;
   1356			if (ptr < name) {
   1357				ret = -ENAMETOOLONG;
   1358				goto err;
   1359			}
   1360			read_extent_buffer(path->nodes[0], ptr + 1,
   1361					   (unsigned long)(inode_ref + 1), len);
   1362			ptr[0] = '/';
   1363			btrfs_release_path(path);
   1364		}
   1365		btrfs_put_root(fs_root);
   1366		fs_root = NULL;
   1367	}
   1368
   1369	btrfs_free_path(path);
   1370	if (ptr == name + PATH_MAX - 1) {
   1371		name[0] = '/';
   1372		name[1] = '\0';
   1373	} else {
   1374		memmove(name, ptr, name + PATH_MAX - ptr);
   1375	}
   1376	return name;
   1377
   1378err:
   1379	btrfs_put_root(fs_root);
   1380	btrfs_free_path(path);
   1381	kfree(name);
   1382	return ERR_PTR(ret);
   1383}
   1384
   1385static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
   1386{
   1387	struct btrfs_root *root = fs_info->tree_root;
   1388	struct btrfs_dir_item *di;
   1389	struct btrfs_path *path;
   1390	struct btrfs_key location;
   1391	u64 dir_id;
   1392
   1393	path = btrfs_alloc_path();
   1394	if (!path)
   1395		return -ENOMEM;
   1396
   1397	/*
   1398	 * Find the "default" dir item which points to the root item that we
   1399	 * will mount by default if we haven't been given a specific subvolume
   1400	 * to mount.
   1401	 */
   1402	dir_id = btrfs_super_root_dir(fs_info->super_copy);
   1403	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
   1404	if (IS_ERR(di)) {
   1405		btrfs_free_path(path);
   1406		return PTR_ERR(di);
   1407	}
   1408	if (!di) {
   1409		/*
   1410		 * Ok the default dir item isn't there.  This is weird since
   1411		 * it's always been there, but don't freak out, just try and
   1412		 * mount the top-level subvolume.
   1413		 */
   1414		btrfs_free_path(path);
   1415		*objectid = BTRFS_FS_TREE_OBJECTID;
   1416		return 0;
   1417	}
   1418
   1419	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
   1420	btrfs_free_path(path);
   1421	*objectid = location.objectid;
   1422	return 0;
   1423}
   1424
   1425static int btrfs_fill_super(struct super_block *sb,
   1426			    struct btrfs_fs_devices *fs_devices,
   1427			    void *data)
   1428{
   1429	struct inode *inode;
   1430	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
   1431	int err;
   1432
   1433	sb->s_maxbytes = MAX_LFS_FILESIZE;
   1434	sb->s_magic = BTRFS_SUPER_MAGIC;
   1435	sb->s_op = &btrfs_super_ops;
   1436	sb->s_d_op = &btrfs_dentry_operations;
   1437	sb->s_export_op = &btrfs_export_ops;
   1438#ifdef CONFIG_FS_VERITY
   1439	sb->s_vop = &btrfs_verityops;
   1440#endif
   1441	sb->s_xattr = btrfs_xattr_handlers;
   1442	sb->s_time_gran = 1;
   1443#ifdef CONFIG_BTRFS_FS_POSIX_ACL
   1444	sb->s_flags |= SB_POSIXACL;
   1445#endif
   1446	sb->s_flags |= SB_I_VERSION;
   1447	sb->s_iflags |= SB_I_CGROUPWB;
   1448
   1449	err = super_setup_bdi(sb);
   1450	if (err) {
   1451		btrfs_err(fs_info, "super_setup_bdi failed");
   1452		return err;
   1453	}
   1454
   1455	err = open_ctree(sb, fs_devices, (char *)data);
   1456	if (err) {
   1457		btrfs_err(fs_info, "open_ctree failed");
   1458		return err;
   1459	}
   1460
   1461	inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
   1462	if (IS_ERR(inode)) {
   1463		err = PTR_ERR(inode);
   1464		goto fail_close;
   1465	}
   1466
   1467	sb->s_root = d_make_root(inode);
   1468	if (!sb->s_root) {
   1469		err = -ENOMEM;
   1470		goto fail_close;
   1471	}
   1472
   1473	sb->s_flags |= SB_ACTIVE;
   1474	return 0;
   1475
   1476fail_close:
   1477	close_ctree(fs_info);
   1478	return err;
   1479}
   1480
   1481int btrfs_sync_fs(struct super_block *sb, int wait)
   1482{
   1483	struct btrfs_trans_handle *trans;
   1484	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
   1485	struct btrfs_root *root = fs_info->tree_root;
   1486
   1487	trace_btrfs_sync_fs(fs_info, wait);
   1488
   1489	if (!wait) {
   1490		filemap_flush(fs_info->btree_inode->i_mapping);
   1491		return 0;
   1492	}
   1493
   1494	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
   1495
   1496	trans = btrfs_attach_transaction_barrier(root);
   1497	if (IS_ERR(trans)) {
   1498		/* no transaction, don't bother */
   1499		if (PTR_ERR(trans) == -ENOENT) {
   1500			/*
   1501			 * Exit unless we have some pending changes
   1502			 * that need to go through commit
   1503			 */
   1504			if (fs_info->pending_changes == 0)
   1505				return 0;
   1506			/*
   1507			 * A non-blocking test if the fs is frozen. We must not
   1508			 * start a new transaction here otherwise a deadlock
   1509			 * happens. The pending operations are delayed to the
   1510			 * next commit after thawing.
   1511			 */
   1512			if (sb_start_write_trylock(sb))
   1513				sb_end_write(sb);
   1514			else
   1515				return 0;
   1516			trans = btrfs_start_transaction(root, 0);
   1517		}
   1518		if (IS_ERR(trans))
   1519			return PTR_ERR(trans);
   1520	}
   1521	return btrfs_commit_transaction(trans);
   1522}
   1523
   1524static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed)
   1525{
   1526	seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s);
   1527	*printed = true;
   1528}
   1529
   1530static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
   1531{
   1532	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
   1533	const char *compress_type;
   1534	const char *subvol_name;
   1535	bool printed = false;
   1536
   1537	if (btrfs_test_opt(info, DEGRADED))
   1538		seq_puts(seq, ",degraded");
   1539	if (btrfs_test_opt(info, NODATASUM))
   1540		seq_puts(seq, ",nodatasum");
   1541	if (btrfs_test_opt(info, NODATACOW))
   1542		seq_puts(seq, ",nodatacow");
   1543	if (btrfs_test_opt(info, NOBARRIER))
   1544		seq_puts(seq, ",nobarrier");
   1545	if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
   1546		seq_printf(seq, ",max_inline=%llu", info->max_inline);
   1547	if (info->thread_pool_size !=  min_t(unsigned long,
   1548					     num_online_cpus() + 2, 8))
   1549		seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
   1550	if (btrfs_test_opt(info, COMPRESS)) {
   1551		compress_type = btrfs_compress_type2str(info->compress_type);
   1552		if (btrfs_test_opt(info, FORCE_COMPRESS))
   1553			seq_printf(seq, ",compress-force=%s", compress_type);
   1554		else
   1555			seq_printf(seq, ",compress=%s", compress_type);
   1556		if (info->compress_level)
   1557			seq_printf(seq, ":%d", info->compress_level);
   1558	}
   1559	if (btrfs_test_opt(info, NOSSD))
   1560		seq_puts(seq, ",nossd");
   1561	if (btrfs_test_opt(info, SSD_SPREAD))
   1562		seq_puts(seq, ",ssd_spread");
   1563	else if (btrfs_test_opt(info, SSD))
   1564		seq_puts(seq, ",ssd");
   1565	if (btrfs_test_opt(info, NOTREELOG))
   1566		seq_puts(seq, ",notreelog");
   1567	if (btrfs_test_opt(info, NOLOGREPLAY))
   1568		print_rescue_option(seq, "nologreplay", &printed);
   1569	if (btrfs_test_opt(info, USEBACKUPROOT))
   1570		print_rescue_option(seq, "usebackuproot", &printed);
   1571	if (btrfs_test_opt(info, IGNOREBADROOTS))
   1572		print_rescue_option(seq, "ignorebadroots", &printed);
   1573	if (btrfs_test_opt(info, IGNOREDATACSUMS))
   1574		print_rescue_option(seq, "ignoredatacsums", &printed);
   1575	if (btrfs_test_opt(info, FLUSHONCOMMIT))
   1576		seq_puts(seq, ",flushoncommit");
   1577	if (btrfs_test_opt(info, DISCARD_SYNC))
   1578		seq_puts(seq, ",discard");
   1579	if (btrfs_test_opt(info, DISCARD_ASYNC))
   1580		seq_puts(seq, ",discard=async");
   1581	if (!(info->sb->s_flags & SB_POSIXACL))
   1582		seq_puts(seq, ",noacl");
   1583	if (btrfs_free_space_cache_v1_active(info))
   1584		seq_puts(seq, ",space_cache");
   1585	else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
   1586		seq_puts(seq, ",space_cache=v2");
   1587	else
   1588		seq_puts(seq, ",nospace_cache");
   1589	if (btrfs_test_opt(info, RESCAN_UUID_TREE))
   1590		seq_puts(seq, ",rescan_uuid_tree");
   1591	if (btrfs_test_opt(info, CLEAR_CACHE))
   1592		seq_puts(seq, ",clear_cache");
   1593	if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
   1594		seq_puts(seq, ",user_subvol_rm_allowed");
   1595	if (btrfs_test_opt(info, ENOSPC_DEBUG))
   1596		seq_puts(seq, ",enospc_debug");
   1597	if (btrfs_test_opt(info, AUTO_DEFRAG))
   1598		seq_puts(seq, ",autodefrag");
   1599	if (btrfs_test_opt(info, SKIP_BALANCE))
   1600		seq_puts(seq, ",skip_balance");
   1601#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
   1602	if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
   1603		seq_puts(seq, ",check_int_data");
   1604	else if (btrfs_test_opt(info, CHECK_INTEGRITY))
   1605		seq_puts(seq, ",check_int");
   1606	if (info->check_integrity_print_mask)
   1607		seq_printf(seq, ",check_int_print_mask=%d",
   1608				info->check_integrity_print_mask);
   1609#endif
   1610	if (info->metadata_ratio)
   1611		seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
   1612	if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
   1613		seq_puts(seq, ",fatal_errors=panic");
   1614	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
   1615		seq_printf(seq, ",commit=%u", info->commit_interval);
   1616#ifdef CONFIG_BTRFS_DEBUG
   1617	if (btrfs_test_opt(info, FRAGMENT_DATA))
   1618		seq_puts(seq, ",fragment=data");
   1619	if (btrfs_test_opt(info, FRAGMENT_METADATA))
   1620		seq_puts(seq, ",fragment=metadata");
   1621#endif
   1622	if (btrfs_test_opt(info, REF_VERIFY))
   1623		seq_puts(seq, ",ref_verify");
   1624	seq_printf(seq, ",subvolid=%llu",
   1625		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
   1626	subvol_name = btrfs_get_subvol_name_from_objectid(info,
   1627			BTRFS_I(d_inode(dentry))->root->root_key.objectid);
   1628	if (!IS_ERR(subvol_name)) {
   1629		seq_puts(seq, ",subvol=");
   1630		seq_escape(seq, subvol_name, " \t\n\\");
   1631		kfree(subvol_name);
   1632	}
   1633	return 0;
   1634}
   1635
   1636static int btrfs_test_super(struct super_block *s, void *data)
   1637{
   1638	struct btrfs_fs_info *p = data;
   1639	struct btrfs_fs_info *fs_info = btrfs_sb(s);
   1640
   1641	return fs_info->fs_devices == p->fs_devices;
   1642}
   1643
   1644static int btrfs_set_super(struct super_block *s, void *data)
   1645{
   1646	int err = set_anon_super(s, data);
   1647	if (!err)
   1648		s->s_fs_info = data;
   1649	return err;
   1650}
   1651
   1652/*
   1653 * subvolumes are identified by ino 256
   1654 */
   1655static inline int is_subvolume_inode(struct inode *inode)
   1656{
   1657	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
   1658		return 1;
   1659	return 0;
   1660}
   1661
   1662static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
   1663				   struct vfsmount *mnt)
   1664{
   1665	struct dentry *root;
   1666	int ret;
   1667
   1668	if (!subvol_name) {
   1669		if (!subvol_objectid) {
   1670			ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
   1671							  &subvol_objectid);
   1672			if (ret) {
   1673				root = ERR_PTR(ret);
   1674				goto out;
   1675			}
   1676		}
   1677		subvol_name = btrfs_get_subvol_name_from_objectid(
   1678					btrfs_sb(mnt->mnt_sb), subvol_objectid);
   1679		if (IS_ERR(subvol_name)) {
   1680			root = ERR_CAST(subvol_name);
   1681			subvol_name = NULL;
   1682			goto out;
   1683		}
   1684
   1685	}
   1686
   1687	root = mount_subtree(mnt, subvol_name);
   1688	/* mount_subtree() drops our reference on the vfsmount. */
   1689	mnt = NULL;
   1690
   1691	if (!IS_ERR(root)) {
   1692		struct super_block *s = root->d_sb;
   1693		struct btrfs_fs_info *fs_info = btrfs_sb(s);
   1694		struct inode *root_inode = d_inode(root);
   1695		u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
   1696
   1697		ret = 0;
   1698		if (!is_subvolume_inode(root_inode)) {
   1699			btrfs_err(fs_info, "'%s' is not a valid subvolume",
   1700			       subvol_name);
   1701			ret = -EINVAL;
   1702		}
   1703		if (subvol_objectid && root_objectid != subvol_objectid) {
   1704			/*
   1705			 * This will also catch a race condition where a
   1706			 * subvolume which was passed by ID is renamed and
   1707			 * another subvolume is renamed over the old location.
   1708			 */
   1709			btrfs_err(fs_info,
   1710				  "subvol '%s' does not match subvolid %llu",
   1711				  subvol_name, subvol_objectid);
   1712			ret = -EINVAL;
   1713		}
   1714		if (ret) {
   1715			dput(root);
   1716			root = ERR_PTR(ret);
   1717			deactivate_locked_super(s);
   1718		}
   1719	}
   1720
   1721out:
   1722	mntput(mnt);
   1723	kfree(subvol_name);
   1724	return root;
   1725}
   1726
   1727/*
   1728 * Find a superblock for the given device / mount point.
   1729 *
   1730 * Note: This is based on mount_bdev from fs/super.c with a few additions
   1731 *       for multiple device setup.  Make sure to keep it in sync.
   1732 */
   1733static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
   1734		int flags, const char *device_name, void *data)
   1735{
   1736	struct block_device *bdev = NULL;
   1737	struct super_block *s;
   1738	struct btrfs_device *device = NULL;
   1739	struct btrfs_fs_devices *fs_devices = NULL;
   1740	struct btrfs_fs_info *fs_info = NULL;
   1741	void *new_sec_opts = NULL;
   1742	fmode_t mode = FMODE_READ;
   1743	int error = 0;
   1744
   1745	if (!(flags & SB_RDONLY))
   1746		mode |= FMODE_WRITE;
   1747
   1748	if (data) {
   1749		error = security_sb_eat_lsm_opts(data, &new_sec_opts);
   1750		if (error)
   1751			return ERR_PTR(error);
   1752	}
   1753
   1754	/*
   1755	 * Setup a dummy root and fs_info for test/set super.  This is because
   1756	 * we don't actually fill this stuff out until open_ctree, but we need
   1757	 * then open_ctree will properly initialize the file system specific
   1758	 * settings later.  btrfs_init_fs_info initializes the static elements
   1759	 * of the fs_info (locks and such) to make cleanup easier if we find a
   1760	 * superblock with our given fs_devices later on at sget() time.
   1761	 */
   1762	fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
   1763	if (!fs_info) {
   1764		error = -ENOMEM;
   1765		goto error_sec_opts;
   1766	}
   1767	btrfs_init_fs_info(fs_info);
   1768
   1769	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
   1770	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
   1771	if (!fs_info->super_copy || !fs_info->super_for_commit) {
   1772		error = -ENOMEM;
   1773		goto error_fs_info;
   1774	}
   1775
   1776	mutex_lock(&uuid_mutex);
   1777	error = btrfs_parse_device_options(data, mode, fs_type);
   1778	if (error) {
   1779		mutex_unlock(&uuid_mutex);
   1780		goto error_fs_info;
   1781	}
   1782
   1783	device = btrfs_scan_one_device(device_name, mode, fs_type);
   1784	if (IS_ERR(device)) {
   1785		mutex_unlock(&uuid_mutex);
   1786		error = PTR_ERR(device);
   1787		goto error_fs_info;
   1788	}
   1789
   1790	fs_devices = device->fs_devices;
   1791	fs_info->fs_devices = fs_devices;
   1792
   1793	error = btrfs_open_devices(fs_devices, mode, fs_type);
   1794	mutex_unlock(&uuid_mutex);
   1795	if (error)
   1796		goto error_fs_info;
   1797
   1798	if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
   1799		error = -EACCES;
   1800		goto error_close_devices;
   1801	}
   1802
   1803	bdev = fs_devices->latest_dev->bdev;
   1804	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
   1805		 fs_info);
   1806	if (IS_ERR(s)) {
   1807		error = PTR_ERR(s);
   1808		goto error_close_devices;
   1809	}
   1810
   1811	if (s->s_root) {
   1812		btrfs_close_devices(fs_devices);
   1813		btrfs_free_fs_info(fs_info);
   1814		if ((flags ^ s->s_flags) & SB_RDONLY)
   1815			error = -EBUSY;
   1816	} else {
   1817		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
   1818		btrfs_sb(s)->bdev_holder = fs_type;
   1819		if (!strstr(crc32c_impl(), "generic"))
   1820			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
   1821		error = btrfs_fill_super(s, fs_devices, data);
   1822	}
   1823	if (!error)
   1824		error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
   1825	security_free_mnt_opts(&new_sec_opts);
   1826	if (error) {
   1827		deactivate_locked_super(s);
   1828		return ERR_PTR(error);
   1829	}
   1830
   1831	return dget(s->s_root);
   1832
   1833error_close_devices:
   1834	btrfs_close_devices(fs_devices);
   1835error_fs_info:
   1836	btrfs_free_fs_info(fs_info);
   1837error_sec_opts:
   1838	security_free_mnt_opts(&new_sec_opts);
   1839	return ERR_PTR(error);
   1840}
   1841
   1842/*
   1843 * Mount function which is called by VFS layer.
   1844 *
   1845 * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
   1846 * which needs vfsmount* of device's root (/).  This means device's root has to
   1847 * be mounted internally in any case.
   1848 *
   1849 * Operation flow:
   1850 *   1. Parse subvol id related options for later use in mount_subvol().
   1851 *
   1852 *   2. Mount device's root (/) by calling vfs_kern_mount().
   1853 *
   1854 *      NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
   1855 *      first place. In order to avoid calling btrfs_mount() again, we use
   1856 *      different file_system_type which is not registered to VFS by
   1857 *      register_filesystem() (btrfs_root_fs_type). As a result,
   1858 *      btrfs_mount_root() is called. The return value will be used by
   1859 *      mount_subtree() in mount_subvol().
   1860 *
   1861 *   3. Call mount_subvol() to get the dentry of subvolume. Since there is
   1862 *      "btrfs subvolume set-default", mount_subvol() is called always.
   1863 */
   1864static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
   1865		const char *device_name, void *data)
   1866{
   1867	struct vfsmount *mnt_root;
   1868	struct dentry *root;
   1869	char *subvol_name = NULL;
   1870	u64 subvol_objectid = 0;
   1871	int error = 0;
   1872
   1873	error = btrfs_parse_subvol_options(data, &subvol_name,
   1874					&subvol_objectid);
   1875	if (error) {
   1876		kfree(subvol_name);
   1877		return ERR_PTR(error);
   1878	}
   1879
   1880	/* mount device's root (/) */
   1881	mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
   1882	if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
   1883		if (flags & SB_RDONLY) {
   1884			mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
   1885				flags & ~SB_RDONLY, device_name, data);
   1886		} else {
   1887			mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
   1888				flags | SB_RDONLY, device_name, data);
   1889			if (IS_ERR(mnt_root)) {
   1890				root = ERR_CAST(mnt_root);
   1891				kfree(subvol_name);
   1892				goto out;
   1893			}
   1894
   1895			down_write(&mnt_root->mnt_sb->s_umount);
   1896			error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
   1897			up_write(&mnt_root->mnt_sb->s_umount);
   1898			if (error < 0) {
   1899				root = ERR_PTR(error);
   1900				mntput(mnt_root);
   1901				kfree(subvol_name);
   1902				goto out;
   1903			}
   1904		}
   1905	}
   1906	if (IS_ERR(mnt_root)) {
   1907		root = ERR_CAST(mnt_root);
   1908		kfree(subvol_name);
   1909		goto out;
   1910	}
   1911
   1912	/* mount_subvol() will free subvol_name and mnt_root */
   1913	root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
   1914
   1915out:
   1916	return root;
   1917}
   1918
   1919static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
   1920				     u32 new_pool_size, u32 old_pool_size)
   1921{
   1922	if (new_pool_size == old_pool_size)
   1923		return;
   1924
   1925	fs_info->thread_pool_size = new_pool_size;
   1926
   1927	btrfs_info(fs_info, "resize thread pool %d -> %d",
   1928	       old_pool_size, new_pool_size);
   1929
   1930	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
   1931	btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
   1932	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
   1933	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
   1934	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
   1935	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
   1936	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
   1937				new_pool_size);
   1938	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
   1939	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
   1940	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
   1941}
   1942
   1943static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
   1944				       unsigned long old_opts, int flags)
   1945{
   1946	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
   1947	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
   1948	     (flags & SB_RDONLY))) {
   1949		/* wait for any defraggers to finish */
   1950		wait_event(fs_info->transaction_wait,
   1951			   (atomic_read(&fs_info->defrag_running) == 0));
   1952		if (flags & SB_RDONLY)
   1953			sync_filesystem(fs_info->sb);
   1954	}
   1955}
   1956
   1957static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
   1958					 unsigned long old_opts)
   1959{
   1960	const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
   1961
   1962	/*
   1963	 * We need to cleanup all defragable inodes if the autodefragment is
   1964	 * close or the filesystem is read only.
   1965	 */
   1966	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
   1967	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) {
   1968		btrfs_cleanup_defrag_inodes(fs_info);
   1969	}
   1970
   1971	/* If we toggled discard async */
   1972	if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
   1973	    btrfs_test_opt(fs_info, DISCARD_ASYNC))
   1974		btrfs_discard_resume(fs_info);
   1975	else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
   1976		 !btrfs_test_opt(fs_info, DISCARD_ASYNC))
   1977		btrfs_discard_cleanup(fs_info);
   1978
   1979	/* If we toggled space cache */
   1980	if (cache_opt != btrfs_free_space_cache_v1_active(fs_info))
   1981		btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
   1982}
   1983
   1984static int btrfs_remount(struct super_block *sb, int *flags, char *data)
   1985{
   1986	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
   1987	unsigned old_flags = sb->s_flags;
   1988	unsigned long old_opts = fs_info->mount_opt;
   1989	unsigned long old_compress_type = fs_info->compress_type;
   1990	u64 old_max_inline = fs_info->max_inline;
   1991	u32 old_thread_pool_size = fs_info->thread_pool_size;
   1992	u32 old_metadata_ratio = fs_info->metadata_ratio;
   1993	int ret;
   1994
   1995	sync_filesystem(sb);
   1996	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
   1997
   1998	if (data) {
   1999		void *new_sec_opts = NULL;
   2000
   2001		ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
   2002		if (!ret)
   2003			ret = security_sb_remount(sb, new_sec_opts);
   2004		security_free_mnt_opts(&new_sec_opts);
   2005		if (ret)
   2006			goto restore;
   2007	}
   2008
   2009	ret = btrfs_parse_options(fs_info, data, *flags);
   2010	if (ret)
   2011		goto restore;
   2012
   2013	/* V1 cache is not supported for subpage mount. */
   2014	if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
   2015		btrfs_warn(fs_info,
   2016	"v1 space cache is not supported for page size %lu with sectorsize %u",
   2017			   PAGE_SIZE, fs_info->sectorsize);
   2018		ret = -EINVAL;
   2019		goto restore;
   2020	}
   2021	btrfs_remount_begin(fs_info, old_opts, *flags);
   2022	btrfs_resize_thread_pool(fs_info,
   2023		fs_info->thread_pool_size, old_thread_pool_size);
   2024
   2025	if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
   2026	    (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
   2027	    (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
   2028		btrfs_warn(fs_info,
   2029		"remount supports changing free space tree only from ro to rw");
   2030		/* Make sure free space cache options match the state on disk */
   2031		if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
   2032			btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
   2033			btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
   2034		}
   2035		if (btrfs_free_space_cache_v1_active(fs_info)) {
   2036			btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
   2037			btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
   2038		}
   2039	}
   2040
   2041	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
   2042		goto out;
   2043
   2044	if (*flags & SB_RDONLY) {
   2045		/*
   2046		 * this also happens on 'umount -rf' or on shutdown, when
   2047		 * the filesystem is busy.
   2048		 */
   2049		cancel_work_sync(&fs_info->async_reclaim_work);
   2050		cancel_work_sync(&fs_info->async_data_reclaim_work);
   2051
   2052		btrfs_discard_cleanup(fs_info);
   2053
   2054		/* wait for the uuid_scan task to finish */
   2055		down(&fs_info->uuid_tree_rescan_sem);
   2056		/* avoid complains from lockdep et al. */
   2057		up(&fs_info->uuid_tree_rescan_sem);
   2058
   2059		btrfs_set_sb_rdonly(sb);
   2060
   2061		/*
   2062		 * Setting SB_RDONLY will put the cleaner thread to
   2063		 * sleep at the next loop if it's already active.
   2064		 * If it's already asleep, we'll leave unused block
   2065		 * groups on disk until we're mounted read-write again
   2066		 * unless we clean them up here.
   2067		 */
   2068		btrfs_delete_unused_bgs(fs_info);
   2069
   2070		/*
   2071		 * The cleaner task could be already running before we set the
   2072		 * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
   2073		 * We must make sure that after we finish the remount, i.e. after
   2074		 * we call btrfs_commit_super(), the cleaner can no longer start
   2075		 * a transaction - either because it was dropping a dead root,
   2076		 * running delayed iputs or deleting an unused block group (the
   2077		 * cleaner picked a block group from the list of unused block
   2078		 * groups before we were able to in the previous call to
   2079		 * btrfs_delete_unused_bgs()).
   2080		 */
   2081		wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
   2082			    TASK_UNINTERRUPTIBLE);
   2083
   2084		/*
   2085		 * We've set the superblock to RO mode, so we might have made
   2086		 * the cleaner task sleep without running all pending delayed
   2087		 * iputs. Go through all the delayed iputs here, so that if an
   2088		 * unmount happens without remounting RW we don't end up at
   2089		 * finishing close_ctree() with a non-empty list of delayed
   2090		 * iputs.
   2091		 */
   2092		btrfs_run_delayed_iputs(fs_info);
   2093
   2094		btrfs_dev_replace_suspend_for_unmount(fs_info);
   2095		btrfs_scrub_cancel(fs_info);
   2096		btrfs_pause_balance(fs_info);
   2097
   2098		/*
   2099		 * Pause the qgroup rescan worker if it is running. We don't want
   2100		 * it to be still running after we are in RO mode, as after that,
   2101		 * by the time we unmount, it might have left a transaction open,
   2102		 * so we would leak the transaction and/or crash.
   2103		 */
   2104		btrfs_qgroup_wait_for_completion(fs_info, false);
   2105
   2106		ret = btrfs_commit_super(fs_info);
   2107		if (ret)
   2108			goto restore;
   2109	} else {
   2110		if (BTRFS_FS_ERROR(fs_info)) {
   2111			btrfs_err(fs_info,
   2112				"Remounting read-write after error is not allowed");
   2113			ret = -EINVAL;
   2114			goto restore;
   2115		}
   2116		if (fs_info->fs_devices->rw_devices == 0) {
   2117			ret = -EACCES;
   2118			goto restore;
   2119		}
   2120
   2121		if (!btrfs_check_rw_degradable(fs_info, NULL)) {
   2122			btrfs_warn(fs_info,
   2123		"too many missing devices, writable remount is not allowed");
   2124			ret = -EACCES;
   2125			goto restore;
   2126		}
   2127
   2128		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
   2129			btrfs_warn(fs_info,
   2130		"mount required to replay tree-log, cannot remount read-write");
   2131			ret = -EINVAL;
   2132			goto restore;
   2133		}
   2134
   2135		/*
   2136		 * NOTE: when remounting with a change that does writes, don't
   2137		 * put it anywhere above this point, as we are not sure to be
   2138		 * safe to write until we pass the above checks.
   2139		 */
   2140		ret = btrfs_start_pre_rw_mount(fs_info);
   2141		if (ret)
   2142			goto restore;
   2143
   2144		btrfs_clear_sb_rdonly(sb);
   2145
   2146		set_bit(BTRFS_FS_OPEN, &fs_info->flags);
   2147	}
   2148out:
   2149	/*
   2150	 * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
   2151	 * since the absence of the flag means it can be toggled off by remount.
   2152	 */
   2153	*flags |= SB_I_VERSION;
   2154
   2155	wake_up_process(fs_info->transaction_kthread);
   2156	btrfs_remount_cleanup(fs_info, old_opts);
   2157	btrfs_clear_oneshot_options(fs_info);
   2158	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
   2159
   2160	return 0;
   2161
   2162restore:
   2163	/* We've hit an error - don't reset SB_RDONLY */
   2164	if (sb_rdonly(sb))
   2165		old_flags |= SB_RDONLY;
   2166	if (!(old_flags & SB_RDONLY))
   2167		clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
   2168	sb->s_flags = old_flags;
   2169	fs_info->mount_opt = old_opts;
   2170	fs_info->compress_type = old_compress_type;
   2171	fs_info->max_inline = old_max_inline;
   2172	btrfs_resize_thread_pool(fs_info,
   2173		old_thread_pool_size, fs_info->thread_pool_size);
   2174	fs_info->metadata_ratio = old_metadata_ratio;
   2175	btrfs_remount_cleanup(fs_info, old_opts);
   2176	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
   2177
   2178	return ret;
   2179}
   2180
   2181/* Used to sort the devices by max_avail(descending sort) */
   2182static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
   2183{
   2184	const struct btrfs_device_info *dev_info1 = a;
   2185	const struct btrfs_device_info *dev_info2 = b;
   2186
   2187	if (dev_info1->max_avail > dev_info2->max_avail)
   2188		return -1;
   2189	else if (dev_info1->max_avail < dev_info2->max_avail)
   2190		return 1;
   2191	return 0;
   2192}
   2193
   2194/*
   2195 * sort the devices by max_avail, in which max free extent size of each device
   2196 * is stored.(Descending Sort)
   2197 */
   2198static inline void btrfs_descending_sort_devices(
   2199					struct btrfs_device_info *devices,
   2200					size_t nr_devices)
   2201{
   2202	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
   2203	     btrfs_cmp_device_free_bytes, NULL);
   2204}
   2205
   2206/*
   2207 * The helper to calc the free space on the devices that can be used to store
   2208 * file data.
   2209 */
   2210static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
   2211					      u64 *free_bytes)
   2212{
   2213	struct btrfs_device_info *devices_info;
   2214	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   2215	struct btrfs_device *device;
   2216	u64 type;
   2217	u64 avail_space;
   2218	u64 min_stripe_size;
   2219	int num_stripes = 1;
   2220	int i = 0, nr_devices;
   2221	const struct btrfs_raid_attr *rattr;
   2222
   2223	/*
   2224	 * We aren't under the device list lock, so this is racy-ish, but good
   2225	 * enough for our purposes.
   2226	 */
   2227	nr_devices = fs_info->fs_devices->open_devices;
   2228	if (!nr_devices) {
   2229		smp_mb();
   2230		nr_devices = fs_info->fs_devices->open_devices;
   2231		ASSERT(nr_devices);
   2232		if (!nr_devices) {
   2233			*free_bytes = 0;
   2234			return 0;
   2235		}
   2236	}
   2237
   2238	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
   2239			       GFP_KERNEL);
   2240	if (!devices_info)
   2241		return -ENOMEM;
   2242
   2243	/* calc min stripe number for data space allocation */
   2244	type = btrfs_data_alloc_profile(fs_info);
   2245	rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
   2246
   2247	if (type & BTRFS_BLOCK_GROUP_RAID0)
   2248		num_stripes = nr_devices;
   2249	else if (type & BTRFS_BLOCK_GROUP_RAID1)
   2250		num_stripes = 2;
   2251	else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
   2252		num_stripes = 3;
   2253	else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
   2254		num_stripes = 4;
   2255	else if (type & BTRFS_BLOCK_GROUP_RAID10)
   2256		num_stripes = 4;
   2257
   2258	/* Adjust for more than 1 stripe per device */
   2259	min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
   2260
   2261	rcu_read_lock();
   2262	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
   2263		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
   2264						&device->dev_state) ||
   2265		    !device->bdev ||
   2266		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
   2267			continue;
   2268
   2269		if (i >= nr_devices)
   2270			break;
   2271
   2272		avail_space = device->total_bytes - device->bytes_used;
   2273
   2274		/* align with stripe_len */
   2275		avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
   2276
   2277		/*
   2278		 * In order to avoid overwriting the superblock on the drive,
   2279		 * btrfs starts at an offset of at least 1MB when doing chunk
   2280		 * allocation.
   2281		 *
   2282		 * This ensures we have at least min_stripe_size free space
   2283		 * after excluding 1MB.
   2284		 */
   2285		if (avail_space <= SZ_1M + min_stripe_size)
   2286			continue;
   2287
   2288		avail_space -= SZ_1M;
   2289
   2290		devices_info[i].dev = device;
   2291		devices_info[i].max_avail = avail_space;
   2292
   2293		i++;
   2294	}
   2295	rcu_read_unlock();
   2296
   2297	nr_devices = i;
   2298
   2299	btrfs_descending_sort_devices(devices_info, nr_devices);
   2300
   2301	i = nr_devices - 1;
   2302	avail_space = 0;
   2303	while (nr_devices >= rattr->devs_min) {
   2304		num_stripes = min(num_stripes, nr_devices);
   2305
   2306		if (devices_info[i].max_avail >= min_stripe_size) {
   2307			int j;
   2308			u64 alloc_size;
   2309
   2310			avail_space += devices_info[i].max_avail * num_stripes;
   2311			alloc_size = devices_info[i].max_avail;
   2312			for (j = i + 1 - num_stripes; j <= i; j++)
   2313				devices_info[j].max_avail -= alloc_size;
   2314		}
   2315		i--;
   2316		nr_devices--;
   2317	}
   2318
   2319	kfree(devices_info);
   2320	*free_bytes = avail_space;
   2321	return 0;
   2322}
   2323
   2324/*
   2325 * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
   2326 *
   2327 * If there's a redundant raid level at DATA block groups, use the respective
   2328 * multiplier to scale the sizes.
   2329 *
   2330 * Unused device space usage is based on simulating the chunk allocator
   2331 * algorithm that respects the device sizes and order of allocations.  This is
   2332 * a close approximation of the actual use but there are other factors that may
   2333 * change the result (like a new metadata chunk).
   2334 *
   2335 * If metadata is exhausted, f_bavail will be 0.
   2336 */
   2337static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
   2338{
   2339	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
   2340	struct btrfs_super_block *disk_super = fs_info->super_copy;
   2341	struct btrfs_space_info *found;
   2342	u64 total_used = 0;
   2343	u64 total_free_data = 0;
   2344	u64 total_free_meta = 0;
   2345	u32 bits = fs_info->sectorsize_bits;
   2346	__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
   2347	unsigned factor = 1;
   2348	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
   2349	int ret;
   2350	u64 thresh = 0;
   2351	int mixed = 0;
   2352
   2353	list_for_each_entry(found, &fs_info->space_info, list) {
   2354		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
   2355			int i;
   2356
   2357			total_free_data += found->disk_total - found->disk_used;
   2358			total_free_data -=
   2359				btrfs_account_ro_block_groups_free_space(found);
   2360
   2361			for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
   2362				if (!list_empty(&found->block_groups[i]))
   2363					factor = btrfs_bg_type_to_factor(
   2364						btrfs_raid_array[i].bg_flag);
   2365			}
   2366		}
   2367
   2368		/*
   2369		 * Metadata in mixed block goup profiles are accounted in data
   2370		 */
   2371		if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
   2372			if (found->flags & BTRFS_BLOCK_GROUP_DATA)
   2373				mixed = 1;
   2374			else
   2375				total_free_meta += found->disk_total -
   2376					found->disk_used;
   2377		}
   2378
   2379		total_used += found->disk_used;
   2380	}
   2381
   2382	buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
   2383	buf->f_blocks >>= bits;
   2384	buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
   2385
   2386	/* Account global block reserve as used, it's in logical size already */
   2387	spin_lock(&block_rsv->lock);
   2388	/* Mixed block groups accounting is not byte-accurate, avoid overflow */
   2389	if (buf->f_bfree >= block_rsv->size >> bits)
   2390		buf->f_bfree -= block_rsv->size >> bits;
   2391	else
   2392		buf->f_bfree = 0;
   2393	spin_unlock(&block_rsv->lock);
   2394
   2395	buf->f_bavail = div_u64(total_free_data, factor);
   2396	ret = btrfs_calc_avail_data_space(fs_info, &total_free_data);
   2397	if (ret)
   2398		return ret;
   2399	buf->f_bavail += div_u64(total_free_data, factor);
   2400	buf->f_bavail = buf->f_bavail >> bits;
   2401
   2402	/*
   2403	 * We calculate the remaining metadata space minus global reserve. If
   2404	 * this is (supposedly) smaller than zero, there's no space. But this
   2405	 * does not hold in practice, the exhausted state happens where's still
   2406	 * some positive delta. So we apply some guesswork and compare the
   2407	 * delta to a 4M threshold.  (Practically observed delta was ~2M.)
   2408	 *
   2409	 * We probably cannot calculate the exact threshold value because this
   2410	 * depends on the internal reservations requested by various
   2411	 * operations, so some operations that consume a few metadata will
   2412	 * succeed even if the Avail is zero. But this is better than the other
   2413	 * way around.
   2414	 */
   2415	thresh = SZ_4M;
   2416
   2417	/*
   2418	 * We only want to claim there's no available space if we can no longer
   2419	 * allocate chunks for our metadata profile and our global reserve will
   2420	 * not fit in the free metadata space.  If we aren't ->full then we
   2421	 * still can allocate chunks and thus are fine using the currently
   2422	 * calculated f_bavail.
   2423	 */
   2424	if (!mixed && block_rsv->space_info->full &&
   2425	    total_free_meta - thresh < block_rsv->size)
   2426		buf->f_bavail = 0;
   2427
   2428	buf->f_type = BTRFS_SUPER_MAGIC;
   2429	buf->f_bsize = dentry->d_sb->s_blocksize;
   2430	buf->f_namelen = BTRFS_NAME_LEN;
   2431
   2432	/* We treat it as constant endianness (it doesn't matter _which_)
   2433	   because we want the fsid to come out the same whether mounted
   2434	   on a big-endian or little-endian host */
   2435	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
   2436	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
   2437	/* Mask in the root object ID too, to disambiguate subvols */
   2438	buf->f_fsid.val[0] ^=
   2439		BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
   2440	buf->f_fsid.val[1] ^=
   2441		BTRFS_I(d_inode(dentry))->root->root_key.objectid;
   2442
   2443	return 0;
   2444}
   2445
   2446static void btrfs_kill_super(struct super_block *sb)
   2447{
   2448	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
   2449	kill_anon_super(sb);
   2450	btrfs_free_fs_info(fs_info);
   2451}
   2452
   2453static struct file_system_type btrfs_fs_type = {
   2454	.owner		= THIS_MODULE,
   2455	.name		= "btrfs",
   2456	.mount		= btrfs_mount,
   2457	.kill_sb	= btrfs_kill_super,
   2458	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
   2459};
   2460
   2461static struct file_system_type btrfs_root_fs_type = {
   2462	.owner		= THIS_MODULE,
   2463	.name		= "btrfs",
   2464	.mount		= btrfs_mount_root,
   2465	.kill_sb	= btrfs_kill_super,
   2466	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
   2467};
   2468
   2469MODULE_ALIAS_FS("btrfs");
   2470
   2471static int btrfs_control_open(struct inode *inode, struct file *file)
   2472{
   2473	/*
   2474	 * The control file's private_data is used to hold the
   2475	 * transaction when it is started and is used to keep
   2476	 * track of whether a transaction is already in progress.
   2477	 */
   2478	file->private_data = NULL;
   2479	return 0;
   2480}
   2481
   2482/*
   2483 * Used by /dev/btrfs-control for devices ioctls.
   2484 */
   2485static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
   2486				unsigned long arg)
   2487{
   2488	struct btrfs_ioctl_vol_args *vol;
   2489	struct btrfs_device *device = NULL;
   2490	dev_t devt = 0;
   2491	int ret = -ENOTTY;
   2492
   2493	if (!capable(CAP_SYS_ADMIN))
   2494		return -EPERM;
   2495
   2496	vol = memdup_user((void __user *)arg, sizeof(*vol));
   2497	if (IS_ERR(vol))
   2498		return PTR_ERR(vol);
   2499	vol->name[BTRFS_PATH_NAME_MAX] = '\0';
   2500
   2501	switch (cmd) {
   2502	case BTRFS_IOC_SCAN_DEV:
   2503		mutex_lock(&uuid_mutex);
   2504		device = btrfs_scan_one_device(vol->name, FMODE_READ,
   2505					       &btrfs_root_fs_type);
   2506		ret = PTR_ERR_OR_ZERO(device);
   2507		mutex_unlock(&uuid_mutex);
   2508		break;
   2509	case BTRFS_IOC_FORGET_DEV:
   2510		if (vol->name[0] != 0) {
   2511			ret = lookup_bdev(vol->name, &devt);
   2512			if (ret)
   2513				break;
   2514		}
   2515		ret = btrfs_forget_devices(devt);
   2516		break;
   2517	case BTRFS_IOC_DEVICES_READY:
   2518		mutex_lock(&uuid_mutex);
   2519		device = btrfs_scan_one_device(vol->name, FMODE_READ,
   2520					       &btrfs_root_fs_type);
   2521		if (IS_ERR(device)) {
   2522			mutex_unlock(&uuid_mutex);
   2523			ret = PTR_ERR(device);
   2524			break;
   2525		}
   2526		ret = !(device->fs_devices->num_devices ==
   2527			device->fs_devices->total_devices);
   2528		mutex_unlock(&uuid_mutex);
   2529		break;
   2530	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
   2531		ret = btrfs_ioctl_get_supported_features((void __user*)arg);
   2532		break;
   2533	}
   2534
   2535	kfree(vol);
   2536	return ret;
   2537}
   2538
   2539static int btrfs_freeze(struct super_block *sb)
   2540{
   2541	struct btrfs_trans_handle *trans;
   2542	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
   2543	struct btrfs_root *root = fs_info->tree_root;
   2544
   2545	set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
   2546	/*
   2547	 * We don't need a barrier here, we'll wait for any transaction that
   2548	 * could be in progress on other threads (and do delayed iputs that
   2549	 * we want to avoid on a frozen filesystem), or do the commit
   2550	 * ourselves.
   2551	 */
   2552	trans = btrfs_attach_transaction_barrier(root);
   2553	if (IS_ERR(trans)) {
   2554		/* no transaction, don't bother */
   2555		if (PTR_ERR(trans) == -ENOENT)
   2556			return 0;
   2557		return PTR_ERR(trans);
   2558	}
   2559	return btrfs_commit_transaction(trans);
   2560}
   2561
   2562static int btrfs_unfreeze(struct super_block *sb)
   2563{
   2564	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
   2565
   2566	clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
   2567	return 0;
   2568}
   2569
   2570static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
   2571{
   2572	struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
   2573
   2574	/*
   2575	 * There should be always a valid pointer in latest_dev, it may be stale
   2576	 * for a short moment in case it's being deleted but still valid until
   2577	 * the end of RCU grace period.
   2578	 */
   2579	rcu_read_lock();
   2580	seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
   2581	rcu_read_unlock();
   2582
   2583	return 0;
   2584}
   2585
   2586static const struct super_operations btrfs_super_ops = {
   2587	.drop_inode	= btrfs_drop_inode,
   2588	.evict_inode	= btrfs_evict_inode,
   2589	.put_super	= btrfs_put_super,
   2590	.sync_fs	= btrfs_sync_fs,
   2591	.show_options	= btrfs_show_options,
   2592	.show_devname	= btrfs_show_devname,
   2593	.alloc_inode	= btrfs_alloc_inode,
   2594	.destroy_inode	= btrfs_destroy_inode,
   2595	.free_inode	= btrfs_free_inode,
   2596	.statfs		= btrfs_statfs,
   2597	.remount_fs	= btrfs_remount,
   2598	.freeze_fs	= btrfs_freeze,
   2599	.unfreeze_fs	= btrfs_unfreeze,
   2600};
   2601
   2602static const struct file_operations btrfs_ctl_fops = {
   2603	.open = btrfs_control_open,
   2604	.unlocked_ioctl	 = btrfs_control_ioctl,
   2605	.compat_ioctl = compat_ptr_ioctl,
   2606	.owner	 = THIS_MODULE,
   2607	.llseek = noop_llseek,
   2608};
   2609
   2610static struct miscdevice btrfs_misc = {
   2611	.minor		= BTRFS_MINOR,
   2612	.name		= "btrfs-control",
   2613	.fops		= &btrfs_ctl_fops
   2614};
   2615
   2616MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
   2617MODULE_ALIAS("devname:btrfs-control");
   2618
   2619static int __init btrfs_interface_init(void)
   2620{
   2621	return misc_register(&btrfs_misc);
   2622}
   2623
   2624static __cold void btrfs_interface_exit(void)
   2625{
   2626	misc_deregister(&btrfs_misc);
   2627}
   2628
   2629static void __init btrfs_print_mod_info(void)
   2630{
   2631	static const char options[] = ""
   2632#ifdef CONFIG_BTRFS_DEBUG
   2633			", debug=on"
   2634#endif
   2635#ifdef CONFIG_BTRFS_ASSERT
   2636			", assert=on"
   2637#endif
   2638#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
   2639			", integrity-checker=on"
   2640#endif
   2641#ifdef CONFIG_BTRFS_FS_REF_VERIFY
   2642			", ref-verify=on"
   2643#endif
   2644#ifdef CONFIG_BLK_DEV_ZONED
   2645			", zoned=yes"
   2646#else
   2647			", zoned=no"
   2648#endif
   2649#ifdef CONFIG_FS_VERITY
   2650			", fsverity=yes"
   2651#else
   2652			", fsverity=no"
   2653#endif
   2654			;
   2655	pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
   2656}
   2657
   2658static int __init init_btrfs_fs(void)
   2659{
   2660	int err;
   2661
   2662	btrfs_props_init();
   2663
   2664	err = btrfs_init_sysfs();
   2665	if (err)
   2666		return err;
   2667
   2668	btrfs_init_compress();
   2669
   2670	err = btrfs_init_cachep();
   2671	if (err)
   2672		goto free_compress;
   2673
   2674	err = extent_io_init();
   2675	if (err)
   2676		goto free_cachep;
   2677
   2678	err = extent_state_cache_init();
   2679	if (err)
   2680		goto free_extent_io;
   2681
   2682	err = extent_map_init();
   2683	if (err)
   2684		goto free_extent_state_cache;
   2685
   2686	err = ordered_data_init();
   2687	if (err)
   2688		goto free_extent_map;
   2689
   2690	err = btrfs_delayed_inode_init();
   2691	if (err)
   2692		goto free_ordered_data;
   2693
   2694	err = btrfs_auto_defrag_init();
   2695	if (err)
   2696		goto free_delayed_inode;
   2697
   2698	err = btrfs_delayed_ref_init();
   2699	if (err)
   2700		goto free_auto_defrag;
   2701
   2702	err = btrfs_prelim_ref_init();
   2703	if (err)
   2704		goto free_delayed_ref;
   2705
   2706	err = btrfs_end_io_wq_init();
   2707	if (err)
   2708		goto free_prelim_ref;
   2709
   2710	err = btrfs_interface_init();
   2711	if (err)
   2712		goto free_end_io_wq;
   2713
   2714	btrfs_print_mod_info();
   2715
   2716	err = btrfs_run_sanity_tests();
   2717	if (err)
   2718		goto unregister_ioctl;
   2719
   2720	err = register_filesystem(&btrfs_fs_type);
   2721	if (err)
   2722		goto unregister_ioctl;
   2723
   2724	return 0;
   2725
   2726unregister_ioctl:
   2727	btrfs_interface_exit();
   2728free_end_io_wq:
   2729	btrfs_end_io_wq_exit();
   2730free_prelim_ref:
   2731	btrfs_prelim_ref_exit();
   2732free_delayed_ref:
   2733	btrfs_delayed_ref_exit();
   2734free_auto_defrag:
   2735	btrfs_auto_defrag_exit();
   2736free_delayed_inode:
   2737	btrfs_delayed_inode_exit();
   2738free_ordered_data:
   2739	ordered_data_exit();
   2740free_extent_map:
   2741	extent_map_exit();
   2742free_extent_state_cache:
   2743	extent_state_cache_exit();
   2744free_extent_io:
   2745	extent_io_exit();
   2746free_cachep:
   2747	btrfs_destroy_cachep();
   2748free_compress:
   2749	btrfs_exit_compress();
   2750	btrfs_exit_sysfs();
   2751
   2752	return err;
   2753}
   2754
   2755static void __exit exit_btrfs_fs(void)
   2756{
   2757	btrfs_destroy_cachep();
   2758	btrfs_delayed_ref_exit();
   2759	btrfs_auto_defrag_exit();
   2760	btrfs_delayed_inode_exit();
   2761	btrfs_prelim_ref_exit();
   2762	ordered_data_exit();
   2763	extent_map_exit();
   2764	extent_state_cache_exit();
   2765	extent_io_exit();
   2766	btrfs_interface_exit();
   2767	btrfs_end_io_wq_exit();
   2768	unregister_filesystem(&btrfs_fs_type);
   2769	btrfs_exit_sysfs();
   2770	btrfs_cleanup_fs_uuids();
   2771	btrfs_exit_compress();
   2772}
   2773
   2774late_initcall(init_btrfs_fs);
   2775module_exit(exit_btrfs_fs)
   2776
   2777MODULE_LICENSE("GPL");
   2778MODULE_SOFTDEP("pre: crc32c");
   2779MODULE_SOFTDEP("pre: xxhash64");
   2780MODULE_SOFTDEP("pre: sha256");
   2781MODULE_SOFTDEP("pre: blake2b-256");