volumes.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
volumes.c (227764B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2007 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/sched.h>
      7#include <linux/sched/mm.h>
      8#include <linux/bio.h>
      9#include <linux/slab.h>
     10#include <linux/blkdev.h>
     11#include <linux/ratelimit.h>
     12#include <linux/kthread.h>
     13#include <linux/raid/pq.h>
     14#include <linux/semaphore.h>
     15#include <linux/uuid.h>
     16#include <linux/list_sort.h>
     17#include <linux/namei.h>
     18#include "misc.h"
     19#include "ctree.h"
     20#include "extent_map.h"
     21#include "disk-io.h"
     22#include "transaction.h"
     23#include "print-tree.h"
     24#include "volumes.h"
     25#include "raid56.h"
     26#include "async-thread.h"
     27#include "check-integrity.h"
     28#include "rcu-string.h"
     29#include "dev-replace.h"
     30#include "sysfs.h"
     31#include "tree-checker.h"
     32#include "space-info.h"
     33#include "block-group.h"
     34#include "discard.h"
     35#include "zoned.h"
     36
     37#define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
     38					 BTRFS_BLOCK_GROUP_RAID10 | \
     39					 BTRFS_BLOCK_GROUP_RAID56_MASK)
     40
     41const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
     42	[BTRFS_RAID_RAID10] = {
     43		.sub_stripes	= 2,
     44		.dev_stripes	= 1,
     45		.devs_max	= 0,	/* 0 == as many as possible */
     46		.devs_min	= 2,
     47		.tolerated_failures = 1,
     48		.devs_increment	= 2,
     49		.ncopies	= 2,
     50		.nparity        = 0,
     51		.raid_name	= "raid10",
     52		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
     53		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
     54	},
     55	[BTRFS_RAID_RAID1] = {
     56		.sub_stripes	= 1,
     57		.dev_stripes	= 1,
     58		.devs_max	= 2,
     59		.devs_min	= 2,
     60		.tolerated_failures = 1,
     61		.devs_increment	= 2,
     62		.ncopies	= 2,
     63		.nparity        = 0,
     64		.raid_name	= "raid1",
     65		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
     66		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
     67	},
     68	[BTRFS_RAID_RAID1C3] = {
     69		.sub_stripes	= 1,
     70		.dev_stripes	= 1,
     71		.devs_max	= 3,
     72		.devs_min	= 3,
     73		.tolerated_failures = 2,
     74		.devs_increment	= 3,
     75		.ncopies	= 3,
     76		.nparity        = 0,
     77		.raid_name	= "raid1c3",
     78		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
     79		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
     80	},
     81	[BTRFS_RAID_RAID1C4] = {
     82		.sub_stripes	= 1,
     83		.dev_stripes	= 1,
     84		.devs_max	= 4,
     85		.devs_min	= 4,
     86		.tolerated_failures = 3,
     87		.devs_increment	= 4,
     88		.ncopies	= 4,
     89		.nparity        = 0,
     90		.raid_name	= "raid1c4",
     91		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
     92		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
     93	},
     94	[BTRFS_RAID_DUP] = {
     95		.sub_stripes	= 1,
     96		.dev_stripes	= 2,
     97		.devs_max	= 1,
     98		.devs_min	= 1,
     99		.tolerated_failures = 0,
    100		.devs_increment	= 1,
    101		.ncopies	= 2,
    102		.nparity        = 0,
    103		.raid_name	= "dup",
    104		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
    105		.mindev_error	= 0,
    106	},
    107	[BTRFS_RAID_RAID0] = {
    108		.sub_stripes	= 1,
    109		.dev_stripes	= 1,
    110		.devs_max	= 0,
    111		.devs_min	= 1,
    112		.tolerated_failures = 0,
    113		.devs_increment	= 1,
    114		.ncopies	= 1,
    115		.nparity        = 0,
    116		.raid_name	= "raid0",
    117		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
    118		.mindev_error	= 0,
    119	},
    120	[BTRFS_RAID_SINGLE] = {
    121		.sub_stripes	= 1,
    122		.dev_stripes	= 1,
    123		.devs_max	= 1,
    124		.devs_min	= 1,
    125		.tolerated_failures = 0,
    126		.devs_increment	= 1,
    127		.ncopies	= 1,
    128		.nparity        = 0,
    129		.raid_name	= "single",
    130		.bg_flag	= 0,
    131		.mindev_error	= 0,
    132	},
    133	[BTRFS_RAID_RAID5] = {
    134		.sub_stripes	= 1,
    135		.dev_stripes	= 1,
    136		.devs_max	= 0,
    137		.devs_min	= 2,
    138		.tolerated_failures = 1,
    139		.devs_increment	= 1,
    140		.ncopies	= 1,
    141		.nparity        = 1,
    142		.raid_name	= "raid5",
    143		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
    144		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
    145	},
    146	[BTRFS_RAID_RAID6] = {
    147		.sub_stripes	= 1,
    148		.dev_stripes	= 1,
    149		.devs_max	= 0,
    150		.devs_min	= 3,
    151		.tolerated_failures = 2,
    152		.devs_increment	= 1,
    153		.ncopies	= 1,
    154		.nparity        = 2,
    155		.raid_name	= "raid6",
    156		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
    157		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
    158	},
    159};
    160
    161/*
    162 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
    163 * can be used as index to access btrfs_raid_array[].
    164 */
    165enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
    166{
    167	const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
    168
    169	if (!profile)
    170		return BTRFS_RAID_SINGLE;
    171
    172	return BTRFS_BG_FLAG_TO_INDEX(profile);
    173}
    174
    175const char *btrfs_bg_type_to_raid_name(u64 flags)
    176{
    177	const int index = btrfs_bg_flags_to_raid_index(flags);
    178
    179	if (index >= BTRFS_NR_RAID_TYPES)
    180		return NULL;
    181
    182	return btrfs_raid_array[index].raid_name;
    183}
    184
    185/*
    186 * Fill @buf with textual description of @bg_flags, no more than @size_buf
    187 * bytes including terminating null byte.
    188 */
    189void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
    190{
    191	int i;
    192	int ret;
    193	char *bp = buf;
    194	u64 flags = bg_flags;
    195	u32 size_bp = size_buf;
    196
    197	if (!flags) {
    198		strcpy(bp, "NONE");
    199		return;
    200	}
    201
    202#define DESCRIBE_FLAG(flag, desc)						\
    203	do {								\
    204		if (flags & (flag)) {					\
    205			ret = snprintf(bp, size_bp, "%s|", (desc));	\
    206			if (ret < 0 || ret >= size_bp)			\
    207				goto out_overflow;			\
    208			size_bp -= ret;					\
    209			bp += ret;					\
    210			flags &= ~(flag);				\
    211		}							\
    212	} while (0)
    213
    214	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
    215	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
    216	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
    217
    218	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
    219	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
    220		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
    221			      btrfs_raid_array[i].raid_name);
    222#undef DESCRIBE_FLAG
    223
    224	if (flags) {
    225		ret = snprintf(bp, size_bp, "0x%llx|", flags);
    226		size_bp -= ret;
    227	}
    228
    229	if (size_bp < size_buf)
    230		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
    231
    232	/*
    233	 * The text is trimmed, it's up to the caller to provide sufficiently
    234	 * large buffer
    235	 */
    236out_overflow:;
    237}
    238
    239static int init_first_rw_device(struct btrfs_trans_handle *trans);
    240static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
    241static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
    242static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
    243static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
    244			     enum btrfs_map_op op,
    245			     u64 logical, u64 *length,
    246			     struct btrfs_io_context **bioc_ret,
    247			     int mirror_num, int need_raid_map);
    248
    249/*
    250 * Device locking
    251 * ==============
    252 *
    253 * There are several mutexes that protect manipulation of devices and low-level
    254 * structures like chunks but not block groups, extents or files
    255 *
    256 * uuid_mutex (global lock)
    257 * ------------------------
    258 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
    259 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
    260 * device) or requested by the device= mount option
    261 *
    262 * the mutex can be very coarse and can cover long-running operations
    263 *
    264 * protects: updates to fs_devices counters like missing devices, rw devices,
    265 * seeding, structure cloning, opening/closing devices at mount/umount time
    266 *
    267 * global::fs_devs - add, remove, updates to the global list
    268 *
    269 * does not protect: manipulation of the fs_devices::devices list in general
    270 * but in mount context it could be used to exclude list modifications by eg.
    271 * scan ioctl
    272 *
    273 * btrfs_device::name - renames (write side), read is RCU
    274 *
    275 * fs_devices::device_list_mutex (per-fs, with RCU)
    276 * ------------------------------------------------
    277 * protects updates to fs_devices::devices, ie. adding and deleting
    278 *
    279 * simple list traversal with read-only actions can be done with RCU protection
    280 *
    281 * may be used to exclude some operations from running concurrently without any
    282 * modifications to the list (see write_all_supers)
    283 *
    284 * Is not required at mount and close times, because our device list is
    285 * protected by the uuid_mutex at that point.
    286 *
    287 * balance_mutex
    288 * -------------
    289 * protects balance structures (status, state) and context accessed from
    290 * several places (internally, ioctl)
    291 *
    292 * chunk_mutex
    293 * -----------
    294 * protects chunks, adding or removing during allocation, trim or when a new
    295 * device is added/removed. Additionally it also protects post_commit_list of
    296 * individual devices, since they can be added to the transaction's
    297 * post_commit_list only with chunk_mutex held.
    298 *
    299 * cleaner_mutex
    300 * -------------
    301 * a big lock that is held by the cleaner thread and prevents running subvolume
    302 * cleaning together with relocation or delayed iputs
    303 *
    304 *
    305 * Lock nesting
    306 * ============
    307 *
    308 * uuid_mutex
    309 *   device_list_mutex
    310 *     chunk_mutex
    311 *   balance_mutex
    312 *
    313 *
    314 * Exclusive operations
    315 * ====================
    316 *
    317 * Maintains the exclusivity of the following operations that apply to the
    318 * whole filesystem and cannot run in parallel.
    319 *
    320 * - Balance (*)
    321 * - Device add
    322 * - Device remove
    323 * - Device replace (*)
    324 * - Resize
    325 *
    326 * The device operations (as above) can be in one of the following states:
    327 *
    328 * - Running state
    329 * - Paused state
    330 * - Completed state
    331 *
    332 * Only device operations marked with (*) can go into the Paused state for the
    333 * following reasons:
    334 *
    335 * - ioctl (only Balance can be Paused through ioctl)
    336 * - filesystem remounted as read-only
    337 * - filesystem unmounted and mounted as read-only
    338 * - system power-cycle and filesystem mounted as read-only
    339 * - filesystem or device errors leading to forced read-only
    340 *
    341 * The status of exclusive operation is set and cleared atomically.
    342 * During the course of Paused state, fs_info::exclusive_operation remains set.
    343 * A device operation in Paused or Running state can be canceled or resumed
    344 * either by ioctl (Balance only) or when remounted as read-write.
    345 * The exclusive status is cleared when the device operation is canceled or
    346 * completed.
    347 */
    348
    349DEFINE_MUTEX(uuid_mutex);
    350static LIST_HEAD(fs_uuids);
    351struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
    352{
    353	return &fs_uuids;
    354}
    355
    356/*
    357 * alloc_fs_devices - allocate struct btrfs_fs_devices
    358 * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
    359 * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
    360 *
    361 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
    362 * The returned struct is not linked onto any lists and can be destroyed with
    363 * kfree() right away.
    364 */
    365static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
    366						 const u8 *metadata_fsid)
    367{
    368	struct btrfs_fs_devices *fs_devs;
    369
    370	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
    371	if (!fs_devs)
    372		return ERR_PTR(-ENOMEM);
    373
    374	mutex_init(&fs_devs->device_list_mutex);
    375
    376	INIT_LIST_HEAD(&fs_devs->devices);
    377	INIT_LIST_HEAD(&fs_devs->alloc_list);
    378	INIT_LIST_HEAD(&fs_devs->fs_list);
    379	INIT_LIST_HEAD(&fs_devs->seed_list);
    380	if (fsid)
    381		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
    382
    383	if (metadata_fsid)
    384		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
    385	else if (fsid)
    386		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
    387
    388	return fs_devs;
    389}
    390
    391void btrfs_free_device(struct btrfs_device *device)
    392{
    393	WARN_ON(!list_empty(&device->post_commit_list));
    394	rcu_string_free(device->name);
    395	extent_io_tree_release(&device->alloc_state);
    396	btrfs_destroy_dev_zone_info(device);
    397	kfree(device);
    398}
    399
    400static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
    401{
    402	struct btrfs_device *device;
    403	WARN_ON(fs_devices->opened);
    404	while (!list_empty(&fs_devices->devices)) {
    405		device = list_entry(fs_devices->devices.next,
    406				    struct btrfs_device, dev_list);
    407		list_del(&device->dev_list);
    408		btrfs_free_device(device);
    409	}
    410	kfree(fs_devices);
    411}
    412
    413void __exit btrfs_cleanup_fs_uuids(void)
    414{
    415	struct btrfs_fs_devices *fs_devices;
    416
    417	while (!list_empty(&fs_uuids)) {
    418		fs_devices = list_entry(fs_uuids.next,
    419					struct btrfs_fs_devices, fs_list);
    420		list_del(&fs_devices->fs_list);
    421		free_fs_devices(fs_devices);
    422	}
    423}
    424
    425static noinline struct btrfs_fs_devices *find_fsid(
    426		const u8 *fsid, const u8 *metadata_fsid)
    427{
    428	struct btrfs_fs_devices *fs_devices;
    429
    430	ASSERT(fsid);
    431
    432	/* Handle non-split brain cases */
    433	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
    434		if (metadata_fsid) {
    435			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
    436			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
    437				      BTRFS_FSID_SIZE) == 0)
    438				return fs_devices;
    439		} else {
    440			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
    441				return fs_devices;
    442		}
    443	}
    444	return NULL;
    445}
    446
    447static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
    448				struct btrfs_super_block *disk_super)
    449{
    450
    451	struct btrfs_fs_devices *fs_devices;
    452
    453	/*
    454	 * Handle scanned device having completed its fsid change but
    455	 * belonging to a fs_devices that was created by first scanning
    456	 * a device which didn't have its fsid/metadata_uuid changed
    457	 * at all and the CHANGING_FSID_V2 flag set.
    458	 */
    459	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
    460		if (fs_devices->fsid_change &&
    461		    memcmp(disk_super->metadata_uuid, fs_devices->fsid,
    462			   BTRFS_FSID_SIZE) == 0 &&
    463		    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
    464			   BTRFS_FSID_SIZE) == 0) {
    465			return fs_devices;
    466		}
    467	}
    468	/*
    469	 * Handle scanned device having completed its fsid change but
    470	 * belonging to a fs_devices that was created by a device that
    471	 * has an outdated pair of fsid/metadata_uuid and
    472	 * CHANGING_FSID_V2 flag set.
    473	 */
    474	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
    475		if (fs_devices->fsid_change &&
    476		    memcmp(fs_devices->metadata_uuid,
    477			   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
    478		    memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
    479			   BTRFS_FSID_SIZE) == 0) {
    480			return fs_devices;
    481		}
    482	}
    483
    484	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
    485}
    486
    487
    488static int
    489btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
    490		      int flush, struct block_device **bdev,
    491		      struct btrfs_super_block **disk_super)
    492{
    493	int ret;
    494
    495	*bdev = blkdev_get_by_path(device_path, flags, holder);
    496
    497	if (IS_ERR(*bdev)) {
    498		ret = PTR_ERR(*bdev);
    499		goto error;
    500	}
    501
    502	if (flush)
    503		sync_blockdev(*bdev);
    504	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
    505	if (ret) {
    506		blkdev_put(*bdev, flags);
    507		goto error;
    508	}
    509	invalidate_bdev(*bdev);
    510	*disk_super = btrfs_read_dev_super(*bdev);
    511	if (IS_ERR(*disk_super)) {
    512		ret = PTR_ERR(*disk_super);
    513		blkdev_put(*bdev, flags);
    514		goto error;
    515	}
    516
    517	return 0;
    518
    519error:
    520	*bdev = NULL;
    521	return ret;
    522}
    523
    524/**
    525 *  Search and remove all stale devices (which are not mounted).
    526 *  When both inputs are NULL, it will search and release all stale devices.
    527 *
    528 *  @devt:	Optional. When provided will it release all unmounted devices
    529 *		matching this devt only.
    530 *  @skip_device:  Optional. Will skip this device when searching for the stale
    531 *		devices.
    532 *
    533 *  Return:	0 for success or if @devt is 0.
    534 *		-EBUSY if @devt is a mounted device.
    535 *		-ENOENT if @devt does not match any device in the list.
    536 */
    537static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
    538{
    539	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
    540	struct btrfs_device *device, *tmp_device;
    541	int ret = 0;
    542
    543	lockdep_assert_held(&uuid_mutex);
    544
    545	if (devt)
    546		ret = -ENOENT;
    547
    548	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
    549
    550		mutex_lock(&fs_devices->device_list_mutex);
    551		list_for_each_entry_safe(device, tmp_device,
    552					 &fs_devices->devices, dev_list) {
    553			if (skip_device && skip_device == device)
    554				continue;
    555			if (devt && devt != device->devt)
    556				continue;
    557			if (fs_devices->opened) {
    558				/* for an already deleted device return 0 */
    559				if (devt && ret != 0)
    560					ret = -EBUSY;
    561				break;
    562			}
    563
    564			/* delete the stale device */
    565			fs_devices->num_devices--;
    566			list_del(&device->dev_list);
    567			btrfs_free_device(device);
    568
    569			ret = 0;
    570		}
    571		mutex_unlock(&fs_devices->device_list_mutex);
    572
    573		if (fs_devices->num_devices == 0) {
    574			btrfs_sysfs_remove_fsid(fs_devices);
    575			list_del(&fs_devices->fs_list);
    576			free_fs_devices(fs_devices);
    577		}
    578	}
    579
    580	return ret;
    581}
    582
    583/*
    584 * This is only used on mount, and we are protected from competing things
    585 * messing with our fs_devices by the uuid_mutex, thus we do not need the
    586 * fs_devices->device_list_mutex here.
    587 */
    588static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
    589			struct btrfs_device *device, fmode_t flags,
    590			void *holder)
    591{
    592	struct block_device *bdev;
    593	struct btrfs_super_block *disk_super;
    594	u64 devid;
    595	int ret;
    596
    597	if (device->bdev)
    598		return -EINVAL;
    599	if (!device->name)
    600		return -EINVAL;
    601
    602	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
    603				    &bdev, &disk_super);
    604	if (ret)
    605		return ret;
    606
    607	devid = btrfs_stack_device_id(&disk_super->dev_item);
    608	if (devid != device->devid)
    609		goto error_free_page;
    610
    611	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
    612		goto error_free_page;
    613
    614	device->generation = btrfs_super_generation(disk_super);
    615
    616	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
    617		if (btrfs_super_incompat_flags(disk_super) &
    618		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
    619			pr_err(
    620		"BTRFS: Invalid seeding and uuid-changed device detected\n");
    621			goto error_free_page;
    622		}
    623
    624		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
    625		fs_devices->seeding = true;
    626	} else {
    627		if (bdev_read_only(bdev))
    628			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
    629		else
    630			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
    631	}
    632
    633	if (!bdev_nonrot(bdev))
    634		fs_devices->rotating = true;
    635
    636	device->bdev = bdev;
    637	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
    638	device->mode = flags;
    639
    640	fs_devices->open_devices++;
    641	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
    642	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
    643		fs_devices->rw_devices++;
    644		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
    645	}
    646	btrfs_release_disk_super(disk_super);
    647
    648	return 0;
    649
    650error_free_page:
    651	btrfs_release_disk_super(disk_super);
    652	blkdev_put(bdev, flags);
    653
    654	return -EINVAL;
    655}
    656
    657/*
    658 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
    659 * being created with a disk that has already completed its fsid change. Such
    660 * disk can belong to an fs which has its FSID changed or to one which doesn't.
    661 * Handle both cases here.
    662 */
    663static struct btrfs_fs_devices *find_fsid_inprogress(
    664					struct btrfs_super_block *disk_super)
    665{
    666	struct btrfs_fs_devices *fs_devices;
    667
    668	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
    669		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
    670			   BTRFS_FSID_SIZE) != 0 &&
    671		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
    672			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
    673			return fs_devices;
    674		}
    675	}
    676
    677	return find_fsid(disk_super->fsid, NULL);
    678}
    679
    680
    681static struct btrfs_fs_devices *find_fsid_changed(
    682					struct btrfs_super_block *disk_super)
    683{
    684	struct btrfs_fs_devices *fs_devices;
    685
    686	/*
    687	 * Handles the case where scanned device is part of an fs that had
    688	 * multiple successful changes of FSID but currently device didn't
    689	 * observe it. Meaning our fsid will be different than theirs. We need
    690	 * to handle two subcases :
    691	 *  1 - The fs still continues to have different METADATA/FSID uuids.
    692	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
    693	 *  are equal).
    694	 */
    695	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
    696		/* Changed UUIDs */
    697		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
    698			   BTRFS_FSID_SIZE) != 0 &&
    699		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
    700			   BTRFS_FSID_SIZE) == 0 &&
    701		    memcmp(fs_devices->fsid, disk_super->fsid,
    702			   BTRFS_FSID_SIZE) != 0)
    703			return fs_devices;
    704
    705		/* Unchanged UUIDs */
    706		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
    707			   BTRFS_FSID_SIZE) == 0 &&
    708		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
    709			   BTRFS_FSID_SIZE) == 0)
    710			return fs_devices;
    711	}
    712
    713	return NULL;
    714}
    715
    716static struct btrfs_fs_devices *find_fsid_reverted_metadata(
    717				struct btrfs_super_block *disk_super)
    718{
    719	struct btrfs_fs_devices *fs_devices;
    720
    721	/*
    722	 * Handle the case where the scanned device is part of an fs whose last
    723	 * metadata UUID change reverted it to the original FSID. At the same
    724	 * time * fs_devices was first created by another constitutent device
    725	 * which didn't fully observe the operation. This results in an
    726	 * btrfs_fs_devices created with metadata/fsid different AND
    727	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
    728	 * fs_devices equal to the FSID of the disk.
    729	 */
    730	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
    731		if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
    732			   BTRFS_FSID_SIZE) != 0 &&
    733		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
    734			   BTRFS_FSID_SIZE) == 0 &&
    735		    fs_devices->fsid_change)
    736			return fs_devices;
    737	}
    738
    739	return NULL;
    740}
    741/*
    742 * Add new device to list of registered devices
    743 *
    744 * Returns:
    745 * device pointer which was just added or updated when successful
    746 * error pointer when failed
    747 */
    748static noinline struct btrfs_device *device_list_add(const char *path,
    749			   struct btrfs_super_block *disk_super,
    750			   bool *new_device_added)
    751{
    752	struct btrfs_device *device;
    753	struct btrfs_fs_devices *fs_devices = NULL;
    754	struct rcu_string *name;
    755	u64 found_transid = btrfs_super_generation(disk_super);
    756	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
    757	dev_t path_devt;
    758	int error;
    759	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
    760		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
    761	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
    762					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
    763
    764	error = lookup_bdev(path, &path_devt);
    765	if (error)
    766		return ERR_PTR(error);
    767
    768	if (fsid_change_in_progress) {
    769		if (!has_metadata_uuid)
    770			fs_devices = find_fsid_inprogress(disk_super);
    771		else
    772			fs_devices = find_fsid_changed(disk_super);
    773	} else if (has_metadata_uuid) {
    774		fs_devices = find_fsid_with_metadata_uuid(disk_super);
    775	} else {
    776		fs_devices = find_fsid_reverted_metadata(disk_super);
    777		if (!fs_devices)
    778			fs_devices = find_fsid(disk_super->fsid, NULL);
    779	}
    780
    781
    782	if (!fs_devices) {
    783		if (has_metadata_uuid)
    784			fs_devices = alloc_fs_devices(disk_super->fsid,
    785						      disk_super->metadata_uuid);
    786		else
    787			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
    788
    789		if (IS_ERR(fs_devices))
    790			return ERR_CAST(fs_devices);
    791
    792		fs_devices->fsid_change = fsid_change_in_progress;
    793
    794		mutex_lock(&fs_devices->device_list_mutex);
    795		list_add(&fs_devices->fs_list, &fs_uuids);
    796
    797		device = NULL;
    798	} else {
    799		struct btrfs_dev_lookup_args args = {
    800			.devid = devid,
    801			.uuid = disk_super->dev_item.uuid,
    802		};
    803
    804		mutex_lock(&fs_devices->device_list_mutex);
    805		device = btrfs_find_device(fs_devices, &args);
    806
    807		/*
    808		 * If this disk has been pulled into an fs devices created by
    809		 * a device which had the CHANGING_FSID_V2 flag then replace the
    810		 * metadata_uuid/fsid values of the fs_devices.
    811		 */
    812		if (fs_devices->fsid_change &&
    813		    found_transid > fs_devices->latest_generation) {
    814			memcpy(fs_devices->fsid, disk_super->fsid,
    815					BTRFS_FSID_SIZE);
    816
    817			if (has_metadata_uuid)
    818				memcpy(fs_devices->metadata_uuid,
    819				       disk_super->metadata_uuid,
    820				       BTRFS_FSID_SIZE);
    821			else
    822				memcpy(fs_devices->metadata_uuid,
    823				       disk_super->fsid, BTRFS_FSID_SIZE);
    824
    825			fs_devices->fsid_change = false;
    826		}
    827	}
    828
    829	if (!device) {
    830		if (fs_devices->opened) {
    831			mutex_unlock(&fs_devices->device_list_mutex);
    832			return ERR_PTR(-EBUSY);
    833		}
    834
    835		device = btrfs_alloc_device(NULL, &devid,
    836					    disk_super->dev_item.uuid);
    837		if (IS_ERR(device)) {
    838			mutex_unlock(&fs_devices->device_list_mutex);
    839			/* we can safely leave the fs_devices entry around */
    840			return device;
    841		}
    842
    843		name = rcu_string_strdup(path, GFP_NOFS);
    844		if (!name) {
    845			btrfs_free_device(device);
    846			mutex_unlock(&fs_devices->device_list_mutex);
    847			return ERR_PTR(-ENOMEM);
    848		}
    849		rcu_assign_pointer(device->name, name);
    850		device->devt = path_devt;
    851
    852		list_add_rcu(&device->dev_list, &fs_devices->devices);
    853		fs_devices->num_devices++;
    854
    855		device->fs_devices = fs_devices;
    856		*new_device_added = true;
    857
    858		if (disk_super->label[0])
    859			pr_info(
    860	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
    861				disk_super->label, devid, found_transid, path,
    862				current->comm, task_pid_nr(current));
    863		else
    864			pr_info(
    865	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
    866				disk_super->fsid, devid, found_transid, path,
    867				current->comm, task_pid_nr(current));
    868
    869	} else if (!device->name || strcmp(device->name->str, path)) {
    870		/*
    871		 * When FS is already mounted.
    872		 * 1. If you are here and if the device->name is NULL that
    873		 *    means this device was missing at time of FS mount.
    874		 * 2. If you are here and if the device->name is different
    875		 *    from 'path' that means either
    876		 *      a. The same device disappeared and reappeared with
    877		 *         different name. or
    878		 *      b. The missing-disk-which-was-replaced, has
    879		 *         reappeared now.
    880		 *
    881		 * We must allow 1 and 2a above. But 2b would be a spurious
    882		 * and unintentional.
    883		 *
    884		 * Further in case of 1 and 2a above, the disk at 'path'
    885		 * would have missed some transaction when it was away and
    886		 * in case of 2a the stale bdev has to be updated as well.
    887		 * 2b must not be allowed at all time.
    888		 */
    889
    890		/*
    891		 * For now, we do allow update to btrfs_fs_device through the
    892		 * btrfs dev scan cli after FS has been mounted.  We're still
    893		 * tracking a problem where systems fail mount by subvolume id
    894		 * when we reject replacement on a mounted FS.
    895		 */
    896		if (!fs_devices->opened && found_transid < device->generation) {
    897			/*
    898			 * That is if the FS is _not_ mounted and if you
    899			 * are here, that means there is more than one
    900			 * disk with same uuid and devid.We keep the one
    901			 * with larger generation number or the last-in if
    902			 * generation are equal.
    903			 */
    904			mutex_unlock(&fs_devices->device_list_mutex);
    905			return ERR_PTR(-EEXIST);
    906		}
    907
    908		/*
    909		 * We are going to replace the device path for a given devid,
    910		 * make sure it's the same device if the device is mounted
    911		 *
    912		 * NOTE: the device->fs_info may not be reliable here so pass
    913		 * in a NULL to message helpers instead. This avoids a possible
    914		 * use-after-free when the fs_info and fs_info->sb are already
    915		 * torn down.
    916		 */
    917		if (device->bdev) {
    918			if (device->devt != path_devt) {
    919				mutex_unlock(&fs_devices->device_list_mutex);
    920				btrfs_warn_in_rcu(NULL,
    921	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
    922						  path, devid, found_transid,
    923						  current->comm,
    924						  task_pid_nr(current));
    925				return ERR_PTR(-EEXIST);
    926			}
    927			btrfs_info_in_rcu(NULL,
    928	"devid %llu device path %s changed to %s scanned by %s (%d)",
    929					  devid, rcu_str_deref(device->name),
    930					  path, current->comm,
    931					  task_pid_nr(current));
    932		}
    933
    934		name = rcu_string_strdup(path, GFP_NOFS);
    935		if (!name) {
    936			mutex_unlock(&fs_devices->device_list_mutex);
    937			return ERR_PTR(-ENOMEM);
    938		}
    939		rcu_string_free(device->name);
    940		rcu_assign_pointer(device->name, name);
    941		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
    942			fs_devices->missing_devices--;
    943			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
    944		}
    945		device->devt = path_devt;
    946	}
    947
    948	/*
    949	 * Unmount does not free the btrfs_device struct but would zero
    950	 * generation along with most of the other members. So just update
    951	 * it back. We need it to pick the disk with largest generation
    952	 * (as above).
    953	 */
    954	if (!fs_devices->opened) {
    955		device->generation = found_transid;
    956		fs_devices->latest_generation = max_t(u64, found_transid,
    957						fs_devices->latest_generation);
    958	}
    959
    960	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
    961
    962	mutex_unlock(&fs_devices->device_list_mutex);
    963	return device;
    964}
    965
    966static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
    967{
    968	struct btrfs_fs_devices *fs_devices;
    969	struct btrfs_device *device;
    970	struct btrfs_device *orig_dev;
    971	int ret = 0;
    972
    973	lockdep_assert_held(&uuid_mutex);
    974
    975	fs_devices = alloc_fs_devices(orig->fsid, NULL);
    976	if (IS_ERR(fs_devices))
    977		return fs_devices;
    978
    979	fs_devices->total_devices = orig->total_devices;
    980
    981	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
    982		struct rcu_string *name;
    983
    984		device = btrfs_alloc_device(NULL, &orig_dev->devid,
    985					    orig_dev->uuid);
    986		if (IS_ERR(device)) {
    987			ret = PTR_ERR(device);
    988			goto error;
    989		}
    990
    991		/*
    992		 * This is ok to do without rcu read locked because we hold the
    993		 * uuid mutex so nothing we touch in here is going to disappear.
    994		 */
    995		if (orig_dev->name) {
    996			name = rcu_string_strdup(orig_dev->name->str,
    997					GFP_KERNEL);
    998			if (!name) {
    999				btrfs_free_device(device);
   1000				ret = -ENOMEM;
   1001				goto error;
   1002			}
   1003			rcu_assign_pointer(device->name, name);
   1004		}
   1005
   1006		list_add(&device->dev_list, &fs_devices->devices);
   1007		device->fs_devices = fs_devices;
   1008		fs_devices->num_devices++;
   1009	}
   1010	return fs_devices;
   1011error:
   1012	free_fs_devices(fs_devices);
   1013	return ERR_PTR(ret);
   1014}
   1015
   1016static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
   1017				      struct btrfs_device **latest_dev)
   1018{
   1019	struct btrfs_device *device, *next;
   1020
   1021	/* This is the initialized path, it is safe to release the devices. */
   1022	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
   1023		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
   1024			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
   1025				      &device->dev_state) &&
   1026			    !test_bit(BTRFS_DEV_STATE_MISSING,
   1027				      &device->dev_state) &&
   1028			    (!*latest_dev ||
   1029			     device->generation > (*latest_dev)->generation)) {
   1030				*latest_dev = device;
   1031			}
   1032			continue;
   1033		}
   1034
   1035		/*
   1036		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
   1037		 * in btrfs_init_dev_replace() so just continue.
   1038		 */
   1039		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
   1040			continue;
   1041
   1042		if (device->bdev) {
   1043			blkdev_put(device->bdev, device->mode);
   1044			device->bdev = NULL;
   1045			fs_devices->open_devices--;
   1046		}
   1047		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
   1048			list_del_init(&device->dev_alloc_list);
   1049			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
   1050			fs_devices->rw_devices--;
   1051		}
   1052		list_del_init(&device->dev_list);
   1053		fs_devices->num_devices--;
   1054		btrfs_free_device(device);
   1055	}
   1056
   1057}
   1058
   1059/*
   1060 * After we have read the system tree and know devids belonging to this
   1061 * filesystem, remove the device which does not belong there.
   1062 */
   1063void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
   1064{
   1065	struct btrfs_device *latest_dev = NULL;
   1066	struct btrfs_fs_devices *seed_dev;
   1067
   1068	mutex_lock(&uuid_mutex);
   1069	__btrfs_free_extra_devids(fs_devices, &latest_dev);
   1070
   1071	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
   1072		__btrfs_free_extra_devids(seed_dev, &latest_dev);
   1073
   1074	fs_devices->latest_dev = latest_dev;
   1075
   1076	mutex_unlock(&uuid_mutex);
   1077}
   1078
   1079static void btrfs_close_bdev(struct btrfs_device *device)
   1080{
   1081	if (!device->bdev)
   1082		return;
   1083
   1084	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
   1085		sync_blockdev(device->bdev);
   1086		invalidate_bdev(device->bdev);
   1087	}
   1088
   1089	blkdev_put(device->bdev, device->mode);
   1090}
   1091
   1092static void btrfs_close_one_device(struct btrfs_device *device)
   1093{
   1094	struct btrfs_fs_devices *fs_devices = device->fs_devices;
   1095
   1096	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
   1097	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
   1098		list_del_init(&device->dev_alloc_list);
   1099		fs_devices->rw_devices--;
   1100	}
   1101
   1102	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
   1103		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
   1104
   1105	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
   1106		clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
   1107		fs_devices->missing_devices--;
   1108	}
   1109
   1110	btrfs_close_bdev(device);
   1111	if (device->bdev) {
   1112		fs_devices->open_devices--;
   1113		device->bdev = NULL;
   1114	}
   1115	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
   1116	btrfs_destroy_dev_zone_info(device);
   1117
   1118	device->fs_info = NULL;
   1119	atomic_set(&device->dev_stats_ccnt, 0);
   1120	extent_io_tree_release(&device->alloc_state);
   1121
   1122	/*
   1123	 * Reset the flush error record. We might have a transient flush error
   1124	 * in this mount, and if so we aborted the current transaction and set
   1125	 * the fs to an error state, guaranteeing no super blocks can be further
   1126	 * committed. However that error might be transient and if we unmount the
   1127	 * filesystem and mount it again, we should allow the mount to succeed
   1128	 * (btrfs_check_rw_degradable() should not fail) - if after mounting the
   1129	 * filesystem again we still get flush errors, then we will again abort
   1130	 * any transaction and set the error state, guaranteeing no commits of
   1131	 * unsafe super blocks.
   1132	 */
   1133	device->last_flush_error = 0;
   1134
   1135	/* Verify the device is back in a pristine state  */
   1136	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
   1137	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
   1138	ASSERT(list_empty(&device->dev_alloc_list));
   1139	ASSERT(list_empty(&device->post_commit_list));
   1140}
   1141
   1142static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
   1143{
   1144	struct btrfs_device *device, *tmp;
   1145
   1146	lockdep_assert_held(&uuid_mutex);
   1147
   1148	if (--fs_devices->opened > 0)
   1149		return;
   1150
   1151	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
   1152		btrfs_close_one_device(device);
   1153
   1154	WARN_ON(fs_devices->open_devices);
   1155	WARN_ON(fs_devices->rw_devices);
   1156	fs_devices->opened = 0;
   1157	fs_devices->seeding = false;
   1158	fs_devices->fs_info = NULL;
   1159}
   1160
   1161void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
   1162{
   1163	LIST_HEAD(list);
   1164	struct btrfs_fs_devices *tmp;
   1165
   1166	mutex_lock(&uuid_mutex);
   1167	close_fs_devices(fs_devices);
   1168	if (!fs_devices->opened)
   1169		list_splice_init(&fs_devices->seed_list, &list);
   1170
   1171	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
   1172		close_fs_devices(fs_devices);
   1173		list_del(&fs_devices->seed_list);
   1174		free_fs_devices(fs_devices);
   1175	}
   1176	mutex_unlock(&uuid_mutex);
   1177}
   1178
   1179static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
   1180				fmode_t flags, void *holder)
   1181{
   1182	struct btrfs_device *device;
   1183	struct btrfs_device *latest_dev = NULL;
   1184	struct btrfs_device *tmp_device;
   1185
   1186	flags |= FMODE_EXCL;
   1187
   1188	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
   1189				 dev_list) {
   1190		int ret;
   1191
   1192		ret = btrfs_open_one_device(fs_devices, device, flags, holder);
   1193		if (ret == 0 &&
   1194		    (!latest_dev || device->generation > latest_dev->generation)) {
   1195			latest_dev = device;
   1196		} else if (ret == -ENODATA) {
   1197			fs_devices->num_devices--;
   1198			list_del(&device->dev_list);
   1199			btrfs_free_device(device);
   1200		}
   1201	}
   1202	if (fs_devices->open_devices == 0)
   1203		return -EINVAL;
   1204
   1205	fs_devices->opened = 1;
   1206	fs_devices->latest_dev = latest_dev;
   1207	fs_devices->total_rw_bytes = 0;
   1208	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
   1209	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
   1210
   1211	return 0;
   1212}
   1213
   1214static int devid_cmp(void *priv, const struct list_head *a,
   1215		     const struct list_head *b)
   1216{
   1217	const struct btrfs_device *dev1, *dev2;
   1218
   1219	dev1 = list_entry(a, struct btrfs_device, dev_list);
   1220	dev2 = list_entry(b, struct btrfs_device, dev_list);
   1221
   1222	if (dev1->devid < dev2->devid)
   1223		return -1;
   1224	else if (dev1->devid > dev2->devid)
   1225		return 1;
   1226	return 0;
   1227}
   1228
   1229int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
   1230		       fmode_t flags, void *holder)
   1231{
   1232	int ret;
   1233
   1234	lockdep_assert_held(&uuid_mutex);
   1235	/*
   1236	 * The device_list_mutex cannot be taken here in case opening the
   1237	 * underlying device takes further locks like open_mutex.
   1238	 *
   1239	 * We also don't need the lock here as this is called during mount and
   1240	 * exclusion is provided by uuid_mutex
   1241	 */
   1242
   1243	if (fs_devices->opened) {
   1244		fs_devices->opened++;
   1245		ret = 0;
   1246	} else {
   1247		list_sort(NULL, &fs_devices->devices, devid_cmp);
   1248		ret = open_fs_devices(fs_devices, flags, holder);
   1249	}
   1250
   1251	return ret;
   1252}
   1253
   1254void btrfs_release_disk_super(struct btrfs_super_block *super)
   1255{
   1256	struct page *page = virt_to_page(super);
   1257
   1258	put_page(page);
   1259}
   1260
   1261static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
   1262						       u64 bytenr, u64 bytenr_orig)
   1263{
   1264	struct btrfs_super_block *disk_super;
   1265	struct page *page;
   1266	void *p;
   1267	pgoff_t index;
   1268
   1269	/* make sure our super fits in the device */
   1270	if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
   1271		return ERR_PTR(-EINVAL);
   1272
   1273	/* make sure our super fits in the page */
   1274	if (sizeof(*disk_super) > PAGE_SIZE)
   1275		return ERR_PTR(-EINVAL);
   1276
   1277	/* make sure our super doesn't straddle pages on disk */
   1278	index = bytenr >> PAGE_SHIFT;
   1279	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
   1280		return ERR_PTR(-EINVAL);
   1281
   1282	/* pull in the page with our super */
   1283	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
   1284
   1285	if (IS_ERR(page))
   1286		return ERR_CAST(page);
   1287
   1288	p = page_address(page);
   1289
   1290	/* align our pointer to the offset of the super block */
   1291	disk_super = p + offset_in_page(bytenr);
   1292
   1293	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
   1294	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
   1295		btrfs_release_disk_super(p);
   1296		return ERR_PTR(-EINVAL);
   1297	}
   1298
   1299	if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
   1300		disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
   1301
   1302	return disk_super;
   1303}
   1304
   1305int btrfs_forget_devices(dev_t devt)
   1306{
   1307	int ret;
   1308
   1309	mutex_lock(&uuid_mutex);
   1310	ret = btrfs_free_stale_devices(devt, NULL);
   1311	mutex_unlock(&uuid_mutex);
   1312
   1313	return ret;
   1314}
   1315
   1316/*
   1317 * Look for a btrfs signature on a device. This may be called out of the mount path
   1318 * and we are not allowed to call set_blocksize during the scan. The superblock
   1319 * is read via pagecache
   1320 */
   1321struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
   1322					   void *holder)
   1323{
   1324	struct btrfs_super_block *disk_super;
   1325	bool new_device_added = false;
   1326	struct btrfs_device *device = NULL;
   1327	struct block_device *bdev;
   1328	u64 bytenr, bytenr_orig;
   1329	int ret;
   1330
   1331	lockdep_assert_held(&uuid_mutex);
   1332
   1333	/*
   1334	 * we would like to check all the supers, but that would make
   1335	 * a btrfs mount succeed after a mkfs from a different FS.
   1336	 * So, we need to add a special mount option to scan for
   1337	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
   1338	 */
   1339	flags |= FMODE_EXCL;
   1340
   1341	bdev = blkdev_get_by_path(path, flags, holder);
   1342	if (IS_ERR(bdev))
   1343		return ERR_CAST(bdev);
   1344
   1345	bytenr_orig = btrfs_sb_offset(0);
   1346	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
   1347	if (ret) {
   1348		device = ERR_PTR(ret);
   1349		goto error_bdev_put;
   1350	}
   1351
   1352	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
   1353	if (IS_ERR(disk_super)) {
   1354		device = ERR_CAST(disk_super);
   1355		goto error_bdev_put;
   1356	}
   1357
   1358	device = device_list_add(path, disk_super, &new_device_added);
   1359	if (!IS_ERR(device) && new_device_added)
   1360		btrfs_free_stale_devices(device->devt, device);
   1361
   1362	btrfs_release_disk_super(disk_super);
   1363
   1364error_bdev_put:
   1365	blkdev_put(bdev, flags);
   1366
   1367	return device;
   1368}
   1369
   1370/*
   1371 * Try to find a chunk that intersects [start, start + len] range and when one
   1372 * such is found, record the end of it in *start
   1373 */
   1374static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
   1375				    u64 len)
   1376{
   1377	u64 physical_start, physical_end;
   1378
   1379	lockdep_assert_held(&device->fs_info->chunk_mutex);
   1380
   1381	if (!find_first_extent_bit(&device->alloc_state, *start,
   1382				   &physical_start, &physical_end,
   1383				   CHUNK_ALLOCATED, NULL)) {
   1384
   1385		if (in_range(physical_start, *start, len) ||
   1386		    in_range(*start, physical_start,
   1387			     physical_end - physical_start)) {
   1388			*start = physical_end + 1;
   1389			return true;
   1390		}
   1391	}
   1392	return false;
   1393}
   1394
   1395static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
   1396{
   1397	switch (device->fs_devices->chunk_alloc_policy) {
   1398	case BTRFS_CHUNK_ALLOC_REGULAR:
   1399		/*
   1400		 * We don't want to overwrite the superblock on the drive nor
   1401		 * any area used by the boot loader (grub for example), so we
   1402		 * make sure to start at an offset of at least 1MB.
   1403		 */
   1404		return max_t(u64, start, SZ_1M);
   1405	case BTRFS_CHUNK_ALLOC_ZONED:
   1406		/*
   1407		 * We don't care about the starting region like regular
   1408		 * allocator, because we anyway use/reserve the first two zones
   1409		 * for superblock logging.
   1410		 */
   1411		return ALIGN(start, device->zone_info->zone_size);
   1412	default:
   1413		BUG();
   1414	}
   1415}
   1416
   1417static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
   1418					u64 *hole_start, u64 *hole_size,
   1419					u64 num_bytes)
   1420{
   1421	u64 zone_size = device->zone_info->zone_size;
   1422	u64 pos;
   1423	int ret;
   1424	bool changed = false;
   1425
   1426	ASSERT(IS_ALIGNED(*hole_start, zone_size));
   1427
   1428	while (*hole_size > 0) {
   1429		pos = btrfs_find_allocatable_zones(device, *hole_start,
   1430						   *hole_start + *hole_size,
   1431						   num_bytes);
   1432		if (pos != *hole_start) {
   1433			*hole_size = *hole_start + *hole_size - pos;
   1434			*hole_start = pos;
   1435			changed = true;
   1436			if (*hole_size < num_bytes)
   1437				break;
   1438		}
   1439
   1440		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
   1441
   1442		/* Range is ensured to be empty */
   1443		if (!ret)
   1444			return changed;
   1445
   1446		/* Given hole range was invalid (outside of device) */
   1447		if (ret == -ERANGE) {
   1448			*hole_start += *hole_size;
   1449			*hole_size = 0;
   1450			return true;
   1451		}
   1452
   1453		*hole_start += zone_size;
   1454		*hole_size -= zone_size;
   1455		changed = true;
   1456	}
   1457
   1458	return changed;
   1459}
   1460
   1461/**
   1462 * dev_extent_hole_check - check if specified hole is suitable for allocation
   1463 * @device:	the device which we have the hole
   1464 * @hole_start: starting position of the hole
   1465 * @hole_size:	the size of the hole
   1466 * @num_bytes:	the size of the free space that we need
   1467 *
   1468 * This function may modify @hole_start and @hole_size to reflect the suitable
   1469 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
   1470 */
   1471static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
   1472				  u64 *hole_size, u64 num_bytes)
   1473{
   1474	bool changed = false;
   1475	u64 hole_end = *hole_start + *hole_size;
   1476
   1477	for (;;) {
   1478		/*
   1479		 * Check before we set max_hole_start, otherwise we could end up
   1480		 * sending back this offset anyway.
   1481		 */
   1482		if (contains_pending_extent(device, hole_start, *hole_size)) {
   1483			if (hole_end >= *hole_start)
   1484				*hole_size = hole_end - *hole_start;
   1485			else
   1486				*hole_size = 0;
   1487			changed = true;
   1488		}
   1489
   1490		switch (device->fs_devices->chunk_alloc_policy) {
   1491		case BTRFS_CHUNK_ALLOC_REGULAR:
   1492			/* No extra check */
   1493			break;
   1494		case BTRFS_CHUNK_ALLOC_ZONED:
   1495			if (dev_extent_hole_check_zoned(device, hole_start,
   1496							hole_size, num_bytes)) {
   1497				changed = true;
   1498				/*
   1499				 * The changed hole can contain pending extent.
   1500				 * Loop again to check that.
   1501				 */
   1502				continue;
   1503			}
   1504			break;
   1505		default:
   1506			BUG();
   1507		}
   1508
   1509		break;
   1510	}
   1511
   1512	return changed;
   1513}
   1514
   1515/*
   1516 * find_free_dev_extent_start - find free space in the specified device
   1517 * @device:	  the device which we search the free space in
   1518 * @num_bytes:	  the size of the free space that we need
   1519 * @search_start: the position from which to begin the search
   1520 * @start:	  store the start of the free space.
   1521 * @len:	  the size of the free space. that we find, or the size
   1522 *		  of the max free space if we don't find suitable free space
   1523 *
   1524 * this uses a pretty simple search, the expectation is that it is
   1525 * called very infrequently and that a given device has a small number
   1526 * of extents
   1527 *
   1528 * @start is used to store the start of the free space if we find. But if we
   1529 * don't find suitable free space, it will be used to store the start position
   1530 * of the max free space.
   1531 *
   1532 * @len is used to store the size of the free space that we find.
   1533 * But if we don't find suitable free space, it is used to store the size of
   1534 * the max free space.
   1535 *
   1536 * NOTE: This function will search *commit* root of device tree, and does extra
   1537 * check to ensure dev extents are not double allocated.
   1538 * This makes the function safe to allocate dev extents but may not report
   1539 * correct usable device space, as device extent freed in current transaction
   1540 * is not reported as available.
   1541 */
   1542static int find_free_dev_extent_start(struct btrfs_device *device,
   1543				u64 num_bytes, u64 search_start, u64 *start,
   1544				u64 *len)
   1545{
   1546	struct btrfs_fs_info *fs_info = device->fs_info;
   1547	struct btrfs_root *root = fs_info->dev_root;
   1548	struct btrfs_key key;
   1549	struct btrfs_dev_extent *dev_extent;
   1550	struct btrfs_path *path;
   1551	u64 hole_size;
   1552	u64 max_hole_start;
   1553	u64 max_hole_size;
   1554	u64 extent_end;
   1555	u64 search_end = device->total_bytes;
   1556	int ret;
   1557	int slot;
   1558	struct extent_buffer *l;
   1559
   1560	search_start = dev_extent_search_start(device, search_start);
   1561
   1562	WARN_ON(device->zone_info &&
   1563		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
   1564
   1565	path = btrfs_alloc_path();
   1566	if (!path)
   1567		return -ENOMEM;
   1568
   1569	max_hole_start = search_start;
   1570	max_hole_size = 0;
   1571
   1572again:
   1573	if (search_start >= search_end ||
   1574		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
   1575		ret = -ENOSPC;
   1576		goto out;
   1577	}
   1578
   1579	path->reada = READA_FORWARD;
   1580	path->search_commit_root = 1;
   1581	path->skip_locking = 1;
   1582
   1583	key.objectid = device->devid;
   1584	key.offset = search_start;
   1585	key.type = BTRFS_DEV_EXTENT_KEY;
   1586
   1587	ret = btrfs_search_backwards(root, &key, path);
   1588	if (ret < 0)
   1589		goto out;
   1590
   1591	while (1) {
   1592		l = path->nodes[0];
   1593		slot = path->slots[0];
   1594		if (slot >= btrfs_header_nritems(l)) {
   1595			ret = btrfs_next_leaf(root, path);
   1596			if (ret == 0)
   1597				continue;
   1598			if (ret < 0)
   1599				goto out;
   1600
   1601			break;
   1602		}
   1603		btrfs_item_key_to_cpu(l, &key, slot);
   1604
   1605		if (key.objectid < device->devid)
   1606			goto next;
   1607
   1608		if (key.objectid > device->devid)
   1609			break;
   1610
   1611		if (key.type != BTRFS_DEV_EXTENT_KEY)
   1612			goto next;
   1613
   1614		if (key.offset > search_start) {
   1615			hole_size = key.offset - search_start;
   1616			dev_extent_hole_check(device, &search_start, &hole_size,
   1617					      num_bytes);
   1618
   1619			if (hole_size > max_hole_size) {
   1620				max_hole_start = search_start;
   1621				max_hole_size = hole_size;
   1622			}
   1623
   1624			/*
   1625			 * If this free space is greater than which we need,
   1626			 * it must be the max free space that we have found
   1627			 * until now, so max_hole_start must point to the start
   1628			 * of this free space and the length of this free space
   1629			 * is stored in max_hole_size. Thus, we return
   1630			 * max_hole_start and max_hole_size and go back to the
   1631			 * caller.
   1632			 */
   1633			if (hole_size >= num_bytes) {
   1634				ret = 0;
   1635				goto out;
   1636			}
   1637		}
   1638
   1639		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
   1640		extent_end = key.offset + btrfs_dev_extent_length(l,
   1641								  dev_extent);
   1642		if (extent_end > search_start)
   1643			search_start = extent_end;
   1644next:
   1645		path->slots[0]++;
   1646		cond_resched();
   1647	}
   1648
   1649	/*
   1650	 * At this point, search_start should be the end of
   1651	 * allocated dev extents, and when shrinking the device,
   1652	 * search_end may be smaller than search_start.
   1653	 */
   1654	if (search_end > search_start) {
   1655		hole_size = search_end - search_start;
   1656		if (dev_extent_hole_check(device, &search_start, &hole_size,
   1657					  num_bytes)) {
   1658			btrfs_release_path(path);
   1659			goto again;
   1660		}
   1661
   1662		if (hole_size > max_hole_size) {
   1663			max_hole_start = search_start;
   1664			max_hole_size = hole_size;
   1665		}
   1666	}
   1667
   1668	/* See above. */
   1669	if (max_hole_size < num_bytes)
   1670		ret = -ENOSPC;
   1671	else
   1672		ret = 0;
   1673
   1674out:
   1675	btrfs_free_path(path);
   1676	*start = max_hole_start;
   1677	if (len)
   1678		*len = max_hole_size;
   1679	return ret;
   1680}
   1681
   1682int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
   1683			 u64 *start, u64 *len)
   1684{
   1685	/* FIXME use last free of some kind */
   1686	return find_free_dev_extent_start(device, num_bytes, 0, start, len);
   1687}
   1688
   1689static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
   1690			  struct btrfs_device *device,
   1691			  u64 start, u64 *dev_extent_len)
   1692{
   1693	struct btrfs_fs_info *fs_info = device->fs_info;
   1694	struct btrfs_root *root = fs_info->dev_root;
   1695	int ret;
   1696	struct btrfs_path *path;
   1697	struct btrfs_key key;
   1698	struct btrfs_key found_key;
   1699	struct extent_buffer *leaf = NULL;
   1700	struct btrfs_dev_extent *extent = NULL;
   1701
   1702	path = btrfs_alloc_path();
   1703	if (!path)
   1704		return -ENOMEM;
   1705
   1706	key.objectid = device->devid;
   1707	key.offset = start;
   1708	key.type = BTRFS_DEV_EXTENT_KEY;
   1709again:
   1710	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
   1711	if (ret > 0) {
   1712		ret = btrfs_previous_item(root, path, key.objectid,
   1713					  BTRFS_DEV_EXTENT_KEY);
   1714		if (ret)
   1715			goto out;
   1716		leaf = path->nodes[0];
   1717		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   1718		extent = btrfs_item_ptr(leaf, path->slots[0],
   1719					struct btrfs_dev_extent);
   1720		BUG_ON(found_key.offset > start || found_key.offset +
   1721		       btrfs_dev_extent_length(leaf, extent) < start);
   1722		key = found_key;
   1723		btrfs_release_path(path);
   1724		goto again;
   1725	} else if (ret == 0) {
   1726		leaf = path->nodes[0];
   1727		extent = btrfs_item_ptr(leaf, path->slots[0],
   1728					struct btrfs_dev_extent);
   1729	} else {
   1730		goto out;
   1731	}
   1732
   1733	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
   1734
   1735	ret = btrfs_del_item(trans, root, path);
   1736	if (ret == 0)
   1737		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
   1738out:
   1739	btrfs_free_path(path);
   1740	return ret;
   1741}
   1742
   1743static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
   1744{
   1745	struct extent_map_tree *em_tree;
   1746	struct extent_map *em;
   1747	struct rb_node *n;
   1748	u64 ret = 0;
   1749
   1750	em_tree = &fs_info->mapping_tree;
   1751	read_lock(&em_tree->lock);
   1752	n = rb_last(&em_tree->map.rb_root);
   1753	if (n) {
   1754		em = rb_entry(n, struct extent_map, rb_node);
   1755		ret = em->start + em->len;
   1756	}
   1757	read_unlock(&em_tree->lock);
   1758
   1759	return ret;
   1760}
   1761
   1762static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
   1763				    u64 *devid_ret)
   1764{
   1765	int ret;
   1766	struct btrfs_key key;
   1767	struct btrfs_key found_key;
   1768	struct btrfs_path *path;
   1769
   1770	path = btrfs_alloc_path();
   1771	if (!path)
   1772		return -ENOMEM;
   1773
   1774	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
   1775	key.type = BTRFS_DEV_ITEM_KEY;
   1776	key.offset = (u64)-1;
   1777
   1778	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
   1779	if (ret < 0)
   1780		goto error;
   1781
   1782	if (ret == 0) {
   1783		/* Corruption */
   1784		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
   1785		ret = -EUCLEAN;
   1786		goto error;
   1787	}
   1788
   1789	ret = btrfs_previous_item(fs_info->chunk_root, path,
   1790				  BTRFS_DEV_ITEMS_OBJECTID,
   1791				  BTRFS_DEV_ITEM_KEY);
   1792	if (ret) {
   1793		*devid_ret = 1;
   1794	} else {
   1795		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
   1796				      path->slots[0]);
   1797		*devid_ret = found_key.offset + 1;
   1798	}
   1799	ret = 0;
   1800error:
   1801	btrfs_free_path(path);
   1802	return ret;
   1803}
   1804
   1805/*
   1806 * the device information is stored in the chunk root
   1807 * the btrfs_device struct should be fully filled in
   1808 */
   1809static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
   1810			    struct btrfs_device *device)
   1811{
   1812	int ret;
   1813	struct btrfs_path *path;
   1814	struct btrfs_dev_item *dev_item;
   1815	struct extent_buffer *leaf;
   1816	struct btrfs_key key;
   1817	unsigned long ptr;
   1818
   1819	path = btrfs_alloc_path();
   1820	if (!path)
   1821		return -ENOMEM;
   1822
   1823	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
   1824	key.type = BTRFS_DEV_ITEM_KEY;
   1825	key.offset = device->devid;
   1826
   1827	btrfs_reserve_chunk_metadata(trans, true);
   1828	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
   1829				      &key, sizeof(*dev_item));
   1830	btrfs_trans_release_chunk_metadata(trans);
   1831	if (ret)
   1832		goto out;
   1833
   1834	leaf = path->nodes[0];
   1835	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
   1836
   1837	btrfs_set_device_id(leaf, dev_item, device->devid);
   1838	btrfs_set_device_generation(leaf, dev_item, 0);
   1839	btrfs_set_device_type(leaf, dev_item, device->type);
   1840	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
   1841	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
   1842	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
   1843	btrfs_set_device_total_bytes(leaf, dev_item,
   1844				     btrfs_device_get_disk_total_bytes(device));
   1845	btrfs_set_device_bytes_used(leaf, dev_item,
   1846				    btrfs_device_get_bytes_used(device));
   1847	btrfs_set_device_group(leaf, dev_item, 0);
   1848	btrfs_set_device_seek_speed(leaf, dev_item, 0);
   1849	btrfs_set_device_bandwidth(leaf, dev_item, 0);
   1850	btrfs_set_device_start_offset(leaf, dev_item, 0);
   1851
   1852	ptr = btrfs_device_uuid(dev_item);
   1853	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
   1854	ptr = btrfs_device_fsid(dev_item);
   1855	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
   1856			    ptr, BTRFS_FSID_SIZE);
   1857	btrfs_mark_buffer_dirty(leaf);
   1858
   1859	ret = 0;
   1860out:
   1861	btrfs_free_path(path);
   1862	return ret;
   1863}
   1864
   1865/*
   1866 * Function to update ctime/mtime for a given device path.
   1867 * Mainly used for ctime/mtime based probe like libblkid.
   1868 *
   1869 * We don't care about errors here, this is just to be kind to userspace.
   1870 */
   1871static void update_dev_time(const char *device_path)
   1872{
   1873	struct path path;
   1874	struct timespec64 now;
   1875	int ret;
   1876
   1877	ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
   1878	if (ret)
   1879		return;
   1880
   1881	now = current_time(d_inode(path.dentry));
   1882	inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
   1883	path_put(&path);
   1884}
   1885
   1886static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
   1887			     struct btrfs_device *device)
   1888{
   1889	struct btrfs_root *root = device->fs_info->chunk_root;
   1890	int ret;
   1891	struct btrfs_path *path;
   1892	struct btrfs_key key;
   1893
   1894	path = btrfs_alloc_path();
   1895	if (!path)
   1896		return -ENOMEM;
   1897
   1898	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
   1899	key.type = BTRFS_DEV_ITEM_KEY;
   1900	key.offset = device->devid;
   1901
   1902	btrfs_reserve_chunk_metadata(trans, false);
   1903	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
   1904	btrfs_trans_release_chunk_metadata(trans);
   1905	if (ret) {
   1906		if (ret > 0)
   1907			ret = -ENOENT;
   1908		goto out;
   1909	}
   1910
   1911	ret = btrfs_del_item(trans, root, path);
   1912out:
   1913	btrfs_free_path(path);
   1914	return ret;
   1915}
   1916
   1917/*
   1918 * Verify that @num_devices satisfies the RAID profile constraints in the whole
   1919 * filesystem. It's up to the caller to adjust that number regarding eg. device
   1920 * replace.
   1921 */
   1922static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
   1923		u64 num_devices)
   1924{
   1925	u64 all_avail;
   1926	unsigned seq;
   1927	int i;
   1928
   1929	do {
   1930		seq = read_seqbegin(&fs_info->profiles_lock);
   1931
   1932		all_avail = fs_info->avail_data_alloc_bits |
   1933			    fs_info->avail_system_alloc_bits |
   1934			    fs_info->avail_metadata_alloc_bits;
   1935	} while (read_seqretry(&fs_info->profiles_lock, seq));
   1936
   1937	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
   1938		if (!(all_avail & btrfs_raid_array[i].bg_flag))
   1939			continue;
   1940
   1941		if (num_devices < btrfs_raid_array[i].devs_min)
   1942			return btrfs_raid_array[i].mindev_error;
   1943	}
   1944
   1945	return 0;
   1946}
   1947
   1948static struct btrfs_device * btrfs_find_next_active_device(
   1949		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
   1950{
   1951	struct btrfs_device *next_device;
   1952
   1953	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
   1954		if (next_device != device &&
   1955		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
   1956		    && next_device->bdev)
   1957			return next_device;
   1958	}
   1959
   1960	return NULL;
   1961}
   1962
   1963/*
   1964 * Helper function to check if the given device is part of s_bdev / latest_dev
   1965 * and replace it with the provided or the next active device, in the context
   1966 * where this function called, there should be always be another device (or
   1967 * this_dev) which is active.
   1968 */
   1969void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
   1970					    struct btrfs_device *next_device)
   1971{
   1972	struct btrfs_fs_info *fs_info = device->fs_info;
   1973
   1974	if (!next_device)
   1975		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
   1976							    device);
   1977	ASSERT(next_device);
   1978
   1979	if (fs_info->sb->s_bdev &&
   1980			(fs_info->sb->s_bdev == device->bdev))
   1981		fs_info->sb->s_bdev = next_device->bdev;
   1982
   1983	if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
   1984		fs_info->fs_devices->latest_dev = next_device;
   1985}
   1986
   1987/*
   1988 * Return btrfs_fs_devices::num_devices excluding the device that's being
   1989 * currently replaced.
   1990 */
   1991static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
   1992{
   1993	u64 num_devices = fs_info->fs_devices->num_devices;
   1994
   1995	down_read(&fs_info->dev_replace.rwsem);
   1996	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
   1997		ASSERT(num_devices > 1);
   1998		num_devices--;
   1999	}
   2000	up_read(&fs_info->dev_replace.rwsem);
   2001
   2002	return num_devices;
   2003}
   2004
   2005void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
   2006			       struct block_device *bdev,
   2007			       const char *device_path)
   2008{
   2009	struct btrfs_super_block *disk_super;
   2010	int copy_num;
   2011
   2012	if (!bdev)
   2013		return;
   2014
   2015	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
   2016		struct page *page;
   2017		int ret;
   2018
   2019		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
   2020		if (IS_ERR(disk_super))
   2021			continue;
   2022
   2023		if (bdev_is_zoned(bdev)) {
   2024			btrfs_reset_sb_log_zones(bdev, copy_num);
   2025			continue;
   2026		}
   2027
   2028		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
   2029
   2030		page = virt_to_page(disk_super);
   2031		set_page_dirty(page);
   2032		lock_page(page);
   2033		/* write_on_page() unlocks the page */
   2034		ret = write_one_page(page);
   2035		if (ret)
   2036			btrfs_warn(fs_info,
   2037				"error clearing superblock number %d (%d)",
   2038				copy_num, ret);
   2039		btrfs_release_disk_super(disk_super);
   2040
   2041	}
   2042
   2043	/* Notify udev that device has changed */
   2044	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
   2045
   2046	/* Update ctime/mtime for device path for libblkid */
   2047	update_dev_time(device_path);
   2048}
   2049
   2050int btrfs_rm_device(struct btrfs_fs_info *fs_info,
   2051		    struct btrfs_dev_lookup_args *args,
   2052		    struct block_device **bdev, fmode_t *mode)
   2053{
   2054	struct btrfs_trans_handle *trans;
   2055	struct btrfs_device *device;
   2056	struct btrfs_fs_devices *cur_devices;
   2057	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   2058	u64 num_devices;
   2059	int ret = 0;
   2060
   2061	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
   2062		btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
   2063		return -EINVAL;
   2064	}
   2065
   2066	/*
   2067	 * The device list in fs_devices is accessed without locks (neither
   2068	 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
   2069	 * filesystem and another device rm cannot run.
   2070	 */
   2071	num_devices = btrfs_num_devices(fs_info);
   2072
   2073	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
   2074	if (ret)
   2075		return ret;
   2076
   2077	device = btrfs_find_device(fs_info->fs_devices, args);
   2078	if (!device) {
   2079		if (args->missing)
   2080			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
   2081		else
   2082			ret = -ENOENT;
   2083		return ret;
   2084	}
   2085
   2086	if (btrfs_pinned_by_swapfile(fs_info, device)) {
   2087		btrfs_warn_in_rcu(fs_info,
   2088		  "cannot remove device %s (devid %llu) due to active swapfile",
   2089				  rcu_str_deref(device->name), device->devid);
   2090		return -ETXTBSY;
   2091	}
   2092
   2093	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
   2094		return BTRFS_ERROR_DEV_TGT_REPLACE;
   2095
   2096	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
   2097	    fs_info->fs_devices->rw_devices == 1)
   2098		return BTRFS_ERROR_DEV_ONLY_WRITABLE;
   2099
   2100	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
   2101		mutex_lock(&fs_info->chunk_mutex);
   2102		list_del_init(&device->dev_alloc_list);
   2103		device->fs_devices->rw_devices--;
   2104		mutex_unlock(&fs_info->chunk_mutex);
   2105	}
   2106
   2107	ret = btrfs_shrink_device(device, 0);
   2108	if (ret)
   2109		goto error_undo;
   2110
   2111	trans = btrfs_start_transaction(fs_info->chunk_root, 0);
   2112	if (IS_ERR(trans)) {
   2113		ret = PTR_ERR(trans);
   2114		goto error_undo;
   2115	}
   2116
   2117	ret = btrfs_rm_dev_item(trans, device);
   2118	if (ret) {
   2119		/* Any error in dev item removal is critical */
   2120		btrfs_crit(fs_info,
   2121			   "failed to remove device item for devid %llu: %d",
   2122			   device->devid, ret);
   2123		btrfs_abort_transaction(trans, ret);
   2124		btrfs_end_transaction(trans);
   2125		return ret;
   2126	}
   2127
   2128	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
   2129	btrfs_scrub_cancel_dev(device);
   2130
   2131	/*
   2132	 * the device list mutex makes sure that we don't change
   2133	 * the device list while someone else is writing out all
   2134	 * the device supers. Whoever is writing all supers, should
   2135	 * lock the device list mutex before getting the number of
   2136	 * devices in the super block (super_copy). Conversely,
   2137	 * whoever updates the number of devices in the super block
   2138	 * (super_copy) should hold the device list mutex.
   2139	 */
   2140
   2141	/*
   2142	 * In normal cases the cur_devices == fs_devices. But in case
   2143	 * of deleting a seed device, the cur_devices should point to
   2144	 * its own fs_devices listed under the fs_devices->seed_list.
   2145	 */
   2146	cur_devices = device->fs_devices;
   2147	mutex_lock(&fs_devices->device_list_mutex);
   2148	list_del_rcu(&device->dev_list);
   2149
   2150	cur_devices->num_devices--;
   2151	cur_devices->total_devices--;
   2152	/* Update total_devices of the parent fs_devices if it's seed */
   2153	if (cur_devices != fs_devices)
   2154		fs_devices->total_devices--;
   2155
   2156	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
   2157		cur_devices->missing_devices--;
   2158
   2159	btrfs_assign_next_active_device(device, NULL);
   2160
   2161	if (device->bdev) {
   2162		cur_devices->open_devices--;
   2163		/* remove sysfs entry */
   2164		btrfs_sysfs_remove_device(device);
   2165	}
   2166
   2167	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
   2168	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
   2169	mutex_unlock(&fs_devices->device_list_mutex);
   2170
   2171	/*
   2172	 * At this point, the device is zero sized and detached from the
   2173	 * devices list.  All that's left is to zero out the old supers and
   2174	 * free the device.
   2175	 *
   2176	 * We cannot call btrfs_close_bdev() here because we're holding the sb
   2177	 * write lock, and blkdev_put() will pull in the ->open_mutex on the
   2178	 * block device and it's dependencies.  Instead just flush the device
   2179	 * and let the caller do the final blkdev_put.
   2180	 */
   2181	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
   2182		btrfs_scratch_superblocks(fs_info, device->bdev,
   2183					  device->name->str);
   2184		if (device->bdev) {
   2185			sync_blockdev(device->bdev);
   2186			invalidate_bdev(device->bdev);
   2187		}
   2188	}
   2189
   2190	*bdev = device->bdev;
   2191	*mode = device->mode;
   2192	synchronize_rcu();
   2193	btrfs_free_device(device);
   2194
   2195	/*
   2196	 * This can happen if cur_devices is the private seed devices list.  We
   2197	 * cannot call close_fs_devices() here because it expects the uuid_mutex
   2198	 * to be held, but in fact we don't need that for the private
   2199	 * seed_devices, we can simply decrement cur_devices->opened and then
   2200	 * remove it from our list and free the fs_devices.
   2201	 */
   2202	if (cur_devices->num_devices == 0) {
   2203		list_del_init(&cur_devices->seed_list);
   2204		ASSERT(cur_devices->opened == 1);
   2205		cur_devices->opened--;
   2206		free_fs_devices(cur_devices);
   2207	}
   2208
   2209	ret = btrfs_commit_transaction(trans);
   2210
   2211	return ret;
   2212
   2213error_undo:
   2214	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
   2215		mutex_lock(&fs_info->chunk_mutex);
   2216		list_add(&device->dev_alloc_list,
   2217			 &fs_devices->alloc_list);
   2218		device->fs_devices->rw_devices++;
   2219		mutex_unlock(&fs_info->chunk_mutex);
   2220	}
   2221	return ret;
   2222}
   2223
   2224void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
   2225{
   2226	struct btrfs_fs_devices *fs_devices;
   2227
   2228	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
   2229
   2230	/*
   2231	 * in case of fs with no seed, srcdev->fs_devices will point
   2232	 * to fs_devices of fs_info. However when the dev being replaced is
   2233	 * a seed dev it will point to the seed's local fs_devices. In short
   2234	 * srcdev will have its correct fs_devices in both the cases.
   2235	 */
   2236	fs_devices = srcdev->fs_devices;
   2237
   2238	list_del_rcu(&srcdev->dev_list);
   2239	list_del(&srcdev->dev_alloc_list);
   2240	fs_devices->num_devices--;
   2241	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
   2242		fs_devices->missing_devices--;
   2243
   2244	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
   2245		fs_devices->rw_devices--;
   2246
   2247	if (srcdev->bdev)
   2248		fs_devices->open_devices--;
   2249}
   2250
   2251void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
   2252{
   2253	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
   2254
   2255	mutex_lock(&uuid_mutex);
   2256
   2257	btrfs_close_bdev(srcdev);
   2258	synchronize_rcu();
   2259	btrfs_free_device(srcdev);
   2260
   2261	/* if this is no devs we rather delete the fs_devices */
   2262	if (!fs_devices->num_devices) {
   2263		/*
   2264		 * On a mounted FS, num_devices can't be zero unless it's a
   2265		 * seed. In case of a seed device being replaced, the replace
   2266		 * target added to the sprout FS, so there will be no more
   2267		 * device left under the seed FS.
   2268		 */
   2269		ASSERT(fs_devices->seeding);
   2270
   2271		list_del_init(&fs_devices->seed_list);
   2272		close_fs_devices(fs_devices);
   2273		free_fs_devices(fs_devices);
   2274	}
   2275	mutex_unlock(&uuid_mutex);
   2276}
   2277
   2278void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
   2279{
   2280	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
   2281
   2282	mutex_lock(&fs_devices->device_list_mutex);
   2283
   2284	btrfs_sysfs_remove_device(tgtdev);
   2285
   2286	if (tgtdev->bdev)
   2287		fs_devices->open_devices--;
   2288
   2289	fs_devices->num_devices--;
   2290
   2291	btrfs_assign_next_active_device(tgtdev, NULL);
   2292
   2293	list_del_rcu(&tgtdev->dev_list);
   2294
   2295	mutex_unlock(&fs_devices->device_list_mutex);
   2296
   2297	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
   2298				  tgtdev->name->str);
   2299
   2300	btrfs_close_bdev(tgtdev);
   2301	synchronize_rcu();
   2302	btrfs_free_device(tgtdev);
   2303}
   2304
   2305/**
   2306 * Populate args from device at path
   2307 *
   2308 * @fs_info:	the filesystem
   2309 * @args:	the args to populate
   2310 * @path:	the path to the device
   2311 *
   2312 * This will read the super block of the device at @path and populate @args with
   2313 * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
   2314 * lookup a device to operate on, but need to do it before we take any locks.
   2315 * This properly handles the special case of "missing" that a user may pass in,
   2316 * and does some basic sanity checks.  The caller must make sure that @path is
   2317 * properly NUL terminated before calling in, and must call
   2318 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
   2319 * uuid buffers.
   2320 *
   2321 * Return: 0 for success, -errno for failure
   2322 */
   2323int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
   2324				 struct btrfs_dev_lookup_args *args,
   2325				 const char *path)
   2326{
   2327	struct btrfs_super_block *disk_super;
   2328	struct block_device *bdev;
   2329	int ret;
   2330
   2331	if (!path || !path[0])
   2332		return -EINVAL;
   2333	if (!strcmp(path, "missing")) {
   2334		args->missing = true;
   2335		return 0;
   2336	}
   2337
   2338	args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
   2339	args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
   2340	if (!args->uuid || !args->fsid) {
   2341		btrfs_put_dev_args_from_path(args);
   2342		return -ENOMEM;
   2343	}
   2344
   2345	ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
   2346				    &bdev, &disk_super);
   2347	if (ret)
   2348		return ret;
   2349	args->devid = btrfs_stack_device_id(&disk_super->dev_item);
   2350	memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
   2351	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
   2352		memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
   2353	else
   2354		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
   2355	btrfs_release_disk_super(disk_super);
   2356	blkdev_put(bdev, FMODE_READ);
   2357	return 0;
   2358}
   2359
   2360/*
   2361 * Only use this jointly with btrfs_get_dev_args_from_path() because we will
   2362 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
   2363 * that don't need to be freed.
   2364 */
   2365void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
   2366{
   2367	kfree(args->uuid);
   2368	kfree(args->fsid);
   2369	args->uuid = NULL;
   2370	args->fsid = NULL;
   2371}
   2372
   2373struct btrfs_device *btrfs_find_device_by_devspec(
   2374		struct btrfs_fs_info *fs_info, u64 devid,
   2375		const char *device_path)
   2376{
   2377	BTRFS_DEV_LOOKUP_ARGS(args);
   2378	struct btrfs_device *device;
   2379	int ret;
   2380
   2381	if (devid) {
   2382		args.devid = devid;
   2383		device = btrfs_find_device(fs_info->fs_devices, &args);
   2384		if (!device)
   2385			return ERR_PTR(-ENOENT);
   2386		return device;
   2387	}
   2388
   2389	ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
   2390	if (ret)
   2391		return ERR_PTR(ret);
   2392	device = btrfs_find_device(fs_info->fs_devices, &args);
   2393	btrfs_put_dev_args_from_path(&args);
   2394	if (!device)
   2395		return ERR_PTR(-ENOENT);
   2396	return device;
   2397}
   2398
   2399static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
   2400{
   2401	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   2402	struct btrfs_fs_devices *old_devices;
   2403	struct btrfs_fs_devices *seed_devices;
   2404
   2405	lockdep_assert_held(&uuid_mutex);
   2406	if (!fs_devices->seeding)
   2407		return ERR_PTR(-EINVAL);
   2408
   2409	/*
   2410	 * Private copy of the seed devices, anchored at
   2411	 * fs_info->fs_devices->seed_list
   2412	 */
   2413	seed_devices = alloc_fs_devices(NULL, NULL);
   2414	if (IS_ERR(seed_devices))
   2415		return seed_devices;
   2416
   2417	/*
   2418	 * It's necessary to retain a copy of the original seed fs_devices in
   2419	 * fs_uuids so that filesystems which have been seeded can successfully
   2420	 * reference the seed device from open_seed_devices. This also supports
   2421	 * multiple fs seed.
   2422	 */
   2423	old_devices = clone_fs_devices(fs_devices);
   2424	if (IS_ERR(old_devices)) {
   2425		kfree(seed_devices);
   2426		return old_devices;
   2427	}
   2428
   2429	list_add(&old_devices->fs_list, &fs_uuids);
   2430
   2431	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
   2432	seed_devices->opened = 1;
   2433	INIT_LIST_HEAD(&seed_devices->devices);
   2434	INIT_LIST_HEAD(&seed_devices->alloc_list);
   2435	mutex_init(&seed_devices->device_list_mutex);
   2436
   2437	return seed_devices;
   2438}
   2439
   2440/*
   2441 * Splice seed devices into the sprout fs_devices.
   2442 * Generate a new fsid for the sprouted read-write filesystem.
   2443 */
   2444static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
   2445			       struct btrfs_fs_devices *seed_devices)
   2446{
   2447	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   2448	struct btrfs_super_block *disk_super = fs_info->super_copy;
   2449	struct btrfs_device *device;
   2450	u64 super_flags;
   2451
   2452	/*
   2453	 * We are updating the fsid, the thread leading to device_list_add()
   2454	 * could race, so uuid_mutex is needed.
   2455	 */
   2456	lockdep_assert_held(&uuid_mutex);
   2457
   2458	/*
   2459	 * The threads listed below may traverse dev_list but can do that without
   2460	 * device_list_mutex:
   2461	 * - All device ops and balance - as we are in btrfs_exclop_start.
   2462	 * - Various dev_list readers - are using RCU.
   2463	 * - btrfs_ioctl_fitrim() - is using RCU.
   2464	 *
   2465	 * For-read threads as below are using device_list_mutex:
   2466	 * - Readonly scrub btrfs_scrub_dev()
   2467	 * - Readonly scrub btrfs_scrub_progress()
   2468	 * - btrfs_get_dev_stats()
   2469	 */
   2470	lockdep_assert_held(&fs_devices->device_list_mutex);
   2471
   2472	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
   2473			      synchronize_rcu);
   2474	list_for_each_entry(device, &seed_devices->devices, dev_list)
   2475		device->fs_devices = seed_devices;
   2476
   2477	fs_devices->seeding = false;
   2478	fs_devices->num_devices = 0;
   2479	fs_devices->open_devices = 0;
   2480	fs_devices->missing_devices = 0;
   2481	fs_devices->rotating = false;
   2482	list_add(&seed_devices->seed_list, &fs_devices->seed_list);
   2483
   2484	generate_random_uuid(fs_devices->fsid);
   2485	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
   2486	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
   2487
   2488	super_flags = btrfs_super_flags(disk_super) &
   2489		      ~BTRFS_SUPER_FLAG_SEEDING;
   2490	btrfs_set_super_flags(disk_super, super_flags);
   2491}
   2492
   2493/*
   2494 * Store the expected generation for seed devices in device items.
   2495 */
   2496static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
   2497{
   2498	BTRFS_DEV_LOOKUP_ARGS(args);
   2499	struct btrfs_fs_info *fs_info = trans->fs_info;
   2500	struct btrfs_root *root = fs_info->chunk_root;
   2501	struct btrfs_path *path;
   2502	struct extent_buffer *leaf;
   2503	struct btrfs_dev_item *dev_item;
   2504	struct btrfs_device *device;
   2505	struct btrfs_key key;
   2506	u8 fs_uuid[BTRFS_FSID_SIZE];
   2507	u8 dev_uuid[BTRFS_UUID_SIZE];
   2508	int ret;
   2509
   2510	path = btrfs_alloc_path();
   2511	if (!path)
   2512		return -ENOMEM;
   2513
   2514	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
   2515	key.offset = 0;
   2516	key.type = BTRFS_DEV_ITEM_KEY;
   2517
   2518	while (1) {
   2519		btrfs_reserve_chunk_metadata(trans, false);
   2520		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
   2521		btrfs_trans_release_chunk_metadata(trans);
   2522		if (ret < 0)
   2523			goto error;
   2524
   2525		leaf = path->nodes[0];
   2526next_slot:
   2527		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
   2528			ret = btrfs_next_leaf(root, path);
   2529			if (ret > 0)
   2530				break;
   2531			if (ret < 0)
   2532				goto error;
   2533			leaf = path->nodes[0];
   2534			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   2535			btrfs_release_path(path);
   2536			continue;
   2537		}
   2538
   2539		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   2540		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
   2541		    key.type != BTRFS_DEV_ITEM_KEY)
   2542			break;
   2543
   2544		dev_item = btrfs_item_ptr(leaf, path->slots[0],
   2545					  struct btrfs_dev_item);
   2546		args.devid = btrfs_device_id(leaf, dev_item);
   2547		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
   2548				   BTRFS_UUID_SIZE);
   2549		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
   2550				   BTRFS_FSID_SIZE);
   2551		args.uuid = dev_uuid;
   2552		args.fsid = fs_uuid;
   2553		device = btrfs_find_device(fs_info->fs_devices, &args);
   2554		BUG_ON(!device); /* Logic error */
   2555
   2556		if (device->fs_devices->seeding) {
   2557			btrfs_set_device_generation(leaf, dev_item,
   2558						    device->generation);
   2559			btrfs_mark_buffer_dirty(leaf);
   2560		}
   2561
   2562		path->slots[0]++;
   2563		goto next_slot;
   2564	}
   2565	ret = 0;
   2566error:
   2567	btrfs_free_path(path);
   2568	return ret;
   2569}
   2570
   2571int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
   2572{
   2573	struct btrfs_root *root = fs_info->dev_root;
   2574	struct btrfs_trans_handle *trans;
   2575	struct btrfs_device *device;
   2576	struct block_device *bdev;
   2577	struct super_block *sb = fs_info->sb;
   2578	struct rcu_string *name;
   2579	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   2580	struct btrfs_fs_devices *seed_devices;
   2581	u64 orig_super_total_bytes;
   2582	u64 orig_super_num_devices;
   2583	int ret = 0;
   2584	bool seeding_dev = false;
   2585	bool locked = false;
   2586
   2587	if (sb_rdonly(sb) && !fs_devices->seeding)
   2588		return -EROFS;
   2589
   2590	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
   2591				  fs_info->bdev_holder);
   2592	if (IS_ERR(bdev))
   2593		return PTR_ERR(bdev);
   2594
   2595	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
   2596		ret = -EINVAL;
   2597		goto error;
   2598	}
   2599
   2600	if (fs_devices->seeding) {
   2601		seeding_dev = true;
   2602		down_write(&sb->s_umount);
   2603		mutex_lock(&uuid_mutex);
   2604		locked = true;
   2605	}
   2606
   2607	sync_blockdev(bdev);
   2608
   2609	rcu_read_lock();
   2610	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
   2611		if (device->bdev == bdev) {
   2612			ret = -EEXIST;
   2613			rcu_read_unlock();
   2614			goto error;
   2615		}
   2616	}
   2617	rcu_read_unlock();
   2618
   2619	device = btrfs_alloc_device(fs_info, NULL, NULL);
   2620	if (IS_ERR(device)) {
   2621		/* we can safely leave the fs_devices entry around */
   2622		ret = PTR_ERR(device);
   2623		goto error;
   2624	}
   2625
   2626	name = rcu_string_strdup(device_path, GFP_KERNEL);
   2627	if (!name) {
   2628		ret = -ENOMEM;
   2629		goto error_free_device;
   2630	}
   2631	rcu_assign_pointer(device->name, name);
   2632
   2633	device->fs_info = fs_info;
   2634	device->bdev = bdev;
   2635	ret = lookup_bdev(device_path, &device->devt);
   2636	if (ret)
   2637		goto error_free_device;
   2638
   2639	ret = btrfs_get_dev_zone_info(device, false);
   2640	if (ret)
   2641		goto error_free_device;
   2642
   2643	trans = btrfs_start_transaction(root, 0);
   2644	if (IS_ERR(trans)) {
   2645		ret = PTR_ERR(trans);
   2646		goto error_free_zone;
   2647	}
   2648
   2649	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
   2650	device->generation = trans->transid;
   2651	device->io_width = fs_info->sectorsize;
   2652	device->io_align = fs_info->sectorsize;
   2653	device->sector_size = fs_info->sectorsize;
   2654	device->total_bytes =
   2655		round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
   2656	device->disk_total_bytes = device->total_bytes;
   2657	device->commit_total_bytes = device->total_bytes;
   2658	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
   2659	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
   2660	device->mode = FMODE_EXCL;
   2661	device->dev_stats_valid = 1;
   2662	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
   2663
   2664	if (seeding_dev) {
   2665		btrfs_clear_sb_rdonly(sb);
   2666
   2667		/* GFP_KERNEL allocation must not be under device_list_mutex */
   2668		seed_devices = btrfs_init_sprout(fs_info);
   2669		if (IS_ERR(seed_devices)) {
   2670			ret = PTR_ERR(seed_devices);
   2671			btrfs_abort_transaction(trans, ret);
   2672			goto error_trans;
   2673		}
   2674	}
   2675
   2676	mutex_lock(&fs_devices->device_list_mutex);
   2677	if (seeding_dev) {
   2678		btrfs_setup_sprout(fs_info, seed_devices);
   2679		btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
   2680						device);
   2681	}
   2682
   2683	device->fs_devices = fs_devices;
   2684
   2685	mutex_lock(&fs_info->chunk_mutex);
   2686	list_add_rcu(&device->dev_list, &fs_devices->devices);
   2687	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
   2688	fs_devices->num_devices++;
   2689	fs_devices->open_devices++;
   2690	fs_devices->rw_devices++;
   2691	fs_devices->total_devices++;
   2692	fs_devices->total_rw_bytes += device->total_bytes;
   2693
   2694	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
   2695
   2696	if (!bdev_nonrot(bdev))
   2697		fs_devices->rotating = true;
   2698
   2699	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
   2700	btrfs_set_super_total_bytes(fs_info->super_copy,
   2701		round_down(orig_super_total_bytes + device->total_bytes,
   2702			   fs_info->sectorsize));
   2703
   2704	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
   2705	btrfs_set_super_num_devices(fs_info->super_copy,
   2706				    orig_super_num_devices + 1);
   2707
   2708	/*
   2709	 * we've got more storage, clear any full flags on the space
   2710	 * infos
   2711	 */
   2712	btrfs_clear_space_info_full(fs_info);
   2713
   2714	mutex_unlock(&fs_info->chunk_mutex);
   2715
   2716	/* Add sysfs device entry */
   2717	btrfs_sysfs_add_device(device);
   2718
   2719	mutex_unlock(&fs_devices->device_list_mutex);
   2720
   2721	if (seeding_dev) {
   2722		mutex_lock(&fs_info->chunk_mutex);
   2723		ret = init_first_rw_device(trans);
   2724		mutex_unlock(&fs_info->chunk_mutex);
   2725		if (ret) {
   2726			btrfs_abort_transaction(trans, ret);
   2727			goto error_sysfs;
   2728		}
   2729	}
   2730
   2731	ret = btrfs_add_dev_item(trans, device);
   2732	if (ret) {
   2733		btrfs_abort_transaction(trans, ret);
   2734		goto error_sysfs;
   2735	}
   2736
   2737	if (seeding_dev) {
   2738		ret = btrfs_finish_sprout(trans);
   2739		if (ret) {
   2740			btrfs_abort_transaction(trans, ret);
   2741			goto error_sysfs;
   2742		}
   2743
   2744		/*
   2745		 * fs_devices now represents the newly sprouted filesystem and
   2746		 * its fsid has been changed by btrfs_sprout_splice().
   2747		 */
   2748		btrfs_sysfs_update_sprout_fsid(fs_devices);
   2749	}
   2750
   2751	ret = btrfs_commit_transaction(trans);
   2752
   2753	if (seeding_dev) {
   2754		mutex_unlock(&uuid_mutex);
   2755		up_write(&sb->s_umount);
   2756		locked = false;
   2757
   2758		if (ret) /* transaction commit */
   2759			return ret;
   2760
   2761		ret = btrfs_relocate_sys_chunks(fs_info);
   2762		if (ret < 0)
   2763			btrfs_handle_fs_error(fs_info, ret,
   2764				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
   2765		trans = btrfs_attach_transaction(root);
   2766		if (IS_ERR(trans)) {
   2767			if (PTR_ERR(trans) == -ENOENT)
   2768				return 0;
   2769			ret = PTR_ERR(trans);
   2770			trans = NULL;
   2771			goto error_sysfs;
   2772		}
   2773		ret = btrfs_commit_transaction(trans);
   2774	}
   2775
   2776	/*
   2777	 * Now that we have written a new super block to this device, check all
   2778	 * other fs_devices list if device_path alienates any other scanned
   2779	 * device.
   2780	 * We can ignore the return value as it typically returns -EINVAL and
   2781	 * only succeeds if the device was an alien.
   2782	 */
   2783	btrfs_forget_devices(device->devt);
   2784
   2785	/* Update ctime/mtime for blkid or udev */
   2786	update_dev_time(device_path);
   2787
   2788	return ret;
   2789
   2790error_sysfs:
   2791	btrfs_sysfs_remove_device(device);
   2792	mutex_lock(&fs_info->fs_devices->device_list_mutex);
   2793	mutex_lock(&fs_info->chunk_mutex);
   2794	list_del_rcu(&device->dev_list);
   2795	list_del(&device->dev_alloc_list);
   2796	fs_info->fs_devices->num_devices--;
   2797	fs_info->fs_devices->open_devices--;
   2798	fs_info->fs_devices->rw_devices--;
   2799	fs_info->fs_devices->total_devices--;
   2800	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
   2801	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
   2802	btrfs_set_super_total_bytes(fs_info->super_copy,
   2803				    orig_super_total_bytes);
   2804	btrfs_set_super_num_devices(fs_info->super_copy,
   2805				    orig_super_num_devices);
   2806	mutex_unlock(&fs_info->chunk_mutex);
   2807	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   2808error_trans:
   2809	if (seeding_dev)
   2810		btrfs_set_sb_rdonly(sb);
   2811	if (trans)
   2812		btrfs_end_transaction(trans);
   2813error_free_zone:
   2814	btrfs_destroy_dev_zone_info(device);
   2815error_free_device:
   2816	btrfs_free_device(device);
   2817error:
   2818	blkdev_put(bdev, FMODE_EXCL);
   2819	if (locked) {
   2820		mutex_unlock(&uuid_mutex);
   2821		up_write(&sb->s_umount);
   2822	}
   2823	return ret;
   2824}
   2825
   2826static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
   2827					struct btrfs_device *device)
   2828{
   2829	int ret;
   2830	struct btrfs_path *path;
   2831	struct btrfs_root *root = device->fs_info->chunk_root;
   2832	struct btrfs_dev_item *dev_item;
   2833	struct extent_buffer *leaf;
   2834	struct btrfs_key key;
   2835
   2836	path = btrfs_alloc_path();
   2837	if (!path)
   2838		return -ENOMEM;
   2839
   2840	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
   2841	key.type = BTRFS_DEV_ITEM_KEY;
   2842	key.offset = device->devid;
   2843
   2844	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
   2845	if (ret < 0)
   2846		goto out;
   2847
   2848	if (ret > 0) {
   2849		ret = -ENOENT;
   2850		goto out;
   2851	}
   2852
   2853	leaf = path->nodes[0];
   2854	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
   2855
   2856	btrfs_set_device_id(leaf, dev_item, device->devid);
   2857	btrfs_set_device_type(leaf, dev_item, device->type);
   2858	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
   2859	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
   2860	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
   2861	btrfs_set_device_total_bytes(leaf, dev_item,
   2862				     btrfs_device_get_disk_total_bytes(device));
   2863	btrfs_set_device_bytes_used(leaf, dev_item,
   2864				    btrfs_device_get_bytes_used(device));
   2865	btrfs_mark_buffer_dirty(leaf);
   2866
   2867out:
   2868	btrfs_free_path(path);
   2869	return ret;
   2870}
   2871
   2872int btrfs_grow_device(struct btrfs_trans_handle *trans,
   2873		      struct btrfs_device *device, u64 new_size)
   2874{
   2875	struct btrfs_fs_info *fs_info = device->fs_info;
   2876	struct btrfs_super_block *super_copy = fs_info->super_copy;
   2877	u64 old_total;
   2878	u64 diff;
   2879	int ret;
   2880
   2881	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
   2882		return -EACCES;
   2883
   2884	new_size = round_down(new_size, fs_info->sectorsize);
   2885
   2886	mutex_lock(&fs_info->chunk_mutex);
   2887	old_total = btrfs_super_total_bytes(super_copy);
   2888	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
   2889
   2890	if (new_size <= device->total_bytes ||
   2891	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
   2892		mutex_unlock(&fs_info->chunk_mutex);
   2893		return -EINVAL;
   2894	}
   2895
   2896	btrfs_set_super_total_bytes(super_copy,
   2897			round_down(old_total + diff, fs_info->sectorsize));
   2898	device->fs_devices->total_rw_bytes += diff;
   2899
   2900	btrfs_device_set_total_bytes(device, new_size);
   2901	btrfs_device_set_disk_total_bytes(device, new_size);
   2902	btrfs_clear_space_info_full(device->fs_info);
   2903	if (list_empty(&device->post_commit_list))
   2904		list_add_tail(&device->post_commit_list,
   2905			      &trans->transaction->dev_update_list);
   2906	mutex_unlock(&fs_info->chunk_mutex);
   2907
   2908	btrfs_reserve_chunk_metadata(trans, false);
   2909	ret = btrfs_update_device(trans, device);
   2910	btrfs_trans_release_chunk_metadata(trans);
   2911
   2912	return ret;
   2913}
   2914
   2915static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
   2916{
   2917	struct btrfs_fs_info *fs_info = trans->fs_info;
   2918	struct btrfs_root *root = fs_info->chunk_root;
   2919	int ret;
   2920	struct btrfs_path *path;
   2921	struct btrfs_key key;
   2922
   2923	path = btrfs_alloc_path();
   2924	if (!path)
   2925		return -ENOMEM;
   2926
   2927	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
   2928	key.offset = chunk_offset;
   2929	key.type = BTRFS_CHUNK_ITEM_KEY;
   2930
   2931	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
   2932	if (ret < 0)
   2933		goto out;
   2934	else if (ret > 0) { /* Logic error or corruption */
   2935		btrfs_handle_fs_error(fs_info, -ENOENT,
   2936				      "Failed lookup while freeing chunk.");
   2937		ret = -ENOENT;
   2938		goto out;
   2939	}
   2940
   2941	ret = btrfs_del_item(trans, root, path);
   2942	if (ret < 0)
   2943		btrfs_handle_fs_error(fs_info, ret,
   2944				      "Failed to delete chunk item.");
   2945out:
   2946	btrfs_free_path(path);
   2947	return ret;
   2948}
   2949
   2950static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
   2951{
   2952	struct btrfs_super_block *super_copy = fs_info->super_copy;
   2953	struct btrfs_disk_key *disk_key;
   2954	struct btrfs_chunk *chunk;
   2955	u8 *ptr;
   2956	int ret = 0;
   2957	u32 num_stripes;
   2958	u32 array_size;
   2959	u32 len = 0;
   2960	u32 cur;
   2961	struct btrfs_key key;
   2962
   2963	lockdep_assert_held(&fs_info->chunk_mutex);
   2964	array_size = btrfs_super_sys_array_size(super_copy);
   2965
   2966	ptr = super_copy->sys_chunk_array;
   2967	cur = 0;
   2968
   2969	while (cur < array_size) {
   2970		disk_key = (struct btrfs_disk_key *)ptr;
   2971		btrfs_disk_key_to_cpu(&key, disk_key);
   2972
   2973		len = sizeof(*disk_key);
   2974
   2975		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
   2976			chunk = (struct btrfs_chunk *)(ptr + len);
   2977			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
   2978			len += btrfs_chunk_item_size(num_stripes);
   2979		} else {
   2980			ret = -EIO;
   2981			break;
   2982		}
   2983		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
   2984		    key.offset == chunk_offset) {
   2985			memmove(ptr, ptr + len, array_size - (cur + len));
   2986			array_size -= len;
   2987			btrfs_set_super_sys_array_size(super_copy, array_size);
   2988		} else {
   2989			ptr += len;
   2990			cur += len;
   2991		}
   2992	}
   2993	return ret;
   2994}
   2995
   2996/*
   2997 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
   2998 * @logical: Logical block offset in bytes.
   2999 * @length: Length of extent in bytes.
   3000 *
   3001 * Return: Chunk mapping or ERR_PTR.
   3002 */
   3003struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
   3004				       u64 logical, u64 length)
   3005{
   3006	struct extent_map_tree *em_tree;
   3007	struct extent_map *em;
   3008
   3009	em_tree = &fs_info->mapping_tree;
   3010	read_lock(&em_tree->lock);
   3011	em = lookup_extent_mapping(em_tree, logical, length);
   3012	read_unlock(&em_tree->lock);
   3013
   3014	if (!em) {
   3015		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
   3016			   logical, length);
   3017		return ERR_PTR(-EINVAL);
   3018	}
   3019
   3020	if (em->start > logical || em->start + em->len < logical) {
   3021		btrfs_crit(fs_info,
   3022			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
   3023			   logical, length, em->start, em->start + em->len);
   3024		free_extent_map(em);
   3025		return ERR_PTR(-EINVAL);
   3026	}
   3027
   3028	/* callers are responsible for dropping em's ref. */
   3029	return em;
   3030}
   3031
   3032static int remove_chunk_item(struct btrfs_trans_handle *trans,
   3033			     struct map_lookup *map, u64 chunk_offset)
   3034{
   3035	int i;
   3036
   3037	/*
   3038	 * Removing chunk items and updating the device items in the chunks btree
   3039	 * requires holding the chunk_mutex.
   3040	 * See the comment at btrfs_chunk_alloc() for the details.
   3041	 */
   3042	lockdep_assert_held(&trans->fs_info->chunk_mutex);
   3043
   3044	for (i = 0; i < map->num_stripes; i++) {
   3045		int ret;
   3046
   3047		ret = btrfs_update_device(trans, map->stripes[i].dev);
   3048		if (ret)
   3049			return ret;
   3050	}
   3051
   3052	return btrfs_free_chunk(trans, chunk_offset);
   3053}
   3054
   3055int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
   3056{
   3057	struct btrfs_fs_info *fs_info = trans->fs_info;
   3058	struct extent_map *em;
   3059	struct map_lookup *map;
   3060	u64 dev_extent_len = 0;
   3061	int i, ret = 0;
   3062	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   3063
   3064	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
   3065	if (IS_ERR(em)) {
   3066		/*
   3067		 * This is a logic error, but we don't want to just rely on the
   3068		 * user having built with ASSERT enabled, so if ASSERT doesn't
   3069		 * do anything we still error out.
   3070		 */
   3071		ASSERT(0);
   3072		return PTR_ERR(em);
   3073	}
   3074	map = em->map_lookup;
   3075
   3076	/*
   3077	 * First delete the device extent items from the devices btree.
   3078	 * We take the device_list_mutex to avoid racing with the finishing phase
   3079	 * of a device replace operation. See the comment below before acquiring
   3080	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
   3081	 * because that can result in a deadlock when deleting the device extent
   3082	 * items from the devices btree - COWing an extent buffer from the btree
   3083	 * may result in allocating a new metadata chunk, which would attempt to
   3084	 * lock again fs_info->chunk_mutex.
   3085	 */
   3086	mutex_lock(&fs_devices->device_list_mutex);
   3087	for (i = 0; i < map->num_stripes; i++) {
   3088		struct btrfs_device *device = map->stripes[i].dev;
   3089		ret = btrfs_free_dev_extent(trans, device,
   3090					    map->stripes[i].physical,
   3091					    &dev_extent_len);
   3092		if (ret) {
   3093			mutex_unlock(&fs_devices->device_list_mutex);
   3094			btrfs_abort_transaction(trans, ret);
   3095			goto out;
   3096		}
   3097
   3098		if (device->bytes_used > 0) {
   3099			mutex_lock(&fs_info->chunk_mutex);
   3100			btrfs_device_set_bytes_used(device,
   3101					device->bytes_used - dev_extent_len);
   3102			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
   3103			btrfs_clear_space_info_full(fs_info);
   3104			mutex_unlock(&fs_info->chunk_mutex);
   3105		}
   3106	}
   3107	mutex_unlock(&fs_devices->device_list_mutex);
   3108
   3109	/*
   3110	 * We acquire fs_info->chunk_mutex for 2 reasons:
   3111	 *
   3112	 * 1) Just like with the first phase of the chunk allocation, we must
   3113	 *    reserve system space, do all chunk btree updates and deletions, and
   3114	 *    update the system chunk array in the superblock while holding this
   3115	 *    mutex. This is for similar reasons as explained on the comment at
   3116	 *    the top of btrfs_chunk_alloc();
   3117	 *
   3118	 * 2) Prevent races with the final phase of a device replace operation
   3119	 *    that replaces the device object associated with the map's stripes,
   3120	 *    because the device object's id can change at any time during that
   3121	 *    final phase of the device replace operation
   3122	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
   3123	 *    replaced device and then see it with an ID of
   3124	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
   3125	 *    the device item, which does not exists on the chunk btree.
   3126	 *    The finishing phase of device replace acquires both the
   3127	 *    device_list_mutex and the chunk_mutex, in that order, so we are
   3128	 *    safe by just acquiring the chunk_mutex.
   3129	 */
   3130	trans->removing_chunk = true;
   3131	mutex_lock(&fs_info->chunk_mutex);
   3132
   3133	check_system_chunk(trans, map->type);
   3134
   3135	ret = remove_chunk_item(trans, map, chunk_offset);
   3136	/*
   3137	 * Normally we should not get -ENOSPC since we reserved space before
   3138	 * through the call to check_system_chunk().
   3139	 *
   3140	 * Despite our system space_info having enough free space, we may not
   3141	 * be able to allocate extents from its block groups, because all have
   3142	 * an incompatible profile, which will force us to allocate a new system
   3143	 * block group with the right profile, or right after we called
   3144	 * check_system_space() above, a scrub turned the only system block group
   3145	 * with enough free space into RO mode.
   3146	 * This is explained with more detail at do_chunk_alloc().
   3147	 *
   3148	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
   3149	 */
   3150	if (ret == -ENOSPC) {
   3151		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
   3152		struct btrfs_block_group *sys_bg;
   3153
   3154		sys_bg = btrfs_create_chunk(trans, sys_flags);
   3155		if (IS_ERR(sys_bg)) {
   3156			ret = PTR_ERR(sys_bg);
   3157			btrfs_abort_transaction(trans, ret);
   3158			goto out;
   3159		}
   3160
   3161		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
   3162		if (ret) {
   3163			btrfs_abort_transaction(trans, ret);
   3164			goto out;
   3165		}
   3166
   3167		ret = remove_chunk_item(trans, map, chunk_offset);
   3168		if (ret) {
   3169			btrfs_abort_transaction(trans, ret);
   3170			goto out;
   3171		}
   3172	} else if (ret) {
   3173		btrfs_abort_transaction(trans, ret);
   3174		goto out;
   3175	}
   3176
   3177	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
   3178
   3179	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
   3180		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
   3181		if (ret) {
   3182			btrfs_abort_transaction(trans, ret);
   3183			goto out;
   3184		}
   3185	}
   3186
   3187	mutex_unlock(&fs_info->chunk_mutex);
   3188	trans->removing_chunk = false;
   3189
   3190	/*
   3191	 * We are done with chunk btree updates and deletions, so release the
   3192	 * system space we previously reserved (with check_system_chunk()).
   3193	 */
   3194	btrfs_trans_release_chunk_metadata(trans);
   3195
   3196	ret = btrfs_remove_block_group(trans, chunk_offset, em);
   3197	if (ret) {
   3198		btrfs_abort_transaction(trans, ret);
   3199		goto out;
   3200	}
   3201
   3202out:
   3203	if (trans->removing_chunk) {
   3204		mutex_unlock(&fs_info->chunk_mutex);
   3205		trans->removing_chunk = false;
   3206	}
   3207	/* once for us */
   3208	free_extent_map(em);
   3209	return ret;
   3210}
   3211
   3212int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
   3213{
   3214	struct btrfs_root *root = fs_info->chunk_root;
   3215	struct btrfs_trans_handle *trans;
   3216	struct btrfs_block_group *block_group;
   3217	u64 length;
   3218	int ret;
   3219
   3220	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
   3221		btrfs_err(fs_info,
   3222			  "relocate: not supported on extent tree v2 yet");
   3223		return -EINVAL;
   3224	}
   3225
   3226	/*
   3227	 * Prevent races with automatic removal of unused block groups.
   3228	 * After we relocate and before we remove the chunk with offset
   3229	 * chunk_offset, automatic removal of the block group can kick in,
   3230	 * resulting in a failure when calling btrfs_remove_chunk() below.
   3231	 *
   3232	 * Make sure to acquire this mutex before doing a tree search (dev
   3233	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
   3234	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
   3235	 * we release the path used to search the chunk/dev tree and before
   3236	 * the current task acquires this mutex and calls us.
   3237	 */
   3238	lockdep_assert_held(&fs_info->reclaim_bgs_lock);
   3239
   3240	/* step one, relocate all the extents inside this chunk */
   3241	btrfs_scrub_pause(fs_info);
   3242	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
   3243	btrfs_scrub_continue(fs_info);
   3244	if (ret)
   3245		return ret;
   3246
   3247	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
   3248	if (!block_group)
   3249		return -ENOENT;
   3250	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
   3251	length = block_group->length;
   3252	btrfs_put_block_group(block_group);
   3253
   3254	/*
   3255	 * On a zoned file system, discard the whole block group, this will
   3256	 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
   3257	 * resetting the zone fails, don't treat it as a fatal problem from the
   3258	 * filesystem's point of view.
   3259	 */
   3260	if (btrfs_is_zoned(fs_info)) {
   3261		ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
   3262		if (ret)
   3263			btrfs_info(fs_info,
   3264				"failed to reset zone %llu after relocation",
   3265				chunk_offset);
   3266	}
   3267
   3268	trans = btrfs_start_trans_remove_block_group(root->fs_info,
   3269						     chunk_offset);
   3270	if (IS_ERR(trans)) {
   3271		ret = PTR_ERR(trans);
   3272		btrfs_handle_fs_error(root->fs_info, ret, NULL);
   3273		return ret;
   3274	}
   3275
   3276	/*
   3277	 * step two, delete the device extents and the
   3278	 * chunk tree entries
   3279	 */
   3280	ret = btrfs_remove_chunk(trans, chunk_offset);
   3281	btrfs_end_transaction(trans);
   3282	return ret;
   3283}
   3284
   3285static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
   3286{
   3287	struct btrfs_root *chunk_root = fs_info->chunk_root;
   3288	struct btrfs_path *path;
   3289	struct extent_buffer *leaf;
   3290	struct btrfs_chunk *chunk;
   3291	struct btrfs_key key;
   3292	struct btrfs_key found_key;
   3293	u64 chunk_type;
   3294	bool retried = false;
   3295	int failed = 0;
   3296	int ret;
   3297
   3298	path = btrfs_alloc_path();
   3299	if (!path)
   3300		return -ENOMEM;
   3301
   3302again:
   3303	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
   3304	key.offset = (u64)-1;
   3305	key.type = BTRFS_CHUNK_ITEM_KEY;
   3306
   3307	while (1) {
   3308		mutex_lock(&fs_info->reclaim_bgs_lock);
   3309		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
   3310		if (ret < 0) {
   3311			mutex_unlock(&fs_info->reclaim_bgs_lock);
   3312			goto error;
   3313		}
   3314		BUG_ON(ret == 0); /* Corruption */
   3315
   3316		ret = btrfs_previous_item(chunk_root, path, key.objectid,
   3317					  key.type);
   3318		if (ret)
   3319			mutex_unlock(&fs_info->reclaim_bgs_lock);
   3320		if (ret < 0)
   3321			goto error;
   3322		if (ret > 0)
   3323			break;
   3324
   3325		leaf = path->nodes[0];
   3326		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   3327
   3328		chunk = btrfs_item_ptr(leaf, path->slots[0],
   3329				       struct btrfs_chunk);
   3330		chunk_type = btrfs_chunk_type(leaf, chunk);
   3331		btrfs_release_path(path);
   3332
   3333		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
   3334			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
   3335			if (ret == -ENOSPC)
   3336				failed++;
   3337			else
   3338				BUG_ON(ret);
   3339		}
   3340		mutex_unlock(&fs_info->reclaim_bgs_lock);
   3341
   3342		if (found_key.offset == 0)
   3343			break;
   3344		key.offset = found_key.offset - 1;
   3345	}
   3346	ret = 0;
   3347	if (failed && !retried) {
   3348		failed = 0;
   3349		retried = true;
   3350		goto again;
   3351	} else if (WARN_ON(failed && retried)) {
   3352		ret = -ENOSPC;
   3353	}
   3354error:
   3355	btrfs_free_path(path);
   3356	return ret;
   3357}
   3358
   3359/*
   3360 * return 1 : allocate a data chunk successfully,
   3361 * return <0: errors during allocating a data chunk,
   3362 * return 0 : no need to allocate a data chunk.
   3363 */
   3364static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
   3365				      u64 chunk_offset)
   3366{
   3367	struct btrfs_block_group *cache;
   3368	u64 bytes_used;
   3369	u64 chunk_type;
   3370
   3371	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
   3372	ASSERT(cache);
   3373	chunk_type = cache->flags;
   3374	btrfs_put_block_group(cache);
   3375
   3376	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
   3377		return 0;
   3378
   3379	spin_lock(&fs_info->data_sinfo->lock);
   3380	bytes_used = fs_info->data_sinfo->bytes_used;
   3381	spin_unlock(&fs_info->data_sinfo->lock);
   3382
   3383	if (!bytes_used) {
   3384		struct btrfs_trans_handle *trans;
   3385		int ret;
   3386
   3387		trans =	btrfs_join_transaction(fs_info->tree_root);
   3388		if (IS_ERR(trans))
   3389			return PTR_ERR(trans);
   3390
   3391		ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
   3392		btrfs_end_transaction(trans);
   3393		if (ret < 0)
   3394			return ret;
   3395		return 1;
   3396	}
   3397
   3398	return 0;
   3399}
   3400
   3401static int insert_balance_item(struct btrfs_fs_info *fs_info,
   3402			       struct btrfs_balance_control *bctl)
   3403{
   3404	struct btrfs_root *root = fs_info->tree_root;
   3405	struct btrfs_trans_handle *trans;
   3406	struct btrfs_balance_item *item;
   3407	struct btrfs_disk_balance_args disk_bargs;
   3408	struct btrfs_path *path;
   3409	struct extent_buffer *leaf;
   3410	struct btrfs_key key;
   3411	int ret, err;
   3412
   3413	path = btrfs_alloc_path();
   3414	if (!path)
   3415		return -ENOMEM;
   3416
   3417	trans = btrfs_start_transaction(root, 0);
   3418	if (IS_ERR(trans)) {
   3419		btrfs_free_path(path);
   3420		return PTR_ERR(trans);
   3421	}
   3422
   3423	key.objectid = BTRFS_BALANCE_OBJECTID;
   3424	key.type = BTRFS_TEMPORARY_ITEM_KEY;
   3425	key.offset = 0;
   3426
   3427	ret = btrfs_insert_empty_item(trans, root, path, &key,
   3428				      sizeof(*item));
   3429	if (ret)
   3430		goto out;
   3431
   3432	leaf = path->nodes[0];
   3433	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
   3434
   3435	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
   3436
   3437	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
   3438	btrfs_set_balance_data(leaf, item, &disk_bargs);
   3439	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
   3440	btrfs_set_balance_meta(leaf, item, &disk_bargs);
   3441	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
   3442	btrfs_set_balance_sys(leaf, item, &disk_bargs);
   3443
   3444	btrfs_set_balance_flags(leaf, item, bctl->flags);
   3445
   3446	btrfs_mark_buffer_dirty(leaf);
   3447out:
   3448	btrfs_free_path(path);
   3449	err = btrfs_commit_transaction(trans);
   3450	if (err && !ret)
   3451		ret = err;
   3452	return ret;
   3453}
   3454
   3455static int del_balance_item(struct btrfs_fs_info *fs_info)
   3456{
   3457	struct btrfs_root *root = fs_info->tree_root;
   3458	struct btrfs_trans_handle *trans;
   3459	struct btrfs_path *path;
   3460	struct btrfs_key key;
   3461	int ret, err;
   3462
   3463	path = btrfs_alloc_path();
   3464	if (!path)
   3465		return -ENOMEM;
   3466
   3467	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
   3468	if (IS_ERR(trans)) {
   3469		btrfs_free_path(path);
   3470		return PTR_ERR(trans);
   3471	}
   3472
   3473	key.objectid = BTRFS_BALANCE_OBJECTID;
   3474	key.type = BTRFS_TEMPORARY_ITEM_KEY;
   3475	key.offset = 0;
   3476
   3477	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
   3478	if (ret < 0)
   3479		goto out;
   3480	if (ret > 0) {
   3481		ret = -ENOENT;
   3482		goto out;
   3483	}
   3484
   3485	ret = btrfs_del_item(trans, root, path);
   3486out:
   3487	btrfs_free_path(path);
   3488	err = btrfs_commit_transaction(trans);
   3489	if (err && !ret)
   3490		ret = err;
   3491	return ret;
   3492}
   3493
   3494/*
   3495 * This is a heuristic used to reduce the number of chunks balanced on
   3496 * resume after balance was interrupted.
   3497 */
   3498static void update_balance_args(struct btrfs_balance_control *bctl)
   3499{
   3500	/*
   3501	 * Turn on soft mode for chunk types that were being converted.
   3502	 */
   3503	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
   3504		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
   3505	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
   3506		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
   3507	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
   3508		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
   3509
   3510	/*
   3511	 * Turn on usage filter if is not already used.  The idea is
   3512	 * that chunks that we have already balanced should be
   3513	 * reasonably full.  Don't do it for chunks that are being
   3514	 * converted - that will keep us from relocating unconverted
   3515	 * (albeit full) chunks.
   3516	 */
   3517	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
   3518	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
   3519	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
   3520		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
   3521		bctl->data.usage = 90;
   3522	}
   3523	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
   3524	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
   3525	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
   3526		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
   3527		bctl->sys.usage = 90;
   3528	}
   3529	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
   3530	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
   3531	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
   3532		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
   3533		bctl->meta.usage = 90;
   3534	}
   3535}
   3536
   3537/*
   3538 * Clear the balance status in fs_info and delete the balance item from disk.
   3539 */
   3540static void reset_balance_state(struct btrfs_fs_info *fs_info)
   3541{
   3542	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
   3543	int ret;
   3544
   3545	BUG_ON(!fs_info->balance_ctl);
   3546
   3547	spin_lock(&fs_info->balance_lock);
   3548	fs_info->balance_ctl = NULL;
   3549	spin_unlock(&fs_info->balance_lock);
   3550
   3551	kfree(bctl);
   3552	ret = del_balance_item(fs_info);
   3553	if (ret)
   3554		btrfs_handle_fs_error(fs_info, ret, NULL);
   3555}
   3556
   3557/*
   3558 * Balance filters.  Return 1 if chunk should be filtered out
   3559 * (should not be balanced).
   3560 */
   3561static int chunk_profiles_filter(u64 chunk_type,
   3562				 struct btrfs_balance_args *bargs)
   3563{
   3564	chunk_type = chunk_to_extended(chunk_type) &
   3565				BTRFS_EXTENDED_PROFILE_MASK;
   3566
   3567	if (bargs->profiles & chunk_type)
   3568		return 0;
   3569
   3570	return 1;
   3571}
   3572
   3573static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
   3574			      struct btrfs_balance_args *bargs)
   3575{
   3576	struct btrfs_block_group *cache;
   3577	u64 chunk_used;
   3578	u64 user_thresh_min;
   3579	u64 user_thresh_max;
   3580	int ret = 1;
   3581
   3582	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
   3583	chunk_used = cache->used;
   3584
   3585	if (bargs->usage_min == 0)
   3586		user_thresh_min = 0;
   3587	else
   3588		user_thresh_min = div_factor_fine(cache->length,
   3589						  bargs->usage_min);
   3590
   3591	if (bargs->usage_max == 0)
   3592		user_thresh_max = 1;
   3593	else if (bargs->usage_max > 100)
   3594		user_thresh_max = cache->length;
   3595	else
   3596		user_thresh_max = div_factor_fine(cache->length,
   3597						  bargs->usage_max);
   3598
   3599	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
   3600		ret = 0;
   3601
   3602	btrfs_put_block_group(cache);
   3603	return ret;
   3604}
   3605
   3606static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
   3607		u64 chunk_offset, struct btrfs_balance_args *bargs)
   3608{
   3609	struct btrfs_block_group *cache;
   3610	u64 chunk_used, user_thresh;
   3611	int ret = 1;
   3612
   3613	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
   3614	chunk_used = cache->used;
   3615
   3616	if (bargs->usage_min == 0)
   3617		user_thresh = 1;
   3618	else if (bargs->usage > 100)
   3619		user_thresh = cache->length;
   3620	else
   3621		user_thresh = div_factor_fine(cache->length, bargs->usage);
   3622
   3623	if (chunk_used < user_thresh)
   3624		ret = 0;
   3625
   3626	btrfs_put_block_group(cache);
   3627	return ret;
   3628}
   3629
   3630static int chunk_devid_filter(struct extent_buffer *leaf,
   3631			      struct btrfs_chunk *chunk,
   3632			      struct btrfs_balance_args *bargs)
   3633{
   3634	struct btrfs_stripe *stripe;
   3635	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
   3636	int i;
   3637
   3638	for (i = 0; i < num_stripes; i++) {
   3639		stripe = btrfs_stripe_nr(chunk, i);
   3640		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
   3641			return 0;
   3642	}
   3643
   3644	return 1;
   3645}
   3646
   3647static u64 calc_data_stripes(u64 type, int num_stripes)
   3648{
   3649	const int index = btrfs_bg_flags_to_raid_index(type);
   3650	const int ncopies = btrfs_raid_array[index].ncopies;
   3651	const int nparity = btrfs_raid_array[index].nparity;
   3652
   3653	return (num_stripes - nparity) / ncopies;
   3654}
   3655
   3656/* [pstart, pend) */
   3657static int chunk_drange_filter(struct extent_buffer *leaf,
   3658			       struct btrfs_chunk *chunk,
   3659			       struct btrfs_balance_args *bargs)
   3660{
   3661	struct btrfs_stripe *stripe;
   3662	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
   3663	u64 stripe_offset;
   3664	u64 stripe_length;
   3665	u64 type;
   3666	int factor;
   3667	int i;
   3668
   3669	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
   3670		return 0;
   3671
   3672	type = btrfs_chunk_type(leaf, chunk);
   3673	factor = calc_data_stripes(type, num_stripes);
   3674
   3675	for (i = 0; i < num_stripes; i++) {
   3676		stripe = btrfs_stripe_nr(chunk, i);
   3677		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
   3678			continue;
   3679
   3680		stripe_offset = btrfs_stripe_offset(leaf, stripe);
   3681		stripe_length = btrfs_chunk_length(leaf, chunk);
   3682		stripe_length = div_u64(stripe_length, factor);
   3683
   3684		if (stripe_offset < bargs->pend &&
   3685		    stripe_offset + stripe_length > bargs->pstart)
   3686			return 0;
   3687	}
   3688
   3689	return 1;
   3690}
   3691
   3692/* [vstart, vend) */
   3693static int chunk_vrange_filter(struct extent_buffer *leaf,
   3694			       struct btrfs_chunk *chunk,
   3695			       u64 chunk_offset,
   3696			       struct btrfs_balance_args *bargs)
   3697{
   3698	if (chunk_offset < bargs->vend &&
   3699	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
   3700		/* at least part of the chunk is inside this vrange */
   3701		return 0;
   3702
   3703	return 1;
   3704}
   3705
   3706static int chunk_stripes_range_filter(struct extent_buffer *leaf,
   3707			       struct btrfs_chunk *chunk,
   3708			       struct btrfs_balance_args *bargs)
   3709{
   3710	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
   3711
   3712	if (bargs->stripes_min <= num_stripes
   3713			&& num_stripes <= bargs->stripes_max)
   3714		return 0;
   3715
   3716	return 1;
   3717}
   3718
   3719static int chunk_soft_convert_filter(u64 chunk_type,
   3720				     struct btrfs_balance_args *bargs)
   3721{
   3722	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
   3723		return 0;
   3724
   3725	chunk_type = chunk_to_extended(chunk_type) &
   3726				BTRFS_EXTENDED_PROFILE_MASK;
   3727
   3728	if (bargs->target == chunk_type)
   3729		return 1;
   3730
   3731	return 0;
   3732}
   3733
   3734static int should_balance_chunk(struct extent_buffer *leaf,
   3735				struct btrfs_chunk *chunk, u64 chunk_offset)
   3736{
   3737	struct btrfs_fs_info *fs_info = leaf->fs_info;
   3738	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
   3739	struct btrfs_balance_args *bargs = NULL;
   3740	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
   3741
   3742	/* type filter */
   3743	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
   3744	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
   3745		return 0;
   3746	}
   3747
   3748	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
   3749		bargs = &bctl->data;
   3750	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
   3751		bargs = &bctl->sys;
   3752	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
   3753		bargs = &bctl->meta;
   3754
   3755	/* profiles filter */
   3756	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
   3757	    chunk_profiles_filter(chunk_type, bargs)) {
   3758		return 0;
   3759	}
   3760
   3761	/* usage filter */
   3762	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
   3763	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
   3764		return 0;
   3765	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
   3766	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
   3767		return 0;
   3768	}
   3769
   3770	/* devid filter */
   3771	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
   3772	    chunk_devid_filter(leaf, chunk, bargs)) {
   3773		return 0;
   3774	}
   3775
   3776	/* drange filter, makes sense only with devid filter */
   3777	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
   3778	    chunk_drange_filter(leaf, chunk, bargs)) {
   3779		return 0;
   3780	}
   3781
   3782	/* vrange filter */
   3783	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
   3784	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
   3785		return 0;
   3786	}
   3787
   3788	/* stripes filter */
   3789	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
   3790	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
   3791		return 0;
   3792	}
   3793
   3794	/* soft profile changing mode */
   3795	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
   3796	    chunk_soft_convert_filter(chunk_type, bargs)) {
   3797		return 0;
   3798	}
   3799
   3800	/*
   3801	 * limited by count, must be the last filter
   3802	 */
   3803	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
   3804		if (bargs->limit == 0)
   3805			return 0;
   3806		else
   3807			bargs->limit--;
   3808	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
   3809		/*
   3810		 * Same logic as the 'limit' filter; the minimum cannot be
   3811		 * determined here because we do not have the global information
   3812		 * about the count of all chunks that satisfy the filters.
   3813		 */
   3814		if (bargs->limit_max == 0)
   3815			return 0;
   3816		else
   3817			bargs->limit_max--;
   3818	}
   3819
   3820	return 1;
   3821}
   3822
   3823static int __btrfs_balance(struct btrfs_fs_info *fs_info)
   3824{
   3825	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
   3826	struct btrfs_root *chunk_root = fs_info->chunk_root;
   3827	u64 chunk_type;
   3828	struct btrfs_chunk *chunk;
   3829	struct btrfs_path *path = NULL;
   3830	struct btrfs_key key;
   3831	struct btrfs_key found_key;
   3832	struct extent_buffer *leaf;
   3833	int slot;
   3834	int ret;
   3835	int enospc_errors = 0;
   3836	bool counting = true;
   3837	/* The single value limit and min/max limits use the same bytes in the */
   3838	u64 limit_data = bctl->data.limit;
   3839	u64 limit_meta = bctl->meta.limit;
   3840	u64 limit_sys = bctl->sys.limit;
   3841	u32 count_data = 0;
   3842	u32 count_meta = 0;
   3843	u32 count_sys = 0;
   3844	int chunk_reserved = 0;
   3845
   3846	path = btrfs_alloc_path();
   3847	if (!path) {
   3848		ret = -ENOMEM;
   3849		goto error;
   3850	}
   3851
   3852	/* zero out stat counters */
   3853	spin_lock(&fs_info->balance_lock);
   3854	memset(&bctl->stat, 0, sizeof(bctl->stat));
   3855	spin_unlock(&fs_info->balance_lock);
   3856again:
   3857	if (!counting) {
   3858		/*
   3859		 * The single value limit and min/max limits use the same bytes
   3860		 * in the
   3861		 */
   3862		bctl->data.limit = limit_data;
   3863		bctl->meta.limit = limit_meta;
   3864		bctl->sys.limit = limit_sys;
   3865	}
   3866	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
   3867	key.offset = (u64)-1;
   3868	key.type = BTRFS_CHUNK_ITEM_KEY;
   3869
   3870	while (1) {
   3871		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
   3872		    atomic_read(&fs_info->balance_cancel_req)) {
   3873			ret = -ECANCELED;
   3874			goto error;
   3875		}
   3876
   3877		mutex_lock(&fs_info->reclaim_bgs_lock);
   3878		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
   3879		if (ret < 0) {
   3880			mutex_unlock(&fs_info->reclaim_bgs_lock);
   3881			goto error;
   3882		}
   3883
   3884		/*
   3885		 * this shouldn't happen, it means the last relocate
   3886		 * failed
   3887		 */
   3888		if (ret == 0)
   3889			BUG(); /* FIXME break ? */
   3890
   3891		ret = btrfs_previous_item(chunk_root, path, 0,
   3892					  BTRFS_CHUNK_ITEM_KEY);
   3893		if (ret) {
   3894			mutex_unlock(&fs_info->reclaim_bgs_lock);
   3895			ret = 0;
   3896			break;
   3897		}
   3898
   3899		leaf = path->nodes[0];
   3900		slot = path->slots[0];
   3901		btrfs_item_key_to_cpu(leaf, &found_key, slot);
   3902
   3903		if (found_key.objectid != key.objectid) {
   3904			mutex_unlock(&fs_info->reclaim_bgs_lock);
   3905			break;
   3906		}
   3907
   3908		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
   3909		chunk_type = btrfs_chunk_type(leaf, chunk);
   3910
   3911		if (!counting) {
   3912			spin_lock(&fs_info->balance_lock);
   3913			bctl->stat.considered++;
   3914			spin_unlock(&fs_info->balance_lock);
   3915		}
   3916
   3917		ret = should_balance_chunk(leaf, chunk, found_key.offset);
   3918
   3919		btrfs_release_path(path);
   3920		if (!ret) {
   3921			mutex_unlock(&fs_info->reclaim_bgs_lock);
   3922			goto loop;
   3923		}
   3924
   3925		if (counting) {
   3926			mutex_unlock(&fs_info->reclaim_bgs_lock);
   3927			spin_lock(&fs_info->balance_lock);
   3928			bctl->stat.expected++;
   3929			spin_unlock(&fs_info->balance_lock);
   3930
   3931			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
   3932				count_data++;
   3933			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
   3934				count_sys++;
   3935			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
   3936				count_meta++;
   3937
   3938			goto loop;
   3939		}
   3940
   3941		/*
   3942		 * Apply limit_min filter, no need to check if the LIMITS
   3943		 * filter is used, limit_min is 0 by default
   3944		 */
   3945		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
   3946					count_data < bctl->data.limit_min)
   3947				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
   3948					count_meta < bctl->meta.limit_min)
   3949				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
   3950					count_sys < bctl->sys.limit_min)) {
   3951			mutex_unlock(&fs_info->reclaim_bgs_lock);
   3952			goto loop;
   3953		}
   3954
   3955		if (!chunk_reserved) {
   3956			/*
   3957			 * We may be relocating the only data chunk we have,
   3958			 * which could potentially end up with losing data's
   3959			 * raid profile, so lets allocate an empty one in
   3960			 * advance.
   3961			 */
   3962			ret = btrfs_may_alloc_data_chunk(fs_info,
   3963							 found_key.offset);
   3964			if (ret < 0) {
   3965				mutex_unlock(&fs_info->reclaim_bgs_lock);
   3966				goto error;
   3967			} else if (ret == 1) {
   3968				chunk_reserved = 1;
   3969			}
   3970		}
   3971
   3972		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
   3973		mutex_unlock(&fs_info->reclaim_bgs_lock);
   3974		if (ret == -ENOSPC) {
   3975			enospc_errors++;
   3976		} else if (ret == -ETXTBSY) {
   3977			btrfs_info(fs_info,
   3978	   "skipping relocation of block group %llu due to active swapfile",
   3979				   found_key.offset);
   3980			ret = 0;
   3981		} else if (ret) {
   3982			goto error;
   3983		} else {
   3984			spin_lock(&fs_info->balance_lock);
   3985			bctl->stat.completed++;
   3986			spin_unlock(&fs_info->balance_lock);
   3987		}
   3988loop:
   3989		if (found_key.offset == 0)
   3990			break;
   3991		key.offset = found_key.offset - 1;
   3992	}
   3993
   3994	if (counting) {
   3995		btrfs_release_path(path);
   3996		counting = false;
   3997		goto again;
   3998	}
   3999error:
   4000	btrfs_free_path(path);
   4001	if (enospc_errors) {
   4002		btrfs_info(fs_info, "%d enospc errors during balance",
   4003			   enospc_errors);
   4004		if (!ret)
   4005			ret = -ENOSPC;
   4006	}
   4007
   4008	return ret;
   4009}
   4010
   4011/**
   4012 * alloc_profile_is_valid - see if a given profile is valid and reduced
   4013 * @flags: profile to validate
   4014 * @extended: if true @flags is treated as an extended profile
   4015 */
   4016static int alloc_profile_is_valid(u64 flags, int extended)
   4017{
   4018	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
   4019			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
   4020
   4021	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
   4022
   4023	/* 1) check that all other bits are zeroed */
   4024	if (flags & ~mask)
   4025		return 0;
   4026
   4027	/* 2) see if profile is reduced */
   4028	if (flags == 0)
   4029		return !extended; /* "0" is valid for usual profiles */
   4030
   4031	return has_single_bit_set(flags);
   4032}
   4033
   4034static inline int balance_need_close(struct btrfs_fs_info *fs_info)
   4035{
   4036	/* cancel requested || normal exit path */
   4037	return atomic_read(&fs_info->balance_cancel_req) ||
   4038		(atomic_read(&fs_info->balance_pause_req) == 0 &&
   4039		 atomic_read(&fs_info->balance_cancel_req) == 0);
   4040}
   4041
   4042/*
   4043 * Validate target profile against allowed profiles and return true if it's OK.
   4044 * Otherwise print the error message and return false.
   4045 */
   4046static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
   4047		const struct btrfs_balance_args *bargs,
   4048		u64 allowed, const char *type)
   4049{
   4050	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
   4051		return true;
   4052
   4053	/* Profile is valid and does not have bits outside of the allowed set */
   4054	if (alloc_profile_is_valid(bargs->target, 1) &&
   4055	    (bargs->target & ~allowed) == 0)
   4056		return true;
   4057
   4058	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
   4059			type, btrfs_bg_type_to_raid_name(bargs->target));
   4060	return false;
   4061}
   4062
   4063/*
   4064 * Fill @buf with textual description of balance filter flags @bargs, up to
   4065 * @size_buf including the terminating null. The output may be trimmed if it
   4066 * does not fit into the provided buffer.
   4067 */
   4068static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
   4069				 u32 size_buf)
   4070{
   4071	int ret;
   4072	u32 size_bp = size_buf;
   4073	char *bp = buf;
   4074	u64 flags = bargs->flags;
   4075	char tmp_buf[128] = {'\0'};
   4076
   4077	if (!flags)
   4078		return;
   4079
   4080#define CHECK_APPEND_NOARG(a)						\
   4081	do {								\
   4082		ret = snprintf(bp, size_bp, (a));			\
   4083		if (ret < 0 || ret >= size_bp)				\
   4084			goto out_overflow;				\
   4085		size_bp -= ret;						\
   4086		bp += ret;						\
   4087	} while (0)
   4088
   4089#define CHECK_APPEND_1ARG(a, v1)					\
   4090	do {								\
   4091		ret = snprintf(bp, size_bp, (a), (v1));			\
   4092		if (ret < 0 || ret >= size_bp)				\
   4093			goto out_overflow;				\
   4094		size_bp -= ret;						\
   4095		bp += ret;						\
   4096	} while (0)
   4097
   4098#define CHECK_APPEND_2ARG(a, v1, v2)					\
   4099	do {								\
   4100		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
   4101		if (ret < 0 || ret >= size_bp)				\
   4102			goto out_overflow;				\
   4103		size_bp -= ret;						\
   4104		bp += ret;						\
   4105	} while (0)
   4106
   4107	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
   4108		CHECK_APPEND_1ARG("convert=%s,",
   4109				  btrfs_bg_type_to_raid_name(bargs->target));
   4110
   4111	if (flags & BTRFS_BALANCE_ARGS_SOFT)
   4112		CHECK_APPEND_NOARG("soft,");
   4113
   4114	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
   4115		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
   4116					    sizeof(tmp_buf));
   4117		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
   4118	}
   4119
   4120	if (flags & BTRFS_BALANCE_ARGS_USAGE)
   4121		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
   4122
   4123	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
   4124		CHECK_APPEND_2ARG("usage=%u..%u,",
   4125				  bargs->usage_min, bargs->usage_max);
   4126
   4127	if (flags & BTRFS_BALANCE_ARGS_DEVID)
   4128		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
   4129
   4130	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
   4131		CHECK_APPEND_2ARG("drange=%llu..%llu,",
   4132				  bargs->pstart, bargs->pend);
   4133
   4134	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
   4135		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
   4136				  bargs->vstart, bargs->vend);
   4137
   4138	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
   4139		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
   4140
   4141	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
   4142		CHECK_APPEND_2ARG("limit=%u..%u,",
   4143				bargs->limit_min, bargs->limit_max);
   4144
   4145	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
   4146		CHECK_APPEND_2ARG("stripes=%u..%u,",
   4147				  bargs->stripes_min, bargs->stripes_max);
   4148
   4149#undef CHECK_APPEND_2ARG
   4150#undef CHECK_APPEND_1ARG
   4151#undef CHECK_APPEND_NOARG
   4152
   4153out_overflow:
   4154
   4155	if (size_bp < size_buf)
   4156		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
   4157	else
   4158		buf[0] = '\0';
   4159}
   4160
   4161static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
   4162{
   4163	u32 size_buf = 1024;
   4164	char tmp_buf[192] = {'\0'};
   4165	char *buf;
   4166	char *bp;
   4167	u32 size_bp = size_buf;
   4168	int ret;
   4169	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
   4170
   4171	buf = kzalloc(size_buf, GFP_KERNEL);
   4172	if (!buf)
   4173		return;
   4174
   4175	bp = buf;
   4176
   4177#define CHECK_APPEND_1ARG(a, v1)					\
   4178	do {								\
   4179		ret = snprintf(bp, size_bp, (a), (v1));			\
   4180		if (ret < 0 || ret >= size_bp)				\
   4181			goto out_overflow;				\
   4182		size_bp -= ret;						\
   4183		bp += ret;						\
   4184	} while (0)
   4185
   4186	if (bctl->flags & BTRFS_BALANCE_FORCE)
   4187		CHECK_APPEND_1ARG("%s", "-f ");
   4188
   4189	if (bctl->flags & BTRFS_BALANCE_DATA) {
   4190		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
   4191		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
   4192	}
   4193
   4194	if (bctl->flags & BTRFS_BALANCE_METADATA) {
   4195		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
   4196		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
   4197	}
   4198
   4199	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
   4200		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
   4201		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
   4202	}
   4203
   4204#undef CHECK_APPEND_1ARG
   4205
   4206out_overflow:
   4207
   4208	if (size_bp < size_buf)
   4209		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
   4210	btrfs_info(fs_info, "balance: %s %s",
   4211		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
   4212		   "resume" : "start", buf);
   4213
   4214	kfree(buf);
   4215}
   4216
   4217/*
   4218 * Should be called with balance mutexe held
   4219 */
   4220int btrfs_balance(struct btrfs_fs_info *fs_info,
   4221		  struct btrfs_balance_control *bctl,
   4222		  struct btrfs_ioctl_balance_args *bargs)
   4223{
   4224	u64 meta_target, data_target;
   4225	u64 allowed;
   4226	int mixed = 0;
   4227	int ret;
   4228	u64 num_devices;
   4229	unsigned seq;
   4230	bool reducing_redundancy;
   4231	int i;
   4232
   4233	if (btrfs_fs_closing(fs_info) ||
   4234	    atomic_read(&fs_info->balance_pause_req) ||
   4235	    btrfs_should_cancel_balance(fs_info)) {
   4236		ret = -EINVAL;
   4237		goto out;
   4238	}
   4239
   4240	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
   4241	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
   4242		mixed = 1;
   4243
   4244	/*
   4245	 * In case of mixed groups both data and meta should be picked,
   4246	 * and identical options should be given for both of them.
   4247	 */
   4248	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
   4249	if (mixed && (bctl->flags & allowed)) {
   4250		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
   4251		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
   4252		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
   4253			btrfs_err(fs_info,
   4254	  "balance: mixed groups data and metadata options must be the same");
   4255			ret = -EINVAL;
   4256			goto out;
   4257		}
   4258	}
   4259
   4260	/*
   4261	 * rw_devices will not change at the moment, device add/delete/replace
   4262	 * are exclusive
   4263	 */
   4264	num_devices = fs_info->fs_devices->rw_devices;
   4265
   4266	/*
   4267	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
   4268	 * special bit for it, to make it easier to distinguish.  Thus we need
   4269	 * to set it manually, or balance would refuse the profile.
   4270	 */
   4271	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
   4272	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
   4273		if (num_devices >= btrfs_raid_array[i].devs_min)
   4274			allowed |= btrfs_raid_array[i].bg_flag;
   4275
   4276	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
   4277	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
   4278	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
   4279		ret = -EINVAL;
   4280		goto out;
   4281	}
   4282
   4283	/*
   4284	 * Allow to reduce metadata or system integrity only if force set for
   4285	 * profiles with redundancy (copies, parity)
   4286	 */
   4287	allowed = 0;
   4288	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
   4289		if (btrfs_raid_array[i].ncopies >= 2 ||
   4290		    btrfs_raid_array[i].tolerated_failures >= 1)
   4291			allowed |= btrfs_raid_array[i].bg_flag;
   4292	}
   4293	do {
   4294		seq = read_seqbegin(&fs_info->profiles_lock);
   4295
   4296		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
   4297		     (fs_info->avail_system_alloc_bits & allowed) &&
   4298		     !(bctl->sys.target & allowed)) ||
   4299		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
   4300		     (fs_info->avail_metadata_alloc_bits & allowed) &&
   4301		     !(bctl->meta.target & allowed)))
   4302			reducing_redundancy = true;
   4303		else
   4304			reducing_redundancy = false;
   4305
   4306		/* if we're not converting, the target field is uninitialized */
   4307		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
   4308			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
   4309		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
   4310			bctl->data.target : fs_info->avail_data_alloc_bits;
   4311	} while (read_seqretry(&fs_info->profiles_lock, seq));
   4312
   4313	if (reducing_redundancy) {
   4314		if (bctl->flags & BTRFS_BALANCE_FORCE) {
   4315			btrfs_info(fs_info,
   4316			   "balance: force reducing metadata redundancy");
   4317		} else {
   4318			btrfs_err(fs_info,
   4319	"balance: reduces metadata redundancy, use --force if you want this");
   4320			ret = -EINVAL;
   4321			goto out;
   4322		}
   4323	}
   4324
   4325	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
   4326		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
   4327		btrfs_warn(fs_info,
   4328	"balance: metadata profile %s has lower redundancy than data profile %s",
   4329				btrfs_bg_type_to_raid_name(meta_target),
   4330				btrfs_bg_type_to_raid_name(data_target));
   4331	}
   4332
   4333	ret = insert_balance_item(fs_info, bctl);
   4334	if (ret && ret != -EEXIST)
   4335		goto out;
   4336
   4337	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
   4338		BUG_ON(ret == -EEXIST);
   4339		BUG_ON(fs_info->balance_ctl);
   4340		spin_lock(&fs_info->balance_lock);
   4341		fs_info->balance_ctl = bctl;
   4342		spin_unlock(&fs_info->balance_lock);
   4343	} else {
   4344		BUG_ON(ret != -EEXIST);
   4345		spin_lock(&fs_info->balance_lock);
   4346		update_balance_args(bctl);
   4347		spin_unlock(&fs_info->balance_lock);
   4348	}
   4349
   4350	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
   4351	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
   4352	describe_balance_start_or_resume(fs_info);
   4353	mutex_unlock(&fs_info->balance_mutex);
   4354
   4355	ret = __btrfs_balance(fs_info);
   4356
   4357	mutex_lock(&fs_info->balance_mutex);
   4358	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
   4359		btrfs_info(fs_info, "balance: paused");
   4360		btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
   4361	}
   4362	/*
   4363	 * Balance can be canceled by:
   4364	 *
   4365	 * - Regular cancel request
   4366	 *   Then ret == -ECANCELED and balance_cancel_req > 0
   4367	 *
   4368	 * - Fatal signal to "btrfs" process
   4369	 *   Either the signal caught by wait_reserve_ticket() and callers
   4370	 *   got -EINTR, or caught by btrfs_should_cancel_balance() and
   4371	 *   got -ECANCELED.
   4372	 *   Either way, in this case balance_cancel_req = 0, and
   4373	 *   ret == -EINTR or ret == -ECANCELED.
   4374	 *
   4375	 * So here we only check the return value to catch canceled balance.
   4376	 */
   4377	else if (ret == -ECANCELED || ret == -EINTR)
   4378		btrfs_info(fs_info, "balance: canceled");
   4379	else
   4380		btrfs_info(fs_info, "balance: ended with status: %d", ret);
   4381
   4382	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
   4383
   4384	if (bargs) {
   4385		memset(bargs, 0, sizeof(*bargs));
   4386		btrfs_update_ioctl_balance_args(fs_info, bargs);
   4387	}
   4388
   4389	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
   4390	    balance_need_close(fs_info)) {
   4391		reset_balance_state(fs_info);
   4392		btrfs_exclop_finish(fs_info);
   4393	}
   4394
   4395	wake_up(&fs_info->balance_wait_q);
   4396
   4397	return ret;
   4398out:
   4399	if (bctl->flags & BTRFS_BALANCE_RESUME)
   4400		reset_balance_state(fs_info);
   4401	else
   4402		kfree(bctl);
   4403	btrfs_exclop_finish(fs_info);
   4404
   4405	return ret;
   4406}
   4407
   4408static int balance_kthread(void *data)
   4409{
   4410	struct btrfs_fs_info *fs_info = data;
   4411	int ret = 0;
   4412
   4413	sb_start_write(fs_info->sb);
   4414	mutex_lock(&fs_info->balance_mutex);
   4415	if (fs_info->balance_ctl)
   4416		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
   4417	mutex_unlock(&fs_info->balance_mutex);
   4418	sb_end_write(fs_info->sb);
   4419
   4420	return ret;
   4421}
   4422
   4423int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
   4424{
   4425	struct task_struct *tsk;
   4426
   4427	mutex_lock(&fs_info->balance_mutex);
   4428	if (!fs_info->balance_ctl) {
   4429		mutex_unlock(&fs_info->balance_mutex);
   4430		return 0;
   4431	}
   4432	mutex_unlock(&fs_info->balance_mutex);
   4433
   4434	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
   4435		btrfs_info(fs_info, "balance: resume skipped");
   4436		return 0;
   4437	}
   4438
   4439	spin_lock(&fs_info->super_lock);
   4440	ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
   4441	fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
   4442	spin_unlock(&fs_info->super_lock);
   4443	/*
   4444	 * A ro->rw remount sequence should continue with the paused balance
   4445	 * regardless of who pauses it, system or the user as of now, so set
   4446	 * the resume flag.
   4447	 */
   4448	spin_lock(&fs_info->balance_lock);
   4449	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
   4450	spin_unlock(&fs_info->balance_lock);
   4451
   4452	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
   4453	return PTR_ERR_OR_ZERO(tsk);
   4454}
   4455
   4456int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
   4457{
   4458	struct btrfs_balance_control *bctl;
   4459	struct btrfs_balance_item *item;
   4460	struct btrfs_disk_balance_args disk_bargs;
   4461	struct btrfs_path *path;
   4462	struct extent_buffer *leaf;
   4463	struct btrfs_key key;
   4464	int ret;
   4465
   4466	path = btrfs_alloc_path();
   4467	if (!path)
   4468		return -ENOMEM;
   4469
   4470	key.objectid = BTRFS_BALANCE_OBJECTID;
   4471	key.type = BTRFS_TEMPORARY_ITEM_KEY;
   4472	key.offset = 0;
   4473
   4474	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
   4475	if (ret < 0)
   4476		goto out;
   4477	if (ret > 0) { /* ret = -ENOENT; */
   4478		ret = 0;
   4479		goto out;
   4480	}
   4481
   4482	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
   4483	if (!bctl) {
   4484		ret = -ENOMEM;
   4485		goto out;
   4486	}
   4487
   4488	leaf = path->nodes[0];
   4489	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
   4490
   4491	bctl->flags = btrfs_balance_flags(leaf, item);
   4492	bctl->flags |= BTRFS_BALANCE_RESUME;
   4493
   4494	btrfs_balance_data(leaf, item, &disk_bargs);
   4495	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
   4496	btrfs_balance_meta(leaf, item, &disk_bargs);
   4497	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
   4498	btrfs_balance_sys(leaf, item, &disk_bargs);
   4499	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
   4500
   4501	/*
   4502	 * This should never happen, as the paused balance state is recovered
   4503	 * during mount without any chance of other exclusive ops to collide.
   4504	 *
   4505	 * This gives the exclusive op status to balance and keeps in paused
   4506	 * state until user intervention (cancel or umount). If the ownership
   4507	 * cannot be assigned, show a message but do not fail. The balance
   4508	 * is in a paused state and must have fs_info::balance_ctl properly
   4509	 * set up.
   4510	 */
   4511	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
   4512		btrfs_warn(fs_info,
   4513	"balance: cannot set exclusive op status, resume manually");
   4514
   4515	btrfs_release_path(path);
   4516
   4517	mutex_lock(&fs_info->balance_mutex);
   4518	BUG_ON(fs_info->balance_ctl);
   4519	spin_lock(&fs_info->balance_lock);
   4520	fs_info->balance_ctl = bctl;
   4521	spin_unlock(&fs_info->balance_lock);
   4522	mutex_unlock(&fs_info->balance_mutex);
   4523out:
   4524	btrfs_free_path(path);
   4525	return ret;
   4526}
   4527
   4528int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
   4529{
   4530	int ret = 0;
   4531
   4532	mutex_lock(&fs_info->balance_mutex);
   4533	if (!fs_info->balance_ctl) {
   4534		mutex_unlock(&fs_info->balance_mutex);
   4535		return -ENOTCONN;
   4536	}
   4537
   4538	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
   4539		atomic_inc(&fs_info->balance_pause_req);
   4540		mutex_unlock(&fs_info->balance_mutex);
   4541
   4542		wait_event(fs_info->balance_wait_q,
   4543			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
   4544
   4545		mutex_lock(&fs_info->balance_mutex);
   4546		/* we are good with balance_ctl ripped off from under us */
   4547		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
   4548		atomic_dec(&fs_info->balance_pause_req);
   4549	} else {
   4550		ret = -ENOTCONN;
   4551	}
   4552
   4553	mutex_unlock(&fs_info->balance_mutex);
   4554	return ret;
   4555}
   4556
   4557int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
   4558{
   4559	mutex_lock(&fs_info->balance_mutex);
   4560	if (!fs_info->balance_ctl) {
   4561		mutex_unlock(&fs_info->balance_mutex);
   4562		return -ENOTCONN;
   4563	}
   4564
   4565	/*
   4566	 * A paused balance with the item stored on disk can be resumed at
   4567	 * mount time if the mount is read-write. Otherwise it's still paused
   4568	 * and we must not allow cancelling as it deletes the item.
   4569	 */
   4570	if (sb_rdonly(fs_info->sb)) {
   4571		mutex_unlock(&fs_info->balance_mutex);
   4572		return -EROFS;
   4573	}
   4574
   4575	atomic_inc(&fs_info->balance_cancel_req);
   4576	/*
   4577	 * if we are running just wait and return, balance item is
   4578	 * deleted in btrfs_balance in this case
   4579	 */
   4580	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
   4581		mutex_unlock(&fs_info->balance_mutex);
   4582		wait_event(fs_info->balance_wait_q,
   4583			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
   4584		mutex_lock(&fs_info->balance_mutex);
   4585	} else {
   4586		mutex_unlock(&fs_info->balance_mutex);
   4587		/*
   4588		 * Lock released to allow other waiters to continue, we'll
   4589		 * reexamine the status again.
   4590		 */
   4591		mutex_lock(&fs_info->balance_mutex);
   4592
   4593		if (fs_info->balance_ctl) {
   4594			reset_balance_state(fs_info);
   4595			btrfs_exclop_finish(fs_info);
   4596			btrfs_info(fs_info, "balance: canceled");
   4597		}
   4598	}
   4599
   4600	BUG_ON(fs_info->balance_ctl ||
   4601		test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
   4602	atomic_dec(&fs_info->balance_cancel_req);
   4603	mutex_unlock(&fs_info->balance_mutex);
   4604	return 0;
   4605}
   4606
   4607int btrfs_uuid_scan_kthread(void *data)
   4608{
   4609	struct btrfs_fs_info *fs_info = data;
   4610	struct btrfs_root *root = fs_info->tree_root;
   4611	struct btrfs_key key;
   4612	struct btrfs_path *path = NULL;
   4613	int ret = 0;
   4614	struct extent_buffer *eb;
   4615	int slot;
   4616	struct btrfs_root_item root_item;
   4617	u32 item_size;
   4618	struct btrfs_trans_handle *trans = NULL;
   4619	bool closing = false;
   4620
   4621	path = btrfs_alloc_path();
   4622	if (!path) {
   4623		ret = -ENOMEM;
   4624		goto out;
   4625	}
   4626
   4627	key.objectid = 0;
   4628	key.type = BTRFS_ROOT_ITEM_KEY;
   4629	key.offset = 0;
   4630
   4631	while (1) {
   4632		if (btrfs_fs_closing(fs_info)) {
   4633			closing = true;
   4634			break;
   4635		}
   4636		ret = btrfs_search_forward(root, &key, path,
   4637				BTRFS_OLDEST_GENERATION);
   4638		if (ret) {
   4639			if (ret > 0)
   4640				ret = 0;
   4641			break;
   4642		}
   4643
   4644		if (key.type != BTRFS_ROOT_ITEM_KEY ||
   4645		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
   4646		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
   4647		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
   4648			goto skip;
   4649
   4650		eb = path->nodes[0];
   4651		slot = path->slots[0];
   4652		item_size = btrfs_item_size(eb, slot);
   4653		if (item_size < sizeof(root_item))
   4654			goto skip;
   4655
   4656		read_extent_buffer(eb, &root_item,
   4657				   btrfs_item_ptr_offset(eb, slot),
   4658				   (int)sizeof(root_item));
   4659		if (btrfs_root_refs(&root_item) == 0)
   4660			goto skip;
   4661
   4662		if (!btrfs_is_empty_uuid(root_item.uuid) ||
   4663		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
   4664			if (trans)
   4665				goto update_tree;
   4666
   4667			btrfs_release_path(path);
   4668			/*
   4669			 * 1 - subvol uuid item
   4670			 * 1 - received_subvol uuid item
   4671			 */
   4672			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
   4673			if (IS_ERR(trans)) {
   4674				ret = PTR_ERR(trans);
   4675				break;
   4676			}
   4677			continue;
   4678		} else {
   4679			goto skip;
   4680		}
   4681update_tree:
   4682		btrfs_release_path(path);
   4683		if (!btrfs_is_empty_uuid(root_item.uuid)) {
   4684			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
   4685						  BTRFS_UUID_KEY_SUBVOL,
   4686						  key.objectid);
   4687			if (ret < 0) {
   4688				btrfs_warn(fs_info, "uuid_tree_add failed %d",
   4689					ret);
   4690				break;
   4691			}
   4692		}
   4693
   4694		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
   4695			ret = btrfs_uuid_tree_add(trans,
   4696						  root_item.received_uuid,
   4697						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
   4698						  key.objectid);
   4699			if (ret < 0) {
   4700				btrfs_warn(fs_info, "uuid_tree_add failed %d",
   4701					ret);
   4702				break;
   4703			}
   4704		}
   4705
   4706skip:
   4707		btrfs_release_path(path);
   4708		if (trans) {
   4709			ret = btrfs_end_transaction(trans);
   4710			trans = NULL;
   4711			if (ret)
   4712				break;
   4713		}
   4714
   4715		if (key.offset < (u64)-1) {
   4716			key.offset++;
   4717		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
   4718			key.offset = 0;
   4719			key.type = BTRFS_ROOT_ITEM_KEY;
   4720		} else if (key.objectid < (u64)-1) {
   4721			key.offset = 0;
   4722			key.type = BTRFS_ROOT_ITEM_KEY;
   4723			key.objectid++;
   4724		} else {
   4725			break;
   4726		}
   4727		cond_resched();
   4728	}
   4729
   4730out:
   4731	btrfs_free_path(path);
   4732	if (trans && !IS_ERR(trans))
   4733		btrfs_end_transaction(trans);
   4734	if (ret)
   4735		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
   4736	else if (!closing)
   4737		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
   4738	up(&fs_info->uuid_tree_rescan_sem);
   4739	return 0;
   4740}
   4741
   4742int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
   4743{
   4744	struct btrfs_trans_handle *trans;
   4745	struct btrfs_root *tree_root = fs_info->tree_root;
   4746	struct btrfs_root *uuid_root;
   4747	struct task_struct *task;
   4748	int ret;
   4749
   4750	/*
   4751	 * 1 - root node
   4752	 * 1 - root item
   4753	 */
   4754	trans = btrfs_start_transaction(tree_root, 2);
   4755	if (IS_ERR(trans))
   4756		return PTR_ERR(trans);
   4757
   4758	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
   4759	if (IS_ERR(uuid_root)) {
   4760		ret = PTR_ERR(uuid_root);
   4761		btrfs_abort_transaction(trans, ret);
   4762		btrfs_end_transaction(trans);
   4763		return ret;
   4764	}
   4765
   4766	fs_info->uuid_root = uuid_root;
   4767
   4768	ret = btrfs_commit_transaction(trans);
   4769	if (ret)
   4770		return ret;
   4771
   4772	down(&fs_info->uuid_tree_rescan_sem);
   4773	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
   4774	if (IS_ERR(task)) {
   4775		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
   4776		btrfs_warn(fs_info, "failed to start uuid_scan task");
   4777		up(&fs_info->uuid_tree_rescan_sem);
   4778		return PTR_ERR(task);
   4779	}
   4780
   4781	return 0;
   4782}
   4783
   4784/*
   4785 * shrinking a device means finding all of the device extents past
   4786 * the new size, and then following the back refs to the chunks.
   4787 * The chunk relocation code actually frees the device extent
   4788 */
   4789int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
   4790{
   4791	struct btrfs_fs_info *fs_info = device->fs_info;
   4792	struct btrfs_root *root = fs_info->dev_root;
   4793	struct btrfs_trans_handle *trans;
   4794	struct btrfs_dev_extent *dev_extent = NULL;
   4795	struct btrfs_path *path;
   4796	u64 length;
   4797	u64 chunk_offset;
   4798	int ret;
   4799	int slot;
   4800	int failed = 0;
   4801	bool retried = false;
   4802	struct extent_buffer *l;
   4803	struct btrfs_key key;
   4804	struct btrfs_super_block *super_copy = fs_info->super_copy;
   4805	u64 old_total = btrfs_super_total_bytes(super_copy);
   4806	u64 old_size = btrfs_device_get_total_bytes(device);
   4807	u64 diff;
   4808	u64 start;
   4809
   4810	new_size = round_down(new_size, fs_info->sectorsize);
   4811	start = new_size;
   4812	diff = round_down(old_size - new_size, fs_info->sectorsize);
   4813
   4814	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
   4815		return -EINVAL;
   4816
   4817	path = btrfs_alloc_path();
   4818	if (!path)
   4819		return -ENOMEM;
   4820
   4821	path->reada = READA_BACK;
   4822
   4823	trans = btrfs_start_transaction(root, 0);
   4824	if (IS_ERR(trans)) {
   4825		btrfs_free_path(path);
   4826		return PTR_ERR(trans);
   4827	}
   4828
   4829	mutex_lock(&fs_info->chunk_mutex);
   4830
   4831	btrfs_device_set_total_bytes(device, new_size);
   4832	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
   4833		device->fs_devices->total_rw_bytes -= diff;
   4834		atomic64_sub(diff, &fs_info->free_chunk_space);
   4835	}
   4836
   4837	/*
   4838	 * Once the device's size has been set to the new size, ensure all
   4839	 * in-memory chunks are synced to disk so that the loop below sees them
   4840	 * and relocates them accordingly.
   4841	 */
   4842	if (contains_pending_extent(device, &start, diff)) {
   4843		mutex_unlock(&fs_info->chunk_mutex);
   4844		ret = btrfs_commit_transaction(trans);
   4845		if (ret)
   4846			goto done;
   4847	} else {
   4848		mutex_unlock(&fs_info->chunk_mutex);
   4849		btrfs_end_transaction(trans);
   4850	}
   4851
   4852again:
   4853	key.objectid = device->devid;
   4854	key.offset = (u64)-1;
   4855	key.type = BTRFS_DEV_EXTENT_KEY;
   4856
   4857	do {
   4858		mutex_lock(&fs_info->reclaim_bgs_lock);
   4859		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   4860		if (ret < 0) {
   4861			mutex_unlock(&fs_info->reclaim_bgs_lock);
   4862			goto done;
   4863		}
   4864
   4865		ret = btrfs_previous_item(root, path, 0, key.type);
   4866		if (ret) {
   4867			mutex_unlock(&fs_info->reclaim_bgs_lock);
   4868			if (ret < 0)
   4869				goto done;
   4870			ret = 0;
   4871			btrfs_release_path(path);
   4872			break;
   4873		}
   4874
   4875		l = path->nodes[0];
   4876		slot = path->slots[0];
   4877		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
   4878
   4879		if (key.objectid != device->devid) {
   4880			mutex_unlock(&fs_info->reclaim_bgs_lock);
   4881			btrfs_release_path(path);
   4882			break;
   4883		}
   4884
   4885		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
   4886		length = btrfs_dev_extent_length(l, dev_extent);
   4887
   4888		if (key.offset + length <= new_size) {
   4889			mutex_unlock(&fs_info->reclaim_bgs_lock);
   4890			btrfs_release_path(path);
   4891			break;
   4892		}
   4893
   4894		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
   4895		btrfs_release_path(path);
   4896
   4897		/*
   4898		 * We may be relocating the only data chunk we have,
   4899		 * which could potentially end up with losing data's
   4900		 * raid profile, so lets allocate an empty one in
   4901		 * advance.
   4902		 */
   4903		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
   4904		if (ret < 0) {
   4905			mutex_unlock(&fs_info->reclaim_bgs_lock);
   4906			goto done;
   4907		}
   4908
   4909		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
   4910		mutex_unlock(&fs_info->reclaim_bgs_lock);
   4911		if (ret == -ENOSPC) {
   4912			failed++;
   4913		} else if (ret) {
   4914			if (ret == -ETXTBSY) {
   4915				btrfs_warn(fs_info,
   4916		   "could not shrink block group %llu due to active swapfile",
   4917					   chunk_offset);
   4918			}
   4919			goto done;
   4920		}
   4921	} while (key.offset-- > 0);
   4922
   4923	if (failed && !retried) {
   4924		failed = 0;
   4925		retried = true;
   4926		goto again;
   4927	} else if (failed && retried) {
   4928		ret = -ENOSPC;
   4929		goto done;
   4930	}
   4931
   4932	/* Shrinking succeeded, else we would be at "done". */
   4933	trans = btrfs_start_transaction(root, 0);
   4934	if (IS_ERR(trans)) {
   4935		ret = PTR_ERR(trans);
   4936		goto done;
   4937	}
   4938
   4939	mutex_lock(&fs_info->chunk_mutex);
   4940	/* Clear all state bits beyond the shrunk device size */
   4941	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
   4942			  CHUNK_STATE_MASK);
   4943
   4944	btrfs_device_set_disk_total_bytes(device, new_size);
   4945	if (list_empty(&device->post_commit_list))
   4946		list_add_tail(&device->post_commit_list,
   4947			      &trans->transaction->dev_update_list);
   4948
   4949	WARN_ON(diff > old_total);
   4950	btrfs_set_super_total_bytes(super_copy,
   4951			round_down(old_total - diff, fs_info->sectorsize));
   4952	mutex_unlock(&fs_info->chunk_mutex);
   4953
   4954	btrfs_reserve_chunk_metadata(trans, false);
   4955	/* Now btrfs_update_device() will change the on-disk size. */
   4956	ret = btrfs_update_device(trans, device);
   4957	btrfs_trans_release_chunk_metadata(trans);
   4958	if (ret < 0) {
   4959		btrfs_abort_transaction(trans, ret);
   4960		btrfs_end_transaction(trans);
   4961	} else {
   4962		ret = btrfs_commit_transaction(trans);
   4963	}
   4964done:
   4965	btrfs_free_path(path);
   4966	if (ret) {
   4967		mutex_lock(&fs_info->chunk_mutex);
   4968		btrfs_device_set_total_bytes(device, old_size);
   4969		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
   4970			device->fs_devices->total_rw_bytes += diff;
   4971		atomic64_add(diff, &fs_info->free_chunk_space);
   4972		mutex_unlock(&fs_info->chunk_mutex);
   4973	}
   4974	return ret;
   4975}
   4976
   4977static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
   4978			   struct btrfs_key *key,
   4979			   struct btrfs_chunk *chunk, int item_size)
   4980{
   4981	struct btrfs_super_block *super_copy = fs_info->super_copy;
   4982	struct btrfs_disk_key disk_key;
   4983	u32 array_size;
   4984	u8 *ptr;
   4985
   4986	lockdep_assert_held(&fs_info->chunk_mutex);
   4987
   4988	array_size = btrfs_super_sys_array_size(super_copy);
   4989	if (array_size + item_size + sizeof(disk_key)
   4990			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
   4991		return -EFBIG;
   4992
   4993	ptr = super_copy->sys_chunk_array + array_size;
   4994	btrfs_cpu_key_to_disk(&disk_key, key);
   4995	memcpy(ptr, &disk_key, sizeof(disk_key));
   4996	ptr += sizeof(disk_key);
   4997	memcpy(ptr, chunk, item_size);
   4998	item_size += sizeof(disk_key);
   4999	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
   5000
   5001	return 0;
   5002}
   5003
   5004/*
   5005 * sort the devices in descending order by max_avail, total_avail
   5006 */
   5007static int btrfs_cmp_device_info(const void *a, const void *b)
   5008{
   5009	const struct btrfs_device_info *di_a = a;
   5010	const struct btrfs_device_info *di_b = b;
   5011
   5012	if (di_a->max_avail > di_b->max_avail)
   5013		return -1;
   5014	if (di_a->max_avail < di_b->max_avail)
   5015		return 1;
   5016	if (di_a->total_avail > di_b->total_avail)
   5017		return -1;
   5018	if (di_a->total_avail < di_b->total_avail)
   5019		return 1;
   5020	return 0;
   5021}
   5022
   5023static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
   5024{
   5025	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
   5026		return;
   5027
   5028	btrfs_set_fs_incompat(info, RAID56);
   5029}
   5030
   5031static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
   5032{
   5033	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
   5034		return;
   5035
   5036	btrfs_set_fs_incompat(info, RAID1C34);
   5037}
   5038
   5039/*
   5040 * Structure used internally for btrfs_create_chunk() function.
   5041 * Wraps needed parameters.
   5042 */
   5043struct alloc_chunk_ctl {
   5044	u64 start;
   5045	u64 type;
   5046	/* Total number of stripes to allocate */
   5047	int num_stripes;
   5048	/* sub_stripes info for map */
   5049	int sub_stripes;
   5050	/* Stripes per device */
   5051	int dev_stripes;
   5052	/* Maximum number of devices to use */
   5053	int devs_max;
   5054	/* Minimum number of devices to use */
   5055	int devs_min;
   5056	/* ndevs has to be a multiple of this */
   5057	int devs_increment;
   5058	/* Number of copies */
   5059	int ncopies;
   5060	/* Number of stripes worth of bytes to store parity information */
   5061	int nparity;
   5062	u64 max_stripe_size;
   5063	u64 max_chunk_size;
   5064	u64 dev_extent_min;
   5065	u64 stripe_size;
   5066	u64 chunk_size;
   5067	int ndevs;
   5068};
   5069
   5070static void init_alloc_chunk_ctl_policy_regular(
   5071				struct btrfs_fs_devices *fs_devices,
   5072				struct alloc_chunk_ctl *ctl)
   5073{
   5074	u64 type = ctl->type;
   5075
   5076	if (type & BTRFS_BLOCK_GROUP_DATA) {
   5077		ctl->max_stripe_size = SZ_1G;
   5078		ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
   5079	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
   5080		/* For larger filesystems, use larger metadata chunks */
   5081		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
   5082			ctl->max_stripe_size = SZ_1G;
   5083		else
   5084			ctl->max_stripe_size = SZ_256M;
   5085		ctl->max_chunk_size = ctl->max_stripe_size;
   5086	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
   5087		ctl->max_stripe_size = SZ_32M;
   5088		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
   5089		ctl->devs_max = min_t(int, ctl->devs_max,
   5090				      BTRFS_MAX_DEVS_SYS_CHUNK);
   5091	} else {
   5092		BUG();
   5093	}
   5094
   5095	/* We don't want a chunk larger than 10% of writable space */
   5096	ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
   5097				  ctl->max_chunk_size);
   5098	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
   5099}
   5100
   5101static void init_alloc_chunk_ctl_policy_zoned(
   5102				      struct btrfs_fs_devices *fs_devices,
   5103				      struct alloc_chunk_ctl *ctl)
   5104{
   5105	u64 zone_size = fs_devices->fs_info->zone_size;
   5106	u64 limit;
   5107	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
   5108	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
   5109	u64 min_chunk_size = min_data_stripes * zone_size;
   5110	u64 type = ctl->type;
   5111
   5112	ctl->max_stripe_size = zone_size;
   5113	if (type & BTRFS_BLOCK_GROUP_DATA) {
   5114		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
   5115						 zone_size);
   5116	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
   5117		ctl->max_chunk_size = ctl->max_stripe_size;
   5118	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
   5119		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
   5120		ctl->devs_max = min_t(int, ctl->devs_max,
   5121				      BTRFS_MAX_DEVS_SYS_CHUNK);
   5122	} else {
   5123		BUG();
   5124	}
   5125
   5126	/* We don't want a chunk larger than 10% of writable space */
   5127	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
   5128			       zone_size),
   5129		    min_chunk_size);
   5130	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
   5131	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
   5132}
   5133
   5134static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
   5135				 struct alloc_chunk_ctl *ctl)
   5136{
   5137	int index = btrfs_bg_flags_to_raid_index(ctl->type);
   5138
   5139	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
   5140	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
   5141	ctl->devs_max = btrfs_raid_array[index].devs_max;
   5142	if (!ctl->devs_max)
   5143		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
   5144	ctl->devs_min = btrfs_raid_array[index].devs_min;
   5145	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
   5146	ctl->ncopies = btrfs_raid_array[index].ncopies;
   5147	ctl->nparity = btrfs_raid_array[index].nparity;
   5148	ctl->ndevs = 0;
   5149
   5150	switch (fs_devices->chunk_alloc_policy) {
   5151	case BTRFS_CHUNK_ALLOC_REGULAR:
   5152		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
   5153		break;
   5154	case BTRFS_CHUNK_ALLOC_ZONED:
   5155		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
   5156		break;
   5157	default:
   5158		BUG();
   5159	}
   5160}
   5161
   5162static int gather_device_info(struct btrfs_fs_devices *fs_devices,
   5163			      struct alloc_chunk_ctl *ctl,
   5164			      struct btrfs_device_info *devices_info)
   5165{
   5166	struct btrfs_fs_info *info = fs_devices->fs_info;
   5167	struct btrfs_device *device;
   5168	u64 total_avail;
   5169	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
   5170	int ret;
   5171	int ndevs = 0;
   5172	u64 max_avail;
   5173	u64 dev_offset;
   5174
   5175	/*
   5176	 * in the first pass through the devices list, we gather information
   5177	 * about the available holes on each device.
   5178	 */
   5179	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
   5180		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
   5181			WARN(1, KERN_ERR
   5182			       "BTRFS: read-only device in alloc_list\n");
   5183			continue;
   5184		}
   5185
   5186		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
   5187					&device->dev_state) ||
   5188		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
   5189			continue;
   5190
   5191		if (device->total_bytes > device->bytes_used)
   5192			total_avail = device->total_bytes - device->bytes_used;
   5193		else
   5194			total_avail = 0;
   5195
   5196		/* If there is no space on this device, skip it. */
   5197		if (total_avail < ctl->dev_extent_min)
   5198			continue;
   5199
   5200		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
   5201					   &max_avail);
   5202		if (ret && ret != -ENOSPC)
   5203			return ret;
   5204
   5205		if (ret == 0)
   5206			max_avail = dev_extent_want;
   5207
   5208		if (max_avail < ctl->dev_extent_min) {
   5209			if (btrfs_test_opt(info, ENOSPC_DEBUG))
   5210				btrfs_debug(info,
   5211			"%s: devid %llu has no free space, have=%llu want=%llu",
   5212					    __func__, device->devid, max_avail,
   5213					    ctl->dev_extent_min);
   5214			continue;
   5215		}
   5216
   5217		if (ndevs == fs_devices->rw_devices) {
   5218			WARN(1, "%s: found more than %llu devices\n",
   5219			     __func__, fs_devices->rw_devices);
   5220			break;
   5221		}
   5222		devices_info[ndevs].dev_offset = dev_offset;
   5223		devices_info[ndevs].max_avail = max_avail;
   5224		devices_info[ndevs].total_avail = total_avail;
   5225		devices_info[ndevs].dev = device;
   5226		++ndevs;
   5227	}
   5228	ctl->ndevs = ndevs;
   5229
   5230	/*
   5231	 * now sort the devices by hole size / available space
   5232	 */
   5233	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
   5234	     btrfs_cmp_device_info, NULL);
   5235
   5236	return 0;
   5237}
   5238
   5239static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
   5240				      struct btrfs_device_info *devices_info)
   5241{
   5242	/* Number of stripes that count for block group size */
   5243	int data_stripes;
   5244
   5245	/*
   5246	 * The primary goal is to maximize the number of stripes, so use as
   5247	 * many devices as possible, even if the stripes are not maximum sized.
   5248	 *
   5249	 * The DUP profile stores more than one stripe per device, the
   5250	 * max_avail is the total size so we have to adjust.
   5251	 */
   5252	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
   5253				   ctl->dev_stripes);
   5254	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
   5255
   5256	/* This will have to be fixed for RAID1 and RAID10 over more drives */
   5257	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
   5258
   5259	/*
   5260	 * Use the number of data stripes to figure out how big this chunk is
   5261	 * really going to be in terms of logical address space, and compare
   5262	 * that answer with the max chunk size. If it's higher, we try to
   5263	 * reduce stripe_size.
   5264	 */
   5265	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
   5266		/*
   5267		 * Reduce stripe_size, round it up to a 16MB boundary again and
   5268		 * then use it, unless it ends up being even bigger than the
   5269		 * previous value we had already.
   5270		 */
   5271		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
   5272							data_stripes), SZ_16M),
   5273				       ctl->stripe_size);
   5274	}
   5275
   5276	/* Align to BTRFS_STRIPE_LEN */
   5277	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
   5278	ctl->chunk_size = ctl->stripe_size * data_stripes;
   5279
   5280	return 0;
   5281}
   5282
   5283static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
   5284				    struct btrfs_device_info *devices_info)
   5285{
   5286	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
   5287	/* Number of stripes that count for block group size */
   5288	int data_stripes;
   5289
   5290	/*
   5291	 * It should hold because:
   5292	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
   5293	 */
   5294	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
   5295
   5296	ctl->stripe_size = zone_size;
   5297	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
   5298	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
   5299
   5300	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
   5301	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
   5302		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
   5303					     ctl->stripe_size) + ctl->nparity,
   5304				     ctl->dev_stripes);
   5305		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
   5306		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
   5307		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
   5308	}
   5309
   5310	ctl->chunk_size = ctl->stripe_size * data_stripes;
   5311
   5312	return 0;
   5313}
   5314
   5315static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
   5316			      struct alloc_chunk_ctl *ctl,
   5317			      struct btrfs_device_info *devices_info)
   5318{
   5319	struct btrfs_fs_info *info = fs_devices->fs_info;
   5320
   5321	/*
   5322	 * Round down to number of usable stripes, devs_increment can be any
   5323	 * number so we can't use round_down() that requires power of 2, while
   5324	 * rounddown is safe.
   5325	 */
   5326	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
   5327
   5328	if (ctl->ndevs < ctl->devs_min) {
   5329		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
   5330			btrfs_debug(info,
   5331	"%s: not enough devices with free space: have=%d minimum required=%d",
   5332				    __func__, ctl->ndevs, ctl->devs_min);
   5333		}
   5334		return -ENOSPC;
   5335	}
   5336
   5337	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
   5338
   5339	switch (fs_devices->chunk_alloc_policy) {
   5340	case BTRFS_CHUNK_ALLOC_REGULAR:
   5341		return decide_stripe_size_regular(ctl, devices_info);
   5342	case BTRFS_CHUNK_ALLOC_ZONED:
   5343		return decide_stripe_size_zoned(ctl, devices_info);
   5344	default:
   5345		BUG();
   5346	}
   5347}
   5348
   5349static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
   5350			struct alloc_chunk_ctl *ctl,
   5351			struct btrfs_device_info *devices_info)
   5352{
   5353	struct btrfs_fs_info *info = trans->fs_info;
   5354	struct map_lookup *map = NULL;
   5355	struct extent_map_tree *em_tree;
   5356	struct btrfs_block_group *block_group;
   5357	struct extent_map *em;
   5358	u64 start = ctl->start;
   5359	u64 type = ctl->type;
   5360	int ret;
   5361	int i;
   5362	int j;
   5363
   5364	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
   5365	if (!map)
   5366		return ERR_PTR(-ENOMEM);
   5367	map->num_stripes = ctl->num_stripes;
   5368
   5369	for (i = 0; i < ctl->ndevs; ++i) {
   5370		for (j = 0; j < ctl->dev_stripes; ++j) {
   5371			int s = i * ctl->dev_stripes + j;
   5372			map->stripes[s].dev = devices_info[i].dev;
   5373			map->stripes[s].physical = devices_info[i].dev_offset +
   5374						   j * ctl->stripe_size;
   5375		}
   5376	}
   5377	map->stripe_len = BTRFS_STRIPE_LEN;
   5378	map->io_align = BTRFS_STRIPE_LEN;
   5379	map->io_width = BTRFS_STRIPE_LEN;
   5380	map->type = type;
   5381	map->sub_stripes = ctl->sub_stripes;
   5382
   5383	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
   5384
   5385	em = alloc_extent_map();
   5386	if (!em) {
   5387		kfree(map);
   5388		return ERR_PTR(-ENOMEM);
   5389	}
   5390	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
   5391	em->map_lookup = map;
   5392	em->start = start;
   5393	em->len = ctl->chunk_size;
   5394	em->block_start = 0;
   5395	em->block_len = em->len;
   5396	em->orig_block_len = ctl->stripe_size;
   5397
   5398	em_tree = &info->mapping_tree;
   5399	write_lock(&em_tree->lock);
   5400	ret = add_extent_mapping(em_tree, em, 0);
   5401	if (ret) {
   5402		write_unlock(&em_tree->lock);
   5403		free_extent_map(em);
   5404		return ERR_PTR(ret);
   5405	}
   5406	write_unlock(&em_tree->lock);
   5407
   5408	block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
   5409	if (IS_ERR(block_group))
   5410		goto error_del_extent;
   5411
   5412	for (i = 0; i < map->num_stripes; i++) {
   5413		struct btrfs_device *dev = map->stripes[i].dev;
   5414
   5415		btrfs_device_set_bytes_used(dev,
   5416					    dev->bytes_used + ctl->stripe_size);
   5417		if (list_empty(&dev->post_commit_list))
   5418			list_add_tail(&dev->post_commit_list,
   5419				      &trans->transaction->dev_update_list);
   5420	}
   5421
   5422	atomic64_sub(ctl->stripe_size * map->num_stripes,
   5423		     &info->free_chunk_space);
   5424
   5425	free_extent_map(em);
   5426	check_raid56_incompat_flag(info, type);
   5427	check_raid1c34_incompat_flag(info, type);
   5428
   5429	return block_group;
   5430
   5431error_del_extent:
   5432	write_lock(&em_tree->lock);
   5433	remove_extent_mapping(em_tree, em);
   5434	write_unlock(&em_tree->lock);
   5435
   5436	/* One for our allocation */
   5437	free_extent_map(em);
   5438	/* One for the tree reference */
   5439	free_extent_map(em);
   5440
   5441	return block_group;
   5442}
   5443
   5444struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
   5445					    u64 type)
   5446{
   5447	struct btrfs_fs_info *info = trans->fs_info;
   5448	struct btrfs_fs_devices *fs_devices = info->fs_devices;
   5449	struct btrfs_device_info *devices_info = NULL;
   5450	struct alloc_chunk_ctl ctl;
   5451	struct btrfs_block_group *block_group;
   5452	int ret;
   5453
   5454	lockdep_assert_held(&info->chunk_mutex);
   5455
   5456	if (!alloc_profile_is_valid(type, 0)) {
   5457		ASSERT(0);
   5458		return ERR_PTR(-EINVAL);
   5459	}
   5460
   5461	if (list_empty(&fs_devices->alloc_list)) {
   5462		if (btrfs_test_opt(info, ENOSPC_DEBUG))
   5463			btrfs_debug(info, "%s: no writable device", __func__);
   5464		return ERR_PTR(-ENOSPC);
   5465	}
   5466
   5467	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
   5468		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
   5469		ASSERT(0);
   5470		return ERR_PTR(-EINVAL);
   5471	}
   5472
   5473	ctl.start = find_next_chunk(info);
   5474	ctl.type = type;
   5475	init_alloc_chunk_ctl(fs_devices, &ctl);
   5476
   5477	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
   5478			       GFP_NOFS);
   5479	if (!devices_info)
   5480		return ERR_PTR(-ENOMEM);
   5481
   5482	ret = gather_device_info(fs_devices, &ctl, devices_info);
   5483	if (ret < 0) {
   5484		block_group = ERR_PTR(ret);
   5485		goto out;
   5486	}
   5487
   5488	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
   5489	if (ret < 0) {
   5490		block_group = ERR_PTR(ret);
   5491		goto out;
   5492	}
   5493
   5494	block_group = create_chunk(trans, &ctl, devices_info);
   5495
   5496out:
   5497	kfree(devices_info);
   5498	return block_group;
   5499}
   5500
   5501/*
   5502 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
   5503 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
   5504 * chunks.
   5505 *
   5506 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
   5507 * phases.
   5508 */
   5509int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
   5510				     struct btrfs_block_group *bg)
   5511{
   5512	struct btrfs_fs_info *fs_info = trans->fs_info;
   5513	struct btrfs_root *chunk_root = fs_info->chunk_root;
   5514	struct btrfs_key key;
   5515	struct btrfs_chunk *chunk;
   5516	struct btrfs_stripe *stripe;
   5517	struct extent_map *em;
   5518	struct map_lookup *map;
   5519	size_t item_size;
   5520	int i;
   5521	int ret;
   5522
   5523	/*
   5524	 * We take the chunk_mutex for 2 reasons:
   5525	 *
   5526	 * 1) Updates and insertions in the chunk btree must be done while holding
   5527	 *    the chunk_mutex, as well as updating the system chunk array in the
   5528	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
   5529	 *    details;
   5530	 *
   5531	 * 2) To prevent races with the final phase of a device replace operation
   5532	 *    that replaces the device object associated with the map's stripes,
   5533	 *    because the device object's id can change at any time during that
   5534	 *    final phase of the device replace operation
   5535	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
   5536	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
   5537	 *    which would cause a failure when updating the device item, which does
   5538	 *    not exists, or persisting a stripe of the chunk item with such ID.
   5539	 *    Here we can't use the device_list_mutex because our caller already
   5540	 *    has locked the chunk_mutex, and the final phase of device replace
   5541	 *    acquires both mutexes - first the device_list_mutex and then the
   5542	 *    chunk_mutex. Using any of those two mutexes protects us from a
   5543	 *    concurrent device replace.
   5544	 */
   5545	lockdep_assert_held(&fs_info->chunk_mutex);
   5546
   5547	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
   5548	if (IS_ERR(em)) {
   5549		ret = PTR_ERR(em);
   5550		btrfs_abort_transaction(trans, ret);
   5551		return ret;
   5552	}
   5553
   5554	map = em->map_lookup;
   5555	item_size = btrfs_chunk_item_size(map->num_stripes);
   5556
   5557	chunk = kzalloc(item_size, GFP_NOFS);
   5558	if (!chunk) {
   5559		ret = -ENOMEM;
   5560		btrfs_abort_transaction(trans, ret);
   5561		goto out;
   5562	}
   5563
   5564	for (i = 0; i < map->num_stripes; i++) {
   5565		struct btrfs_device *device = map->stripes[i].dev;
   5566
   5567		ret = btrfs_update_device(trans, device);
   5568		if (ret)
   5569			goto out;
   5570	}
   5571
   5572	stripe = &chunk->stripe;
   5573	for (i = 0; i < map->num_stripes; i++) {
   5574		struct btrfs_device *device = map->stripes[i].dev;
   5575		const u64 dev_offset = map->stripes[i].physical;
   5576
   5577		btrfs_set_stack_stripe_devid(stripe, device->devid);
   5578		btrfs_set_stack_stripe_offset(stripe, dev_offset);
   5579		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
   5580		stripe++;
   5581	}
   5582
   5583	btrfs_set_stack_chunk_length(chunk, bg->length);
   5584	btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
   5585	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
   5586	btrfs_set_stack_chunk_type(chunk, map->type);
   5587	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
   5588	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
   5589	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
   5590	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
   5591	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
   5592
   5593	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
   5594	key.type = BTRFS_CHUNK_ITEM_KEY;
   5595	key.offset = bg->start;
   5596
   5597	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
   5598	if (ret)
   5599		goto out;
   5600
   5601	bg->chunk_item_inserted = 1;
   5602
   5603	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
   5604		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
   5605		if (ret)
   5606			goto out;
   5607	}
   5608
   5609out:
   5610	kfree(chunk);
   5611	free_extent_map(em);
   5612	return ret;
   5613}
   5614
   5615static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
   5616{
   5617	struct btrfs_fs_info *fs_info = trans->fs_info;
   5618	u64 alloc_profile;
   5619	struct btrfs_block_group *meta_bg;
   5620	struct btrfs_block_group *sys_bg;
   5621
   5622	/*
   5623	 * When adding a new device for sprouting, the seed device is read-only
   5624	 * so we must first allocate a metadata and a system chunk. But before
   5625	 * adding the block group items to the extent, device and chunk btrees,
   5626	 * we must first:
   5627	 *
   5628	 * 1) Create both chunks without doing any changes to the btrees, as
   5629	 *    otherwise we would get -ENOSPC since the block groups from the
   5630	 *    seed device are read-only;
   5631	 *
   5632	 * 2) Add the device item for the new sprout device - finishing the setup
   5633	 *    of a new block group requires updating the device item in the chunk
   5634	 *    btree, so it must exist when we attempt to do it. The previous step
   5635	 *    ensures this does not fail with -ENOSPC.
   5636	 *
   5637	 * After that we can add the block group items to their btrees:
   5638	 * update existing device item in the chunk btree, add a new block group
   5639	 * item to the extent btree, add a new chunk item to the chunk btree and
   5640	 * finally add the new device extent items to the devices btree.
   5641	 */
   5642
   5643	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
   5644	meta_bg = btrfs_create_chunk(trans, alloc_profile);
   5645	if (IS_ERR(meta_bg))
   5646		return PTR_ERR(meta_bg);
   5647
   5648	alloc_profile = btrfs_system_alloc_profile(fs_info);
   5649	sys_bg = btrfs_create_chunk(trans, alloc_profile);
   5650	if (IS_ERR(sys_bg))
   5651		return PTR_ERR(sys_bg);
   5652
   5653	return 0;
   5654}
   5655
   5656static inline int btrfs_chunk_max_errors(struct map_lookup *map)
   5657{
   5658	const int index = btrfs_bg_flags_to_raid_index(map->type);
   5659
   5660	return btrfs_raid_array[index].tolerated_failures;
   5661}
   5662
   5663bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
   5664{
   5665	struct extent_map *em;
   5666	struct map_lookup *map;
   5667	int miss_ndevs = 0;
   5668	int i;
   5669	bool ret = true;
   5670
   5671	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
   5672	if (IS_ERR(em))
   5673		return false;
   5674
   5675	map = em->map_lookup;
   5676	for (i = 0; i < map->num_stripes; i++) {
   5677		if (test_bit(BTRFS_DEV_STATE_MISSING,
   5678					&map->stripes[i].dev->dev_state)) {
   5679			miss_ndevs++;
   5680			continue;
   5681		}
   5682		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
   5683					&map->stripes[i].dev->dev_state)) {
   5684			ret = false;
   5685			goto end;
   5686		}
   5687	}
   5688
   5689	/*
   5690	 * If the number of missing devices is larger than max errors, we can
   5691	 * not write the data into that chunk successfully.
   5692	 */
   5693	if (miss_ndevs > btrfs_chunk_max_errors(map))
   5694		ret = false;
   5695end:
   5696	free_extent_map(em);
   5697	return ret;
   5698}
   5699
   5700void btrfs_mapping_tree_free(struct extent_map_tree *tree)
   5701{
   5702	struct extent_map *em;
   5703
   5704	while (1) {
   5705		write_lock(&tree->lock);
   5706		em = lookup_extent_mapping(tree, 0, (u64)-1);
   5707		if (em)
   5708			remove_extent_mapping(tree, em);
   5709		write_unlock(&tree->lock);
   5710		if (!em)
   5711			break;
   5712		/* once for us */
   5713		free_extent_map(em);
   5714		/* once for the tree */
   5715		free_extent_map(em);
   5716	}
   5717}
   5718
   5719int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
   5720{
   5721	struct extent_map *em;
   5722	struct map_lookup *map;
   5723	int ret;
   5724
   5725	em = btrfs_get_chunk_map(fs_info, logical, len);
   5726	if (IS_ERR(em))
   5727		/*
   5728		 * We could return errors for these cases, but that could get
   5729		 * ugly and we'd probably do the same thing which is just not do
   5730		 * anything else and exit, so return 1 so the callers don't try
   5731		 * to use other copies.
   5732		 */
   5733		return 1;
   5734
   5735	map = em->map_lookup;
   5736	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
   5737		ret = map->num_stripes;
   5738	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
   5739		ret = map->sub_stripes;
   5740	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
   5741		ret = 2;
   5742	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
   5743		/*
   5744		 * There could be two corrupted data stripes, we need
   5745		 * to loop retry in order to rebuild the correct data.
   5746		 *
   5747		 * Fail a stripe at a time on every retry except the
   5748		 * stripe under reconstruction.
   5749		 */
   5750		ret = map->num_stripes;
   5751	else
   5752		ret = 1;
   5753	free_extent_map(em);
   5754
   5755	down_read(&fs_info->dev_replace.rwsem);
   5756	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
   5757	    fs_info->dev_replace.tgtdev)
   5758		ret++;
   5759	up_read(&fs_info->dev_replace.rwsem);
   5760
   5761	return ret;
   5762}
   5763
   5764unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
   5765				    u64 logical)
   5766{
   5767	struct extent_map *em;
   5768	struct map_lookup *map;
   5769	unsigned long len = fs_info->sectorsize;
   5770
   5771	em = btrfs_get_chunk_map(fs_info, logical, len);
   5772
   5773	if (!WARN_ON(IS_ERR(em))) {
   5774		map = em->map_lookup;
   5775		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
   5776			len = map->stripe_len * nr_data_stripes(map);
   5777		free_extent_map(em);
   5778	}
   5779	return len;
   5780}
   5781
   5782int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
   5783{
   5784	struct extent_map *em;
   5785	struct map_lookup *map;
   5786	int ret = 0;
   5787
   5788	em = btrfs_get_chunk_map(fs_info, logical, len);
   5789
   5790	if(!WARN_ON(IS_ERR(em))) {
   5791		map = em->map_lookup;
   5792		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
   5793			ret = 1;
   5794		free_extent_map(em);
   5795	}
   5796	return ret;
   5797}
   5798
   5799static int find_live_mirror(struct btrfs_fs_info *fs_info,
   5800			    struct map_lookup *map, int first,
   5801			    int dev_replace_is_ongoing)
   5802{
   5803	int i;
   5804	int num_stripes;
   5805	int preferred_mirror;
   5806	int tolerance;
   5807	struct btrfs_device *srcdev;
   5808
   5809	ASSERT((map->type &
   5810		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
   5811
   5812	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
   5813		num_stripes = map->sub_stripes;
   5814	else
   5815		num_stripes = map->num_stripes;
   5816
   5817	switch (fs_info->fs_devices->read_policy) {
   5818	default:
   5819		/* Shouldn't happen, just warn and use pid instead of failing */
   5820		btrfs_warn_rl(fs_info,
   5821			      "unknown read_policy type %u, reset to pid",
   5822			      fs_info->fs_devices->read_policy);
   5823		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
   5824		fallthrough;
   5825	case BTRFS_READ_POLICY_PID:
   5826		preferred_mirror = first + (current->pid % num_stripes);
   5827		break;
   5828	}
   5829
   5830	if (dev_replace_is_ongoing &&
   5831	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
   5832	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
   5833		srcdev = fs_info->dev_replace.srcdev;
   5834	else
   5835		srcdev = NULL;
   5836
   5837	/*
   5838	 * try to avoid the drive that is the source drive for a
   5839	 * dev-replace procedure, only choose it if no other non-missing
   5840	 * mirror is available
   5841	 */
   5842	for (tolerance = 0; tolerance < 2; tolerance++) {
   5843		if (map->stripes[preferred_mirror].dev->bdev &&
   5844		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
   5845			return preferred_mirror;
   5846		for (i = first; i < first + num_stripes; i++) {
   5847			if (map->stripes[i].dev->bdev &&
   5848			    (tolerance || map->stripes[i].dev != srcdev))
   5849				return i;
   5850		}
   5851	}
   5852
   5853	/* we couldn't find one that doesn't fail.  Just return something
   5854	 * and the io error handling code will clean up eventually
   5855	 */
   5856	return preferred_mirror;
   5857}
   5858
   5859/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
   5860static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
   5861{
   5862	int i;
   5863	int again = 1;
   5864
   5865	while (again) {
   5866		again = 0;
   5867		for (i = 0; i < num_stripes - 1; i++) {
   5868			/* Swap if parity is on a smaller index */
   5869			if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
   5870				swap(bioc->stripes[i], bioc->stripes[i + 1]);
   5871				swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
   5872				again = 1;
   5873			}
   5874		}
   5875	}
   5876}
   5877
   5878static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
   5879						       int total_stripes,
   5880						       int real_stripes)
   5881{
   5882	struct btrfs_io_context *bioc = kzalloc(
   5883		 /* The size of btrfs_io_context */
   5884		sizeof(struct btrfs_io_context) +
   5885		/* Plus the variable array for the stripes */
   5886		sizeof(struct btrfs_io_stripe) * (total_stripes) +
   5887		/* Plus the variable array for the tgt dev */
   5888		sizeof(int) * (real_stripes) +
   5889		/*
   5890		 * Plus the raid_map, which includes both the tgt dev
   5891		 * and the stripes.
   5892		 */
   5893		sizeof(u64) * (total_stripes),
   5894		GFP_NOFS|__GFP_NOFAIL);
   5895
   5896	atomic_set(&bioc->error, 0);
   5897	refcount_set(&bioc->refs, 1);
   5898
   5899	bioc->fs_info = fs_info;
   5900	bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
   5901	bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
   5902
   5903	return bioc;
   5904}
   5905
   5906void btrfs_get_bioc(struct btrfs_io_context *bioc)
   5907{
   5908	WARN_ON(!refcount_read(&bioc->refs));
   5909	refcount_inc(&bioc->refs);
   5910}
   5911
   5912void btrfs_put_bioc(struct btrfs_io_context *bioc)
   5913{
   5914	if (!bioc)
   5915		return;
   5916	if (refcount_dec_and_test(&bioc->refs))
   5917		kfree(bioc);
   5918}
   5919
   5920/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
   5921/*
   5922 * Please note that, discard won't be sent to target device of device
   5923 * replace.
   5924 */
   5925static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
   5926					 u64 logical, u64 *length_ret,
   5927					 struct btrfs_io_context **bioc_ret)
   5928{
   5929	struct extent_map *em;
   5930	struct map_lookup *map;
   5931	struct btrfs_io_context *bioc;
   5932	u64 length = *length_ret;
   5933	u64 offset;
   5934	u64 stripe_nr;
   5935	u64 stripe_nr_end;
   5936	u64 stripe_end_offset;
   5937	u64 stripe_cnt;
   5938	u64 stripe_len;
   5939	u64 stripe_offset;
   5940	u64 num_stripes;
   5941	u32 stripe_index;
   5942	u32 factor = 0;
   5943	u32 sub_stripes = 0;
   5944	u64 stripes_per_dev = 0;
   5945	u32 remaining_stripes = 0;
   5946	u32 last_stripe = 0;
   5947	int ret = 0;
   5948	int i;
   5949
   5950	/* Discard always returns a bioc. */
   5951	ASSERT(bioc_ret);
   5952
   5953	em = btrfs_get_chunk_map(fs_info, logical, length);
   5954	if (IS_ERR(em))
   5955		return PTR_ERR(em);
   5956
   5957	map = em->map_lookup;
   5958	/* we don't discard raid56 yet */
   5959	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
   5960		ret = -EOPNOTSUPP;
   5961		goto out;
   5962	}
   5963
   5964	offset = logical - em->start;
   5965	length = min_t(u64, em->start + em->len - logical, length);
   5966	*length_ret = length;
   5967
   5968	stripe_len = map->stripe_len;
   5969	/*
   5970	 * stripe_nr counts the total number of stripes we have to stride
   5971	 * to get to this block
   5972	 */
   5973	stripe_nr = div64_u64(offset, stripe_len);
   5974
   5975	/* stripe_offset is the offset of this block in its stripe */
   5976	stripe_offset = offset - stripe_nr * stripe_len;
   5977
   5978	stripe_nr_end = round_up(offset + length, map->stripe_len);
   5979	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
   5980	stripe_cnt = stripe_nr_end - stripe_nr;
   5981	stripe_end_offset = stripe_nr_end * map->stripe_len -
   5982			    (offset + length);
   5983	/*
   5984	 * after this, stripe_nr is the number of stripes on this
   5985	 * device we have to walk to find the data, and stripe_index is
   5986	 * the number of our device in the stripe array
   5987	 */
   5988	num_stripes = 1;
   5989	stripe_index = 0;
   5990	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
   5991			 BTRFS_BLOCK_GROUP_RAID10)) {
   5992		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
   5993			sub_stripes = 1;
   5994		else
   5995			sub_stripes = map->sub_stripes;
   5996
   5997		factor = map->num_stripes / sub_stripes;
   5998		num_stripes = min_t(u64, map->num_stripes,
   5999				    sub_stripes * stripe_cnt);
   6000		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
   6001		stripe_index *= sub_stripes;
   6002		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
   6003					      &remaining_stripes);
   6004		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
   6005		last_stripe *= sub_stripes;
   6006	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
   6007				BTRFS_BLOCK_GROUP_DUP)) {
   6008		num_stripes = map->num_stripes;
   6009	} else {
   6010		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
   6011					&stripe_index);
   6012	}
   6013
   6014	bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
   6015	if (!bioc) {
   6016		ret = -ENOMEM;
   6017		goto out;
   6018	}
   6019
   6020	for (i = 0; i < num_stripes; i++) {
   6021		bioc->stripes[i].physical =
   6022			map->stripes[stripe_index].physical +
   6023			stripe_offset + stripe_nr * map->stripe_len;
   6024		bioc->stripes[i].dev = map->stripes[stripe_index].dev;
   6025
   6026		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
   6027				 BTRFS_BLOCK_GROUP_RAID10)) {
   6028			bioc->stripes[i].length = stripes_per_dev *
   6029				map->stripe_len;
   6030
   6031			if (i / sub_stripes < remaining_stripes)
   6032				bioc->stripes[i].length += map->stripe_len;
   6033
   6034			/*
   6035			 * Special for the first stripe and
   6036			 * the last stripe:
   6037			 *
   6038			 * |-------|...|-------|
   6039			 *     |----------|
   6040			 *    off     end_off
   6041			 */
   6042			if (i < sub_stripes)
   6043				bioc->stripes[i].length -= stripe_offset;
   6044
   6045			if (stripe_index >= last_stripe &&
   6046			    stripe_index <= (last_stripe +
   6047					     sub_stripes - 1))
   6048				bioc->stripes[i].length -= stripe_end_offset;
   6049
   6050			if (i == sub_stripes - 1)
   6051				stripe_offset = 0;
   6052		} else {
   6053			bioc->stripes[i].length = length;
   6054		}
   6055
   6056		stripe_index++;
   6057		if (stripe_index == map->num_stripes) {
   6058			stripe_index = 0;
   6059			stripe_nr++;
   6060		}
   6061	}
   6062
   6063	*bioc_ret = bioc;
   6064	bioc->map_type = map->type;
   6065	bioc->num_stripes = num_stripes;
   6066out:
   6067	free_extent_map(em);
   6068	return ret;
   6069}
   6070
   6071/*
   6072 * In dev-replace case, for repair case (that's the only case where the mirror
   6073 * is selected explicitly when calling btrfs_map_block), blocks left of the
   6074 * left cursor can also be read from the target drive.
   6075 *
   6076 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
   6077 * array of stripes.
   6078 * For READ, it also needs to be supported using the same mirror number.
   6079 *
   6080 * If the requested block is not left of the left cursor, EIO is returned. This
   6081 * can happen because btrfs_num_copies() returns one more in the dev-replace
   6082 * case.
   6083 */
   6084static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
   6085					 u64 logical, u64 length,
   6086					 u64 srcdev_devid, int *mirror_num,
   6087					 u64 *physical)
   6088{
   6089	struct btrfs_io_context *bioc = NULL;
   6090	int num_stripes;
   6091	int index_srcdev = 0;
   6092	int found = 0;
   6093	u64 physical_of_found = 0;
   6094	int i;
   6095	int ret = 0;
   6096
   6097	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
   6098				logical, &length, &bioc, 0, 0);
   6099	if (ret) {
   6100		ASSERT(bioc == NULL);
   6101		return ret;
   6102	}
   6103
   6104	num_stripes = bioc->num_stripes;
   6105	if (*mirror_num > num_stripes) {
   6106		/*
   6107		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
   6108		 * that means that the requested area is not left of the left
   6109		 * cursor
   6110		 */
   6111		btrfs_put_bioc(bioc);
   6112		return -EIO;
   6113	}
   6114
   6115	/*
   6116	 * process the rest of the function using the mirror_num of the source
   6117	 * drive. Therefore look it up first.  At the end, patch the device
   6118	 * pointer to the one of the target drive.
   6119	 */
   6120	for (i = 0; i < num_stripes; i++) {
   6121		if (bioc->stripes[i].dev->devid != srcdev_devid)
   6122			continue;
   6123
   6124		/*
   6125		 * In case of DUP, in order to keep it simple, only add the
   6126		 * mirror with the lowest physical address
   6127		 */
   6128		if (found &&
   6129		    physical_of_found <= bioc->stripes[i].physical)
   6130			continue;
   6131
   6132		index_srcdev = i;
   6133		found = 1;
   6134		physical_of_found = bioc->stripes[i].physical;
   6135	}
   6136
   6137	btrfs_put_bioc(bioc);
   6138
   6139	ASSERT(found);
   6140	if (!found)
   6141		return -EIO;
   6142
   6143	*mirror_num = index_srcdev + 1;
   6144	*physical = physical_of_found;
   6145	return ret;
   6146}
   6147
   6148static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
   6149{
   6150	struct btrfs_block_group *cache;
   6151	bool ret;
   6152
   6153	/* Non zoned filesystem does not use "to_copy" flag */
   6154	if (!btrfs_is_zoned(fs_info))
   6155		return false;
   6156
   6157	cache = btrfs_lookup_block_group(fs_info, logical);
   6158
   6159	spin_lock(&cache->lock);
   6160	ret = cache->to_copy;
   6161	spin_unlock(&cache->lock);
   6162
   6163	btrfs_put_block_group(cache);
   6164	return ret;
   6165}
   6166
   6167static void handle_ops_on_dev_replace(enum btrfs_map_op op,
   6168				      struct btrfs_io_context **bioc_ret,
   6169				      struct btrfs_dev_replace *dev_replace,
   6170				      u64 logical,
   6171				      int *num_stripes_ret, int *max_errors_ret)
   6172{
   6173	struct btrfs_io_context *bioc = *bioc_ret;
   6174	u64 srcdev_devid = dev_replace->srcdev->devid;
   6175	int tgtdev_indexes = 0;
   6176	int num_stripes = *num_stripes_ret;
   6177	int max_errors = *max_errors_ret;
   6178	int i;
   6179
   6180	if (op == BTRFS_MAP_WRITE) {
   6181		int index_where_to_add;
   6182
   6183		/*
   6184		 * A block group which have "to_copy" set will eventually
   6185		 * copied by dev-replace process. We can avoid cloning IO here.
   6186		 */
   6187		if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
   6188			return;
   6189
   6190		/*
   6191		 * duplicate the write operations while the dev replace
   6192		 * procedure is running. Since the copying of the old disk to
   6193		 * the new disk takes place at run time while the filesystem is
   6194		 * mounted writable, the regular write operations to the old
   6195		 * disk have to be duplicated to go to the new disk as well.
   6196		 *
   6197		 * Note that device->missing is handled by the caller, and that
   6198		 * the write to the old disk is already set up in the stripes
   6199		 * array.
   6200		 */
   6201		index_where_to_add = num_stripes;
   6202		for (i = 0; i < num_stripes; i++) {
   6203			if (bioc->stripes[i].dev->devid == srcdev_devid) {
   6204				/* write to new disk, too */
   6205				struct btrfs_io_stripe *new =
   6206					bioc->stripes + index_where_to_add;
   6207				struct btrfs_io_stripe *old =
   6208					bioc->stripes + i;
   6209
   6210				new->physical = old->physical;
   6211				new->length = old->length;
   6212				new->dev = dev_replace->tgtdev;
   6213				bioc->tgtdev_map[i] = index_where_to_add;
   6214				index_where_to_add++;
   6215				max_errors++;
   6216				tgtdev_indexes++;
   6217			}
   6218		}
   6219		num_stripes = index_where_to_add;
   6220	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
   6221		int index_srcdev = 0;
   6222		int found = 0;
   6223		u64 physical_of_found = 0;
   6224
   6225		/*
   6226		 * During the dev-replace procedure, the target drive can also
   6227		 * be used to read data in case it is needed to repair a corrupt
   6228		 * block elsewhere. This is possible if the requested area is
   6229		 * left of the left cursor. In this area, the target drive is a
   6230		 * full copy of the source drive.
   6231		 */
   6232		for (i = 0; i < num_stripes; i++) {
   6233			if (bioc->stripes[i].dev->devid == srcdev_devid) {
   6234				/*
   6235				 * In case of DUP, in order to keep it simple,
   6236				 * only add the mirror with the lowest physical
   6237				 * address
   6238				 */
   6239				if (found &&
   6240				    physical_of_found <= bioc->stripes[i].physical)
   6241					continue;
   6242				index_srcdev = i;
   6243				found = 1;
   6244				physical_of_found = bioc->stripes[i].physical;
   6245			}
   6246		}
   6247		if (found) {
   6248			struct btrfs_io_stripe *tgtdev_stripe =
   6249				bioc->stripes + num_stripes;
   6250
   6251			tgtdev_stripe->physical = physical_of_found;
   6252			tgtdev_stripe->length =
   6253				bioc->stripes[index_srcdev].length;
   6254			tgtdev_stripe->dev = dev_replace->tgtdev;
   6255			bioc->tgtdev_map[index_srcdev] = num_stripes;
   6256
   6257			tgtdev_indexes++;
   6258			num_stripes++;
   6259		}
   6260	}
   6261
   6262	*num_stripes_ret = num_stripes;
   6263	*max_errors_ret = max_errors;
   6264	bioc->num_tgtdevs = tgtdev_indexes;
   6265	*bioc_ret = bioc;
   6266}
   6267
   6268static bool need_full_stripe(enum btrfs_map_op op)
   6269{
   6270	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
   6271}
   6272
   6273/*
   6274 * Calculate the geometry of a particular (address, len) tuple. This
   6275 * information is used to calculate how big a particular bio can get before it
   6276 * straddles a stripe.
   6277 *
   6278 * @fs_info: the filesystem
   6279 * @em:      mapping containing the logical extent
   6280 * @op:      type of operation - write or read
   6281 * @logical: address that we want to figure out the geometry of
   6282 * @io_geom: pointer used to return values
   6283 *
   6284 * Returns < 0 in case a chunk for the given logical address cannot be found,
   6285 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
   6286 */
   6287int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
   6288			  enum btrfs_map_op op, u64 logical,
   6289			  struct btrfs_io_geometry *io_geom)
   6290{
   6291	struct map_lookup *map;
   6292	u64 len;
   6293	u64 offset;
   6294	u64 stripe_offset;
   6295	u64 stripe_nr;
   6296	u32 stripe_len;
   6297	u64 raid56_full_stripe_start = (u64)-1;
   6298	int data_stripes;
   6299
   6300	ASSERT(op != BTRFS_MAP_DISCARD);
   6301
   6302	map = em->map_lookup;
   6303	/* Offset of this logical address in the chunk */
   6304	offset = logical - em->start;
   6305	/* Len of a stripe in a chunk */
   6306	stripe_len = map->stripe_len;
   6307	/*
   6308	 * Stripe_nr is where this block falls in
   6309	 * stripe_offset is the offset of this block in its stripe.
   6310	 */
   6311	stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
   6312	ASSERT(stripe_offset < U32_MAX);
   6313
   6314	data_stripes = nr_data_stripes(map);
   6315
   6316	/* Only stripe based profiles needs to check against stripe length. */
   6317	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
   6318		u64 max_len = stripe_len - stripe_offset;
   6319
   6320		/*
   6321		 * In case of raid56, we need to know the stripe aligned start
   6322		 */
   6323		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
   6324			unsigned long full_stripe_len = stripe_len * data_stripes;
   6325			raid56_full_stripe_start = offset;
   6326
   6327			/*
   6328			 * Allow a write of a full stripe, but make sure we
   6329			 * don't allow straddling of stripes
   6330			 */
   6331			raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
   6332					full_stripe_len);
   6333			raid56_full_stripe_start *= full_stripe_len;
   6334
   6335			/*
   6336			 * For writes to RAID[56], allow a full stripeset across
   6337			 * all disks. For other RAID types and for RAID[56]
   6338			 * reads, just allow a single stripe (on a single disk).
   6339			 */
   6340			if (op == BTRFS_MAP_WRITE) {
   6341				max_len = stripe_len * data_stripes -
   6342					  (offset - raid56_full_stripe_start);
   6343			}
   6344		}
   6345		len = min_t(u64, em->len - offset, max_len);
   6346	} else {
   6347		len = em->len - offset;
   6348	}
   6349
   6350	io_geom->len = len;
   6351	io_geom->offset = offset;
   6352	io_geom->stripe_len = stripe_len;
   6353	io_geom->stripe_nr = stripe_nr;
   6354	io_geom->stripe_offset = stripe_offset;
   6355	io_geom->raid56_stripe_offset = raid56_full_stripe_start;
   6356
   6357	return 0;
   6358}
   6359
   6360static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
   6361			     enum btrfs_map_op op,
   6362			     u64 logical, u64 *length,
   6363			     struct btrfs_io_context **bioc_ret,
   6364			     int mirror_num, int need_raid_map)
   6365{
   6366	struct extent_map *em;
   6367	struct map_lookup *map;
   6368	u64 stripe_offset;
   6369	u64 stripe_nr;
   6370	u64 stripe_len;
   6371	u32 stripe_index;
   6372	int data_stripes;
   6373	int i;
   6374	int ret = 0;
   6375	int num_stripes;
   6376	int max_errors = 0;
   6377	int tgtdev_indexes = 0;
   6378	struct btrfs_io_context *bioc = NULL;
   6379	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   6380	int dev_replace_is_ongoing = 0;
   6381	int num_alloc_stripes;
   6382	int patch_the_first_stripe_for_dev_replace = 0;
   6383	u64 physical_to_patch_in_first_stripe = 0;
   6384	u64 raid56_full_stripe_start = (u64)-1;
   6385	struct btrfs_io_geometry geom;
   6386
   6387	ASSERT(bioc_ret);
   6388	ASSERT(op != BTRFS_MAP_DISCARD);
   6389
   6390	em = btrfs_get_chunk_map(fs_info, logical, *length);
   6391	ASSERT(!IS_ERR(em));
   6392
   6393	ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
   6394	if (ret < 0)
   6395		return ret;
   6396
   6397	map = em->map_lookup;
   6398
   6399	*length = geom.len;
   6400	stripe_len = geom.stripe_len;
   6401	stripe_nr = geom.stripe_nr;
   6402	stripe_offset = geom.stripe_offset;
   6403	raid56_full_stripe_start = geom.raid56_stripe_offset;
   6404	data_stripes = nr_data_stripes(map);
   6405
   6406	down_read(&dev_replace->rwsem);
   6407	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
   6408	/*
   6409	 * Hold the semaphore for read during the whole operation, write is
   6410	 * requested at commit time but must wait.
   6411	 */
   6412	if (!dev_replace_is_ongoing)
   6413		up_read(&dev_replace->rwsem);
   6414
   6415	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
   6416	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
   6417		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
   6418						    dev_replace->srcdev->devid,
   6419						    &mirror_num,
   6420					    &physical_to_patch_in_first_stripe);
   6421		if (ret)
   6422			goto out;
   6423		else
   6424			patch_the_first_stripe_for_dev_replace = 1;
   6425	} else if (mirror_num > map->num_stripes) {
   6426		mirror_num = 0;
   6427	}
   6428
   6429	num_stripes = 1;
   6430	stripe_index = 0;
   6431	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
   6432		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
   6433				&stripe_index);
   6434		if (!need_full_stripe(op))
   6435			mirror_num = 1;
   6436	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
   6437		if (need_full_stripe(op))
   6438			num_stripes = map->num_stripes;
   6439		else if (mirror_num)
   6440			stripe_index = mirror_num - 1;
   6441		else {
   6442			stripe_index = find_live_mirror(fs_info, map, 0,
   6443					    dev_replace_is_ongoing);
   6444			mirror_num = stripe_index + 1;
   6445		}
   6446
   6447	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
   6448		if (need_full_stripe(op)) {
   6449			num_stripes = map->num_stripes;
   6450		} else if (mirror_num) {
   6451			stripe_index = mirror_num - 1;
   6452		} else {
   6453			mirror_num = 1;
   6454		}
   6455
   6456	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
   6457		u32 factor = map->num_stripes / map->sub_stripes;
   6458
   6459		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
   6460		stripe_index *= map->sub_stripes;
   6461
   6462		if (need_full_stripe(op))
   6463			num_stripes = map->sub_stripes;
   6464		else if (mirror_num)
   6465			stripe_index += mirror_num - 1;
   6466		else {
   6467			int old_stripe_index = stripe_index;
   6468			stripe_index = find_live_mirror(fs_info, map,
   6469					      stripe_index,
   6470					      dev_replace_is_ongoing);
   6471			mirror_num = stripe_index - old_stripe_index + 1;
   6472		}
   6473
   6474	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
   6475		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
   6476			/* push stripe_nr back to the start of the full stripe */
   6477			stripe_nr = div64_u64(raid56_full_stripe_start,
   6478					stripe_len * data_stripes);
   6479
   6480			/* RAID[56] write or recovery. Return all stripes */
   6481			num_stripes = map->num_stripes;
   6482			max_errors = nr_parity_stripes(map);
   6483
   6484			*length = map->stripe_len;
   6485			stripe_index = 0;
   6486			stripe_offset = 0;
   6487		} else {
   6488			/*
   6489			 * Mirror #0 or #1 means the original data block.
   6490			 * Mirror #2 is RAID5 parity block.
   6491			 * Mirror #3 is RAID6 Q block.
   6492			 */
   6493			stripe_nr = div_u64_rem(stripe_nr,
   6494					data_stripes, &stripe_index);
   6495			if (mirror_num > 1)
   6496				stripe_index = data_stripes + mirror_num - 2;
   6497
   6498			/* We distribute the parity blocks across stripes */
   6499			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
   6500					&stripe_index);
   6501			if (!need_full_stripe(op) && mirror_num <= 1)
   6502				mirror_num = 1;
   6503		}
   6504	} else {
   6505		/*
   6506		 * after this, stripe_nr is the number of stripes on this
   6507		 * device we have to walk to find the data, and stripe_index is
   6508		 * the number of our device in the stripe array
   6509		 */
   6510		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
   6511				&stripe_index);
   6512		mirror_num = stripe_index + 1;
   6513	}
   6514	if (stripe_index >= map->num_stripes) {
   6515		btrfs_crit(fs_info,
   6516			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
   6517			   stripe_index, map->num_stripes);
   6518		ret = -EINVAL;
   6519		goto out;
   6520	}
   6521
   6522	num_alloc_stripes = num_stripes;
   6523	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
   6524		if (op == BTRFS_MAP_WRITE)
   6525			num_alloc_stripes <<= 1;
   6526		if (op == BTRFS_MAP_GET_READ_MIRRORS)
   6527			num_alloc_stripes++;
   6528		tgtdev_indexes = num_stripes;
   6529	}
   6530
   6531	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
   6532	if (!bioc) {
   6533		ret = -ENOMEM;
   6534		goto out;
   6535	}
   6536
   6537	for (i = 0; i < num_stripes; i++) {
   6538		bioc->stripes[i].physical = map->stripes[stripe_index].physical +
   6539			stripe_offset + stripe_nr * map->stripe_len;
   6540		bioc->stripes[i].dev = map->stripes[stripe_index].dev;
   6541		stripe_index++;
   6542	}
   6543
   6544	/* Build raid_map */
   6545	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
   6546	    (need_full_stripe(op) || mirror_num > 1)) {
   6547		u64 tmp;
   6548		unsigned rot;
   6549
   6550		/* Work out the disk rotation on this stripe-set */
   6551		div_u64_rem(stripe_nr, num_stripes, &rot);
   6552
   6553		/* Fill in the logical address of each stripe */
   6554		tmp = stripe_nr * data_stripes;
   6555		for (i = 0; i < data_stripes; i++)
   6556			bioc->raid_map[(i + rot) % num_stripes] =
   6557				em->start + (tmp + i) * map->stripe_len;
   6558
   6559		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
   6560		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
   6561			bioc->raid_map[(i + rot + 1) % num_stripes] =
   6562				RAID6_Q_STRIPE;
   6563
   6564		sort_parity_stripes(bioc, num_stripes);
   6565	}
   6566
   6567	if (need_full_stripe(op))
   6568		max_errors = btrfs_chunk_max_errors(map);
   6569
   6570	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
   6571	    need_full_stripe(op)) {
   6572		handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
   6573					  &num_stripes, &max_errors);
   6574	}
   6575
   6576	*bioc_ret = bioc;
   6577	bioc->map_type = map->type;
   6578	bioc->num_stripes = num_stripes;
   6579	bioc->max_errors = max_errors;
   6580	bioc->mirror_num = mirror_num;
   6581
   6582	/*
   6583	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
   6584	 * mirror_num == num_stripes + 1 && dev_replace target drive is
   6585	 * available as a mirror
   6586	 */
   6587	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
   6588		WARN_ON(num_stripes > 1);
   6589		bioc->stripes[0].dev = dev_replace->tgtdev;
   6590		bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
   6591		bioc->mirror_num = map->num_stripes + 1;
   6592	}
   6593out:
   6594	if (dev_replace_is_ongoing) {
   6595		lockdep_assert_held(&dev_replace->rwsem);
   6596		/* Unlock and let waiting writers proceed */
   6597		up_read(&dev_replace->rwsem);
   6598	}
   6599	free_extent_map(em);
   6600	return ret;
   6601}
   6602
   6603int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
   6604		      u64 logical, u64 *length,
   6605		      struct btrfs_io_context **bioc_ret, int mirror_num)
   6606{
   6607	if (op == BTRFS_MAP_DISCARD)
   6608		return __btrfs_map_block_for_discard(fs_info, logical,
   6609						     length, bioc_ret);
   6610
   6611	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
   6612				 mirror_num, 0);
   6613}
   6614
   6615/* For Scrub/replace */
   6616int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
   6617		     u64 logical, u64 *length,
   6618		     struct btrfs_io_context **bioc_ret)
   6619{
   6620	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
   6621}
   6622
   6623static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
   6624{
   6625	bio->bi_private = bioc->private;
   6626	bio->bi_end_io = bioc->end_io;
   6627	bio_endio(bio);
   6628
   6629	btrfs_put_bioc(bioc);
   6630}
   6631
   6632static void btrfs_end_bio(struct bio *bio)
   6633{
   6634	struct btrfs_io_context *bioc = bio->bi_private;
   6635	int is_orig_bio = 0;
   6636
   6637	if (bio->bi_status) {
   6638		atomic_inc(&bioc->error);
   6639		if (bio->bi_status == BLK_STS_IOERR ||
   6640		    bio->bi_status == BLK_STS_TARGET) {
   6641			struct btrfs_device *dev = btrfs_bio(bio)->device;
   6642
   6643			ASSERT(dev->bdev);
   6644			if (btrfs_op(bio) == BTRFS_MAP_WRITE)
   6645				btrfs_dev_stat_inc_and_print(dev,
   6646						BTRFS_DEV_STAT_WRITE_ERRS);
   6647			else if (!(bio->bi_opf & REQ_RAHEAD))
   6648				btrfs_dev_stat_inc_and_print(dev,
   6649						BTRFS_DEV_STAT_READ_ERRS);
   6650			if (bio->bi_opf & REQ_PREFLUSH)
   6651				btrfs_dev_stat_inc_and_print(dev,
   6652						BTRFS_DEV_STAT_FLUSH_ERRS);
   6653		}
   6654	}
   6655
   6656	if (bio == bioc->orig_bio)
   6657		is_orig_bio = 1;
   6658
   6659	btrfs_bio_counter_dec(bioc->fs_info);
   6660
   6661	if (atomic_dec_and_test(&bioc->stripes_pending)) {
   6662		if (!is_orig_bio) {
   6663			bio_put(bio);
   6664			bio = bioc->orig_bio;
   6665		}
   6666
   6667		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
   6668		/* only send an error to the higher layers if it is
   6669		 * beyond the tolerance of the btrfs bio
   6670		 */
   6671		if (atomic_read(&bioc->error) > bioc->max_errors) {
   6672			bio->bi_status = BLK_STS_IOERR;
   6673		} else {
   6674			/*
   6675			 * this bio is actually up to date, we didn't
   6676			 * go over the max number of errors
   6677			 */
   6678			bio->bi_status = BLK_STS_OK;
   6679		}
   6680
   6681		btrfs_end_bioc(bioc, bio);
   6682	} else if (!is_orig_bio) {
   6683		bio_put(bio);
   6684	}
   6685}
   6686
   6687static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
   6688			      u64 physical, struct btrfs_device *dev)
   6689{
   6690	struct btrfs_fs_info *fs_info = bioc->fs_info;
   6691
   6692	bio->bi_private = bioc;
   6693	btrfs_bio(bio)->device = dev;
   6694	bio->bi_end_io = btrfs_end_bio;
   6695	bio->bi_iter.bi_sector = physical >> 9;
   6696	/*
   6697	 * For zone append writing, bi_sector must point the beginning of the
   6698	 * zone
   6699	 */
   6700	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
   6701		if (btrfs_dev_is_sequential(dev, physical)) {
   6702			u64 zone_start = round_down(physical, fs_info->zone_size);
   6703
   6704			bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
   6705		} else {
   6706			bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
   6707			bio->bi_opf |= REQ_OP_WRITE;
   6708		}
   6709	}
   6710	btrfs_debug_in_rcu(fs_info,
   6711	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
   6712		bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
   6713		(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
   6714		dev->devid, bio->bi_iter.bi_size);
   6715
   6716	btrfs_bio_counter_inc_noblocked(fs_info);
   6717
   6718	btrfsic_check_bio(bio);
   6719	submit_bio(bio);
   6720}
   6721
   6722static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
   6723{
   6724	atomic_inc(&bioc->error);
   6725	if (atomic_dec_and_test(&bioc->stripes_pending)) {
   6726		/* Should be the original bio. */
   6727		WARN_ON(bio != bioc->orig_bio);
   6728
   6729		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
   6730		bio->bi_iter.bi_sector = logical >> 9;
   6731		if (atomic_read(&bioc->error) > bioc->max_errors)
   6732			bio->bi_status = BLK_STS_IOERR;
   6733		else
   6734			bio->bi_status = BLK_STS_OK;
   6735		btrfs_end_bioc(bioc, bio);
   6736	}
   6737}
   6738
   6739blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
   6740			   int mirror_num)
   6741{
   6742	struct btrfs_device *dev;
   6743	struct bio *first_bio = bio;
   6744	u64 logical = bio->bi_iter.bi_sector << 9;
   6745	u64 length = 0;
   6746	u64 map_length;
   6747	int ret;
   6748	int dev_nr;
   6749	int total_devs;
   6750	struct btrfs_io_context *bioc = NULL;
   6751
   6752	length = bio->bi_iter.bi_size;
   6753	map_length = length;
   6754
   6755	btrfs_bio_counter_inc_blocked(fs_info);
   6756	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
   6757				&map_length, &bioc, mirror_num, 1);
   6758	if (ret) {
   6759		btrfs_bio_counter_dec(fs_info);
   6760		return errno_to_blk_status(ret);
   6761	}
   6762
   6763	total_devs = bioc->num_stripes;
   6764	bioc->orig_bio = first_bio;
   6765	bioc->private = first_bio->bi_private;
   6766	bioc->end_io = first_bio->bi_end_io;
   6767	atomic_set(&bioc->stripes_pending, bioc->num_stripes);
   6768
   6769	if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
   6770	    ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
   6771		/* In this case, map_length has been set to the length of
   6772		   a single stripe; not the whole write */
   6773		if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
   6774			ret = raid56_parity_write(bio, bioc, map_length);
   6775		} else {
   6776			ret = raid56_parity_recover(bio, bioc, map_length,
   6777						    mirror_num, 1);
   6778		}
   6779
   6780		btrfs_bio_counter_dec(fs_info);
   6781		return errno_to_blk_status(ret);
   6782	}
   6783
   6784	if (map_length < length) {
   6785		btrfs_crit(fs_info,
   6786			   "mapping failed logical %llu bio len %llu len %llu",
   6787			   logical, length, map_length);
   6788		BUG();
   6789	}
   6790
   6791	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
   6792		dev = bioc->stripes[dev_nr].dev;
   6793		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
   6794						   &dev->dev_state) ||
   6795		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
   6796		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
   6797			bioc_error(bioc, first_bio, logical);
   6798			continue;
   6799		}
   6800
   6801		if (dev_nr < total_devs - 1) {
   6802			bio = btrfs_bio_clone(dev->bdev, first_bio);
   6803		} else {
   6804			bio = first_bio;
   6805			bio_set_dev(bio, dev->bdev);
   6806		}
   6807
   6808		submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
   6809	}
   6810	btrfs_bio_counter_dec(fs_info);
   6811	return BLK_STS_OK;
   6812}
   6813
   6814static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
   6815				      const struct btrfs_fs_devices *fs_devices)
   6816{
   6817	if (args->fsid == NULL)
   6818		return true;
   6819	if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
   6820		return true;
   6821	return false;
   6822}
   6823
   6824static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
   6825				  const struct btrfs_device *device)
   6826{
   6827	ASSERT((args->devid != (u64)-1) || args->missing);
   6828
   6829	if ((args->devid != (u64)-1) && device->devid != args->devid)
   6830		return false;
   6831	if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
   6832		return false;
   6833	if (!args->missing)
   6834		return true;
   6835	if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
   6836	    !device->bdev)
   6837		return true;
   6838	return false;
   6839}
   6840
   6841/*
   6842 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
   6843 * return NULL.
   6844 *
   6845 * If devid and uuid are both specified, the match must be exact, otherwise
   6846 * only devid is used.
   6847 */
   6848struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
   6849				       const struct btrfs_dev_lookup_args *args)
   6850{
   6851	struct btrfs_device *device;
   6852	struct btrfs_fs_devices *seed_devs;
   6853
   6854	if (dev_args_match_fs_devices(args, fs_devices)) {
   6855		list_for_each_entry(device, &fs_devices->devices, dev_list) {
   6856			if (dev_args_match_device(args, device))
   6857				return device;
   6858		}
   6859	}
   6860
   6861	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
   6862		if (!dev_args_match_fs_devices(args, seed_devs))
   6863			continue;
   6864		list_for_each_entry(device, &seed_devs->devices, dev_list) {
   6865			if (dev_args_match_device(args, device))
   6866				return device;
   6867		}
   6868	}
   6869
   6870	return NULL;
   6871}
   6872
   6873static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
   6874					    u64 devid, u8 *dev_uuid)
   6875{
   6876	struct btrfs_device *device;
   6877	unsigned int nofs_flag;
   6878
   6879	/*
   6880	 * We call this under the chunk_mutex, so we want to use NOFS for this
   6881	 * allocation, however we don't want to change btrfs_alloc_device() to
   6882	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
   6883	 * places.
   6884	 */
   6885	nofs_flag = memalloc_nofs_save();
   6886	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
   6887	memalloc_nofs_restore(nofs_flag);
   6888	if (IS_ERR(device))
   6889		return device;
   6890
   6891	list_add(&device->dev_list, &fs_devices->devices);
   6892	device->fs_devices = fs_devices;
   6893	fs_devices->num_devices++;
   6894
   6895	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
   6896	fs_devices->missing_devices++;
   6897
   6898	return device;
   6899}
   6900
   6901/**
   6902 * btrfs_alloc_device - allocate struct btrfs_device
   6903 * @fs_info:	used only for generating a new devid, can be NULL if
   6904 *		devid is provided (i.e. @devid != NULL).
   6905 * @devid:	a pointer to devid for this device.  If NULL a new devid
   6906 *		is generated.
   6907 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
   6908 *		is generated.
   6909 *
   6910 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
   6911 * on error.  Returned struct is not linked onto any lists and must be
   6912 * destroyed with btrfs_free_device.
   6913 */
   6914struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
   6915					const u64 *devid,
   6916					const u8 *uuid)
   6917{
   6918	struct btrfs_device *dev;
   6919	u64 tmp;
   6920
   6921	if (WARN_ON(!devid && !fs_info))
   6922		return ERR_PTR(-EINVAL);
   6923
   6924	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
   6925	if (!dev)
   6926		return ERR_PTR(-ENOMEM);
   6927
   6928	INIT_LIST_HEAD(&dev->dev_list);
   6929	INIT_LIST_HEAD(&dev->dev_alloc_list);
   6930	INIT_LIST_HEAD(&dev->post_commit_list);
   6931
   6932	atomic_set(&dev->dev_stats_ccnt, 0);
   6933	btrfs_device_data_ordered_init(dev);
   6934	extent_io_tree_init(fs_info, &dev->alloc_state,
   6935			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
   6936
   6937	if (devid)
   6938		tmp = *devid;
   6939	else {
   6940		int ret;
   6941
   6942		ret = find_next_devid(fs_info, &tmp);
   6943		if (ret) {
   6944			btrfs_free_device(dev);
   6945			return ERR_PTR(ret);
   6946		}
   6947	}
   6948	dev->devid = tmp;
   6949
   6950	if (uuid)
   6951		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
   6952	else
   6953		generate_random_uuid(dev->uuid);
   6954
   6955	return dev;
   6956}
   6957
   6958static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
   6959					u64 devid, u8 *uuid, bool error)
   6960{
   6961	if (error)
   6962		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
   6963			      devid, uuid);
   6964	else
   6965		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
   6966			      devid, uuid);
   6967}
   6968
   6969static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
   6970{
   6971	const int data_stripes = calc_data_stripes(type, num_stripes);
   6972
   6973	return div_u64(chunk_len, data_stripes);
   6974}
   6975
   6976#if BITS_PER_LONG == 32
   6977/*
   6978 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
   6979 * can't be accessed on 32bit systems.
   6980 *
   6981 * This function do mount time check to reject the fs if it already has
   6982 * metadata chunk beyond that limit.
   6983 */
   6984static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
   6985				  u64 logical, u64 length, u64 type)
   6986{
   6987	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
   6988		return 0;
   6989
   6990	if (logical + length < MAX_LFS_FILESIZE)
   6991		return 0;
   6992
   6993	btrfs_err_32bit_limit(fs_info);
   6994	return -EOVERFLOW;
   6995}
   6996
   6997/*
   6998 * This is to give early warning for any metadata chunk reaching
   6999 * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
   7000 * Although we can still access the metadata, it's not going to be possible
   7001 * once the limit is reached.
   7002 */
   7003static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
   7004				  u64 logical, u64 length, u64 type)
   7005{
   7006	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
   7007		return;
   7008
   7009	if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
   7010		return;
   7011
   7012	btrfs_warn_32bit_limit(fs_info);
   7013}
   7014#endif
   7015
   7016static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
   7017						  u64 devid, u8 *uuid)
   7018{
   7019	struct btrfs_device *dev;
   7020
   7021	if (!btrfs_test_opt(fs_info, DEGRADED)) {
   7022		btrfs_report_missing_device(fs_info, devid, uuid, true);
   7023		return ERR_PTR(-ENOENT);
   7024	}
   7025
   7026	dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
   7027	if (IS_ERR(dev)) {
   7028		btrfs_err(fs_info, "failed to init missing device %llu: %ld",
   7029			  devid, PTR_ERR(dev));
   7030		return dev;
   7031	}
   7032	btrfs_report_missing_device(fs_info, devid, uuid, false);
   7033
   7034	return dev;
   7035}
   7036
   7037static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
   7038			  struct btrfs_chunk *chunk)
   7039{
   7040	BTRFS_DEV_LOOKUP_ARGS(args);
   7041	struct btrfs_fs_info *fs_info = leaf->fs_info;
   7042	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
   7043	struct map_lookup *map;
   7044	struct extent_map *em;
   7045	u64 logical;
   7046	u64 length;
   7047	u64 devid;
   7048	u64 type;
   7049	u8 uuid[BTRFS_UUID_SIZE];
   7050	int num_stripes;
   7051	int ret;
   7052	int i;
   7053
   7054	logical = key->offset;
   7055	length = btrfs_chunk_length(leaf, chunk);
   7056	type = btrfs_chunk_type(leaf, chunk);
   7057	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
   7058
   7059#if BITS_PER_LONG == 32
   7060	ret = check_32bit_meta_chunk(fs_info, logical, length, type);
   7061	if (ret < 0)
   7062		return ret;
   7063	warn_32bit_meta_chunk(fs_info, logical, length, type);
   7064#endif
   7065
   7066	/*
   7067	 * Only need to verify chunk item if we're reading from sys chunk array,
   7068	 * as chunk item in tree block is already verified by tree-checker.
   7069	 */
   7070	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
   7071		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
   7072		if (ret)
   7073			return ret;
   7074	}
   7075
   7076	read_lock(&map_tree->lock);
   7077	em = lookup_extent_mapping(map_tree, logical, 1);
   7078	read_unlock(&map_tree->lock);
   7079
   7080	/* already mapped? */
   7081	if (em && em->start <= logical && em->start + em->len > logical) {
   7082		free_extent_map(em);
   7083		return 0;
   7084	} else if (em) {
   7085		free_extent_map(em);
   7086	}
   7087
   7088	em = alloc_extent_map();
   7089	if (!em)
   7090		return -ENOMEM;
   7091	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
   7092	if (!map) {
   7093		free_extent_map(em);
   7094		return -ENOMEM;
   7095	}
   7096
   7097	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
   7098	em->map_lookup = map;
   7099	em->start = logical;
   7100	em->len = length;
   7101	em->orig_start = 0;
   7102	em->block_start = 0;
   7103	em->block_len = em->len;
   7104
   7105	map->num_stripes = num_stripes;
   7106	map->io_width = btrfs_chunk_io_width(leaf, chunk);
   7107	map->io_align = btrfs_chunk_io_align(leaf, chunk);
   7108	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
   7109	map->type = type;
   7110	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
   7111	map->verified_stripes = 0;
   7112	em->orig_block_len = calc_stripe_length(type, em->len,
   7113						map->num_stripes);
   7114	for (i = 0; i < num_stripes; i++) {
   7115		map->stripes[i].physical =
   7116			btrfs_stripe_offset_nr(leaf, chunk, i);
   7117		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
   7118		args.devid = devid;
   7119		read_extent_buffer(leaf, uuid, (unsigned long)
   7120				   btrfs_stripe_dev_uuid_nr(chunk, i),
   7121				   BTRFS_UUID_SIZE);
   7122		args.uuid = uuid;
   7123		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
   7124		if (!map->stripes[i].dev) {
   7125			map->stripes[i].dev = handle_missing_device(fs_info,
   7126								    devid, uuid);
   7127			if (IS_ERR(map->stripes[i].dev)) {
   7128				free_extent_map(em);
   7129				return PTR_ERR(map->stripes[i].dev);
   7130			}
   7131		}
   7132
   7133		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
   7134				&(map->stripes[i].dev->dev_state));
   7135	}
   7136
   7137	write_lock(&map_tree->lock);
   7138	ret = add_extent_mapping(map_tree, em, 0);
   7139	write_unlock(&map_tree->lock);
   7140	if (ret < 0) {
   7141		btrfs_err(fs_info,
   7142			  "failed to add chunk map, start=%llu len=%llu: %d",
   7143			  em->start, em->len, ret);
   7144	}
   7145	free_extent_map(em);
   7146
   7147	return ret;
   7148}
   7149
   7150static void fill_device_from_item(struct extent_buffer *leaf,
   7151				 struct btrfs_dev_item *dev_item,
   7152				 struct btrfs_device *device)
   7153{
   7154	unsigned long ptr;
   7155
   7156	device->devid = btrfs_device_id(leaf, dev_item);
   7157	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
   7158	device->total_bytes = device->disk_total_bytes;
   7159	device->commit_total_bytes = device->disk_total_bytes;
   7160	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
   7161	device->commit_bytes_used = device->bytes_used;
   7162	device->type = btrfs_device_type(leaf, dev_item);
   7163	device->io_align = btrfs_device_io_align(leaf, dev_item);
   7164	device->io_width = btrfs_device_io_width(leaf, dev_item);
   7165	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
   7166	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
   7167	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
   7168
   7169	ptr = btrfs_device_uuid(dev_item);
   7170	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
   7171}
   7172
   7173static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
   7174						  u8 *fsid)
   7175{
   7176	struct btrfs_fs_devices *fs_devices;
   7177	int ret;
   7178
   7179	lockdep_assert_held(&uuid_mutex);
   7180	ASSERT(fsid);
   7181
   7182	/* This will match only for multi-device seed fs */
   7183	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
   7184		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
   7185			return fs_devices;
   7186
   7187
   7188	fs_devices = find_fsid(fsid, NULL);
   7189	if (!fs_devices) {
   7190		if (!btrfs_test_opt(fs_info, DEGRADED))
   7191			return ERR_PTR(-ENOENT);
   7192
   7193		fs_devices = alloc_fs_devices(fsid, NULL);
   7194		if (IS_ERR(fs_devices))
   7195			return fs_devices;
   7196
   7197		fs_devices->seeding = true;
   7198		fs_devices->opened = 1;
   7199		return fs_devices;
   7200	}
   7201
   7202	/*
   7203	 * Upon first call for a seed fs fsid, just create a private copy of the
   7204	 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
   7205	 */
   7206	fs_devices = clone_fs_devices(fs_devices);
   7207	if (IS_ERR(fs_devices))
   7208		return fs_devices;
   7209
   7210	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
   7211	if (ret) {
   7212		free_fs_devices(fs_devices);
   7213		return ERR_PTR(ret);
   7214	}
   7215
   7216	if (!fs_devices->seeding) {
   7217		close_fs_devices(fs_devices);
   7218		free_fs_devices(fs_devices);
   7219		return ERR_PTR(-EINVAL);
   7220	}
   7221
   7222	list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
   7223
   7224	return fs_devices;
   7225}
   7226
   7227static int read_one_dev(struct extent_buffer *leaf,
   7228			struct btrfs_dev_item *dev_item)
   7229{
   7230	BTRFS_DEV_LOOKUP_ARGS(args);
   7231	struct btrfs_fs_info *fs_info = leaf->fs_info;
   7232	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   7233	struct btrfs_device *device;
   7234	u64 devid;
   7235	int ret;
   7236	u8 fs_uuid[BTRFS_FSID_SIZE];
   7237	u8 dev_uuid[BTRFS_UUID_SIZE];
   7238
   7239	devid = args.devid = btrfs_device_id(leaf, dev_item);
   7240	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
   7241			   BTRFS_UUID_SIZE);
   7242	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
   7243			   BTRFS_FSID_SIZE);
   7244	args.uuid = dev_uuid;
   7245	args.fsid = fs_uuid;
   7246
   7247	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
   7248		fs_devices = open_seed_devices(fs_info, fs_uuid);
   7249		if (IS_ERR(fs_devices))
   7250			return PTR_ERR(fs_devices);
   7251	}
   7252
   7253	device = btrfs_find_device(fs_info->fs_devices, &args);
   7254	if (!device) {
   7255		if (!btrfs_test_opt(fs_info, DEGRADED)) {
   7256			btrfs_report_missing_device(fs_info, devid,
   7257							dev_uuid, true);
   7258			return -ENOENT;
   7259		}
   7260
   7261		device = add_missing_dev(fs_devices, devid, dev_uuid);
   7262		if (IS_ERR(device)) {
   7263			btrfs_err(fs_info,
   7264				"failed to add missing dev %llu: %ld",
   7265				devid, PTR_ERR(device));
   7266			return PTR_ERR(device);
   7267		}
   7268		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
   7269	} else {
   7270		if (!device->bdev) {
   7271			if (!btrfs_test_opt(fs_info, DEGRADED)) {
   7272				btrfs_report_missing_device(fs_info,
   7273						devid, dev_uuid, true);
   7274				return -ENOENT;
   7275			}
   7276			btrfs_report_missing_device(fs_info, devid,
   7277							dev_uuid, false);
   7278		}
   7279
   7280		if (!device->bdev &&
   7281		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
   7282			/*
   7283			 * this happens when a device that was properly setup
   7284			 * in the device info lists suddenly goes bad.
   7285			 * device->bdev is NULL, and so we have to set
   7286			 * device->missing to one here
   7287			 */
   7288			device->fs_devices->missing_devices++;
   7289			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
   7290		}
   7291
   7292		/* Move the device to its own fs_devices */
   7293		if (device->fs_devices != fs_devices) {
   7294			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
   7295							&device->dev_state));
   7296
   7297			list_move(&device->dev_list, &fs_devices->devices);
   7298			device->fs_devices->num_devices--;
   7299			fs_devices->num_devices++;
   7300
   7301			device->fs_devices->missing_devices--;
   7302			fs_devices->missing_devices++;
   7303
   7304			device->fs_devices = fs_devices;
   7305		}
   7306	}
   7307
   7308	if (device->fs_devices != fs_info->fs_devices) {
   7309		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
   7310		if (device->generation !=
   7311		    btrfs_device_generation(leaf, dev_item))
   7312			return -EINVAL;
   7313	}
   7314
   7315	fill_device_from_item(leaf, dev_item, device);
   7316	if (device->bdev) {
   7317		u64 max_total_bytes = bdev_nr_bytes(device->bdev);
   7318
   7319		if (device->total_bytes > max_total_bytes) {
   7320			btrfs_err(fs_info,
   7321			"device total_bytes should be at most %llu but found %llu",
   7322				  max_total_bytes, device->total_bytes);
   7323			return -EINVAL;
   7324		}
   7325	}
   7326	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
   7327	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
   7328	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
   7329		device->fs_devices->total_rw_bytes += device->total_bytes;
   7330		atomic64_add(device->total_bytes - device->bytes_used,
   7331				&fs_info->free_chunk_space);
   7332	}
   7333	ret = 0;
   7334	return ret;
   7335}
   7336
   7337int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
   7338{
   7339	struct btrfs_super_block *super_copy = fs_info->super_copy;
   7340	struct extent_buffer *sb;
   7341	struct btrfs_disk_key *disk_key;
   7342	struct btrfs_chunk *chunk;
   7343	u8 *array_ptr;
   7344	unsigned long sb_array_offset;
   7345	int ret = 0;
   7346	u32 num_stripes;
   7347	u32 array_size;
   7348	u32 len = 0;
   7349	u32 cur_offset;
   7350	u64 type;
   7351	struct btrfs_key key;
   7352
   7353	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
   7354
   7355	/*
   7356	 * We allocated a dummy extent, just to use extent buffer accessors.
   7357	 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
   7358	 * that's fine, we will not go beyond system chunk array anyway.
   7359	 */
   7360	sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
   7361	if (!sb)
   7362		return -ENOMEM;
   7363	set_extent_buffer_uptodate(sb);
   7364
   7365	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
   7366	array_size = btrfs_super_sys_array_size(super_copy);
   7367
   7368	array_ptr = super_copy->sys_chunk_array;
   7369	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
   7370	cur_offset = 0;
   7371
   7372	while (cur_offset < array_size) {
   7373		disk_key = (struct btrfs_disk_key *)array_ptr;
   7374		len = sizeof(*disk_key);
   7375		if (cur_offset + len > array_size)
   7376			goto out_short_read;
   7377
   7378		btrfs_disk_key_to_cpu(&key, disk_key);
   7379
   7380		array_ptr += len;
   7381		sb_array_offset += len;
   7382		cur_offset += len;
   7383
   7384		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
   7385			btrfs_err(fs_info,
   7386			    "unexpected item type %u in sys_array at offset %u",
   7387				  (u32)key.type, cur_offset);
   7388			ret = -EIO;
   7389			break;
   7390		}
   7391
   7392		chunk = (struct btrfs_chunk *)sb_array_offset;
   7393		/*
   7394		 * At least one btrfs_chunk with one stripe must be present,
   7395		 * exact stripe count check comes afterwards
   7396		 */
   7397		len = btrfs_chunk_item_size(1);
   7398		if (cur_offset + len > array_size)
   7399			goto out_short_read;
   7400
   7401		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
   7402		if (!num_stripes) {
   7403			btrfs_err(fs_info,
   7404			"invalid number of stripes %u in sys_array at offset %u",
   7405				  num_stripes, cur_offset);
   7406			ret = -EIO;
   7407			break;
   7408		}
   7409
   7410		type = btrfs_chunk_type(sb, chunk);
   7411		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
   7412			btrfs_err(fs_info,
   7413			"invalid chunk type %llu in sys_array at offset %u",
   7414				  type, cur_offset);
   7415			ret = -EIO;
   7416			break;
   7417		}
   7418
   7419		len = btrfs_chunk_item_size(num_stripes);
   7420		if (cur_offset + len > array_size)
   7421			goto out_short_read;
   7422
   7423		ret = read_one_chunk(&key, sb, chunk);
   7424		if (ret)
   7425			break;
   7426
   7427		array_ptr += len;
   7428		sb_array_offset += len;
   7429		cur_offset += len;
   7430	}
   7431	clear_extent_buffer_uptodate(sb);
   7432	free_extent_buffer_stale(sb);
   7433	return ret;
   7434
   7435out_short_read:
   7436	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
   7437			len, cur_offset);
   7438	clear_extent_buffer_uptodate(sb);
   7439	free_extent_buffer_stale(sb);
   7440	return -EIO;
   7441}
   7442
   7443/*
   7444 * Check if all chunks in the fs are OK for read-write degraded mount
   7445 *
   7446 * If the @failing_dev is specified, it's accounted as missing.
   7447 *
   7448 * Return true if all chunks meet the minimal RW mount requirements.
   7449 * Return false if any chunk doesn't meet the minimal RW mount requirements.
   7450 */
   7451bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
   7452					struct btrfs_device *failing_dev)
   7453{
   7454	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
   7455	struct extent_map *em;
   7456	u64 next_start = 0;
   7457	bool ret = true;
   7458
   7459	read_lock(&map_tree->lock);
   7460	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
   7461	read_unlock(&map_tree->lock);
   7462	/* No chunk at all? Return false anyway */
   7463	if (!em) {
   7464		ret = false;
   7465		goto out;
   7466	}
   7467	while (em) {
   7468		struct map_lookup *map;
   7469		int missing = 0;
   7470		int max_tolerated;
   7471		int i;
   7472
   7473		map = em->map_lookup;
   7474		max_tolerated =
   7475			btrfs_get_num_tolerated_disk_barrier_failures(
   7476					map->type);
   7477		for (i = 0; i < map->num_stripes; i++) {
   7478			struct btrfs_device *dev = map->stripes[i].dev;
   7479
   7480			if (!dev || !dev->bdev ||
   7481			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
   7482			    dev->last_flush_error)
   7483				missing++;
   7484			else if (failing_dev && failing_dev == dev)
   7485				missing++;
   7486		}
   7487		if (missing > max_tolerated) {
   7488			if (!failing_dev)
   7489				btrfs_warn(fs_info,
   7490	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
   7491				   em->start, missing, max_tolerated);
   7492			free_extent_map(em);
   7493			ret = false;
   7494			goto out;
   7495		}
   7496		next_start = extent_map_end(em);
   7497		free_extent_map(em);
   7498
   7499		read_lock(&map_tree->lock);
   7500		em = lookup_extent_mapping(map_tree, next_start,
   7501					   (u64)(-1) - next_start);
   7502		read_unlock(&map_tree->lock);
   7503	}
   7504out:
   7505	return ret;
   7506}
   7507
   7508static void readahead_tree_node_children(struct extent_buffer *node)
   7509{
   7510	int i;
   7511	const int nr_items = btrfs_header_nritems(node);
   7512
   7513	for (i = 0; i < nr_items; i++)
   7514		btrfs_readahead_node_child(node, i);
   7515}
   7516
   7517int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
   7518{
   7519	struct btrfs_root *root = fs_info->chunk_root;
   7520	struct btrfs_path *path;
   7521	struct extent_buffer *leaf;
   7522	struct btrfs_key key;
   7523	struct btrfs_key found_key;
   7524	int ret;
   7525	int slot;
   7526	int iter_ret = 0;
   7527	u64 total_dev = 0;
   7528	u64 last_ra_node = 0;
   7529
   7530	path = btrfs_alloc_path();
   7531	if (!path)
   7532		return -ENOMEM;
   7533
   7534	/*
   7535	 * uuid_mutex is needed only if we are mounting a sprout FS
   7536	 * otherwise we don't need it.
   7537	 */
   7538	mutex_lock(&uuid_mutex);
   7539
   7540	/*
   7541	 * It is possible for mount and umount to race in such a way that
   7542	 * we execute this code path, but open_fs_devices failed to clear
   7543	 * total_rw_bytes. We certainly want it cleared before reading the
   7544	 * device items, so clear it here.
   7545	 */
   7546	fs_info->fs_devices->total_rw_bytes = 0;
   7547
   7548	/*
   7549	 * Lockdep complains about possible circular locking dependency between
   7550	 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
   7551	 * used for freeze procection of a fs (struct super_block.s_writers),
   7552	 * which we take when starting a transaction, and extent buffers of the
   7553	 * chunk tree if we call read_one_dev() while holding a lock on an
   7554	 * extent buffer of the chunk tree. Since we are mounting the filesystem
   7555	 * and at this point there can't be any concurrent task modifying the
   7556	 * chunk tree, to keep it simple, just skip locking on the chunk tree.
   7557	 */
   7558	ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
   7559	path->skip_locking = 1;
   7560
   7561	/*
   7562	 * Read all device items, and then all the chunk items. All
   7563	 * device items are found before any chunk item (their object id
   7564	 * is smaller than the lowest possible object id for a chunk
   7565	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
   7566	 */
   7567	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
   7568	key.offset = 0;
   7569	key.type = 0;
   7570	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
   7571		struct extent_buffer *node = path->nodes[1];
   7572
   7573		leaf = path->nodes[0];
   7574		slot = path->slots[0];
   7575
   7576		if (node) {
   7577			if (last_ra_node != node->start) {
   7578				readahead_tree_node_children(node);
   7579				last_ra_node = node->start;
   7580			}
   7581		}
   7582		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
   7583			struct btrfs_dev_item *dev_item;
   7584			dev_item = btrfs_item_ptr(leaf, slot,
   7585						  struct btrfs_dev_item);
   7586			ret = read_one_dev(leaf, dev_item);
   7587			if (ret)
   7588				goto error;
   7589			total_dev++;
   7590		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
   7591			struct btrfs_chunk *chunk;
   7592
   7593			/*
   7594			 * We are only called at mount time, so no need to take
   7595			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
   7596			 * we always lock first fs_info->chunk_mutex before
   7597			 * acquiring any locks on the chunk tree. This is a
   7598			 * requirement for chunk allocation, see the comment on
   7599			 * top of btrfs_chunk_alloc() for details.
   7600			 */
   7601			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
   7602			ret = read_one_chunk(&found_key, leaf, chunk);
   7603			if (ret)
   7604				goto error;
   7605		}
   7606	}
   7607	/* Catch error found during iteration */
   7608	if (iter_ret < 0) {
   7609		ret = iter_ret;
   7610		goto error;
   7611	}
   7612
   7613	/*
   7614	 * After loading chunk tree, we've got all device information,
   7615	 * do another round of validation checks.
   7616	 */
   7617	if (total_dev != fs_info->fs_devices->total_devices) {
   7618		btrfs_warn(fs_info,
   7619"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
   7620			  btrfs_super_num_devices(fs_info->super_copy),
   7621			  total_dev);
   7622		fs_info->fs_devices->total_devices = total_dev;
   7623		btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
   7624	}
   7625	if (btrfs_super_total_bytes(fs_info->super_copy) <
   7626	    fs_info->fs_devices->total_rw_bytes) {
   7627		btrfs_err(fs_info,
   7628	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
   7629			  btrfs_super_total_bytes(fs_info->super_copy),
   7630			  fs_info->fs_devices->total_rw_bytes);
   7631		ret = -EINVAL;
   7632		goto error;
   7633	}
   7634	ret = 0;
   7635error:
   7636	mutex_unlock(&uuid_mutex);
   7637
   7638	btrfs_free_path(path);
   7639	return ret;
   7640}
   7641
   7642void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
   7643{
   7644	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
   7645	struct btrfs_device *device;
   7646
   7647	fs_devices->fs_info = fs_info;
   7648
   7649	mutex_lock(&fs_devices->device_list_mutex);
   7650	list_for_each_entry(device, &fs_devices->devices, dev_list)
   7651		device->fs_info = fs_info;
   7652
   7653	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
   7654		list_for_each_entry(device, &seed_devs->devices, dev_list)
   7655			device->fs_info = fs_info;
   7656
   7657		seed_devs->fs_info = fs_info;
   7658	}
   7659	mutex_unlock(&fs_devices->device_list_mutex);
   7660}
   7661
   7662static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
   7663				 const struct btrfs_dev_stats_item *ptr,
   7664				 int index)
   7665{
   7666	u64 val;
   7667
   7668	read_extent_buffer(eb, &val,
   7669			   offsetof(struct btrfs_dev_stats_item, values) +
   7670			    ((unsigned long)ptr) + (index * sizeof(u64)),
   7671			   sizeof(val));
   7672	return val;
   7673}
   7674
   7675static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
   7676				      struct btrfs_dev_stats_item *ptr,
   7677				      int index, u64 val)
   7678{
   7679	write_extent_buffer(eb, &val,
   7680			    offsetof(struct btrfs_dev_stats_item, values) +
   7681			     ((unsigned long)ptr) + (index * sizeof(u64)),
   7682			    sizeof(val));
   7683}
   7684
   7685static int btrfs_device_init_dev_stats(struct btrfs_device *device,
   7686				       struct btrfs_path *path)
   7687{
   7688	struct btrfs_dev_stats_item *ptr;
   7689	struct extent_buffer *eb;
   7690	struct btrfs_key key;
   7691	int item_size;
   7692	int i, ret, slot;
   7693
   7694	if (!device->fs_info->dev_root)
   7695		return 0;
   7696
   7697	key.objectid = BTRFS_DEV_STATS_OBJECTID;
   7698	key.type = BTRFS_PERSISTENT_ITEM_KEY;
   7699	key.offset = device->devid;
   7700	ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
   7701	if (ret) {
   7702		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
   7703			btrfs_dev_stat_set(device, i, 0);
   7704		device->dev_stats_valid = 1;
   7705		btrfs_release_path(path);
   7706		return ret < 0 ? ret : 0;
   7707	}
   7708	slot = path->slots[0];
   7709	eb = path->nodes[0];
   7710	item_size = btrfs_item_size(eb, slot);
   7711
   7712	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
   7713
   7714	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
   7715		if (item_size >= (1 + i) * sizeof(__le64))
   7716			btrfs_dev_stat_set(device, i,
   7717					   btrfs_dev_stats_value(eb, ptr, i));
   7718		else
   7719			btrfs_dev_stat_set(device, i, 0);
   7720	}
   7721
   7722	device->dev_stats_valid = 1;
   7723	btrfs_dev_stat_print_on_load(device);
   7724	btrfs_release_path(path);
   7725
   7726	return 0;
   7727}
   7728
   7729int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
   7730{
   7731	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
   7732	struct btrfs_device *device;
   7733	struct btrfs_path *path = NULL;
   7734	int ret = 0;
   7735
   7736	path = btrfs_alloc_path();
   7737	if (!path)
   7738		return -ENOMEM;
   7739
   7740	mutex_lock(&fs_devices->device_list_mutex);
   7741	list_for_each_entry(device, &fs_devices->devices, dev_list) {
   7742		ret = btrfs_device_init_dev_stats(device, path);
   7743		if (ret)
   7744			goto out;
   7745	}
   7746	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
   7747		list_for_each_entry(device, &seed_devs->devices, dev_list) {
   7748			ret = btrfs_device_init_dev_stats(device, path);
   7749			if (ret)
   7750				goto out;
   7751		}
   7752	}
   7753out:
   7754	mutex_unlock(&fs_devices->device_list_mutex);
   7755
   7756	btrfs_free_path(path);
   7757	return ret;
   7758}
   7759
   7760static int update_dev_stat_item(struct btrfs_trans_handle *trans,
   7761				struct btrfs_device *device)
   7762{
   7763	struct btrfs_fs_info *fs_info = trans->fs_info;
   7764	struct btrfs_root *dev_root = fs_info->dev_root;
   7765	struct btrfs_path *path;
   7766	struct btrfs_key key;
   7767	struct extent_buffer *eb;
   7768	struct btrfs_dev_stats_item *ptr;
   7769	int ret;
   7770	int i;
   7771
   7772	key.objectid = BTRFS_DEV_STATS_OBJECTID;
   7773	key.type = BTRFS_PERSISTENT_ITEM_KEY;
   7774	key.offset = device->devid;
   7775
   7776	path = btrfs_alloc_path();
   7777	if (!path)
   7778		return -ENOMEM;
   7779	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
   7780	if (ret < 0) {
   7781		btrfs_warn_in_rcu(fs_info,
   7782			"error %d while searching for dev_stats item for device %s",
   7783			      ret, rcu_str_deref(device->name));
   7784		goto out;
   7785	}
   7786
   7787	if (ret == 0 &&
   7788	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
   7789		/* need to delete old one and insert a new one */
   7790		ret = btrfs_del_item(trans, dev_root, path);
   7791		if (ret != 0) {
   7792			btrfs_warn_in_rcu(fs_info,
   7793				"delete too small dev_stats item for device %s failed %d",
   7794				      rcu_str_deref(device->name), ret);
   7795			goto out;
   7796		}
   7797		ret = 1;
   7798	}
   7799
   7800	if (ret == 1) {
   7801		/* need to insert a new item */
   7802		btrfs_release_path(path);
   7803		ret = btrfs_insert_empty_item(trans, dev_root, path,
   7804					      &key, sizeof(*ptr));
   7805		if (ret < 0) {
   7806			btrfs_warn_in_rcu(fs_info,
   7807				"insert dev_stats item for device %s failed %d",
   7808				rcu_str_deref(device->name), ret);
   7809			goto out;
   7810		}
   7811	}
   7812
   7813	eb = path->nodes[0];
   7814	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
   7815	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
   7816		btrfs_set_dev_stats_value(eb, ptr, i,
   7817					  btrfs_dev_stat_read(device, i));
   7818	btrfs_mark_buffer_dirty(eb);
   7819
   7820out:
   7821	btrfs_free_path(path);
   7822	return ret;
   7823}
   7824
   7825/*
   7826 * called from commit_transaction. Writes all changed device stats to disk.
   7827 */
   7828int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
   7829{
   7830	struct btrfs_fs_info *fs_info = trans->fs_info;
   7831	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   7832	struct btrfs_device *device;
   7833	int stats_cnt;
   7834	int ret = 0;
   7835
   7836	mutex_lock(&fs_devices->device_list_mutex);
   7837	list_for_each_entry(device, &fs_devices->devices, dev_list) {
   7838		stats_cnt = atomic_read(&device->dev_stats_ccnt);
   7839		if (!device->dev_stats_valid || stats_cnt == 0)
   7840			continue;
   7841
   7842
   7843		/*
   7844		 * There is a LOAD-LOAD control dependency between the value of
   7845		 * dev_stats_ccnt and updating the on-disk values which requires
   7846		 * reading the in-memory counters. Such control dependencies
   7847		 * require explicit read memory barriers.
   7848		 *
   7849		 * This memory barriers pairs with smp_mb__before_atomic in
   7850		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
   7851		 * barrier implied by atomic_xchg in
   7852		 * btrfs_dev_stats_read_and_reset
   7853		 */
   7854		smp_rmb();
   7855
   7856		ret = update_dev_stat_item(trans, device);
   7857		if (!ret)
   7858			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
   7859	}
   7860	mutex_unlock(&fs_devices->device_list_mutex);
   7861
   7862	return ret;
   7863}
   7864
   7865void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
   7866{
   7867	btrfs_dev_stat_inc(dev, index);
   7868	btrfs_dev_stat_print_on_error(dev);
   7869}
   7870
   7871static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
   7872{
   7873	if (!dev->dev_stats_valid)
   7874		return;
   7875	btrfs_err_rl_in_rcu(dev->fs_info,
   7876		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
   7877			   rcu_str_deref(dev->name),
   7878			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
   7879			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
   7880			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
   7881			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
   7882			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
   7883}
   7884
   7885static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
   7886{
   7887	int i;
   7888
   7889	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
   7890		if (btrfs_dev_stat_read(dev, i) != 0)
   7891			break;
   7892	if (i == BTRFS_DEV_STAT_VALUES_MAX)
   7893		return; /* all values == 0, suppress message */
   7894
   7895	btrfs_info_in_rcu(dev->fs_info,
   7896		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
   7897	       rcu_str_deref(dev->name),
   7898	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
   7899	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
   7900	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
   7901	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
   7902	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
   7903}
   7904
   7905int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
   7906			struct btrfs_ioctl_get_dev_stats *stats)
   7907{
   7908	BTRFS_DEV_LOOKUP_ARGS(args);
   7909	struct btrfs_device *dev;
   7910	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   7911	int i;
   7912
   7913	mutex_lock(&fs_devices->device_list_mutex);
   7914	args.devid = stats->devid;
   7915	dev = btrfs_find_device(fs_info->fs_devices, &args);
   7916	mutex_unlock(&fs_devices->device_list_mutex);
   7917
   7918	if (!dev) {
   7919		btrfs_warn(fs_info, "get dev_stats failed, device not found");
   7920		return -ENODEV;
   7921	} else if (!dev->dev_stats_valid) {
   7922		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
   7923		return -ENODEV;
   7924	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
   7925		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
   7926			if (stats->nr_items > i)
   7927				stats->values[i] =
   7928					btrfs_dev_stat_read_and_reset(dev, i);
   7929			else
   7930				btrfs_dev_stat_set(dev, i, 0);
   7931		}
   7932		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
   7933			   current->comm, task_pid_nr(current));
   7934	} else {
   7935		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
   7936			if (stats->nr_items > i)
   7937				stats->values[i] = btrfs_dev_stat_read(dev, i);
   7938	}
   7939	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
   7940		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
   7941	return 0;
   7942}
   7943
   7944/*
   7945 * Update the size and bytes used for each device where it changed.  This is
   7946 * delayed since we would otherwise get errors while writing out the
   7947 * superblocks.
   7948 *
   7949 * Must be invoked during transaction commit.
   7950 */
   7951void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
   7952{
   7953	struct btrfs_device *curr, *next;
   7954
   7955	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
   7956
   7957	if (list_empty(&trans->dev_update_list))
   7958		return;
   7959
   7960	/*
   7961	 * We don't need the device_list_mutex here.  This list is owned by the
   7962	 * transaction and the transaction must complete before the device is
   7963	 * released.
   7964	 */
   7965	mutex_lock(&trans->fs_info->chunk_mutex);
   7966	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
   7967				 post_commit_list) {
   7968		list_del_init(&curr->post_commit_list);
   7969		curr->commit_total_bytes = curr->disk_total_bytes;
   7970		curr->commit_bytes_used = curr->bytes_used;
   7971	}
   7972	mutex_unlock(&trans->fs_info->chunk_mutex);
   7973}
   7974
   7975/*
   7976 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
   7977 */
   7978int btrfs_bg_type_to_factor(u64 flags)
   7979{
   7980	const int index = btrfs_bg_flags_to_raid_index(flags);
   7981
   7982	return btrfs_raid_array[index].ncopies;
   7983}
   7984
   7985
   7986
   7987static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
   7988				 u64 chunk_offset, u64 devid,
   7989				 u64 physical_offset, u64 physical_len)
   7990{
   7991	struct btrfs_dev_lookup_args args = { .devid = devid };
   7992	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
   7993	struct extent_map *em;
   7994	struct map_lookup *map;
   7995	struct btrfs_device *dev;
   7996	u64 stripe_len;
   7997	bool found = false;
   7998	int ret = 0;
   7999	int i;
   8000
   8001	read_lock(&em_tree->lock);
   8002	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
   8003	read_unlock(&em_tree->lock);
   8004
   8005	if (!em) {
   8006		btrfs_err(fs_info,
   8007"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
   8008			  physical_offset, devid);
   8009		ret = -EUCLEAN;
   8010		goto out;
   8011	}
   8012
   8013	map = em->map_lookup;
   8014	stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
   8015	if (physical_len != stripe_len) {
   8016		btrfs_err(fs_info,
   8017"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
   8018			  physical_offset, devid, em->start, physical_len,
   8019			  stripe_len);
   8020		ret = -EUCLEAN;
   8021		goto out;
   8022	}
   8023
   8024	for (i = 0; i < map->num_stripes; i++) {
   8025		if (map->stripes[i].dev->devid == devid &&
   8026		    map->stripes[i].physical == physical_offset) {
   8027			found = true;
   8028			if (map->verified_stripes >= map->num_stripes) {
   8029				btrfs_err(fs_info,
   8030				"too many dev extents for chunk %llu found",
   8031					  em->start);
   8032				ret = -EUCLEAN;
   8033				goto out;
   8034			}
   8035			map->verified_stripes++;
   8036			break;
   8037		}
   8038	}
   8039	if (!found) {
   8040		btrfs_err(fs_info,
   8041	"dev extent physical offset %llu devid %llu has no corresponding chunk",
   8042			physical_offset, devid);
   8043		ret = -EUCLEAN;
   8044	}
   8045
   8046	/* Make sure no dev extent is beyond device boundary */
   8047	dev = btrfs_find_device(fs_info->fs_devices, &args);
   8048	if (!dev) {
   8049		btrfs_err(fs_info, "failed to find devid %llu", devid);
   8050		ret = -EUCLEAN;
   8051		goto out;
   8052	}
   8053
   8054	if (physical_offset + physical_len > dev->disk_total_bytes) {
   8055		btrfs_err(fs_info,
   8056"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
   8057			  devid, physical_offset, physical_len,
   8058			  dev->disk_total_bytes);
   8059		ret = -EUCLEAN;
   8060		goto out;
   8061	}
   8062
   8063	if (dev->zone_info) {
   8064		u64 zone_size = dev->zone_info->zone_size;
   8065
   8066		if (!IS_ALIGNED(physical_offset, zone_size) ||
   8067		    !IS_ALIGNED(physical_len, zone_size)) {
   8068			btrfs_err(fs_info,
   8069"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
   8070				  devid, physical_offset, physical_len);
   8071			ret = -EUCLEAN;
   8072			goto out;
   8073		}
   8074	}
   8075
   8076out:
   8077	free_extent_map(em);
   8078	return ret;
   8079}
   8080
   8081static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
   8082{
   8083	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
   8084	struct extent_map *em;
   8085	struct rb_node *node;
   8086	int ret = 0;
   8087
   8088	read_lock(&em_tree->lock);
   8089	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
   8090		em = rb_entry(node, struct extent_map, rb_node);
   8091		if (em->map_lookup->num_stripes !=
   8092		    em->map_lookup->verified_stripes) {
   8093			btrfs_err(fs_info,
   8094			"chunk %llu has missing dev extent, have %d expect %d",
   8095				  em->start, em->map_lookup->verified_stripes,
   8096				  em->map_lookup->num_stripes);
   8097			ret = -EUCLEAN;
   8098			goto out;
   8099		}
   8100	}
   8101out:
   8102	read_unlock(&em_tree->lock);
   8103	return ret;
   8104}
   8105
   8106/*
   8107 * Ensure that all dev extents are mapped to correct chunk, otherwise
   8108 * later chunk allocation/free would cause unexpected behavior.
   8109 *
   8110 * NOTE: This will iterate through the whole device tree, which should be of
   8111 * the same size level as the chunk tree.  This slightly increases mount time.
   8112 */
   8113int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
   8114{
   8115	struct btrfs_path *path;
   8116	struct btrfs_root *root = fs_info->dev_root;
   8117	struct btrfs_key key;
   8118	u64 prev_devid = 0;
   8119	u64 prev_dev_ext_end = 0;
   8120	int ret = 0;
   8121
   8122	/*
   8123	 * We don't have a dev_root because we mounted with ignorebadroots and
   8124	 * failed to load the root, so we want to skip the verification in this
   8125	 * case for sure.
   8126	 *
   8127	 * However if the dev root is fine, but the tree itself is corrupted
   8128	 * we'd still fail to mount.  This verification is only to make sure
   8129	 * writes can happen safely, so instead just bypass this check
   8130	 * completely in the case of IGNOREBADROOTS.
   8131	 */
   8132	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
   8133		return 0;
   8134
   8135	key.objectid = 1;
   8136	key.type = BTRFS_DEV_EXTENT_KEY;
   8137	key.offset = 0;
   8138
   8139	path = btrfs_alloc_path();
   8140	if (!path)
   8141		return -ENOMEM;
   8142
   8143	path->reada = READA_FORWARD;
   8144	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   8145	if (ret < 0)
   8146		goto out;
   8147
   8148	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
   8149		ret = btrfs_next_leaf(root, path);
   8150		if (ret < 0)
   8151			goto out;
   8152		/* No dev extents at all? Not good */
   8153		if (ret > 0) {
   8154			ret = -EUCLEAN;
   8155			goto out;
   8156		}
   8157	}
   8158	while (1) {
   8159		struct extent_buffer *leaf = path->nodes[0];
   8160		struct btrfs_dev_extent *dext;
   8161		int slot = path->slots[0];
   8162		u64 chunk_offset;
   8163		u64 physical_offset;
   8164		u64 physical_len;
   8165		u64 devid;
   8166
   8167		btrfs_item_key_to_cpu(leaf, &key, slot);
   8168		if (key.type != BTRFS_DEV_EXTENT_KEY)
   8169			break;
   8170		devid = key.objectid;
   8171		physical_offset = key.offset;
   8172
   8173		dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
   8174		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
   8175		physical_len = btrfs_dev_extent_length(leaf, dext);
   8176
   8177		/* Check if this dev extent overlaps with the previous one */
   8178		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
   8179			btrfs_err(fs_info,
   8180"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
   8181				  devid, physical_offset, prev_dev_ext_end);
   8182			ret = -EUCLEAN;
   8183			goto out;
   8184		}
   8185
   8186		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
   8187					    physical_offset, physical_len);
   8188		if (ret < 0)
   8189			goto out;
   8190		prev_devid = devid;
   8191		prev_dev_ext_end = physical_offset + physical_len;
   8192
   8193		ret = btrfs_next_item(root, path);
   8194		if (ret < 0)
   8195			goto out;
   8196		if (ret > 0) {
   8197			ret = 0;
   8198			break;
   8199		}
   8200	}
   8201
   8202	/* Ensure all chunks have corresponding dev extents */
   8203	ret = verify_chunk_dev_extent_mapping(fs_info);
   8204out:
   8205	btrfs_free_path(path);
   8206	return ret;
   8207}
   8208
   8209/*
   8210 * Check whether the given block group or device is pinned by any inode being
   8211 * used as a swapfile.
   8212 */
   8213bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
   8214{
   8215	struct btrfs_swapfile_pin *sp;
   8216	struct rb_node *node;
   8217
   8218	spin_lock(&fs_info->swapfile_pins_lock);
   8219	node = fs_info->swapfile_pins.rb_node;
   8220	while (node) {
   8221		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
   8222		if (ptr < sp->ptr)
   8223			node = node->rb_left;
   8224		else if (ptr > sp->ptr)
   8225			node = node->rb_right;
   8226		else
   8227			break;
   8228	}
   8229	spin_unlock(&fs_info->swapfile_pins_lock);
   8230	return node != NULL;
   8231}
   8232
   8233static int relocating_repair_kthread(void *data)
   8234{
   8235	struct btrfs_block_group *cache = data;
   8236	struct btrfs_fs_info *fs_info = cache->fs_info;
   8237	u64 target;
   8238	int ret = 0;
   8239
   8240	target = cache->start;
   8241	btrfs_put_block_group(cache);
   8242
   8243	sb_start_write(fs_info->sb);
   8244	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
   8245		btrfs_info(fs_info,
   8246			   "zoned: skip relocating block group %llu to repair: EBUSY",
   8247			   target);
   8248		sb_end_write(fs_info->sb);
   8249		return -EBUSY;
   8250	}
   8251
   8252	mutex_lock(&fs_info->reclaim_bgs_lock);
   8253
   8254	/* Ensure block group still exists */
   8255	cache = btrfs_lookup_block_group(fs_info, target);
   8256	if (!cache)
   8257		goto out;
   8258
   8259	if (!cache->relocating_repair)
   8260		goto out;
   8261
   8262	ret = btrfs_may_alloc_data_chunk(fs_info, target);
   8263	if (ret < 0)
   8264		goto out;
   8265
   8266	btrfs_info(fs_info,
   8267		   "zoned: relocating block group %llu to repair IO failure",
   8268		   target);
   8269	ret = btrfs_relocate_chunk(fs_info, target);
   8270
   8271out:
   8272	if (cache)
   8273		btrfs_put_block_group(cache);
   8274	mutex_unlock(&fs_info->reclaim_bgs_lock);
   8275	btrfs_exclop_finish(fs_info);
   8276	sb_end_write(fs_info->sb);
   8277
   8278	return ret;
   8279}
   8280
   8281bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
   8282{
   8283	struct btrfs_block_group *cache;
   8284
   8285	if (!btrfs_is_zoned(fs_info))
   8286		return false;
   8287
   8288	/* Do not attempt to repair in degraded state */
   8289	if (btrfs_test_opt(fs_info, DEGRADED))
   8290		return true;
   8291
   8292	cache = btrfs_lookup_block_group(fs_info, logical);
   8293	if (!cache)
   8294		return true;
   8295
   8296	spin_lock(&cache->lock);
   8297	if (cache->relocating_repair) {
   8298		spin_unlock(&cache->lock);
   8299		btrfs_put_block_group(cache);
   8300		return true;
   8301	}
   8302	cache->relocating_repair = 1;
   8303	spin_unlock(&cache->lock);
   8304
   8305	kthread_run(relocating_repair_kthread, cache,
   8306		    "btrfs-relocating-repair");
   8307
   8308	return true;
   8309}