cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dev-replace.c (38969B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) STRATO AG 2012.  All rights reserved.
      4 */
      5
      6#include <linux/sched.h>
      7#include <linux/bio.h>
      8#include <linux/slab.h>
      9#include <linux/blkdev.h>
     10#include <linux/kthread.h>
     11#include <linux/math64.h>
     12#include "misc.h"
     13#include "ctree.h"
     14#include "extent_map.h"
     15#include "disk-io.h"
     16#include "transaction.h"
     17#include "print-tree.h"
     18#include "volumes.h"
     19#include "async-thread.h"
     20#include "check-integrity.h"
     21#include "rcu-string.h"
     22#include "dev-replace.h"
     23#include "sysfs.h"
     24#include "zoned.h"
     25#include "block-group.h"
     26
     27/*
     28 * Device replace overview
     29 *
     30 * [Objective]
     31 * To copy all extents (both new and on-disk) from source device to target
     32 * device, while still keeping the filesystem read-write.
     33 *
     34 * [Method]
     35 * There are two main methods involved:
     36 *
     37 * - Write duplication
     38 *
     39 *   All new writes will be written to both target and source devices, so even
     40 *   if replace gets canceled, sources device still contains up-to-date data.
     41 *
     42 *   Location:		handle_ops_on_dev_replace() from __btrfs_map_block()
     43 *   Start:		btrfs_dev_replace_start()
     44 *   End:		btrfs_dev_replace_finishing()
     45 *   Content:		Latest data/metadata
     46 *
     47 * - Copy existing extents
     48 *
     49 *   This happens by re-using scrub facility, as scrub also iterates through
     50 *   existing extents from commit root.
     51 *
     52 *   Location:		scrub_write_block_to_dev_replace() from
     53 *   			scrub_block_complete()
     54 *   Content:		Data/meta from commit root.
     55 *
     56 * Due to the content difference, we need to avoid nocow write when dev-replace
     57 * is happening.  This is done by marking the block group read-only and waiting
     58 * for NOCOW writes.
     59 *
     60 * After replace is done, the finishing part is done by swapping the target and
     61 * source devices.
     62 *
     63 *   Location:		btrfs_dev_replace_update_device_in_mapping_tree() from
     64 *   			btrfs_dev_replace_finishing()
     65 */
     66
     67static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
     68				       int scrub_ret);
     69static int btrfs_dev_replace_kthread(void *data);
     70
     71int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
     72{
     73	struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
     74	struct btrfs_key key;
     75	struct btrfs_root *dev_root = fs_info->dev_root;
     76	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
     77	struct extent_buffer *eb;
     78	int slot;
     79	int ret = 0;
     80	struct btrfs_path *path = NULL;
     81	int item_size;
     82	struct btrfs_dev_replace_item *ptr;
     83	u64 src_devid;
     84
     85	if (!dev_root)
     86		return 0;
     87
     88	path = btrfs_alloc_path();
     89	if (!path) {
     90		ret = -ENOMEM;
     91		goto out;
     92	}
     93
     94	key.objectid = 0;
     95	key.type = BTRFS_DEV_REPLACE_KEY;
     96	key.offset = 0;
     97	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
     98	if (ret) {
     99no_valid_dev_replace_entry_found:
    100		/*
    101		 * We don't have a replace item or it's corrupted.  If there is
    102		 * a replace target, fail the mount.
    103		 */
    104		if (btrfs_find_device(fs_info->fs_devices, &args)) {
    105			btrfs_err(fs_info,
    106			"found replace target device without a valid replace item");
    107			ret = -EUCLEAN;
    108			goto out;
    109		}
    110		ret = 0;
    111		dev_replace->replace_state =
    112			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
    113		dev_replace->cont_reading_from_srcdev_mode =
    114		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
    115		dev_replace->time_started = 0;
    116		dev_replace->time_stopped = 0;
    117		atomic64_set(&dev_replace->num_write_errors, 0);
    118		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
    119		dev_replace->cursor_left = 0;
    120		dev_replace->committed_cursor_left = 0;
    121		dev_replace->cursor_left_last_write_of_item = 0;
    122		dev_replace->cursor_right = 0;
    123		dev_replace->srcdev = NULL;
    124		dev_replace->tgtdev = NULL;
    125		dev_replace->is_valid = 0;
    126		dev_replace->item_needs_writeback = 0;
    127		goto out;
    128	}
    129	slot = path->slots[0];
    130	eb = path->nodes[0];
    131	item_size = btrfs_item_size(eb, slot);
    132	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
    133
    134	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
    135		btrfs_warn(fs_info,
    136			"dev_replace entry found has unexpected size, ignore entry");
    137		goto no_valid_dev_replace_entry_found;
    138	}
    139
    140	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
    141	dev_replace->cont_reading_from_srcdev_mode =
    142		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
    143	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
    144	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
    145	dev_replace->time_stopped =
    146		btrfs_dev_replace_time_stopped(eb, ptr);
    147	atomic64_set(&dev_replace->num_write_errors,
    148		     btrfs_dev_replace_num_write_errors(eb, ptr));
    149	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
    150		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
    151	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
    152	dev_replace->committed_cursor_left = dev_replace->cursor_left;
    153	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
    154	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
    155	dev_replace->is_valid = 1;
    156
    157	dev_replace->item_needs_writeback = 0;
    158	switch (dev_replace->replace_state) {
    159	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
    160	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
    161	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
    162		/*
    163		 * We don't have an active replace item but if there is a
    164		 * replace target, fail the mount.
    165		 */
    166		if (btrfs_find_device(fs_info->fs_devices, &args)) {
    167			btrfs_err(fs_info,
    168			"replace devid present without an active replace item");
    169			ret = -EUCLEAN;
    170		} else {
    171			dev_replace->srcdev = NULL;
    172			dev_replace->tgtdev = NULL;
    173		}
    174		break;
    175	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
    176	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
    177		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
    178		args.devid = src_devid;
    179		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
    180
    181		/*
    182		 * allow 'btrfs dev replace_cancel' if src/tgt device is
    183		 * missing
    184		 */
    185		if (!dev_replace->srcdev &&
    186		    !btrfs_test_opt(fs_info, DEGRADED)) {
    187			ret = -EIO;
    188			btrfs_warn(fs_info,
    189			   "cannot mount because device replace operation is ongoing and");
    190			btrfs_warn(fs_info,
    191			   "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
    192			   src_devid);
    193		}
    194		if (!dev_replace->tgtdev &&
    195		    !btrfs_test_opt(fs_info, DEGRADED)) {
    196			ret = -EIO;
    197			btrfs_warn(fs_info,
    198			   "cannot mount because device replace operation is ongoing and");
    199			btrfs_warn(fs_info,
    200			   "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
    201				BTRFS_DEV_REPLACE_DEVID);
    202		}
    203		if (dev_replace->tgtdev) {
    204			if (dev_replace->srcdev) {
    205				dev_replace->tgtdev->total_bytes =
    206					dev_replace->srcdev->total_bytes;
    207				dev_replace->tgtdev->disk_total_bytes =
    208					dev_replace->srcdev->disk_total_bytes;
    209				dev_replace->tgtdev->commit_total_bytes =
    210					dev_replace->srcdev->commit_total_bytes;
    211				dev_replace->tgtdev->bytes_used =
    212					dev_replace->srcdev->bytes_used;
    213				dev_replace->tgtdev->commit_bytes_used =
    214					dev_replace->srcdev->commit_bytes_used;
    215			}
    216			set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
    217				&dev_replace->tgtdev->dev_state);
    218
    219			WARN_ON(fs_info->fs_devices->rw_devices == 0);
    220			dev_replace->tgtdev->io_width = fs_info->sectorsize;
    221			dev_replace->tgtdev->io_align = fs_info->sectorsize;
    222			dev_replace->tgtdev->sector_size = fs_info->sectorsize;
    223			dev_replace->tgtdev->fs_info = fs_info;
    224			set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
    225				&dev_replace->tgtdev->dev_state);
    226		}
    227		break;
    228	}
    229
    230out:
    231	btrfs_free_path(path);
    232	return ret;
    233}
    234
    235/*
    236 * Initialize a new device for device replace target from a given source dev
    237 * and path.
    238 *
    239 * Return 0 and new device in @device_out, otherwise return < 0
    240 */
    241static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
    242				  const char *device_path,
    243				  struct btrfs_device *srcdev,
    244				  struct btrfs_device **device_out)
    245{
    246	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    247	struct btrfs_device *device;
    248	struct block_device *bdev;
    249	struct rcu_string *name;
    250	u64 devid = BTRFS_DEV_REPLACE_DEVID;
    251	int ret = 0;
    252
    253	*device_out = NULL;
    254	if (srcdev->fs_devices->seeding) {
    255		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
    256		return -EINVAL;
    257	}
    258
    259	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
    260				  fs_info->bdev_holder);
    261	if (IS_ERR(bdev)) {
    262		btrfs_err(fs_info, "target device %s is invalid!", device_path);
    263		return PTR_ERR(bdev);
    264	}
    265
    266	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
    267		btrfs_err(fs_info,
    268		"dev-replace: zoned type of target device mismatch with filesystem");
    269		ret = -EINVAL;
    270		goto error;
    271	}
    272
    273	sync_blockdev(bdev);
    274
    275	list_for_each_entry(device, &fs_devices->devices, dev_list) {
    276		if (device->bdev == bdev) {
    277			btrfs_err(fs_info,
    278				  "target device is in the filesystem!");
    279			ret = -EEXIST;
    280			goto error;
    281		}
    282	}
    283
    284
    285	if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
    286		btrfs_err(fs_info,
    287			  "target device is smaller than source device!");
    288		ret = -EINVAL;
    289		goto error;
    290	}
    291
    292
    293	device = btrfs_alloc_device(NULL, &devid, NULL);
    294	if (IS_ERR(device)) {
    295		ret = PTR_ERR(device);
    296		goto error;
    297	}
    298
    299	name = rcu_string_strdup(device_path, GFP_KERNEL);
    300	if (!name) {
    301		btrfs_free_device(device);
    302		ret = -ENOMEM;
    303		goto error;
    304	}
    305	rcu_assign_pointer(device->name, name);
    306	ret = lookup_bdev(device_path, &device->devt);
    307	if (ret)
    308		goto error;
    309
    310	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
    311	device->generation = 0;
    312	device->io_width = fs_info->sectorsize;
    313	device->io_align = fs_info->sectorsize;
    314	device->sector_size = fs_info->sectorsize;
    315	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
    316	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
    317	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
    318	device->commit_total_bytes = srcdev->commit_total_bytes;
    319	device->commit_bytes_used = device->bytes_used;
    320	device->fs_info = fs_info;
    321	device->bdev = bdev;
    322	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
    323	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
    324	device->mode = FMODE_EXCL;
    325	device->dev_stats_valid = 1;
    326	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
    327	device->fs_devices = fs_devices;
    328
    329	ret = btrfs_get_dev_zone_info(device, false);
    330	if (ret)
    331		goto error;
    332
    333	mutex_lock(&fs_devices->device_list_mutex);
    334	list_add(&device->dev_list, &fs_devices->devices);
    335	fs_devices->num_devices++;
    336	fs_devices->open_devices++;
    337	mutex_unlock(&fs_devices->device_list_mutex);
    338
    339	*device_out = device;
    340	return 0;
    341
    342error:
    343	blkdev_put(bdev, FMODE_EXCL);
    344	return ret;
    345}
    346
    347/*
    348 * called from commit_transaction. Writes changed device replace state to
    349 * disk.
    350 */
    351int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
    352{
    353	struct btrfs_fs_info *fs_info = trans->fs_info;
    354	int ret;
    355	struct btrfs_root *dev_root = fs_info->dev_root;
    356	struct btrfs_path *path;
    357	struct btrfs_key key;
    358	struct extent_buffer *eb;
    359	struct btrfs_dev_replace_item *ptr;
    360	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    361
    362	down_read(&dev_replace->rwsem);
    363	if (!dev_replace->is_valid ||
    364	    !dev_replace->item_needs_writeback) {
    365		up_read(&dev_replace->rwsem);
    366		return 0;
    367	}
    368	up_read(&dev_replace->rwsem);
    369
    370	key.objectid = 0;
    371	key.type = BTRFS_DEV_REPLACE_KEY;
    372	key.offset = 0;
    373
    374	path = btrfs_alloc_path();
    375	if (!path) {
    376		ret = -ENOMEM;
    377		goto out;
    378	}
    379	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
    380	if (ret < 0) {
    381		btrfs_warn(fs_info,
    382			   "error %d while searching for dev_replace item!",
    383			   ret);
    384		goto out;
    385	}
    386
    387	if (ret == 0 &&
    388	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
    389		/*
    390		 * need to delete old one and insert a new one.
    391		 * Since no attempt is made to recover any old state, if the
    392		 * dev_replace state is 'running', the data on the target
    393		 * drive is lost.
    394		 * It would be possible to recover the state: just make sure
    395		 * that the beginning of the item is never changed and always
    396		 * contains all the essential information. Then read this
    397		 * minimal set of information and use it as a base for the
    398		 * new state.
    399		 */
    400		ret = btrfs_del_item(trans, dev_root, path);
    401		if (ret != 0) {
    402			btrfs_warn(fs_info,
    403				   "delete too small dev_replace item failed %d!",
    404				   ret);
    405			goto out;
    406		}
    407		ret = 1;
    408	}
    409
    410	if (ret == 1) {
    411		/* need to insert a new item */
    412		btrfs_release_path(path);
    413		ret = btrfs_insert_empty_item(trans, dev_root, path,
    414					      &key, sizeof(*ptr));
    415		if (ret < 0) {
    416			btrfs_warn(fs_info,
    417				   "insert dev_replace item failed %d!", ret);
    418			goto out;
    419		}
    420	}
    421
    422	eb = path->nodes[0];
    423	ptr = btrfs_item_ptr(eb, path->slots[0],
    424			     struct btrfs_dev_replace_item);
    425
    426	down_write(&dev_replace->rwsem);
    427	if (dev_replace->srcdev)
    428		btrfs_set_dev_replace_src_devid(eb, ptr,
    429			dev_replace->srcdev->devid);
    430	else
    431		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
    432	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
    433		dev_replace->cont_reading_from_srcdev_mode);
    434	btrfs_set_dev_replace_replace_state(eb, ptr,
    435		dev_replace->replace_state);
    436	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
    437	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
    438	btrfs_set_dev_replace_num_write_errors(eb, ptr,
    439		atomic64_read(&dev_replace->num_write_errors));
    440	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
    441		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
    442	dev_replace->cursor_left_last_write_of_item =
    443		dev_replace->cursor_left;
    444	btrfs_set_dev_replace_cursor_left(eb, ptr,
    445		dev_replace->cursor_left_last_write_of_item);
    446	btrfs_set_dev_replace_cursor_right(eb, ptr,
    447		dev_replace->cursor_right);
    448	dev_replace->item_needs_writeback = 0;
    449	up_write(&dev_replace->rwsem);
    450
    451	btrfs_mark_buffer_dirty(eb);
    452
    453out:
    454	btrfs_free_path(path);
    455
    456	return ret;
    457}
    458
    459static char* btrfs_dev_name(struct btrfs_device *device)
    460{
    461	if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
    462		return "<missing disk>";
    463	else
    464		return rcu_str_deref(device->name);
    465}
    466
    467static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
    468				    struct btrfs_device *src_dev)
    469{
    470	struct btrfs_path *path;
    471	struct btrfs_key key;
    472	struct btrfs_key found_key;
    473	struct btrfs_root *root = fs_info->dev_root;
    474	struct btrfs_dev_extent *dev_extent = NULL;
    475	struct btrfs_block_group *cache;
    476	struct btrfs_trans_handle *trans;
    477	int iter_ret = 0;
    478	int ret = 0;
    479	u64 chunk_offset;
    480
    481	/* Do not use "to_copy" on non zoned filesystem for now */
    482	if (!btrfs_is_zoned(fs_info))
    483		return 0;
    484
    485	mutex_lock(&fs_info->chunk_mutex);
    486
    487	/* Ensure we don't have pending new block group */
    488	spin_lock(&fs_info->trans_lock);
    489	while (fs_info->running_transaction &&
    490	       !list_empty(&fs_info->running_transaction->dev_update_list)) {
    491		spin_unlock(&fs_info->trans_lock);
    492		mutex_unlock(&fs_info->chunk_mutex);
    493		trans = btrfs_attach_transaction(root);
    494		if (IS_ERR(trans)) {
    495			ret = PTR_ERR(trans);
    496			mutex_lock(&fs_info->chunk_mutex);
    497			if (ret == -ENOENT) {
    498				spin_lock(&fs_info->trans_lock);
    499				continue;
    500			} else {
    501				goto unlock;
    502			}
    503		}
    504
    505		ret = btrfs_commit_transaction(trans);
    506		mutex_lock(&fs_info->chunk_mutex);
    507		if (ret)
    508			goto unlock;
    509
    510		spin_lock(&fs_info->trans_lock);
    511	}
    512	spin_unlock(&fs_info->trans_lock);
    513
    514	path = btrfs_alloc_path();
    515	if (!path) {
    516		ret = -ENOMEM;
    517		goto unlock;
    518	}
    519
    520	path->reada = READA_FORWARD;
    521	path->search_commit_root = 1;
    522	path->skip_locking = 1;
    523
    524	key.objectid = src_dev->devid;
    525	key.type = BTRFS_DEV_EXTENT_KEY;
    526	key.offset = 0;
    527
    528	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
    529		struct extent_buffer *leaf = path->nodes[0];
    530
    531		if (found_key.objectid != src_dev->devid)
    532			break;
    533
    534		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
    535			break;
    536
    537		if (found_key.offset < key.offset)
    538			break;
    539
    540		dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
    541
    542		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
    543
    544		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
    545		if (!cache)
    546			continue;
    547
    548		spin_lock(&cache->lock);
    549		cache->to_copy = 1;
    550		spin_unlock(&cache->lock);
    551
    552		btrfs_put_block_group(cache);
    553	}
    554	if (iter_ret < 0)
    555		ret = iter_ret;
    556
    557	btrfs_free_path(path);
    558unlock:
    559	mutex_unlock(&fs_info->chunk_mutex);
    560
    561	return ret;
    562}
    563
    564bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
    565				      struct btrfs_block_group *cache,
    566				      u64 physical)
    567{
    568	struct btrfs_fs_info *fs_info = cache->fs_info;
    569	struct extent_map *em;
    570	struct map_lookup *map;
    571	u64 chunk_offset = cache->start;
    572	int num_extents, cur_extent;
    573	int i;
    574
    575	/* Do not use "to_copy" on non zoned filesystem for now */
    576	if (!btrfs_is_zoned(fs_info))
    577		return true;
    578
    579	spin_lock(&cache->lock);
    580	if (cache->removed) {
    581		spin_unlock(&cache->lock);
    582		return true;
    583	}
    584	spin_unlock(&cache->lock);
    585
    586	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
    587	ASSERT(!IS_ERR(em));
    588	map = em->map_lookup;
    589
    590	num_extents = cur_extent = 0;
    591	for (i = 0; i < map->num_stripes; i++) {
    592		/* We have more device extent to copy */
    593		if (srcdev != map->stripes[i].dev)
    594			continue;
    595
    596		num_extents++;
    597		if (physical == map->stripes[i].physical)
    598			cur_extent = i;
    599	}
    600
    601	free_extent_map(em);
    602
    603	if (num_extents > 1 && cur_extent < num_extents - 1) {
    604		/*
    605		 * Has more stripes on this device. Keep this block group
    606		 * readonly until we finish all the stripes.
    607		 */
    608		return false;
    609	}
    610
    611	/* Last stripe on this device */
    612	spin_lock(&cache->lock);
    613	cache->to_copy = 0;
    614	spin_unlock(&cache->lock);
    615
    616	return true;
    617}
    618
    619static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
    620		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
    621		int read_src)
    622{
    623	struct btrfs_root *root = fs_info->dev_root;
    624	struct btrfs_trans_handle *trans;
    625	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    626	int ret;
    627	struct btrfs_device *tgt_device = NULL;
    628	struct btrfs_device *src_device = NULL;
    629
    630	src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
    631						  srcdev_name);
    632	if (IS_ERR(src_device))
    633		return PTR_ERR(src_device);
    634
    635	if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
    636		btrfs_warn_in_rcu(fs_info,
    637	  "cannot replace device %s (devid %llu) due to active swapfile",
    638			btrfs_dev_name(src_device), src_device->devid);
    639		return -ETXTBSY;
    640	}
    641
    642	/*
    643	 * Here we commit the transaction to make sure commit_total_bytes
    644	 * of all the devices are updated.
    645	 */
    646	trans = btrfs_attach_transaction(root);
    647	if (!IS_ERR(trans)) {
    648		ret = btrfs_commit_transaction(trans);
    649		if (ret)
    650			return ret;
    651	} else if (PTR_ERR(trans) != -ENOENT) {
    652		return PTR_ERR(trans);
    653	}
    654
    655	ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
    656					    src_device, &tgt_device);
    657	if (ret)
    658		return ret;
    659
    660	ret = mark_block_group_to_copy(fs_info, src_device);
    661	if (ret)
    662		return ret;
    663
    664	down_write(&dev_replace->rwsem);
    665	switch (dev_replace->replace_state) {
    666	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
    667	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
    668	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
    669		break;
    670	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
    671	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
    672		ASSERT(0);
    673		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
    674		up_write(&dev_replace->rwsem);
    675		goto leave;
    676	}
    677
    678	dev_replace->cont_reading_from_srcdev_mode = read_src;
    679	dev_replace->srcdev = src_device;
    680	dev_replace->tgtdev = tgt_device;
    681
    682	btrfs_info_in_rcu(fs_info,
    683		      "dev_replace from %s (devid %llu) to %s started",
    684		      btrfs_dev_name(src_device),
    685		      src_device->devid,
    686		      rcu_str_deref(tgt_device->name));
    687
    688	/*
    689	 * from now on, the writes to the srcdev are all duplicated to
    690	 * go to the tgtdev as well (refer to btrfs_map_block()).
    691	 */
    692	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
    693	dev_replace->time_started = ktime_get_real_seconds();
    694	dev_replace->cursor_left = 0;
    695	dev_replace->committed_cursor_left = 0;
    696	dev_replace->cursor_left_last_write_of_item = 0;
    697	dev_replace->cursor_right = 0;
    698	dev_replace->is_valid = 1;
    699	dev_replace->item_needs_writeback = 1;
    700	atomic64_set(&dev_replace->num_write_errors, 0);
    701	atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
    702	up_write(&dev_replace->rwsem);
    703
    704	ret = btrfs_sysfs_add_device(tgt_device);
    705	if (ret)
    706		btrfs_err(fs_info, "kobj add dev failed %d", ret);
    707
    708	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
    709
    710	/*
    711	 * Commit dev_replace state and reserve 1 item for it.
    712	 * This is crucial to ensure we won't miss copying extents for new block
    713	 * groups that are allocated after we started the device replace, and
    714	 * must be done after setting up the device replace state.
    715	 */
    716	trans = btrfs_start_transaction(root, 1);
    717	if (IS_ERR(trans)) {
    718		ret = PTR_ERR(trans);
    719		down_write(&dev_replace->rwsem);
    720		dev_replace->replace_state =
    721			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
    722		dev_replace->srcdev = NULL;
    723		dev_replace->tgtdev = NULL;
    724		up_write(&dev_replace->rwsem);
    725		goto leave;
    726	}
    727
    728	ret = btrfs_commit_transaction(trans);
    729	WARN_ON(ret);
    730
    731	/* the disk copy procedure reuses the scrub code */
    732	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
    733			      btrfs_device_get_total_bytes(src_device),
    734			      &dev_replace->scrub_progress, 0, 1);
    735
    736	ret = btrfs_dev_replace_finishing(fs_info, ret);
    737	if (ret == -EINPROGRESS)
    738		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
    739
    740	return ret;
    741
    742leave:
    743	btrfs_destroy_dev_replace_tgtdev(tgt_device);
    744	return ret;
    745}
    746
    747int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
    748			    struct btrfs_ioctl_dev_replace_args *args)
    749{
    750	int ret;
    751
    752	switch (args->start.cont_reading_from_srcdev_mode) {
    753	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
    754	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
    755		break;
    756	default:
    757		return -EINVAL;
    758	}
    759
    760	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
    761	    args->start.tgtdev_name[0] == '\0')
    762		return -EINVAL;
    763
    764	ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
    765					args->start.srcdevid,
    766					args->start.srcdev_name,
    767					args->start.cont_reading_from_srcdev_mode);
    768	args->result = ret;
    769	/* don't warn if EINPROGRESS, someone else might be running scrub */
    770	if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
    771	    ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
    772		return 0;
    773
    774	return ret;
    775}
    776
    777/*
    778 * blocked until all in-flight bios operations are finished.
    779 */
    780static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
    781{
    782	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
    783	wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
    784		   &fs_info->dev_replace.bio_counter));
    785}
    786
    787/*
    788 * we have removed target device, it is safe to allow new bios request.
    789 */
    790static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
    791{
    792	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
    793	wake_up(&fs_info->dev_replace.replace_wait);
    794}
    795
    796/*
    797 * When finishing the device replace, before swapping the source device with the
    798 * target device we must update the chunk allocation state in the target device,
    799 * as it is empty because replace works by directly copying the chunks and not
    800 * through the normal chunk allocation path.
    801 */
    802static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
    803					struct btrfs_device *tgtdev)
    804{
    805	struct extent_state *cached_state = NULL;
    806	u64 start = 0;
    807	u64 found_start;
    808	u64 found_end;
    809	int ret = 0;
    810
    811	lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
    812
    813	while (!find_first_extent_bit(&srcdev->alloc_state, start,
    814				      &found_start, &found_end,
    815				      CHUNK_ALLOCATED, &cached_state)) {
    816		ret = set_extent_bits(&tgtdev->alloc_state, found_start,
    817				      found_end, CHUNK_ALLOCATED);
    818		if (ret)
    819			break;
    820		start = found_end + 1;
    821	}
    822
    823	free_extent_state(cached_state);
    824	return ret;
    825}
    826
    827static void btrfs_dev_replace_update_device_in_mapping_tree(
    828						struct btrfs_fs_info *fs_info,
    829						struct btrfs_device *srcdev,
    830						struct btrfs_device *tgtdev)
    831{
    832	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    833	struct extent_map *em;
    834	struct map_lookup *map;
    835	u64 start = 0;
    836	int i;
    837
    838	write_lock(&em_tree->lock);
    839	do {
    840		em = lookup_extent_mapping(em_tree, start, (u64)-1);
    841		if (!em)
    842			break;
    843		map = em->map_lookup;
    844		for (i = 0; i < map->num_stripes; i++)
    845			if (srcdev == map->stripes[i].dev)
    846				map->stripes[i].dev = tgtdev;
    847		start = em->start + em->len;
    848		free_extent_map(em);
    849	} while (start);
    850	write_unlock(&em_tree->lock);
    851}
    852
    853static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
    854				       int scrub_ret)
    855{
    856	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    857	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    858	struct btrfs_device *tgt_device;
    859	struct btrfs_device *src_device;
    860	struct btrfs_root *root = fs_info->tree_root;
    861	u8 uuid_tmp[BTRFS_UUID_SIZE];
    862	struct btrfs_trans_handle *trans;
    863	int ret = 0;
    864
    865	/* don't allow cancel or unmount to disturb the finishing procedure */
    866	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
    867
    868	down_read(&dev_replace->rwsem);
    869	/* was the operation canceled, or is it finished? */
    870	if (dev_replace->replace_state !=
    871	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
    872		up_read(&dev_replace->rwsem);
    873		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
    874		return 0;
    875	}
    876
    877	tgt_device = dev_replace->tgtdev;
    878	src_device = dev_replace->srcdev;
    879	up_read(&dev_replace->rwsem);
    880
    881	/*
    882	 * flush all outstanding I/O and inode extent mappings before the
    883	 * copy operation is declared as being finished
    884	 */
    885	ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
    886	if (ret) {
    887		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
    888		return ret;
    889	}
    890	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
    891
    892	/*
    893	 * We have to use this loop approach because at this point src_device
    894	 * has to be available for transaction commit to complete, yet new
    895	 * chunks shouldn't be allocated on the device.
    896	 */
    897	while (1) {
    898		trans = btrfs_start_transaction(root, 0);
    899		if (IS_ERR(trans)) {
    900			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
    901			return PTR_ERR(trans);
    902		}
    903		ret = btrfs_commit_transaction(trans);
    904		WARN_ON(ret);
    905
    906		/* Prevent write_all_supers() during the finishing procedure */
    907		mutex_lock(&fs_devices->device_list_mutex);
    908		/* Prevent new chunks being allocated on the source device */
    909		mutex_lock(&fs_info->chunk_mutex);
    910
    911		if (!list_empty(&src_device->post_commit_list)) {
    912			mutex_unlock(&fs_devices->device_list_mutex);
    913			mutex_unlock(&fs_info->chunk_mutex);
    914		} else {
    915			break;
    916		}
    917	}
    918
    919	down_write(&dev_replace->rwsem);
    920	dev_replace->replace_state =
    921		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
    922			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
    923	dev_replace->tgtdev = NULL;
    924	dev_replace->srcdev = NULL;
    925	dev_replace->time_stopped = ktime_get_real_seconds();
    926	dev_replace->item_needs_writeback = 1;
    927
    928	/*
    929	 * Update allocation state in the new device and replace the old device
    930	 * with the new one in the mapping tree.
    931	 */
    932	if (!scrub_ret) {
    933		scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
    934		if (scrub_ret)
    935			goto error;
    936		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
    937								src_device,
    938								tgt_device);
    939	} else {
    940		if (scrub_ret != -ECANCELED)
    941			btrfs_err_in_rcu(fs_info,
    942				 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
    943				 btrfs_dev_name(src_device),
    944				 src_device->devid,
    945				 rcu_str_deref(tgt_device->name), scrub_ret);
    946error:
    947		up_write(&dev_replace->rwsem);
    948		mutex_unlock(&fs_info->chunk_mutex);
    949		mutex_unlock(&fs_devices->device_list_mutex);
    950		btrfs_rm_dev_replace_blocked(fs_info);
    951		if (tgt_device)
    952			btrfs_destroy_dev_replace_tgtdev(tgt_device);
    953		btrfs_rm_dev_replace_unblocked(fs_info);
    954		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
    955
    956		return scrub_ret;
    957	}
    958
    959	btrfs_info_in_rcu(fs_info,
    960			  "dev_replace from %s (devid %llu) to %s finished",
    961			  btrfs_dev_name(src_device),
    962			  src_device->devid,
    963			  rcu_str_deref(tgt_device->name));
    964	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
    965	tgt_device->devid = src_device->devid;
    966	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
    967	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
    968	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
    969	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
    970	btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
    971	btrfs_device_set_disk_total_bytes(tgt_device,
    972					  src_device->disk_total_bytes);
    973	btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
    974	tgt_device->commit_bytes_used = src_device->bytes_used;
    975
    976	btrfs_assign_next_active_device(src_device, tgt_device);
    977
    978	list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
    979	fs_devices->rw_devices++;
    980
    981	up_write(&dev_replace->rwsem);
    982	btrfs_rm_dev_replace_blocked(fs_info);
    983
    984	btrfs_rm_dev_replace_remove_srcdev(src_device);
    985
    986	btrfs_rm_dev_replace_unblocked(fs_info);
    987
    988	/*
    989	 * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
    990	 * update on-disk dev stats value during commit transaction
    991	 */
    992	atomic_inc(&tgt_device->dev_stats_ccnt);
    993
    994	/*
    995	 * this is again a consistent state where no dev_replace procedure
    996	 * is running, the target device is part of the filesystem, the
    997	 * source device is not part of the filesystem anymore and its 1st
    998	 * superblock is scratched out so that it is no longer marked to
    999	 * belong to this filesystem.
   1000	 */
   1001	mutex_unlock(&fs_info->chunk_mutex);
   1002	mutex_unlock(&fs_devices->device_list_mutex);
   1003
   1004	/* replace the sysfs entry */
   1005	btrfs_sysfs_remove_device(src_device);
   1006	btrfs_sysfs_update_devid(tgt_device);
   1007	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
   1008		btrfs_scratch_superblocks(fs_info, src_device->bdev,
   1009					  src_device->name->str);
   1010
   1011	/* write back the superblocks */
   1012	trans = btrfs_start_transaction(root, 0);
   1013	if (!IS_ERR(trans))
   1014		btrfs_commit_transaction(trans);
   1015
   1016	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
   1017
   1018	btrfs_rm_dev_replace_free_srcdev(src_device);
   1019
   1020	return 0;
   1021}
   1022
   1023/*
   1024 * Read progress of device replace status according to the state and last
   1025 * stored position. The value format is the same as for
   1026 * btrfs_dev_replace::progress_1000
   1027 */
   1028static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
   1029{
   1030	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   1031	u64 ret = 0;
   1032
   1033	switch (dev_replace->replace_state) {
   1034	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
   1035	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
   1036		ret = 0;
   1037		break;
   1038	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
   1039		ret = 1000;
   1040		break;
   1041	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
   1042	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
   1043		ret = div64_u64(dev_replace->cursor_left,
   1044				div_u64(btrfs_device_get_total_bytes(
   1045						dev_replace->srcdev), 1000));
   1046		break;
   1047	}
   1048
   1049	return ret;
   1050}
   1051
   1052void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
   1053			      struct btrfs_ioctl_dev_replace_args *args)
   1054{
   1055	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   1056
   1057	down_read(&dev_replace->rwsem);
   1058	/* even if !dev_replace_is_valid, the values are good enough for
   1059	 * the replace_status ioctl */
   1060	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
   1061	args->status.replace_state = dev_replace->replace_state;
   1062	args->status.time_started = dev_replace->time_started;
   1063	args->status.time_stopped = dev_replace->time_stopped;
   1064	args->status.num_write_errors =
   1065		atomic64_read(&dev_replace->num_write_errors);
   1066	args->status.num_uncorrectable_read_errors =
   1067		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
   1068	args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
   1069	up_read(&dev_replace->rwsem);
   1070}
   1071
   1072int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
   1073{
   1074	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   1075	struct btrfs_device *tgt_device = NULL;
   1076	struct btrfs_device *src_device = NULL;
   1077	struct btrfs_trans_handle *trans;
   1078	struct btrfs_root *root = fs_info->tree_root;
   1079	int result;
   1080	int ret;
   1081
   1082	if (sb_rdonly(fs_info->sb))
   1083		return -EROFS;
   1084
   1085	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
   1086	down_write(&dev_replace->rwsem);
   1087	switch (dev_replace->replace_state) {
   1088	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
   1089	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
   1090	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
   1091		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
   1092		up_write(&dev_replace->rwsem);
   1093		break;
   1094	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
   1095		tgt_device = dev_replace->tgtdev;
   1096		src_device = dev_replace->srcdev;
   1097		up_write(&dev_replace->rwsem);
   1098		ret = btrfs_scrub_cancel(fs_info);
   1099		if (ret < 0) {
   1100			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
   1101		} else {
   1102			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
   1103			/*
   1104			 * btrfs_dev_replace_finishing() will handle the
   1105			 * cleanup part
   1106			 */
   1107			btrfs_info_in_rcu(fs_info,
   1108				"dev_replace from %s (devid %llu) to %s canceled",
   1109				btrfs_dev_name(src_device), src_device->devid,
   1110				btrfs_dev_name(tgt_device));
   1111		}
   1112		break;
   1113	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
   1114		/*
   1115		 * Scrub doing the replace isn't running so we need to do the
   1116		 * cleanup step of btrfs_dev_replace_finishing() here
   1117		 */
   1118		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
   1119		tgt_device = dev_replace->tgtdev;
   1120		src_device = dev_replace->srcdev;
   1121		dev_replace->tgtdev = NULL;
   1122		dev_replace->srcdev = NULL;
   1123		dev_replace->replace_state =
   1124				BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
   1125		dev_replace->time_stopped = ktime_get_real_seconds();
   1126		dev_replace->item_needs_writeback = 1;
   1127
   1128		up_write(&dev_replace->rwsem);
   1129
   1130		/* Scrub for replace must not be running in suspended state */
   1131		ret = btrfs_scrub_cancel(fs_info);
   1132		ASSERT(ret != -ENOTCONN);
   1133
   1134		trans = btrfs_start_transaction(root, 0);
   1135		if (IS_ERR(trans)) {
   1136			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
   1137			return PTR_ERR(trans);
   1138		}
   1139		ret = btrfs_commit_transaction(trans);
   1140		WARN_ON(ret);
   1141
   1142		btrfs_info_in_rcu(fs_info,
   1143		"suspended dev_replace from %s (devid %llu) to %s canceled",
   1144			btrfs_dev_name(src_device), src_device->devid,
   1145			btrfs_dev_name(tgt_device));
   1146
   1147		if (tgt_device)
   1148			btrfs_destroy_dev_replace_tgtdev(tgt_device);
   1149		break;
   1150	default:
   1151		up_write(&dev_replace->rwsem);
   1152		result = -EINVAL;
   1153	}
   1154
   1155	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
   1156	return result;
   1157}
   1158
   1159void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
   1160{
   1161	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   1162
   1163	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
   1164	down_write(&dev_replace->rwsem);
   1165
   1166	switch (dev_replace->replace_state) {
   1167	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
   1168	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
   1169	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
   1170	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
   1171		break;
   1172	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
   1173		dev_replace->replace_state =
   1174			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
   1175		dev_replace->time_stopped = ktime_get_real_seconds();
   1176		dev_replace->item_needs_writeback = 1;
   1177		btrfs_info(fs_info, "suspending dev_replace for unmount");
   1178		break;
   1179	}
   1180
   1181	up_write(&dev_replace->rwsem);
   1182	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
   1183}
   1184
   1185/* resume dev_replace procedure that was interrupted by unmount */
   1186int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
   1187{
   1188	struct task_struct *task;
   1189	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   1190
   1191	down_write(&dev_replace->rwsem);
   1192
   1193	switch (dev_replace->replace_state) {
   1194	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
   1195	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
   1196	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
   1197		up_write(&dev_replace->rwsem);
   1198		return 0;
   1199	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
   1200		break;
   1201	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
   1202		dev_replace->replace_state =
   1203			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
   1204		break;
   1205	}
   1206	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
   1207		btrfs_info(fs_info,
   1208			   "cannot continue dev_replace, tgtdev is missing");
   1209		btrfs_info(fs_info,
   1210			   "you may cancel the operation after 'mount -o degraded'");
   1211		dev_replace->replace_state =
   1212					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
   1213		up_write(&dev_replace->rwsem);
   1214		return 0;
   1215	}
   1216	up_write(&dev_replace->rwsem);
   1217
   1218	/*
   1219	 * This could collide with a paused balance, but the exclusive op logic
   1220	 * should never allow both to start and pause. We don't want to allow
   1221	 * dev-replace to start anyway.
   1222	 */
   1223	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
   1224		down_write(&dev_replace->rwsem);
   1225		dev_replace->replace_state =
   1226					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
   1227		up_write(&dev_replace->rwsem);
   1228		btrfs_info(fs_info,
   1229		"cannot resume dev-replace, other exclusive operation running");
   1230		return 0;
   1231	}
   1232
   1233	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
   1234	return PTR_ERR_OR_ZERO(task);
   1235}
   1236
   1237static int btrfs_dev_replace_kthread(void *data)
   1238{
   1239	struct btrfs_fs_info *fs_info = data;
   1240	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   1241	u64 progress;
   1242	int ret;
   1243
   1244	progress = btrfs_dev_replace_progress(fs_info);
   1245	progress = div_u64(progress, 10);
   1246	btrfs_info_in_rcu(fs_info,
   1247		"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
   1248		btrfs_dev_name(dev_replace->srcdev),
   1249		dev_replace->srcdev->devid,
   1250		btrfs_dev_name(dev_replace->tgtdev),
   1251		(unsigned int)progress);
   1252
   1253	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
   1254			      dev_replace->committed_cursor_left,
   1255			      btrfs_device_get_total_bytes(dev_replace->srcdev),
   1256			      &dev_replace->scrub_progress, 0, 1);
   1257	ret = btrfs_dev_replace_finishing(fs_info, ret);
   1258	WARN_ON(ret && ret != -ECANCELED);
   1259
   1260	btrfs_exclop_finish(fs_info);
   1261	return 0;
   1262}
   1263
   1264int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
   1265{
   1266	if (!dev_replace->is_valid)
   1267		return 0;
   1268
   1269	switch (dev_replace->replace_state) {
   1270	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
   1271	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
   1272	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
   1273		return 0;
   1274	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
   1275	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
   1276		/*
   1277		 * return true even if tgtdev is missing (this is
   1278		 * something that can happen if the dev_replace
   1279		 * procedure is suspended by an umount and then
   1280		 * the tgtdev is missing (or "btrfs dev scan") was
   1281		 * not called and the filesystem is remounted
   1282		 * in degraded state. This does not stop the
   1283		 * dev_replace procedure. It needs to be canceled
   1284		 * manually if the cancellation is wanted.
   1285		 */
   1286		break;
   1287	}
   1288	return 1;
   1289}
   1290
   1291void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
   1292{
   1293	percpu_counter_inc(&fs_info->dev_replace.bio_counter);
   1294}
   1295
   1296void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
   1297{
   1298	percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
   1299	cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
   1300}
   1301
   1302void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
   1303{
   1304	while (1) {
   1305		percpu_counter_inc(&fs_info->dev_replace.bio_counter);
   1306		if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
   1307				     &fs_info->fs_state)))
   1308			break;
   1309
   1310		btrfs_bio_counter_dec(fs_info);
   1311		wait_event(fs_info->dev_replace.replace_wait,
   1312			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
   1313				     &fs_info->fs_state));
   1314	}
   1315}