scrub.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
scrub.c (122498B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
      4 */
      5
      6#include <linux/blkdev.h>
      7#include <linux/ratelimit.h>
      8#include <linux/sched/mm.h>
      9#include <crypto/hash.h>
     10#include "ctree.h"
     11#include "discard.h"
     12#include "volumes.h"
     13#include "disk-io.h"
     14#include "ordered-data.h"
     15#include "transaction.h"
     16#include "backref.h"
     17#include "extent_io.h"
     18#include "dev-replace.h"
     19#include "check-integrity.h"
     20#include "rcu-string.h"
     21#include "raid56.h"
     22#include "block-group.h"
     23#include "zoned.h"
     24
     25/*
     26 * This is only the first step towards a full-features scrub. It reads all
     27 * extent and super block and verifies the checksums. In case a bad checksum
     28 * is found or the extent cannot be read, good data will be written back if
     29 * any can be found.
     30 *
     31 * Future enhancements:
     32 *  - In case an unrepairable extent is encountered, track which files are
     33 *    affected and report them
     34 *  - track and record media errors, throw out bad devices
     35 *  - add a mode to also read unallocated space
     36 */
     37
     38struct scrub_block;
     39struct scrub_ctx;
     40
     41/*
     42 * The following three values only influence the performance.
     43 *
     44 * The last one configures the number of parallel and outstanding I/O
     45 * operations. The first one configures an upper limit for the number
     46 * of (dynamically allocated) pages that are added to a bio.
     47 */
     48#define SCRUB_SECTORS_PER_BIO	32	/* 128KiB per bio for 4KiB pages */
     49#define SCRUB_BIOS_PER_SCTX	64	/* 8MiB per device in flight for 4KiB pages */
     50
     51/*
     52 * The following value times PAGE_SIZE needs to be large enough to match the
     53 * largest node/leaf/sector size that shall be supported.
     54 */
     55#define SCRUB_MAX_SECTORS_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
     56
     57struct scrub_recover {
     58	refcount_t		refs;
     59	struct btrfs_io_context	*bioc;
     60	u64			map_length;
     61};
     62
     63struct scrub_sector {
     64	struct scrub_block	*sblock;
     65	struct page		*page;
     66	struct btrfs_device	*dev;
     67	struct list_head	list;
     68	u64			flags;  /* extent flags */
     69	u64			generation;
     70	u64			logical;
     71	u64			physical;
     72	u64			physical_for_dev_replace;
     73	atomic_t		refs;
     74	u8			mirror_num;
     75	unsigned int		have_csum:1;
     76	unsigned int		io_error:1;
     77	u8			csum[BTRFS_CSUM_SIZE];
     78
     79	struct scrub_recover	*recover;
     80};
     81
     82struct scrub_bio {
     83	int			index;
     84	struct scrub_ctx	*sctx;
     85	struct btrfs_device	*dev;
     86	struct bio		*bio;
     87	blk_status_t		status;
     88	u64			logical;
     89	u64			physical;
     90	struct scrub_sector	*sectors[SCRUB_SECTORS_PER_BIO];
     91	int			sector_count;
     92	int			next_free;
     93	struct work_struct	work;
     94};
     95
     96struct scrub_block {
     97	struct scrub_sector	*sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
     98	int			sector_count;
     99	atomic_t		outstanding_sectors;
    100	refcount_t		refs; /* free mem on transition to zero */
    101	struct scrub_ctx	*sctx;
    102	struct scrub_parity	*sparity;
    103	struct {
    104		unsigned int	header_error:1;
    105		unsigned int	checksum_error:1;
    106		unsigned int	no_io_error_seen:1;
    107		unsigned int	generation_error:1; /* also sets header_error */
    108
    109		/* The following is for the data used to check parity */
    110		/* It is for the data with checksum */
    111		unsigned int	data_corrected:1;
    112	};
    113	struct work_struct	work;
    114};
    115
    116/* Used for the chunks with parity stripe such RAID5/6 */
    117struct scrub_parity {
    118	struct scrub_ctx	*sctx;
    119
    120	struct btrfs_device	*scrub_dev;
    121
    122	u64			logic_start;
    123
    124	u64			logic_end;
    125
    126	int			nsectors;
    127
    128	u32			stripe_len;
    129
    130	refcount_t		refs;
    131
    132	struct list_head	sectors_list;
    133
    134	/* Work of parity check and repair */
    135	struct work_struct	work;
    136
    137	/* Mark the parity blocks which have data */
    138	unsigned long		*dbitmap;
    139
    140	/*
    141	 * Mark the parity blocks which have data, but errors happen when
    142	 * read data or check data
    143	 */
    144	unsigned long		*ebitmap;
    145
    146	unsigned long		bitmap[];
    147};
    148
    149struct scrub_ctx {
    150	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
    151	struct btrfs_fs_info	*fs_info;
    152	int			first_free;
    153	int			curr;
    154	atomic_t		bios_in_flight;
    155	atomic_t		workers_pending;
    156	spinlock_t		list_lock;
    157	wait_queue_head_t	list_wait;
    158	struct list_head	csum_list;
    159	atomic_t		cancel_req;
    160	int			readonly;
    161	int			sectors_per_bio;
    162
    163	/* State of IO submission throttling affecting the associated device */
    164	ktime_t			throttle_deadline;
    165	u64			throttle_sent;
    166
    167	int			is_dev_replace;
    168	u64			write_pointer;
    169
    170	struct scrub_bio        *wr_curr_bio;
    171	struct mutex            wr_lock;
    172	struct btrfs_device     *wr_tgtdev;
    173	bool                    flush_all_writes;
    174
    175	/*
    176	 * statistics
    177	 */
    178	struct btrfs_scrub_progress stat;
    179	spinlock_t		stat_lock;
    180
    181	/*
    182	 * Use a ref counter to avoid use-after-free issues. Scrub workers
    183	 * decrement bios_in_flight and workers_pending and then do a wakeup
    184	 * on the list_wait wait queue. We must ensure the main scrub task
    185	 * doesn't free the scrub context before or while the workers are
    186	 * doing the wakeup() call.
    187	 */
    188	refcount_t              refs;
    189};
    190
    191struct scrub_warning {
    192	struct btrfs_path	*path;
    193	u64			extent_item_size;
    194	const char		*errstr;
    195	u64			physical;
    196	u64			logical;
    197	struct btrfs_device	*dev;
    198};
    199
    200struct full_stripe_lock {
    201	struct rb_node node;
    202	u64 logical;
    203	u64 refs;
    204	struct mutex mutex;
    205};
    206
    207static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
    208				     struct scrub_block *sblocks_for_recheck);
    209static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
    210				struct scrub_block *sblock,
    211				int retry_failed_mirror);
    212static void scrub_recheck_block_checksum(struct scrub_block *sblock);
    213static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
    214					     struct scrub_block *sblock_good);
    215static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
    216					    struct scrub_block *sblock_good,
    217					    int sector_num, int force_write);
    218static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
    219static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
    220					     int sector_num);
    221static int scrub_checksum_data(struct scrub_block *sblock);
    222static int scrub_checksum_tree_block(struct scrub_block *sblock);
    223static int scrub_checksum_super(struct scrub_block *sblock);
    224static void scrub_block_put(struct scrub_block *sblock);
    225static void scrub_sector_get(struct scrub_sector *sector);
    226static void scrub_sector_put(struct scrub_sector *sector);
    227static void scrub_parity_get(struct scrub_parity *sparity);
    228static void scrub_parity_put(struct scrub_parity *sparity);
    229static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
    230			 u64 physical, struct btrfs_device *dev, u64 flags,
    231			 u64 gen, int mirror_num, u8 *csum,
    232			 u64 physical_for_dev_replace);
    233static void scrub_bio_end_io(struct bio *bio);
    234static void scrub_bio_end_io_worker(struct work_struct *work);
    235static void scrub_block_complete(struct scrub_block *sblock);
    236static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
    237				 u64 extent_logical, u32 extent_len,
    238				 u64 *extent_physical,
    239				 struct btrfs_device **extent_dev,
    240				 int *extent_mirror_num);
    241static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
    242				      struct scrub_sector *sector);
    243static void scrub_wr_submit(struct scrub_ctx *sctx);
    244static void scrub_wr_bio_end_io(struct bio *bio);
    245static void scrub_wr_bio_end_io_worker(struct work_struct *work);
    246static void scrub_put_ctx(struct scrub_ctx *sctx);
    247
    248static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
    249{
    250	return sector->recover &&
    251	       (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
    252}
    253
    254static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
    255{
    256	refcount_inc(&sctx->refs);
    257	atomic_inc(&sctx->bios_in_flight);
    258}
    259
    260static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
    261{
    262	atomic_dec(&sctx->bios_in_flight);
    263	wake_up(&sctx->list_wait);
    264	scrub_put_ctx(sctx);
    265}
    266
    267static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
    268{
    269	while (atomic_read(&fs_info->scrub_pause_req)) {
    270		mutex_unlock(&fs_info->scrub_lock);
    271		wait_event(fs_info->scrub_pause_wait,
    272		   atomic_read(&fs_info->scrub_pause_req) == 0);
    273		mutex_lock(&fs_info->scrub_lock);
    274	}
    275}
    276
    277static void scrub_pause_on(struct btrfs_fs_info *fs_info)
    278{
    279	atomic_inc(&fs_info->scrubs_paused);
    280	wake_up(&fs_info->scrub_pause_wait);
    281}
    282
    283static void scrub_pause_off(struct btrfs_fs_info *fs_info)
    284{
    285	mutex_lock(&fs_info->scrub_lock);
    286	__scrub_blocked_if_needed(fs_info);
    287	atomic_dec(&fs_info->scrubs_paused);
    288	mutex_unlock(&fs_info->scrub_lock);
    289
    290	wake_up(&fs_info->scrub_pause_wait);
    291}
    292
    293static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
    294{
    295	scrub_pause_on(fs_info);
    296	scrub_pause_off(fs_info);
    297}
    298
    299/*
    300 * Insert new full stripe lock into full stripe locks tree
    301 *
    302 * Return pointer to existing or newly inserted full_stripe_lock structure if
    303 * everything works well.
    304 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
    305 *
    306 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
    307 * function
    308 */
    309static struct full_stripe_lock *insert_full_stripe_lock(
    310		struct btrfs_full_stripe_locks_tree *locks_root,
    311		u64 fstripe_logical)
    312{
    313	struct rb_node **p;
    314	struct rb_node *parent = NULL;
    315	struct full_stripe_lock *entry;
    316	struct full_stripe_lock *ret;
    317
    318	lockdep_assert_held(&locks_root->lock);
    319
    320	p = &locks_root->root.rb_node;
    321	while (*p) {
    322		parent = *p;
    323		entry = rb_entry(parent, struct full_stripe_lock, node);
    324		if (fstripe_logical < entry->logical) {
    325			p = &(*p)->rb_left;
    326		} else if (fstripe_logical > entry->logical) {
    327			p = &(*p)->rb_right;
    328		} else {
    329			entry->refs++;
    330			return entry;
    331		}
    332	}
    333
    334	/*
    335	 * Insert new lock.
    336	 */
    337	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
    338	if (!ret)
    339		return ERR_PTR(-ENOMEM);
    340	ret->logical = fstripe_logical;
    341	ret->refs = 1;
    342	mutex_init(&ret->mutex);
    343
    344	rb_link_node(&ret->node, parent, p);
    345	rb_insert_color(&ret->node, &locks_root->root);
    346	return ret;
    347}
    348
    349/*
    350 * Search for a full stripe lock of a block group
    351 *
    352 * Return pointer to existing full stripe lock if found
    353 * Return NULL if not found
    354 */
    355static struct full_stripe_lock *search_full_stripe_lock(
    356		struct btrfs_full_stripe_locks_tree *locks_root,
    357		u64 fstripe_logical)
    358{
    359	struct rb_node *node;
    360	struct full_stripe_lock *entry;
    361
    362	lockdep_assert_held(&locks_root->lock);
    363
    364	node = locks_root->root.rb_node;
    365	while (node) {
    366		entry = rb_entry(node, struct full_stripe_lock, node);
    367		if (fstripe_logical < entry->logical)
    368			node = node->rb_left;
    369		else if (fstripe_logical > entry->logical)
    370			node = node->rb_right;
    371		else
    372			return entry;
    373	}
    374	return NULL;
    375}
    376
    377/*
    378 * Helper to get full stripe logical from a normal bytenr.
    379 *
    380 * Caller must ensure @cache is a RAID56 block group.
    381 */
    382static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
    383{
    384	u64 ret;
    385
    386	/*
    387	 * Due to chunk item size limit, full stripe length should not be
    388	 * larger than U32_MAX. Just a sanity check here.
    389	 */
    390	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
    391
    392	/*
    393	 * round_down() can only handle power of 2, while RAID56 full
    394	 * stripe length can be 64KiB * n, so we need to manually round down.
    395	 */
    396	ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
    397			cache->full_stripe_len + cache->start;
    398	return ret;
    399}
    400
    401/*
    402 * Lock a full stripe to avoid concurrency of recovery and read
    403 *
    404 * It's only used for profiles with parities (RAID5/6), for other profiles it
    405 * does nothing.
    406 *
    407 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
    408 * So caller must call unlock_full_stripe() at the same context.
    409 *
    410 * Return <0 if encounters error.
    411 */
    412static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
    413			    bool *locked_ret)
    414{
    415	struct btrfs_block_group *bg_cache;
    416	struct btrfs_full_stripe_locks_tree *locks_root;
    417	struct full_stripe_lock *existing;
    418	u64 fstripe_start;
    419	int ret = 0;
    420
    421	*locked_ret = false;
    422	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
    423	if (!bg_cache) {
    424		ASSERT(0);
    425		return -ENOENT;
    426	}
    427
    428	/* Profiles not based on parity don't need full stripe lock */
    429	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
    430		goto out;
    431	locks_root = &bg_cache->full_stripe_locks_root;
    432
    433	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
    434
    435	/* Now insert the full stripe lock */
    436	mutex_lock(&locks_root->lock);
    437	existing = insert_full_stripe_lock(locks_root, fstripe_start);
    438	mutex_unlock(&locks_root->lock);
    439	if (IS_ERR(existing)) {
    440		ret = PTR_ERR(existing);
    441		goto out;
    442	}
    443	mutex_lock(&existing->mutex);
    444	*locked_ret = true;
    445out:
    446	btrfs_put_block_group(bg_cache);
    447	return ret;
    448}
    449
    450/*
    451 * Unlock a full stripe.
    452 *
    453 * NOTE: Caller must ensure it's the same context calling corresponding
    454 * lock_full_stripe().
    455 *
    456 * Return 0 if we unlock full stripe without problem.
    457 * Return <0 for error
    458 */
    459static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
    460			      bool locked)
    461{
    462	struct btrfs_block_group *bg_cache;
    463	struct btrfs_full_stripe_locks_tree *locks_root;
    464	struct full_stripe_lock *fstripe_lock;
    465	u64 fstripe_start;
    466	bool freeit = false;
    467	int ret = 0;
    468
    469	/* If we didn't acquire full stripe lock, no need to continue */
    470	if (!locked)
    471		return 0;
    472
    473	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
    474	if (!bg_cache) {
    475		ASSERT(0);
    476		return -ENOENT;
    477	}
    478	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
    479		goto out;
    480
    481	locks_root = &bg_cache->full_stripe_locks_root;
    482	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
    483
    484	mutex_lock(&locks_root->lock);
    485	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
    486	/* Unpaired unlock_full_stripe() detected */
    487	if (!fstripe_lock) {
    488		WARN_ON(1);
    489		ret = -ENOENT;
    490		mutex_unlock(&locks_root->lock);
    491		goto out;
    492	}
    493
    494	if (fstripe_lock->refs == 0) {
    495		WARN_ON(1);
    496		btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
    497			fstripe_lock->logical);
    498	} else {
    499		fstripe_lock->refs--;
    500	}
    501
    502	if (fstripe_lock->refs == 0) {
    503		rb_erase(&fstripe_lock->node, &locks_root->root);
    504		freeit = true;
    505	}
    506	mutex_unlock(&locks_root->lock);
    507
    508	mutex_unlock(&fstripe_lock->mutex);
    509	if (freeit)
    510		kfree(fstripe_lock);
    511out:
    512	btrfs_put_block_group(bg_cache);
    513	return ret;
    514}
    515
    516static void scrub_free_csums(struct scrub_ctx *sctx)
    517{
    518	while (!list_empty(&sctx->csum_list)) {
    519		struct btrfs_ordered_sum *sum;
    520		sum = list_first_entry(&sctx->csum_list,
    521				       struct btrfs_ordered_sum, list);
    522		list_del(&sum->list);
    523		kfree(sum);
    524	}
    525}
    526
    527static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
    528{
    529	int i;
    530
    531	if (!sctx)
    532		return;
    533
    534	/* this can happen when scrub is cancelled */
    535	if (sctx->curr != -1) {
    536		struct scrub_bio *sbio = sctx->bios[sctx->curr];
    537
    538		for (i = 0; i < sbio->sector_count; i++) {
    539			WARN_ON(!sbio->sectors[i]->page);
    540			scrub_block_put(sbio->sectors[i]->sblock);
    541		}
    542		bio_put(sbio->bio);
    543	}
    544
    545	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
    546		struct scrub_bio *sbio = sctx->bios[i];
    547
    548		if (!sbio)
    549			break;
    550		kfree(sbio);
    551	}
    552
    553	kfree(sctx->wr_curr_bio);
    554	scrub_free_csums(sctx);
    555	kfree(sctx);
    556}
    557
    558static void scrub_put_ctx(struct scrub_ctx *sctx)
    559{
    560	if (refcount_dec_and_test(&sctx->refs))
    561		scrub_free_ctx(sctx);
    562}
    563
    564static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
    565		struct btrfs_fs_info *fs_info, int is_dev_replace)
    566{
    567	struct scrub_ctx *sctx;
    568	int		i;
    569
    570	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
    571	if (!sctx)
    572		goto nomem;
    573	refcount_set(&sctx->refs, 1);
    574	sctx->is_dev_replace = is_dev_replace;
    575	sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
    576	sctx->curr = -1;
    577	sctx->fs_info = fs_info;
    578	INIT_LIST_HEAD(&sctx->csum_list);
    579	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
    580		struct scrub_bio *sbio;
    581
    582		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
    583		if (!sbio)
    584			goto nomem;
    585		sctx->bios[i] = sbio;
    586
    587		sbio->index = i;
    588		sbio->sctx = sctx;
    589		sbio->sector_count = 0;
    590		INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
    591
    592		if (i != SCRUB_BIOS_PER_SCTX - 1)
    593			sctx->bios[i]->next_free = i + 1;
    594		else
    595			sctx->bios[i]->next_free = -1;
    596	}
    597	sctx->first_free = 0;
    598	atomic_set(&sctx->bios_in_flight, 0);
    599	atomic_set(&sctx->workers_pending, 0);
    600	atomic_set(&sctx->cancel_req, 0);
    601
    602	spin_lock_init(&sctx->list_lock);
    603	spin_lock_init(&sctx->stat_lock);
    604	init_waitqueue_head(&sctx->list_wait);
    605	sctx->throttle_deadline = 0;
    606
    607	WARN_ON(sctx->wr_curr_bio != NULL);
    608	mutex_init(&sctx->wr_lock);
    609	sctx->wr_curr_bio = NULL;
    610	if (is_dev_replace) {
    611		WARN_ON(!fs_info->dev_replace.tgtdev);
    612		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
    613		sctx->flush_all_writes = false;
    614	}
    615
    616	return sctx;
    617
    618nomem:
    619	scrub_free_ctx(sctx);
    620	return ERR_PTR(-ENOMEM);
    621}
    622
    623static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
    624				     void *warn_ctx)
    625{
    626	u32 nlink;
    627	int ret;
    628	int i;
    629	unsigned nofs_flag;
    630	struct extent_buffer *eb;
    631	struct btrfs_inode_item *inode_item;
    632	struct scrub_warning *swarn = warn_ctx;
    633	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
    634	struct inode_fs_paths *ipath = NULL;
    635	struct btrfs_root *local_root;
    636	struct btrfs_key key;
    637
    638	local_root = btrfs_get_fs_root(fs_info, root, true);
    639	if (IS_ERR(local_root)) {
    640		ret = PTR_ERR(local_root);
    641		goto err;
    642	}
    643
    644	/*
    645	 * this makes the path point to (inum INODE_ITEM ioff)
    646	 */
    647	key.objectid = inum;
    648	key.type = BTRFS_INODE_ITEM_KEY;
    649	key.offset = 0;
    650
    651	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
    652	if (ret) {
    653		btrfs_put_root(local_root);
    654		btrfs_release_path(swarn->path);
    655		goto err;
    656	}
    657
    658	eb = swarn->path->nodes[0];
    659	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
    660					struct btrfs_inode_item);
    661	nlink = btrfs_inode_nlink(eb, inode_item);
    662	btrfs_release_path(swarn->path);
    663
    664	/*
    665	 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
    666	 * uses GFP_NOFS in this context, so we keep it consistent but it does
    667	 * not seem to be strictly necessary.
    668	 */
    669	nofs_flag = memalloc_nofs_save();
    670	ipath = init_ipath(4096, local_root, swarn->path);
    671	memalloc_nofs_restore(nofs_flag);
    672	if (IS_ERR(ipath)) {
    673		btrfs_put_root(local_root);
    674		ret = PTR_ERR(ipath);
    675		ipath = NULL;
    676		goto err;
    677	}
    678	ret = paths_from_inode(inum, ipath);
    679
    680	if (ret < 0)
    681		goto err;
    682
    683	/*
    684	 * we deliberately ignore the bit ipath might have been too small to
    685	 * hold all of the paths here
    686	 */
    687	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
    688		btrfs_warn_in_rcu(fs_info,
    689"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
    690				  swarn->errstr, swarn->logical,
    691				  rcu_str_deref(swarn->dev->name),
    692				  swarn->physical,
    693				  root, inum, offset,
    694				  fs_info->sectorsize, nlink,
    695				  (char *)(unsigned long)ipath->fspath->val[i]);
    696
    697	btrfs_put_root(local_root);
    698	free_ipath(ipath);
    699	return 0;
    700
    701err:
    702	btrfs_warn_in_rcu(fs_info,
    703			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
    704			  swarn->errstr, swarn->logical,
    705			  rcu_str_deref(swarn->dev->name),
    706			  swarn->physical,
    707			  root, inum, offset, ret);
    708
    709	free_ipath(ipath);
    710	return 0;
    711}
    712
    713static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
    714{
    715	struct btrfs_device *dev;
    716	struct btrfs_fs_info *fs_info;
    717	struct btrfs_path *path;
    718	struct btrfs_key found_key;
    719	struct extent_buffer *eb;
    720	struct btrfs_extent_item *ei;
    721	struct scrub_warning swarn;
    722	unsigned long ptr = 0;
    723	u64 extent_item_pos;
    724	u64 flags = 0;
    725	u64 ref_root;
    726	u32 item_size;
    727	u8 ref_level = 0;
    728	int ret;
    729
    730	WARN_ON(sblock->sector_count < 1);
    731	dev = sblock->sectors[0]->dev;
    732	fs_info = sblock->sctx->fs_info;
    733
    734	path = btrfs_alloc_path();
    735	if (!path)
    736		return;
    737
    738	swarn.physical = sblock->sectors[0]->physical;
    739	swarn.logical = sblock->sectors[0]->logical;
    740	swarn.errstr = errstr;
    741	swarn.dev = NULL;
    742
    743	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
    744				  &flags);
    745	if (ret < 0)
    746		goto out;
    747
    748	extent_item_pos = swarn.logical - found_key.objectid;
    749	swarn.extent_item_size = found_key.offset;
    750
    751	eb = path->nodes[0];
    752	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
    753	item_size = btrfs_item_size(eb, path->slots[0]);
    754
    755	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
    756		do {
    757			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
    758						      item_size, &ref_root,
    759						      &ref_level);
    760			btrfs_warn_in_rcu(fs_info,
    761"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
    762				errstr, swarn.logical,
    763				rcu_str_deref(dev->name),
    764				swarn.physical,
    765				ref_level ? "node" : "leaf",
    766				ret < 0 ? -1 : ref_level,
    767				ret < 0 ? -1 : ref_root);
    768		} while (ret != 1);
    769		btrfs_release_path(path);
    770	} else {
    771		btrfs_release_path(path);
    772		swarn.path = path;
    773		swarn.dev = dev;
    774		iterate_extent_inodes(fs_info, found_key.objectid,
    775					extent_item_pos, 1,
    776					scrub_print_warning_inode, &swarn, false);
    777	}
    778
    779out:
    780	btrfs_free_path(path);
    781}
    782
    783static inline void scrub_get_recover(struct scrub_recover *recover)
    784{
    785	refcount_inc(&recover->refs);
    786}
    787
    788static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
    789				     struct scrub_recover *recover)
    790{
    791	if (refcount_dec_and_test(&recover->refs)) {
    792		btrfs_bio_counter_dec(fs_info);
    793		btrfs_put_bioc(recover->bioc);
    794		kfree(recover);
    795	}
    796}
    797
    798/*
    799 * scrub_handle_errored_block gets called when either verification of the
    800 * sectors failed or the bio failed to read, e.g. with EIO. In the latter
    801 * case, this function handles all sectors in the bio, even though only one
    802 * may be bad.
    803 * The goal of this function is to repair the errored block by using the
    804 * contents of one of the mirrors.
    805 */
    806static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
    807{
    808	struct scrub_ctx *sctx = sblock_to_check->sctx;
    809	struct btrfs_device *dev;
    810	struct btrfs_fs_info *fs_info;
    811	u64 logical;
    812	unsigned int failed_mirror_index;
    813	unsigned int is_metadata;
    814	unsigned int have_csum;
    815	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
    816	struct scrub_block *sblock_bad;
    817	int ret;
    818	int mirror_index;
    819	int sector_num;
    820	int success;
    821	bool full_stripe_locked;
    822	unsigned int nofs_flag;
    823	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
    824				      DEFAULT_RATELIMIT_BURST);
    825
    826	BUG_ON(sblock_to_check->sector_count < 1);
    827	fs_info = sctx->fs_info;
    828	if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
    829		/*
    830		 * if we find an error in a super block, we just report it.
    831		 * They will get written with the next transaction commit
    832		 * anyway
    833		 */
    834		spin_lock(&sctx->stat_lock);
    835		++sctx->stat.super_errors;
    836		spin_unlock(&sctx->stat_lock);
    837		return 0;
    838	}
    839	logical = sblock_to_check->sectors[0]->logical;
    840	BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
    841	failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
    842	is_metadata = !(sblock_to_check->sectors[0]->flags &
    843			BTRFS_EXTENT_FLAG_DATA);
    844	have_csum = sblock_to_check->sectors[0]->have_csum;
    845	dev = sblock_to_check->sectors[0]->dev;
    846
    847	if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
    848		return 0;
    849
    850	/*
    851	 * We must use GFP_NOFS because the scrub task might be waiting for a
    852	 * worker task executing this function and in turn a transaction commit
    853	 * might be waiting the scrub task to pause (which needs to wait for all
    854	 * the worker tasks to complete before pausing).
    855	 * We do allocations in the workers through insert_full_stripe_lock()
    856	 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
    857	 * this function.
    858	 */
    859	nofs_flag = memalloc_nofs_save();
    860	/*
    861	 * For RAID5/6, race can happen for a different device scrub thread.
    862	 * For data corruption, Parity and Data threads will both try
    863	 * to recovery the data.
    864	 * Race can lead to doubly added csum error, or even unrecoverable
    865	 * error.
    866	 */
    867	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
    868	if (ret < 0) {
    869		memalloc_nofs_restore(nofs_flag);
    870		spin_lock(&sctx->stat_lock);
    871		if (ret == -ENOMEM)
    872			sctx->stat.malloc_errors++;
    873		sctx->stat.read_errors++;
    874		sctx->stat.uncorrectable_errors++;
    875		spin_unlock(&sctx->stat_lock);
    876		return ret;
    877	}
    878
    879	/*
    880	 * read all mirrors one after the other. This includes to
    881	 * re-read the extent or metadata block that failed (that was
    882	 * the cause that this fixup code is called) another time,
    883	 * sector by sector this time in order to know which sectors
    884	 * caused I/O errors and which ones are good (for all mirrors).
    885	 * It is the goal to handle the situation when more than one
    886	 * mirror contains I/O errors, but the errors do not
    887	 * overlap, i.e. the data can be repaired by selecting the
    888	 * sectors from those mirrors without I/O error on the
    889	 * particular sectors. One example (with blocks >= 2 * sectorsize)
    890	 * would be that mirror #1 has an I/O error on the first sector,
    891	 * the second sector is good, and mirror #2 has an I/O error on
    892	 * the second sector, but the first sector is good.
    893	 * Then the first sector of the first mirror can be repaired by
    894	 * taking the first sector of the second mirror, and the
    895	 * second sector of the second mirror can be repaired by
    896	 * copying the contents of the 2nd sector of the 1st mirror.
    897	 * One more note: if the sectors of one mirror contain I/O
    898	 * errors, the checksum cannot be verified. In order to get
    899	 * the best data for repairing, the first attempt is to find
    900	 * a mirror without I/O errors and with a validated checksum.
    901	 * Only if this is not possible, the sectors are picked from
    902	 * mirrors with I/O errors without considering the checksum.
    903	 * If the latter is the case, at the end, the checksum of the
    904	 * repaired area is verified in order to correctly maintain
    905	 * the statistics.
    906	 */
    907
    908	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
    909				      sizeof(*sblocks_for_recheck), GFP_KERNEL);
    910	if (!sblocks_for_recheck) {
    911		spin_lock(&sctx->stat_lock);
    912		sctx->stat.malloc_errors++;
    913		sctx->stat.read_errors++;
    914		sctx->stat.uncorrectable_errors++;
    915		spin_unlock(&sctx->stat_lock);
    916		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
    917		goto out;
    918	}
    919
    920	/* Setup the context, map the logical blocks and alloc the sectors */
    921	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
    922	if (ret) {
    923		spin_lock(&sctx->stat_lock);
    924		sctx->stat.read_errors++;
    925		sctx->stat.uncorrectable_errors++;
    926		spin_unlock(&sctx->stat_lock);
    927		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
    928		goto out;
    929	}
    930	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
    931	sblock_bad = sblocks_for_recheck + failed_mirror_index;
    932
    933	/* build and submit the bios for the failed mirror, check checksums */
    934	scrub_recheck_block(fs_info, sblock_bad, 1);
    935
    936	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
    937	    sblock_bad->no_io_error_seen) {
    938		/*
    939		 * The error disappeared after reading sector by sector, or
    940		 * the area was part of a huge bio and other parts of the
    941		 * bio caused I/O errors, or the block layer merged several
    942		 * read requests into one and the error is caused by a
    943		 * different bio (usually one of the two latter cases is
    944		 * the cause)
    945		 */
    946		spin_lock(&sctx->stat_lock);
    947		sctx->stat.unverified_errors++;
    948		sblock_to_check->data_corrected = 1;
    949		spin_unlock(&sctx->stat_lock);
    950
    951		if (sctx->is_dev_replace)
    952			scrub_write_block_to_dev_replace(sblock_bad);
    953		goto out;
    954	}
    955
    956	if (!sblock_bad->no_io_error_seen) {
    957		spin_lock(&sctx->stat_lock);
    958		sctx->stat.read_errors++;
    959		spin_unlock(&sctx->stat_lock);
    960		if (__ratelimit(&rs))
    961			scrub_print_warning("i/o error", sblock_to_check);
    962		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
    963	} else if (sblock_bad->checksum_error) {
    964		spin_lock(&sctx->stat_lock);
    965		sctx->stat.csum_errors++;
    966		spin_unlock(&sctx->stat_lock);
    967		if (__ratelimit(&rs))
    968			scrub_print_warning("checksum error", sblock_to_check);
    969		btrfs_dev_stat_inc_and_print(dev,
    970					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
    971	} else if (sblock_bad->header_error) {
    972		spin_lock(&sctx->stat_lock);
    973		sctx->stat.verify_errors++;
    974		spin_unlock(&sctx->stat_lock);
    975		if (__ratelimit(&rs))
    976			scrub_print_warning("checksum/header error",
    977					    sblock_to_check);
    978		if (sblock_bad->generation_error)
    979			btrfs_dev_stat_inc_and_print(dev,
    980				BTRFS_DEV_STAT_GENERATION_ERRS);
    981		else
    982			btrfs_dev_stat_inc_and_print(dev,
    983				BTRFS_DEV_STAT_CORRUPTION_ERRS);
    984	}
    985
    986	if (sctx->readonly) {
    987		ASSERT(!sctx->is_dev_replace);
    988		goto out;
    989	}
    990
    991	/*
    992	 * now build and submit the bios for the other mirrors, check
    993	 * checksums.
    994	 * First try to pick the mirror which is completely without I/O
    995	 * errors and also does not have a checksum error.
    996	 * If one is found, and if a checksum is present, the full block
    997	 * that is known to contain an error is rewritten. Afterwards
    998	 * the block is known to be corrected.
    999	 * If a mirror is found which is completely correct, and no
   1000	 * checksum is present, only those sectors are rewritten that had
   1001	 * an I/O error in the block to be repaired, since it cannot be
   1002	 * determined, which copy of the other sectors is better (and it
   1003	 * could happen otherwise that a correct sector would be
   1004	 * overwritten by a bad one).
   1005	 */
   1006	for (mirror_index = 0; ;mirror_index++) {
   1007		struct scrub_block *sblock_other;
   1008
   1009		if (mirror_index == failed_mirror_index)
   1010			continue;
   1011
   1012		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
   1013		if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
   1014			if (mirror_index >= BTRFS_MAX_MIRRORS)
   1015				break;
   1016			if (!sblocks_for_recheck[mirror_index].sector_count)
   1017				break;
   1018
   1019			sblock_other = sblocks_for_recheck + mirror_index;
   1020		} else {
   1021			struct scrub_recover *r = sblock_bad->sectors[0]->recover;
   1022			int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
   1023
   1024			if (mirror_index >= max_allowed)
   1025				break;
   1026			if (!sblocks_for_recheck[1].sector_count)
   1027				break;
   1028
   1029			ASSERT(failed_mirror_index == 0);
   1030			sblock_other = sblocks_for_recheck + 1;
   1031			sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
   1032		}
   1033
   1034		/* build and submit the bios, check checksums */
   1035		scrub_recheck_block(fs_info, sblock_other, 0);
   1036
   1037		if (!sblock_other->header_error &&
   1038		    !sblock_other->checksum_error &&
   1039		    sblock_other->no_io_error_seen) {
   1040			if (sctx->is_dev_replace) {
   1041				scrub_write_block_to_dev_replace(sblock_other);
   1042				goto corrected_error;
   1043			} else {
   1044				ret = scrub_repair_block_from_good_copy(
   1045						sblock_bad, sblock_other);
   1046				if (!ret)
   1047					goto corrected_error;
   1048			}
   1049		}
   1050	}
   1051
   1052	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
   1053		goto did_not_correct_error;
   1054
   1055	/*
   1056	 * In case of I/O errors in the area that is supposed to be
   1057	 * repaired, continue by picking good copies of those sectors.
   1058	 * Select the good sectors from mirrors to rewrite bad sectors from
   1059	 * the area to fix. Afterwards verify the checksum of the block
   1060	 * that is supposed to be repaired. This verification step is
   1061	 * only done for the purpose of statistic counting and for the
   1062	 * final scrub report, whether errors remain.
   1063	 * A perfect algorithm could make use of the checksum and try
   1064	 * all possible combinations of sectors from the different mirrors
   1065	 * until the checksum verification succeeds. For example, when
   1066	 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
   1067	 * of mirror #2 is readable but the final checksum test fails,
   1068	 * then the 2nd sector of mirror #3 could be tried, whether now
   1069	 * the final checksum succeeds. But this would be a rare
   1070	 * exception and is therefore not implemented. At least it is
   1071	 * avoided that the good copy is overwritten.
   1072	 * A more useful improvement would be to pick the sectors
   1073	 * without I/O error based on sector sizes (512 bytes on legacy
   1074	 * disks) instead of on sectorsize. Then maybe 512 byte of one
   1075	 * mirror could be repaired by taking 512 byte of a different
   1076	 * mirror, even if other 512 byte sectors in the same sectorsize
   1077	 * area are unreadable.
   1078	 */
   1079	success = 1;
   1080	for (sector_num = 0; sector_num < sblock_bad->sector_count;
   1081	     sector_num++) {
   1082		struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
   1083		struct scrub_block *sblock_other = NULL;
   1084
   1085		/* Skip no-io-error sectors in scrub */
   1086		if (!sector_bad->io_error && !sctx->is_dev_replace)
   1087			continue;
   1088
   1089		if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
   1090			/*
   1091			 * In case of dev replace, if raid56 rebuild process
   1092			 * didn't work out correct data, then copy the content
   1093			 * in sblock_bad to make sure target device is identical
   1094			 * to source device, instead of writing garbage data in
   1095			 * sblock_for_recheck array to target device.
   1096			 */
   1097			sblock_other = NULL;
   1098		} else if (sector_bad->io_error) {
   1099			/* Try to find no-io-error sector in mirrors */
   1100			for (mirror_index = 0;
   1101			     mirror_index < BTRFS_MAX_MIRRORS &&
   1102			     sblocks_for_recheck[mirror_index].sector_count > 0;
   1103			     mirror_index++) {
   1104				if (!sblocks_for_recheck[mirror_index].
   1105				    sectors[sector_num]->io_error) {
   1106					sblock_other = sblocks_for_recheck +
   1107						       mirror_index;
   1108					break;
   1109				}
   1110			}
   1111			if (!sblock_other)
   1112				success = 0;
   1113		}
   1114
   1115		if (sctx->is_dev_replace) {
   1116			/*
   1117			 * Did not find a mirror to fetch the sector from.
   1118			 * scrub_write_sector_to_dev_replace() handles this
   1119			 * case (sector->io_error), by filling the block with
   1120			 * zeros before submitting the write request
   1121			 */
   1122			if (!sblock_other)
   1123				sblock_other = sblock_bad;
   1124
   1125			if (scrub_write_sector_to_dev_replace(sblock_other,
   1126							      sector_num) != 0) {
   1127				atomic64_inc(
   1128					&fs_info->dev_replace.num_write_errors);
   1129				success = 0;
   1130			}
   1131		} else if (sblock_other) {
   1132			ret = scrub_repair_sector_from_good_copy(sblock_bad,
   1133								 sblock_other,
   1134								 sector_num, 0);
   1135			if (0 == ret)
   1136				sector_bad->io_error = 0;
   1137			else
   1138				success = 0;
   1139		}
   1140	}
   1141
   1142	if (success && !sctx->is_dev_replace) {
   1143		if (is_metadata || have_csum) {
   1144			/*
   1145			 * need to verify the checksum now that all
   1146			 * sectors on disk are repaired (the write
   1147			 * request for data to be repaired is on its way).
   1148			 * Just be lazy and use scrub_recheck_block()
   1149			 * which re-reads the data before the checksum
   1150			 * is verified, but most likely the data comes out
   1151			 * of the page cache.
   1152			 */
   1153			scrub_recheck_block(fs_info, sblock_bad, 1);
   1154			if (!sblock_bad->header_error &&
   1155			    !sblock_bad->checksum_error &&
   1156			    sblock_bad->no_io_error_seen)
   1157				goto corrected_error;
   1158			else
   1159				goto did_not_correct_error;
   1160		} else {
   1161corrected_error:
   1162			spin_lock(&sctx->stat_lock);
   1163			sctx->stat.corrected_errors++;
   1164			sblock_to_check->data_corrected = 1;
   1165			spin_unlock(&sctx->stat_lock);
   1166			btrfs_err_rl_in_rcu(fs_info,
   1167				"fixed up error at logical %llu on dev %s",
   1168				logical, rcu_str_deref(dev->name));
   1169		}
   1170	} else {
   1171did_not_correct_error:
   1172		spin_lock(&sctx->stat_lock);
   1173		sctx->stat.uncorrectable_errors++;
   1174		spin_unlock(&sctx->stat_lock);
   1175		btrfs_err_rl_in_rcu(fs_info,
   1176			"unable to fixup (regular) error at logical %llu on dev %s",
   1177			logical, rcu_str_deref(dev->name));
   1178	}
   1179
   1180out:
   1181	if (sblocks_for_recheck) {
   1182		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
   1183		     mirror_index++) {
   1184			struct scrub_block *sblock = sblocks_for_recheck +
   1185						     mirror_index;
   1186			struct scrub_recover *recover;
   1187			int i;
   1188
   1189			for (i = 0; i < sblock->sector_count; i++) {
   1190				sblock->sectors[i]->sblock = NULL;
   1191				recover = sblock->sectors[i]->recover;
   1192				if (recover) {
   1193					scrub_put_recover(fs_info, recover);
   1194					sblock->sectors[i]->recover = NULL;
   1195				}
   1196				scrub_sector_put(sblock->sectors[i]);
   1197			}
   1198		}
   1199		kfree(sblocks_for_recheck);
   1200	}
   1201
   1202	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
   1203	memalloc_nofs_restore(nofs_flag);
   1204	if (ret < 0)
   1205		return ret;
   1206	return 0;
   1207}
   1208
   1209static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
   1210{
   1211	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
   1212		return 2;
   1213	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
   1214		return 3;
   1215	else
   1216		return (int)bioc->num_stripes;
   1217}
   1218
   1219static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
   1220						 u64 *raid_map,
   1221						 u64 mapped_length,
   1222						 int nstripes, int mirror,
   1223						 int *stripe_index,
   1224						 u64 *stripe_offset)
   1225{
   1226	int i;
   1227
   1228	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
   1229		/* RAID5/6 */
   1230		for (i = 0; i < nstripes; i++) {
   1231			if (raid_map[i] == RAID6_Q_STRIPE ||
   1232			    raid_map[i] == RAID5_P_STRIPE)
   1233				continue;
   1234
   1235			if (logical >= raid_map[i] &&
   1236			    logical < raid_map[i] + mapped_length)
   1237				break;
   1238		}
   1239
   1240		*stripe_index = i;
   1241		*stripe_offset = logical - raid_map[i];
   1242	} else {
   1243		/* The other RAID type */
   1244		*stripe_index = mirror;
   1245		*stripe_offset = 0;
   1246	}
   1247}
   1248
   1249static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
   1250				     struct scrub_block *sblocks_for_recheck)
   1251{
   1252	struct scrub_ctx *sctx = original_sblock->sctx;
   1253	struct btrfs_fs_info *fs_info = sctx->fs_info;
   1254	u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
   1255	u64 logical = original_sblock->sectors[0]->logical;
   1256	u64 generation = original_sblock->sectors[0]->generation;
   1257	u64 flags = original_sblock->sectors[0]->flags;
   1258	u64 have_csum = original_sblock->sectors[0]->have_csum;
   1259	struct scrub_recover *recover;
   1260	struct btrfs_io_context *bioc;
   1261	u64 sublen;
   1262	u64 mapped_length;
   1263	u64 stripe_offset;
   1264	int stripe_index;
   1265	int sector_index = 0;
   1266	int mirror_index;
   1267	int nmirrors;
   1268	int ret;
   1269
   1270	/*
   1271	 * Note: the two members refs and outstanding_sectors are not used (and
   1272	 * not set) in the blocks that are used for the recheck procedure.
   1273	 */
   1274
   1275	while (length > 0) {
   1276		sublen = min_t(u64, length, fs_info->sectorsize);
   1277		mapped_length = sublen;
   1278		bioc = NULL;
   1279
   1280		/*
   1281		 * With a length of sectorsize, each returned stripe represents
   1282		 * one mirror
   1283		 */
   1284		btrfs_bio_counter_inc_blocked(fs_info);
   1285		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
   1286				       logical, &mapped_length, &bioc);
   1287		if (ret || !bioc || mapped_length < sublen) {
   1288			btrfs_put_bioc(bioc);
   1289			btrfs_bio_counter_dec(fs_info);
   1290			return -EIO;
   1291		}
   1292
   1293		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
   1294		if (!recover) {
   1295			btrfs_put_bioc(bioc);
   1296			btrfs_bio_counter_dec(fs_info);
   1297			return -ENOMEM;
   1298		}
   1299
   1300		refcount_set(&recover->refs, 1);
   1301		recover->bioc = bioc;
   1302		recover->map_length = mapped_length;
   1303
   1304		ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
   1305
   1306		nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
   1307
   1308		for (mirror_index = 0; mirror_index < nmirrors;
   1309		     mirror_index++) {
   1310			struct scrub_block *sblock;
   1311			struct scrub_sector *sector;
   1312
   1313			sblock = sblocks_for_recheck + mirror_index;
   1314			sblock->sctx = sctx;
   1315
   1316			sector = kzalloc(sizeof(*sector), GFP_NOFS);
   1317			if (!sector) {
   1318leave_nomem:
   1319				spin_lock(&sctx->stat_lock);
   1320				sctx->stat.malloc_errors++;
   1321				spin_unlock(&sctx->stat_lock);
   1322				scrub_put_recover(fs_info, recover);
   1323				return -ENOMEM;
   1324			}
   1325			scrub_sector_get(sector);
   1326			sblock->sectors[sector_index] = sector;
   1327			sector->sblock = sblock;
   1328			sector->flags = flags;
   1329			sector->generation = generation;
   1330			sector->logical = logical;
   1331			sector->have_csum = have_csum;
   1332			if (have_csum)
   1333				memcpy(sector->csum,
   1334				       original_sblock->sectors[0]->csum,
   1335				       sctx->fs_info->csum_size);
   1336
   1337			scrub_stripe_index_and_offset(logical,
   1338						      bioc->map_type,
   1339						      bioc->raid_map,
   1340						      mapped_length,
   1341						      bioc->num_stripes -
   1342						      bioc->num_tgtdevs,
   1343						      mirror_index,
   1344						      &stripe_index,
   1345						      &stripe_offset);
   1346			sector->physical = bioc->stripes[stripe_index].physical +
   1347					 stripe_offset;
   1348			sector->dev = bioc->stripes[stripe_index].dev;
   1349
   1350			BUG_ON(sector_index >= original_sblock->sector_count);
   1351			sector->physical_for_dev_replace =
   1352				original_sblock->sectors[sector_index]->
   1353				physical_for_dev_replace;
   1354			/* For missing devices, dev->bdev is NULL */
   1355			sector->mirror_num = mirror_index + 1;
   1356			sblock->sector_count++;
   1357			sector->page = alloc_page(GFP_NOFS);
   1358			if (!sector->page)
   1359				goto leave_nomem;
   1360
   1361			scrub_get_recover(recover);
   1362			sector->recover = recover;
   1363		}
   1364		scrub_put_recover(fs_info, recover);
   1365		length -= sublen;
   1366		logical += sublen;
   1367		sector_index++;
   1368	}
   1369
   1370	return 0;
   1371}
   1372
   1373static void scrub_bio_wait_endio(struct bio *bio)
   1374{
   1375	complete(bio->bi_private);
   1376}
   1377
   1378static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
   1379					struct bio *bio,
   1380					struct scrub_sector *sector)
   1381{
   1382	DECLARE_COMPLETION_ONSTACK(done);
   1383	int ret;
   1384	int mirror_num;
   1385
   1386	bio->bi_iter.bi_sector = sector->logical >> 9;
   1387	bio->bi_private = &done;
   1388	bio->bi_end_io = scrub_bio_wait_endio;
   1389
   1390	mirror_num = sector->sblock->sectors[0]->mirror_num;
   1391	ret = raid56_parity_recover(bio, sector->recover->bioc,
   1392				    sector->recover->map_length,
   1393				    mirror_num, 0);
   1394	if (ret)
   1395		return ret;
   1396
   1397	wait_for_completion_io(&done);
   1398	return blk_status_to_errno(bio->bi_status);
   1399}
   1400
   1401static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
   1402					  struct scrub_block *sblock)
   1403{
   1404	struct scrub_sector *first_sector = sblock->sectors[0];
   1405	struct bio *bio;
   1406	int i;
   1407
   1408	/* All sectors in sblock belong to the same stripe on the same device. */
   1409	ASSERT(first_sector->dev);
   1410	if (!first_sector->dev->bdev)
   1411		goto out;
   1412
   1413	bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
   1414
   1415	for (i = 0; i < sblock->sector_count; i++) {
   1416		struct scrub_sector *sector = sblock->sectors[i];
   1417
   1418		WARN_ON(!sector->page);
   1419		bio_add_page(bio, sector->page, PAGE_SIZE, 0);
   1420	}
   1421
   1422	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
   1423		bio_put(bio);
   1424		goto out;
   1425	}
   1426
   1427	bio_put(bio);
   1428
   1429	scrub_recheck_block_checksum(sblock);
   1430
   1431	return;
   1432out:
   1433	for (i = 0; i < sblock->sector_count; i++)
   1434		sblock->sectors[i]->io_error = 1;
   1435
   1436	sblock->no_io_error_seen = 0;
   1437}
   1438
   1439/*
   1440 * This function will check the on disk data for checksum errors, header errors
   1441 * and read I/O errors. If any I/O errors happen, the exact sectors which are
   1442 * errored are marked as being bad. The goal is to enable scrub to take those
   1443 * sectors that are not errored from all the mirrors so that the sectors that
   1444 * are errored in the just handled mirror can be repaired.
   1445 */
   1446static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
   1447				struct scrub_block *sblock,
   1448				int retry_failed_mirror)
   1449{
   1450	int i;
   1451
   1452	sblock->no_io_error_seen = 1;
   1453
   1454	/* short cut for raid56 */
   1455	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
   1456		return scrub_recheck_block_on_raid56(fs_info, sblock);
   1457
   1458	for (i = 0; i < sblock->sector_count; i++) {
   1459		struct scrub_sector *sector = sblock->sectors[i];
   1460		struct bio bio;
   1461		struct bio_vec bvec;
   1462
   1463		if (sector->dev->bdev == NULL) {
   1464			sector->io_error = 1;
   1465			sblock->no_io_error_seen = 0;
   1466			continue;
   1467		}
   1468
   1469		WARN_ON(!sector->page);
   1470		bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
   1471		bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
   1472		bio.bi_iter.bi_sector = sector->physical >> 9;
   1473
   1474		btrfsic_check_bio(&bio);
   1475		if (submit_bio_wait(&bio)) {
   1476			sector->io_error = 1;
   1477			sblock->no_io_error_seen = 0;
   1478		}
   1479
   1480		bio_uninit(&bio);
   1481	}
   1482
   1483	if (sblock->no_io_error_seen)
   1484		scrub_recheck_block_checksum(sblock);
   1485}
   1486
   1487static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
   1488{
   1489	struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
   1490	int ret;
   1491
   1492	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
   1493	return !ret;
   1494}
   1495
   1496static void scrub_recheck_block_checksum(struct scrub_block *sblock)
   1497{
   1498	sblock->header_error = 0;
   1499	sblock->checksum_error = 0;
   1500	sblock->generation_error = 0;
   1501
   1502	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
   1503		scrub_checksum_data(sblock);
   1504	else
   1505		scrub_checksum_tree_block(sblock);
   1506}
   1507
   1508static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
   1509					     struct scrub_block *sblock_good)
   1510{
   1511	int i;
   1512	int ret = 0;
   1513
   1514	for (i = 0; i < sblock_bad->sector_count; i++) {
   1515		int ret_sub;
   1516
   1517		ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
   1518							     sblock_good, i, 1);
   1519		if (ret_sub)
   1520			ret = ret_sub;
   1521	}
   1522
   1523	return ret;
   1524}
   1525
   1526static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
   1527					      struct scrub_block *sblock_good,
   1528					      int sector_num, int force_write)
   1529{
   1530	struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
   1531	struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
   1532	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
   1533	const u32 sectorsize = fs_info->sectorsize;
   1534
   1535	BUG_ON(sector_bad->page == NULL);
   1536	BUG_ON(sector_good->page == NULL);
   1537	if (force_write || sblock_bad->header_error ||
   1538	    sblock_bad->checksum_error || sector_bad->io_error) {
   1539		struct bio bio;
   1540		struct bio_vec bvec;
   1541		int ret;
   1542
   1543		if (!sector_bad->dev->bdev) {
   1544			btrfs_warn_rl(fs_info,
   1545				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
   1546			return -EIO;
   1547		}
   1548
   1549		bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
   1550		bio.bi_iter.bi_sector = sector_bad->physical >> 9;
   1551		__bio_add_page(&bio, sector_good->page, sectorsize, 0);
   1552
   1553		btrfsic_check_bio(&bio);
   1554		ret = submit_bio_wait(&bio);
   1555		bio_uninit(&bio);
   1556
   1557		if (ret) {
   1558			btrfs_dev_stat_inc_and_print(sector_bad->dev,
   1559				BTRFS_DEV_STAT_WRITE_ERRS);
   1560			atomic64_inc(&fs_info->dev_replace.num_write_errors);
   1561			return -EIO;
   1562		}
   1563	}
   1564
   1565	return 0;
   1566}
   1567
   1568static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
   1569{
   1570	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
   1571	int i;
   1572
   1573	/*
   1574	 * This block is used for the check of the parity on the source device,
   1575	 * so the data needn't be written into the destination device.
   1576	 */
   1577	if (sblock->sparity)
   1578		return;
   1579
   1580	for (i = 0; i < sblock->sector_count; i++) {
   1581		int ret;
   1582
   1583		ret = scrub_write_sector_to_dev_replace(sblock, i);
   1584		if (ret)
   1585			atomic64_inc(&fs_info->dev_replace.num_write_errors);
   1586	}
   1587}
   1588
   1589static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
   1590{
   1591	struct scrub_sector *sector = sblock->sectors[sector_num];
   1592
   1593	BUG_ON(sector->page == NULL);
   1594	if (sector->io_error)
   1595		clear_page(page_address(sector->page));
   1596
   1597	return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
   1598}
   1599
   1600static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
   1601{
   1602	int ret = 0;
   1603	u64 length;
   1604
   1605	if (!btrfs_is_zoned(sctx->fs_info))
   1606		return 0;
   1607
   1608	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
   1609		return 0;
   1610
   1611	if (sctx->write_pointer < physical) {
   1612		length = physical - sctx->write_pointer;
   1613
   1614		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
   1615						sctx->write_pointer, length);
   1616		if (!ret)
   1617			sctx->write_pointer = physical;
   1618	}
   1619	return ret;
   1620}
   1621
   1622static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
   1623				      struct scrub_sector *sector)
   1624{
   1625	struct scrub_bio *sbio;
   1626	int ret;
   1627	const u32 sectorsize = sctx->fs_info->sectorsize;
   1628
   1629	mutex_lock(&sctx->wr_lock);
   1630again:
   1631	if (!sctx->wr_curr_bio) {
   1632		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
   1633					      GFP_KERNEL);
   1634		if (!sctx->wr_curr_bio) {
   1635			mutex_unlock(&sctx->wr_lock);
   1636			return -ENOMEM;
   1637		}
   1638		sctx->wr_curr_bio->sctx = sctx;
   1639		sctx->wr_curr_bio->sector_count = 0;
   1640	}
   1641	sbio = sctx->wr_curr_bio;
   1642	if (sbio->sector_count == 0) {
   1643		ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
   1644		if (ret) {
   1645			mutex_unlock(&sctx->wr_lock);
   1646			return ret;
   1647		}
   1648
   1649		sbio->physical = sector->physical_for_dev_replace;
   1650		sbio->logical = sector->logical;
   1651		sbio->dev = sctx->wr_tgtdev;
   1652		if (!sbio->bio) {
   1653			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
   1654					      REQ_OP_WRITE, GFP_NOFS);
   1655		}
   1656		sbio->bio->bi_private = sbio;
   1657		sbio->bio->bi_end_io = scrub_wr_bio_end_io;
   1658		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
   1659		sbio->status = 0;
   1660	} else if (sbio->physical + sbio->sector_count * sectorsize !=
   1661		   sector->physical_for_dev_replace ||
   1662		   sbio->logical + sbio->sector_count * sectorsize !=
   1663		   sector->logical) {
   1664		scrub_wr_submit(sctx);
   1665		goto again;
   1666	}
   1667
   1668	ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
   1669	if (ret != sectorsize) {
   1670		if (sbio->sector_count < 1) {
   1671			bio_put(sbio->bio);
   1672			sbio->bio = NULL;
   1673			mutex_unlock(&sctx->wr_lock);
   1674			return -EIO;
   1675		}
   1676		scrub_wr_submit(sctx);
   1677		goto again;
   1678	}
   1679
   1680	sbio->sectors[sbio->sector_count] = sector;
   1681	scrub_sector_get(sector);
   1682	sbio->sector_count++;
   1683	if (sbio->sector_count == sctx->sectors_per_bio)
   1684		scrub_wr_submit(sctx);
   1685	mutex_unlock(&sctx->wr_lock);
   1686
   1687	return 0;
   1688}
   1689
   1690static void scrub_wr_submit(struct scrub_ctx *sctx)
   1691{
   1692	struct scrub_bio *sbio;
   1693
   1694	if (!sctx->wr_curr_bio)
   1695		return;
   1696
   1697	sbio = sctx->wr_curr_bio;
   1698	sctx->wr_curr_bio = NULL;
   1699	scrub_pending_bio_inc(sctx);
   1700	/* process all writes in a single worker thread. Then the block layer
   1701	 * orders the requests before sending them to the driver which
   1702	 * doubled the write performance on spinning disks when measured
   1703	 * with Linux 3.5 */
   1704	btrfsic_check_bio(sbio->bio);
   1705	submit_bio(sbio->bio);
   1706
   1707	if (btrfs_is_zoned(sctx->fs_info))
   1708		sctx->write_pointer = sbio->physical + sbio->sector_count *
   1709			sctx->fs_info->sectorsize;
   1710}
   1711
   1712static void scrub_wr_bio_end_io(struct bio *bio)
   1713{
   1714	struct scrub_bio *sbio = bio->bi_private;
   1715	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
   1716
   1717	sbio->status = bio->bi_status;
   1718	sbio->bio = bio;
   1719
   1720	INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
   1721	queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
   1722}
   1723
   1724static void scrub_wr_bio_end_io_worker(struct work_struct *work)
   1725{
   1726	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
   1727	struct scrub_ctx *sctx = sbio->sctx;
   1728	int i;
   1729
   1730	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
   1731	if (sbio->status) {
   1732		struct btrfs_dev_replace *dev_replace =
   1733			&sbio->sctx->fs_info->dev_replace;
   1734
   1735		for (i = 0; i < sbio->sector_count; i++) {
   1736			struct scrub_sector *sector = sbio->sectors[i];
   1737
   1738			sector->io_error = 1;
   1739			atomic64_inc(&dev_replace->num_write_errors);
   1740		}
   1741	}
   1742
   1743	for (i = 0; i < sbio->sector_count; i++)
   1744		scrub_sector_put(sbio->sectors[i]);
   1745
   1746	bio_put(sbio->bio);
   1747	kfree(sbio);
   1748	scrub_pending_bio_dec(sctx);
   1749}
   1750
   1751static int scrub_checksum(struct scrub_block *sblock)
   1752{
   1753	u64 flags;
   1754	int ret;
   1755
   1756	/*
   1757	 * No need to initialize these stats currently,
   1758	 * because this function only use return value
   1759	 * instead of these stats value.
   1760	 *
   1761	 * Todo:
   1762	 * always use stats
   1763	 */
   1764	sblock->header_error = 0;
   1765	sblock->generation_error = 0;
   1766	sblock->checksum_error = 0;
   1767
   1768	WARN_ON(sblock->sector_count < 1);
   1769	flags = sblock->sectors[0]->flags;
   1770	ret = 0;
   1771	if (flags & BTRFS_EXTENT_FLAG_DATA)
   1772		ret = scrub_checksum_data(sblock);
   1773	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
   1774		ret = scrub_checksum_tree_block(sblock);
   1775	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
   1776		(void)scrub_checksum_super(sblock);
   1777	else
   1778		WARN_ON(1);
   1779	if (ret)
   1780		scrub_handle_errored_block(sblock);
   1781
   1782	return ret;
   1783}
   1784
   1785static int scrub_checksum_data(struct scrub_block *sblock)
   1786{
   1787	struct scrub_ctx *sctx = sblock->sctx;
   1788	struct btrfs_fs_info *fs_info = sctx->fs_info;
   1789	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
   1790	u8 csum[BTRFS_CSUM_SIZE];
   1791	struct scrub_sector *sector;
   1792	char *kaddr;
   1793
   1794	BUG_ON(sblock->sector_count < 1);
   1795	sector = sblock->sectors[0];
   1796	if (!sector->have_csum)
   1797		return 0;
   1798
   1799	kaddr = page_address(sector->page);
   1800
   1801	shash->tfm = fs_info->csum_shash;
   1802	crypto_shash_init(shash);
   1803
   1804	/*
   1805	 * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
   1806	 * only contains one sector of data.
   1807	 */
   1808	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
   1809
   1810	if (memcmp(csum, sector->csum, fs_info->csum_size))
   1811		sblock->checksum_error = 1;
   1812	return sblock->checksum_error;
   1813}
   1814
   1815static int scrub_checksum_tree_block(struct scrub_block *sblock)
   1816{
   1817	struct scrub_ctx *sctx = sblock->sctx;
   1818	struct btrfs_header *h;
   1819	struct btrfs_fs_info *fs_info = sctx->fs_info;
   1820	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
   1821	u8 calculated_csum[BTRFS_CSUM_SIZE];
   1822	u8 on_disk_csum[BTRFS_CSUM_SIZE];
   1823	/*
   1824	 * This is done in sectorsize steps even for metadata as there's a
   1825	 * constraint for nodesize to be aligned to sectorsize. This will need
   1826	 * to change so we don't misuse data and metadata units like that.
   1827	 */
   1828	const u32 sectorsize = sctx->fs_info->sectorsize;
   1829	const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
   1830	int i;
   1831	struct scrub_sector *sector;
   1832	char *kaddr;
   1833
   1834	BUG_ON(sblock->sector_count < 1);
   1835
   1836	/* Each member in sectors is just one sector */
   1837	ASSERT(sblock->sector_count == num_sectors);
   1838
   1839	sector = sblock->sectors[0];
   1840	kaddr = page_address(sector->page);
   1841	h = (struct btrfs_header *)kaddr;
   1842	memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
   1843
   1844	/*
   1845	 * we don't use the getter functions here, as we
   1846	 * a) don't have an extent buffer and
   1847	 * b) the page is already kmapped
   1848	 */
   1849	if (sector->logical != btrfs_stack_header_bytenr(h))
   1850		sblock->header_error = 1;
   1851
   1852	if (sector->generation != btrfs_stack_header_generation(h)) {
   1853		sblock->header_error = 1;
   1854		sblock->generation_error = 1;
   1855	}
   1856
   1857	if (!scrub_check_fsid(h->fsid, sector))
   1858		sblock->header_error = 1;
   1859
   1860	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
   1861		   BTRFS_UUID_SIZE))
   1862		sblock->header_error = 1;
   1863
   1864	shash->tfm = fs_info->csum_shash;
   1865	crypto_shash_init(shash);
   1866	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
   1867			    sectorsize - BTRFS_CSUM_SIZE);
   1868
   1869	for (i = 1; i < num_sectors; i++) {
   1870		kaddr = page_address(sblock->sectors[i]->page);
   1871		crypto_shash_update(shash, kaddr, sectorsize);
   1872	}
   1873
   1874	crypto_shash_final(shash, calculated_csum);
   1875	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
   1876		sblock->checksum_error = 1;
   1877
   1878	return sblock->header_error || sblock->checksum_error;
   1879}
   1880
   1881static int scrub_checksum_super(struct scrub_block *sblock)
   1882{
   1883	struct btrfs_super_block *s;
   1884	struct scrub_ctx *sctx = sblock->sctx;
   1885	struct btrfs_fs_info *fs_info = sctx->fs_info;
   1886	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
   1887	u8 calculated_csum[BTRFS_CSUM_SIZE];
   1888	struct scrub_sector *sector;
   1889	char *kaddr;
   1890	int fail_gen = 0;
   1891	int fail_cor = 0;
   1892
   1893	BUG_ON(sblock->sector_count < 1);
   1894	sector = sblock->sectors[0];
   1895	kaddr = page_address(sector->page);
   1896	s = (struct btrfs_super_block *)kaddr;
   1897
   1898	if (sector->logical != btrfs_super_bytenr(s))
   1899		++fail_cor;
   1900
   1901	if (sector->generation != btrfs_super_generation(s))
   1902		++fail_gen;
   1903
   1904	if (!scrub_check_fsid(s->fsid, sector))
   1905		++fail_cor;
   1906
   1907	shash->tfm = fs_info->csum_shash;
   1908	crypto_shash_init(shash);
   1909	crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
   1910			BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
   1911
   1912	if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
   1913		++fail_cor;
   1914
   1915	if (fail_cor + fail_gen) {
   1916		/*
   1917		 * if we find an error in a super block, we just report it.
   1918		 * They will get written with the next transaction commit
   1919		 * anyway
   1920		 */
   1921		spin_lock(&sctx->stat_lock);
   1922		++sctx->stat.super_errors;
   1923		spin_unlock(&sctx->stat_lock);
   1924		if (fail_cor)
   1925			btrfs_dev_stat_inc_and_print(sector->dev,
   1926				BTRFS_DEV_STAT_CORRUPTION_ERRS);
   1927		else
   1928			btrfs_dev_stat_inc_and_print(sector->dev,
   1929				BTRFS_DEV_STAT_GENERATION_ERRS);
   1930	}
   1931
   1932	return fail_cor + fail_gen;
   1933}
   1934
   1935static void scrub_block_get(struct scrub_block *sblock)
   1936{
   1937	refcount_inc(&sblock->refs);
   1938}
   1939
   1940static void scrub_block_put(struct scrub_block *sblock)
   1941{
   1942	if (refcount_dec_and_test(&sblock->refs)) {
   1943		int i;
   1944
   1945		if (sblock->sparity)
   1946			scrub_parity_put(sblock->sparity);
   1947
   1948		for (i = 0; i < sblock->sector_count; i++)
   1949			scrub_sector_put(sblock->sectors[i]);
   1950		kfree(sblock);
   1951	}
   1952}
   1953
   1954static void scrub_sector_get(struct scrub_sector *sector)
   1955{
   1956	atomic_inc(&sector->refs);
   1957}
   1958
   1959static void scrub_sector_put(struct scrub_sector *sector)
   1960{
   1961	if (atomic_dec_and_test(&sector->refs)) {
   1962		if (sector->page)
   1963			__free_page(sector->page);
   1964		kfree(sector);
   1965	}
   1966}
   1967
   1968/*
   1969 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
   1970 * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
   1971 */
   1972static void scrub_throttle(struct scrub_ctx *sctx)
   1973{
   1974	const int time_slice = 1000;
   1975	struct scrub_bio *sbio;
   1976	struct btrfs_device *device;
   1977	s64 delta;
   1978	ktime_t now;
   1979	u32 div;
   1980	u64 bwlimit;
   1981
   1982	sbio = sctx->bios[sctx->curr];
   1983	device = sbio->dev;
   1984	bwlimit = READ_ONCE(device->scrub_speed_max);
   1985	if (bwlimit == 0)
   1986		return;
   1987
   1988	/*
   1989	 * Slice is divided into intervals when the IO is submitted, adjust by
   1990	 * bwlimit and maximum of 64 intervals.
   1991	 */
   1992	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
   1993	div = min_t(u32, 64, div);
   1994
   1995	/* Start new epoch, set deadline */
   1996	now = ktime_get();
   1997	if (sctx->throttle_deadline == 0) {
   1998		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
   1999		sctx->throttle_sent = 0;
   2000	}
   2001
   2002	/* Still in the time to send? */
   2003	if (ktime_before(now, sctx->throttle_deadline)) {
   2004		/* If current bio is within the limit, send it */
   2005		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
   2006		if (sctx->throttle_sent <= div_u64(bwlimit, div))
   2007			return;
   2008
   2009		/* We're over the limit, sleep until the rest of the slice */
   2010		delta = ktime_ms_delta(sctx->throttle_deadline, now);
   2011	} else {
   2012		/* New request after deadline, start new epoch */
   2013		delta = 0;
   2014	}
   2015
   2016	if (delta) {
   2017		long timeout;
   2018
   2019		timeout = div_u64(delta * HZ, 1000);
   2020		schedule_timeout_interruptible(timeout);
   2021	}
   2022
   2023	/* Next call will start the deadline period */
   2024	sctx->throttle_deadline = 0;
   2025}
   2026
   2027static void scrub_submit(struct scrub_ctx *sctx)
   2028{
   2029	struct scrub_bio *sbio;
   2030
   2031	if (sctx->curr == -1)
   2032		return;
   2033
   2034	scrub_throttle(sctx);
   2035
   2036	sbio = sctx->bios[sctx->curr];
   2037	sctx->curr = -1;
   2038	scrub_pending_bio_inc(sctx);
   2039	btrfsic_check_bio(sbio->bio);
   2040	submit_bio(sbio->bio);
   2041}
   2042
   2043static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
   2044				      struct scrub_sector *sector)
   2045{
   2046	struct scrub_block *sblock = sector->sblock;
   2047	struct scrub_bio *sbio;
   2048	const u32 sectorsize = sctx->fs_info->sectorsize;
   2049	int ret;
   2050
   2051again:
   2052	/*
   2053	 * grab a fresh bio or wait for one to become available
   2054	 */
   2055	while (sctx->curr == -1) {
   2056		spin_lock(&sctx->list_lock);
   2057		sctx->curr = sctx->first_free;
   2058		if (sctx->curr != -1) {
   2059			sctx->first_free = sctx->bios[sctx->curr]->next_free;
   2060			sctx->bios[sctx->curr]->next_free = -1;
   2061			sctx->bios[sctx->curr]->sector_count = 0;
   2062			spin_unlock(&sctx->list_lock);
   2063		} else {
   2064			spin_unlock(&sctx->list_lock);
   2065			wait_event(sctx->list_wait, sctx->first_free != -1);
   2066		}
   2067	}
   2068	sbio = sctx->bios[sctx->curr];
   2069	if (sbio->sector_count == 0) {
   2070		sbio->physical = sector->physical;
   2071		sbio->logical = sector->logical;
   2072		sbio->dev = sector->dev;
   2073		if (!sbio->bio) {
   2074			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
   2075					      REQ_OP_READ, GFP_NOFS);
   2076		}
   2077		sbio->bio->bi_private = sbio;
   2078		sbio->bio->bi_end_io = scrub_bio_end_io;
   2079		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
   2080		sbio->status = 0;
   2081	} else if (sbio->physical + sbio->sector_count * sectorsize !=
   2082		   sector->physical ||
   2083		   sbio->logical + sbio->sector_count * sectorsize !=
   2084		   sector->logical ||
   2085		   sbio->dev != sector->dev) {
   2086		scrub_submit(sctx);
   2087		goto again;
   2088	}
   2089
   2090	sbio->sectors[sbio->sector_count] = sector;
   2091	ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
   2092	if (ret != sectorsize) {
   2093		if (sbio->sector_count < 1) {
   2094			bio_put(sbio->bio);
   2095			sbio->bio = NULL;
   2096			return -EIO;
   2097		}
   2098		scrub_submit(sctx);
   2099		goto again;
   2100	}
   2101
   2102	scrub_block_get(sblock); /* one for the page added to the bio */
   2103	atomic_inc(&sblock->outstanding_sectors);
   2104	sbio->sector_count++;
   2105	if (sbio->sector_count == sctx->sectors_per_bio)
   2106		scrub_submit(sctx);
   2107
   2108	return 0;
   2109}
   2110
   2111static void scrub_missing_raid56_end_io(struct bio *bio)
   2112{
   2113	struct scrub_block *sblock = bio->bi_private;
   2114	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
   2115
   2116	if (bio->bi_status)
   2117		sblock->no_io_error_seen = 0;
   2118
   2119	bio_put(bio);
   2120
   2121	queue_work(fs_info->scrub_workers, &sblock->work);
   2122}
   2123
   2124static void scrub_missing_raid56_worker(struct work_struct *work)
   2125{
   2126	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
   2127	struct scrub_ctx *sctx = sblock->sctx;
   2128	struct btrfs_fs_info *fs_info = sctx->fs_info;
   2129	u64 logical;
   2130	struct btrfs_device *dev;
   2131
   2132	logical = sblock->sectors[0]->logical;
   2133	dev = sblock->sectors[0]->dev;
   2134
   2135	if (sblock->no_io_error_seen)
   2136		scrub_recheck_block_checksum(sblock);
   2137
   2138	if (!sblock->no_io_error_seen) {
   2139		spin_lock(&sctx->stat_lock);
   2140		sctx->stat.read_errors++;
   2141		spin_unlock(&sctx->stat_lock);
   2142		btrfs_err_rl_in_rcu(fs_info,
   2143			"IO error rebuilding logical %llu for dev %s",
   2144			logical, rcu_str_deref(dev->name));
   2145	} else if (sblock->header_error || sblock->checksum_error) {
   2146		spin_lock(&sctx->stat_lock);
   2147		sctx->stat.uncorrectable_errors++;
   2148		spin_unlock(&sctx->stat_lock);
   2149		btrfs_err_rl_in_rcu(fs_info,
   2150			"failed to rebuild valid logical %llu for dev %s",
   2151			logical, rcu_str_deref(dev->name));
   2152	} else {
   2153		scrub_write_block_to_dev_replace(sblock);
   2154	}
   2155
   2156	if (sctx->is_dev_replace && sctx->flush_all_writes) {
   2157		mutex_lock(&sctx->wr_lock);
   2158		scrub_wr_submit(sctx);
   2159		mutex_unlock(&sctx->wr_lock);
   2160	}
   2161
   2162	scrub_block_put(sblock);
   2163	scrub_pending_bio_dec(sctx);
   2164}
   2165
   2166static void scrub_missing_raid56_pages(struct scrub_block *sblock)
   2167{
   2168	struct scrub_ctx *sctx = sblock->sctx;
   2169	struct btrfs_fs_info *fs_info = sctx->fs_info;
   2170	u64 length = sblock->sector_count << fs_info->sectorsize_bits;
   2171	u64 logical = sblock->sectors[0]->logical;
   2172	struct btrfs_io_context *bioc = NULL;
   2173	struct bio *bio;
   2174	struct btrfs_raid_bio *rbio;
   2175	int ret;
   2176	int i;
   2177
   2178	btrfs_bio_counter_inc_blocked(fs_info);
   2179	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
   2180			       &length, &bioc);
   2181	if (ret || !bioc || !bioc->raid_map)
   2182		goto bioc_out;
   2183
   2184	if (WARN_ON(!sctx->is_dev_replace ||
   2185		    !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
   2186		/*
   2187		 * We shouldn't be scrubbing a missing device. Even for dev
   2188		 * replace, we should only get here for RAID 5/6. We either
   2189		 * managed to mount something with no mirrors remaining or
   2190		 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
   2191		 */
   2192		goto bioc_out;
   2193	}
   2194
   2195	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
   2196	bio->bi_iter.bi_sector = logical >> 9;
   2197	bio->bi_private = sblock;
   2198	bio->bi_end_io = scrub_missing_raid56_end_io;
   2199
   2200	rbio = raid56_alloc_missing_rbio(bio, bioc, length);
   2201	if (!rbio)
   2202		goto rbio_out;
   2203
   2204	for (i = 0; i < sblock->sector_count; i++) {
   2205		struct scrub_sector *sector = sblock->sectors[i];
   2206
   2207		/*
   2208		 * For now, our scrub is still one page per sector, so pgoff
   2209		 * is always 0.
   2210		 */
   2211		raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
   2212	}
   2213
   2214	INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
   2215	scrub_block_get(sblock);
   2216	scrub_pending_bio_inc(sctx);
   2217	raid56_submit_missing_rbio(rbio);
   2218	return;
   2219
   2220rbio_out:
   2221	bio_put(bio);
   2222bioc_out:
   2223	btrfs_bio_counter_dec(fs_info);
   2224	btrfs_put_bioc(bioc);
   2225	spin_lock(&sctx->stat_lock);
   2226	sctx->stat.malloc_errors++;
   2227	spin_unlock(&sctx->stat_lock);
   2228}
   2229
   2230static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
   2231		       u64 physical, struct btrfs_device *dev, u64 flags,
   2232		       u64 gen, int mirror_num, u8 *csum,
   2233		       u64 physical_for_dev_replace)
   2234{
   2235	struct scrub_block *sblock;
   2236	const u32 sectorsize = sctx->fs_info->sectorsize;
   2237	int index;
   2238
   2239	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
   2240	if (!sblock) {
   2241		spin_lock(&sctx->stat_lock);
   2242		sctx->stat.malloc_errors++;
   2243		spin_unlock(&sctx->stat_lock);
   2244		return -ENOMEM;
   2245	}
   2246
   2247	/* one ref inside this function, plus one for each page added to
   2248	 * a bio later on */
   2249	refcount_set(&sblock->refs, 1);
   2250	sblock->sctx = sctx;
   2251	sblock->no_io_error_seen = 1;
   2252
   2253	for (index = 0; len > 0; index++) {
   2254		struct scrub_sector *sector;
   2255		/*
   2256		 * Here we will allocate one page for one sector to scrub.
   2257		 * This is fine if PAGE_SIZE == sectorsize, but will cost
   2258		 * more memory for PAGE_SIZE > sectorsize case.
   2259		 */
   2260		u32 l = min(sectorsize, len);
   2261
   2262		sector = kzalloc(sizeof(*sector), GFP_KERNEL);
   2263		if (!sector) {
   2264leave_nomem:
   2265			spin_lock(&sctx->stat_lock);
   2266			sctx->stat.malloc_errors++;
   2267			spin_unlock(&sctx->stat_lock);
   2268			scrub_block_put(sblock);
   2269			return -ENOMEM;
   2270		}
   2271		ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
   2272		scrub_sector_get(sector);
   2273		sblock->sectors[index] = sector;
   2274		sector->sblock = sblock;
   2275		sector->dev = dev;
   2276		sector->flags = flags;
   2277		sector->generation = gen;
   2278		sector->logical = logical;
   2279		sector->physical = physical;
   2280		sector->physical_for_dev_replace = physical_for_dev_replace;
   2281		sector->mirror_num = mirror_num;
   2282		if (csum) {
   2283			sector->have_csum = 1;
   2284			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
   2285		} else {
   2286			sector->have_csum = 0;
   2287		}
   2288		sblock->sector_count++;
   2289		sector->page = alloc_page(GFP_KERNEL);
   2290		if (!sector->page)
   2291			goto leave_nomem;
   2292		len -= l;
   2293		logical += l;
   2294		physical += l;
   2295		physical_for_dev_replace += l;
   2296	}
   2297
   2298	WARN_ON(sblock->sector_count == 0);
   2299	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
   2300		/*
   2301		 * This case should only be hit for RAID 5/6 device replace. See
   2302		 * the comment in scrub_missing_raid56_pages() for details.
   2303		 */
   2304		scrub_missing_raid56_pages(sblock);
   2305	} else {
   2306		for (index = 0; index < sblock->sector_count; index++) {
   2307			struct scrub_sector *sector = sblock->sectors[index];
   2308			int ret;
   2309
   2310			ret = scrub_add_sector_to_rd_bio(sctx, sector);
   2311			if (ret) {
   2312				scrub_block_put(sblock);
   2313				return ret;
   2314			}
   2315		}
   2316
   2317		if (flags & BTRFS_EXTENT_FLAG_SUPER)
   2318			scrub_submit(sctx);
   2319	}
   2320
   2321	/* last one frees, either here or in bio completion for last page */
   2322	scrub_block_put(sblock);
   2323	return 0;
   2324}
   2325
   2326static void scrub_bio_end_io(struct bio *bio)
   2327{
   2328	struct scrub_bio *sbio = bio->bi_private;
   2329	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
   2330
   2331	sbio->status = bio->bi_status;
   2332	sbio->bio = bio;
   2333
   2334	queue_work(fs_info->scrub_workers, &sbio->work);
   2335}
   2336
   2337static void scrub_bio_end_io_worker(struct work_struct *work)
   2338{
   2339	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
   2340	struct scrub_ctx *sctx = sbio->sctx;
   2341	int i;
   2342
   2343	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
   2344	if (sbio->status) {
   2345		for (i = 0; i < sbio->sector_count; i++) {
   2346			struct scrub_sector *sector = sbio->sectors[i];
   2347
   2348			sector->io_error = 1;
   2349			sector->sblock->no_io_error_seen = 0;
   2350		}
   2351	}
   2352
   2353	/* Now complete the scrub_block items that have all pages completed */
   2354	for (i = 0; i < sbio->sector_count; i++) {
   2355		struct scrub_sector *sector = sbio->sectors[i];
   2356		struct scrub_block *sblock = sector->sblock;
   2357
   2358		if (atomic_dec_and_test(&sblock->outstanding_sectors))
   2359			scrub_block_complete(sblock);
   2360		scrub_block_put(sblock);
   2361	}
   2362
   2363	bio_put(sbio->bio);
   2364	sbio->bio = NULL;
   2365	spin_lock(&sctx->list_lock);
   2366	sbio->next_free = sctx->first_free;
   2367	sctx->first_free = sbio->index;
   2368	spin_unlock(&sctx->list_lock);
   2369
   2370	if (sctx->is_dev_replace && sctx->flush_all_writes) {
   2371		mutex_lock(&sctx->wr_lock);
   2372		scrub_wr_submit(sctx);
   2373		mutex_unlock(&sctx->wr_lock);
   2374	}
   2375
   2376	scrub_pending_bio_dec(sctx);
   2377}
   2378
   2379static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
   2380				       unsigned long *bitmap,
   2381				       u64 start, u32 len)
   2382{
   2383	u64 offset;
   2384	u32 nsectors;
   2385	u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
   2386
   2387	if (len >= sparity->stripe_len) {
   2388		bitmap_set(bitmap, 0, sparity->nsectors);
   2389		return;
   2390	}
   2391
   2392	start -= sparity->logic_start;
   2393	start = div64_u64_rem(start, sparity->stripe_len, &offset);
   2394	offset = offset >> sectorsize_bits;
   2395	nsectors = len >> sectorsize_bits;
   2396
   2397	if (offset + nsectors <= sparity->nsectors) {
   2398		bitmap_set(bitmap, offset, nsectors);
   2399		return;
   2400	}
   2401
   2402	bitmap_set(bitmap, offset, sparity->nsectors - offset);
   2403	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
   2404}
   2405
   2406static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
   2407						   u64 start, u32 len)
   2408{
   2409	__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
   2410}
   2411
   2412static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
   2413						  u64 start, u32 len)
   2414{
   2415	__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
   2416}
   2417
   2418static void scrub_block_complete(struct scrub_block *sblock)
   2419{
   2420	int corrupted = 0;
   2421
   2422	if (!sblock->no_io_error_seen) {
   2423		corrupted = 1;
   2424		scrub_handle_errored_block(sblock);
   2425	} else {
   2426		/*
   2427		 * if has checksum error, write via repair mechanism in
   2428		 * dev replace case, otherwise write here in dev replace
   2429		 * case.
   2430		 */
   2431		corrupted = scrub_checksum(sblock);
   2432		if (!corrupted && sblock->sctx->is_dev_replace)
   2433			scrub_write_block_to_dev_replace(sblock);
   2434	}
   2435
   2436	if (sblock->sparity && corrupted && !sblock->data_corrected) {
   2437		u64 start = sblock->sectors[0]->logical;
   2438		u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
   2439			  sblock->sctx->fs_info->sectorsize;
   2440
   2441		ASSERT(end - start <= U32_MAX);
   2442		scrub_parity_mark_sectors_error(sblock->sparity,
   2443						start, end - start);
   2444	}
   2445}
   2446
   2447static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
   2448{
   2449	sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
   2450	list_del(&sum->list);
   2451	kfree(sum);
   2452}
   2453
   2454/*
   2455 * Find the desired csum for range [logical, logical + sectorsize), and store
   2456 * the csum into @csum.
   2457 *
   2458 * The search source is sctx->csum_list, which is a pre-populated list
   2459 * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
   2460 * that is before @logical.
   2461 *
   2462 * Return 0 if there is no csum for the range.
   2463 * Return 1 if there is csum for the range and copied to @csum.
   2464 */
   2465static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
   2466{
   2467	bool found = false;
   2468
   2469	while (!list_empty(&sctx->csum_list)) {
   2470		struct btrfs_ordered_sum *sum = NULL;
   2471		unsigned long index;
   2472		unsigned long num_sectors;
   2473
   2474		sum = list_first_entry(&sctx->csum_list,
   2475				       struct btrfs_ordered_sum, list);
   2476		/* The current csum range is beyond our range, no csum found */
   2477		if (sum->bytenr > logical)
   2478			break;
   2479
   2480		/*
   2481		 * The current sum is before our bytenr, since scrub is always
   2482		 * done in bytenr order, the csum will never be used anymore,
   2483		 * clean it up so that later calls won't bother with the range,
   2484		 * and continue search the next range.
   2485		 */
   2486		if (sum->bytenr + sum->len <= logical) {
   2487			drop_csum_range(sctx, sum);
   2488			continue;
   2489		}
   2490
   2491		/* Now the csum range covers our bytenr, copy the csum */
   2492		found = true;
   2493		index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
   2494		num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
   2495
   2496		memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
   2497		       sctx->fs_info->csum_size);
   2498
   2499		/* Cleanup the range if we're at the end of the csum range */
   2500		if (index == num_sectors - 1)
   2501			drop_csum_range(sctx, sum);
   2502		break;
   2503	}
   2504	if (!found)
   2505		return 0;
   2506	return 1;
   2507}
   2508
   2509/* scrub extent tries to collect up to 64 kB for each bio */
   2510static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
   2511			u64 logical, u32 len,
   2512			u64 physical, struct btrfs_device *dev, u64 flags,
   2513			u64 gen, int mirror_num)
   2514{
   2515	struct btrfs_device *src_dev = dev;
   2516	u64 src_physical = physical;
   2517	int src_mirror = mirror_num;
   2518	int ret;
   2519	u8 csum[BTRFS_CSUM_SIZE];
   2520	u32 blocksize;
   2521
   2522	if (flags & BTRFS_EXTENT_FLAG_DATA) {
   2523		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
   2524			blocksize = map->stripe_len;
   2525		else
   2526			blocksize = sctx->fs_info->sectorsize;
   2527		spin_lock(&sctx->stat_lock);
   2528		sctx->stat.data_extents_scrubbed++;
   2529		sctx->stat.data_bytes_scrubbed += len;
   2530		spin_unlock(&sctx->stat_lock);
   2531	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
   2532		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
   2533			blocksize = map->stripe_len;
   2534		else
   2535			blocksize = sctx->fs_info->nodesize;
   2536		spin_lock(&sctx->stat_lock);
   2537		sctx->stat.tree_extents_scrubbed++;
   2538		sctx->stat.tree_bytes_scrubbed += len;
   2539		spin_unlock(&sctx->stat_lock);
   2540	} else {
   2541		blocksize = sctx->fs_info->sectorsize;
   2542		WARN_ON(1);
   2543	}
   2544
   2545	/*
   2546	 * For dev-replace case, we can have @dev being a missing device.
   2547	 * Regular scrub will avoid its execution on missing device at all,
   2548	 * as that would trigger tons of read error.
   2549	 *
   2550	 * Reading from missing device will cause read error counts to
   2551	 * increase unnecessarily.
   2552	 * So here we change the read source to a good mirror.
   2553	 */
   2554	if (sctx->is_dev_replace && !dev->bdev)
   2555		scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
   2556				     &src_dev, &src_mirror);
   2557	while (len) {
   2558		u32 l = min(len, blocksize);
   2559		int have_csum = 0;
   2560
   2561		if (flags & BTRFS_EXTENT_FLAG_DATA) {
   2562			/* push csums to sbio */
   2563			have_csum = scrub_find_csum(sctx, logical, csum);
   2564			if (have_csum == 0)
   2565				++sctx->stat.no_csum;
   2566		}
   2567		ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
   2568				    flags, gen, src_mirror,
   2569				    have_csum ? csum : NULL, physical);
   2570		if (ret)
   2571			return ret;
   2572		len -= l;
   2573		logical += l;
   2574		physical += l;
   2575		src_physical += l;
   2576	}
   2577	return 0;
   2578}
   2579
   2580static int scrub_sectors_for_parity(struct scrub_parity *sparity,
   2581				  u64 logical, u32 len,
   2582				  u64 physical, struct btrfs_device *dev,
   2583				  u64 flags, u64 gen, int mirror_num, u8 *csum)
   2584{
   2585	struct scrub_ctx *sctx = sparity->sctx;
   2586	struct scrub_block *sblock;
   2587	const u32 sectorsize = sctx->fs_info->sectorsize;
   2588	int index;
   2589
   2590	ASSERT(IS_ALIGNED(len, sectorsize));
   2591
   2592	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
   2593	if (!sblock) {
   2594		spin_lock(&sctx->stat_lock);
   2595		sctx->stat.malloc_errors++;
   2596		spin_unlock(&sctx->stat_lock);
   2597		return -ENOMEM;
   2598	}
   2599
   2600	/* one ref inside this function, plus one for each page added to
   2601	 * a bio later on */
   2602	refcount_set(&sblock->refs, 1);
   2603	sblock->sctx = sctx;
   2604	sblock->no_io_error_seen = 1;
   2605	sblock->sparity = sparity;
   2606	scrub_parity_get(sparity);
   2607
   2608	for (index = 0; len > 0; index++) {
   2609		struct scrub_sector *sector;
   2610
   2611		sector = kzalloc(sizeof(*sector), GFP_KERNEL);
   2612		if (!sector) {
   2613leave_nomem:
   2614			spin_lock(&sctx->stat_lock);
   2615			sctx->stat.malloc_errors++;
   2616			spin_unlock(&sctx->stat_lock);
   2617			scrub_block_put(sblock);
   2618			return -ENOMEM;
   2619		}
   2620		ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
   2621		/* For scrub block */
   2622		scrub_sector_get(sector);
   2623		sblock->sectors[index] = sector;
   2624		/* For scrub parity */
   2625		scrub_sector_get(sector);
   2626		list_add_tail(&sector->list, &sparity->sectors_list);
   2627		sector->sblock = sblock;
   2628		sector->dev = dev;
   2629		sector->flags = flags;
   2630		sector->generation = gen;
   2631		sector->logical = logical;
   2632		sector->physical = physical;
   2633		sector->mirror_num = mirror_num;
   2634		if (csum) {
   2635			sector->have_csum = 1;
   2636			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
   2637		} else {
   2638			sector->have_csum = 0;
   2639		}
   2640		sblock->sector_count++;
   2641		sector->page = alloc_page(GFP_KERNEL);
   2642		if (!sector->page)
   2643			goto leave_nomem;
   2644
   2645
   2646		/* Iterate over the stripe range in sectorsize steps */
   2647		len -= sectorsize;
   2648		logical += sectorsize;
   2649		physical += sectorsize;
   2650	}
   2651
   2652	WARN_ON(sblock->sector_count == 0);
   2653	for (index = 0; index < sblock->sector_count; index++) {
   2654		struct scrub_sector *sector = sblock->sectors[index];
   2655		int ret;
   2656
   2657		ret = scrub_add_sector_to_rd_bio(sctx, sector);
   2658		if (ret) {
   2659			scrub_block_put(sblock);
   2660			return ret;
   2661		}
   2662	}
   2663
   2664	/* Last one frees, either here or in bio completion for last sector */
   2665	scrub_block_put(sblock);
   2666	return 0;
   2667}
   2668
   2669static int scrub_extent_for_parity(struct scrub_parity *sparity,
   2670				   u64 logical, u32 len,
   2671				   u64 physical, struct btrfs_device *dev,
   2672				   u64 flags, u64 gen, int mirror_num)
   2673{
   2674	struct scrub_ctx *sctx = sparity->sctx;
   2675	int ret;
   2676	u8 csum[BTRFS_CSUM_SIZE];
   2677	u32 blocksize;
   2678
   2679	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
   2680		scrub_parity_mark_sectors_error(sparity, logical, len);
   2681		return 0;
   2682	}
   2683
   2684	if (flags & BTRFS_EXTENT_FLAG_DATA) {
   2685		blocksize = sparity->stripe_len;
   2686	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
   2687		blocksize = sparity->stripe_len;
   2688	} else {
   2689		blocksize = sctx->fs_info->sectorsize;
   2690		WARN_ON(1);
   2691	}
   2692
   2693	while (len) {
   2694		u32 l = min(len, blocksize);
   2695		int have_csum = 0;
   2696
   2697		if (flags & BTRFS_EXTENT_FLAG_DATA) {
   2698			/* push csums to sbio */
   2699			have_csum = scrub_find_csum(sctx, logical, csum);
   2700			if (have_csum == 0)
   2701				goto skip;
   2702		}
   2703		ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
   2704					     flags, gen, mirror_num,
   2705					     have_csum ? csum : NULL);
   2706		if (ret)
   2707			return ret;
   2708skip:
   2709		len -= l;
   2710		logical += l;
   2711		physical += l;
   2712	}
   2713	return 0;
   2714}
   2715
   2716/*
   2717 * Given a physical address, this will calculate it's
   2718 * logical offset. if this is a parity stripe, it will return
   2719 * the most left data stripe's logical offset.
   2720 *
   2721 * return 0 if it is a data stripe, 1 means parity stripe.
   2722 */
   2723static int get_raid56_logic_offset(u64 physical, int num,
   2724				   struct map_lookup *map, u64 *offset,
   2725				   u64 *stripe_start)
   2726{
   2727	int i;
   2728	int j = 0;
   2729	u64 stripe_nr;
   2730	u64 last_offset;
   2731	u32 stripe_index;
   2732	u32 rot;
   2733	const int data_stripes = nr_data_stripes(map);
   2734
   2735	last_offset = (physical - map->stripes[num].physical) * data_stripes;
   2736	if (stripe_start)
   2737		*stripe_start = last_offset;
   2738
   2739	*offset = last_offset;
   2740	for (i = 0; i < data_stripes; i++) {
   2741		*offset = last_offset + i * map->stripe_len;
   2742
   2743		stripe_nr = div64_u64(*offset, map->stripe_len);
   2744		stripe_nr = div_u64(stripe_nr, data_stripes);
   2745
   2746		/* Work out the disk rotation on this stripe-set */
   2747		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
   2748		/* calculate which stripe this data locates */
   2749		rot += i;
   2750		stripe_index = rot % map->num_stripes;
   2751		if (stripe_index == num)
   2752			return 0;
   2753		if (stripe_index < num)
   2754			j++;
   2755	}
   2756	*offset = last_offset + j * map->stripe_len;
   2757	return 1;
   2758}
   2759
   2760static void scrub_free_parity(struct scrub_parity *sparity)
   2761{
   2762	struct scrub_ctx *sctx = sparity->sctx;
   2763	struct scrub_sector *curr, *next;
   2764	int nbits;
   2765
   2766	nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
   2767	if (nbits) {
   2768		spin_lock(&sctx->stat_lock);
   2769		sctx->stat.read_errors += nbits;
   2770		sctx->stat.uncorrectable_errors += nbits;
   2771		spin_unlock(&sctx->stat_lock);
   2772	}
   2773
   2774	list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
   2775		list_del_init(&curr->list);
   2776		scrub_sector_put(curr);
   2777	}
   2778
   2779	kfree(sparity);
   2780}
   2781
   2782static void scrub_parity_bio_endio_worker(struct work_struct *work)
   2783{
   2784	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
   2785						    work);
   2786	struct scrub_ctx *sctx = sparity->sctx;
   2787
   2788	scrub_free_parity(sparity);
   2789	scrub_pending_bio_dec(sctx);
   2790}
   2791
   2792static void scrub_parity_bio_endio(struct bio *bio)
   2793{
   2794	struct scrub_parity *sparity = bio->bi_private;
   2795	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
   2796
   2797	if (bio->bi_status)
   2798		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
   2799			  sparity->nsectors);
   2800
   2801	bio_put(bio);
   2802
   2803	INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
   2804	queue_work(fs_info->scrub_parity_workers, &sparity->work);
   2805}
   2806
   2807static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
   2808{
   2809	struct scrub_ctx *sctx = sparity->sctx;
   2810	struct btrfs_fs_info *fs_info = sctx->fs_info;
   2811	struct bio *bio;
   2812	struct btrfs_raid_bio *rbio;
   2813	struct btrfs_io_context *bioc = NULL;
   2814	u64 length;
   2815	int ret;
   2816
   2817	if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
   2818			   sparity->nsectors))
   2819		goto out;
   2820
   2821	length = sparity->logic_end - sparity->logic_start;
   2822
   2823	btrfs_bio_counter_inc_blocked(fs_info);
   2824	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
   2825			       &length, &bioc);
   2826	if (ret || !bioc || !bioc->raid_map)
   2827		goto bioc_out;
   2828
   2829	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
   2830	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
   2831	bio->bi_private = sparity;
   2832	bio->bi_end_io = scrub_parity_bio_endio;
   2833
   2834	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
   2835					      sparity->scrub_dev,
   2836					      sparity->dbitmap,
   2837					      sparity->nsectors);
   2838	if (!rbio)
   2839		goto rbio_out;
   2840
   2841	scrub_pending_bio_inc(sctx);
   2842	raid56_parity_submit_scrub_rbio(rbio);
   2843	return;
   2844
   2845rbio_out:
   2846	bio_put(bio);
   2847bioc_out:
   2848	btrfs_bio_counter_dec(fs_info);
   2849	btrfs_put_bioc(bioc);
   2850	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
   2851		  sparity->nsectors);
   2852	spin_lock(&sctx->stat_lock);
   2853	sctx->stat.malloc_errors++;
   2854	spin_unlock(&sctx->stat_lock);
   2855out:
   2856	scrub_free_parity(sparity);
   2857}
   2858
   2859static inline int scrub_calc_parity_bitmap_len(int nsectors)
   2860{
   2861	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
   2862}
   2863
   2864static void scrub_parity_get(struct scrub_parity *sparity)
   2865{
   2866	refcount_inc(&sparity->refs);
   2867}
   2868
   2869static void scrub_parity_put(struct scrub_parity *sparity)
   2870{
   2871	if (!refcount_dec_and_test(&sparity->refs))
   2872		return;
   2873
   2874	scrub_parity_check_and_repair(sparity);
   2875}
   2876
   2877/*
   2878 * Return 0 if the extent item range covers any byte of the range.
   2879 * Return <0 if the extent item is before @search_start.
   2880 * Return >0 if the extent item is after @start_start + @search_len.
   2881 */
   2882static int compare_extent_item_range(struct btrfs_path *path,
   2883				     u64 search_start, u64 search_len)
   2884{
   2885	struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
   2886	u64 len;
   2887	struct btrfs_key key;
   2888
   2889	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   2890	ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
   2891	       key.type == BTRFS_METADATA_ITEM_KEY);
   2892	if (key.type == BTRFS_METADATA_ITEM_KEY)
   2893		len = fs_info->nodesize;
   2894	else
   2895		len = key.offset;
   2896
   2897	if (key.objectid + len <= search_start)
   2898		return -1;
   2899	if (key.objectid >= search_start + search_len)
   2900		return 1;
   2901	return 0;
   2902}
   2903
   2904/*
   2905 * Locate one extent item which covers any byte in range
   2906 * [@search_start, @search_start + @search_length)
   2907 *
   2908 * If the path is not initialized, we will initialize the search by doing
   2909 * a btrfs_search_slot().
   2910 * If the path is already initialized, we will use the path as the initial
   2911 * slot, to avoid duplicated btrfs_search_slot() calls.
   2912 *
   2913 * NOTE: If an extent item starts before @search_start, we will still
   2914 * return the extent item. This is for data extent crossing stripe boundary.
   2915 *
   2916 * Return 0 if we found such extent item, and @path will point to the extent item.
   2917 * Return >0 if no such extent item can be found, and @path will be released.
   2918 * Return <0 if hit fatal error, and @path will be released.
   2919 */
   2920static int find_first_extent_item(struct btrfs_root *extent_root,
   2921				  struct btrfs_path *path,
   2922				  u64 search_start, u64 search_len)
   2923{
   2924	struct btrfs_fs_info *fs_info = extent_root->fs_info;
   2925	struct btrfs_key key;
   2926	int ret;
   2927
   2928	/* Continue using the existing path */
   2929	if (path->nodes[0])
   2930		goto search_forward;
   2931
   2932	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
   2933		key.type = BTRFS_METADATA_ITEM_KEY;
   2934	else
   2935		key.type = BTRFS_EXTENT_ITEM_KEY;
   2936	key.objectid = search_start;
   2937	key.offset = (u64)-1;
   2938
   2939	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
   2940	if (ret < 0)
   2941		return ret;
   2942
   2943	ASSERT(ret > 0);
   2944	/*
   2945	 * Here we intentionally pass 0 as @min_objectid, as there could be
   2946	 * an extent item starting before @search_start.
   2947	 */
   2948	ret = btrfs_previous_extent_item(extent_root, path, 0);
   2949	if (ret < 0)
   2950		return ret;
   2951	/*
   2952	 * No matter whether we have found an extent item, the next loop will
   2953	 * properly do every check on the key.
   2954	 */
   2955search_forward:
   2956	while (true) {
   2957		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   2958		if (key.objectid >= search_start + search_len)
   2959			break;
   2960		if (key.type != BTRFS_METADATA_ITEM_KEY &&
   2961		    key.type != BTRFS_EXTENT_ITEM_KEY)
   2962			goto next;
   2963
   2964		ret = compare_extent_item_range(path, search_start, search_len);
   2965		if (ret == 0)
   2966			return ret;
   2967		if (ret > 0)
   2968			break;
   2969next:
   2970		path->slots[0]++;
   2971		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
   2972			ret = btrfs_next_leaf(extent_root, path);
   2973			if (ret) {
   2974				/* Either no more item or fatal error */
   2975				btrfs_release_path(path);
   2976				return ret;
   2977			}
   2978		}
   2979	}
   2980	btrfs_release_path(path);
   2981	return 1;
   2982}
   2983
   2984static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
   2985			    u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
   2986{
   2987	struct btrfs_key key;
   2988	struct btrfs_extent_item *ei;
   2989
   2990	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   2991	ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
   2992	       key.type == BTRFS_EXTENT_ITEM_KEY);
   2993	*extent_start_ret = key.objectid;
   2994	if (key.type == BTRFS_METADATA_ITEM_KEY)
   2995		*size_ret = path->nodes[0]->fs_info->nodesize;
   2996	else
   2997		*size_ret = key.offset;
   2998	ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
   2999	*flags_ret = btrfs_extent_flags(path->nodes[0], ei);
   3000	*generation_ret = btrfs_extent_generation(path->nodes[0], ei);
   3001}
   3002
   3003static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
   3004				      u64 boundary_start, u64 boudary_len)
   3005{
   3006	return (extent_start < boundary_start &&
   3007		extent_start + extent_len > boundary_start) ||
   3008	       (extent_start < boundary_start + boudary_len &&
   3009		extent_start + extent_len > boundary_start + boudary_len);
   3010}
   3011
   3012static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
   3013					       struct scrub_parity *sparity,
   3014					       struct map_lookup *map,
   3015					       struct btrfs_device *sdev,
   3016					       struct btrfs_path *path,
   3017					       u64 logical)
   3018{
   3019	struct btrfs_fs_info *fs_info = sctx->fs_info;
   3020	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
   3021	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
   3022	u64 cur_logical = logical;
   3023	int ret;
   3024
   3025	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
   3026
   3027	/* Path must not be populated */
   3028	ASSERT(!path->nodes[0]);
   3029
   3030	while (cur_logical < logical + map->stripe_len) {
   3031		struct btrfs_io_context *bioc = NULL;
   3032		struct btrfs_device *extent_dev;
   3033		u64 extent_start;
   3034		u64 extent_size;
   3035		u64 mapped_length;
   3036		u64 extent_flags;
   3037		u64 extent_gen;
   3038		u64 extent_physical;
   3039		u64 extent_mirror_num;
   3040
   3041		ret = find_first_extent_item(extent_root, path, cur_logical,
   3042					     logical + map->stripe_len - cur_logical);
   3043		/* No more extent item in this data stripe */
   3044		if (ret > 0) {
   3045			ret = 0;
   3046			break;
   3047		}
   3048		if (ret < 0)
   3049			break;
   3050		get_extent_info(path, &extent_start, &extent_size, &extent_flags,
   3051				&extent_gen);
   3052
   3053		/* Metadata should not cross stripe boundaries */
   3054		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
   3055		    does_range_cross_boundary(extent_start, extent_size,
   3056					      logical, map->stripe_len)) {
   3057			btrfs_err(fs_info,
   3058	"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
   3059				  extent_start, logical);
   3060			spin_lock(&sctx->stat_lock);
   3061			sctx->stat.uncorrectable_errors++;
   3062			spin_unlock(&sctx->stat_lock);
   3063			cur_logical += extent_size;
   3064			continue;
   3065		}
   3066
   3067		/* Skip hole range which doesn't have any extent */
   3068		cur_logical = max(extent_start, cur_logical);
   3069
   3070		/* Truncate the range inside this data stripe */
   3071		extent_size = min(extent_start + extent_size,
   3072				  logical + map->stripe_len) - cur_logical;
   3073		extent_start = cur_logical;
   3074		ASSERT(extent_size <= U32_MAX);
   3075
   3076		scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
   3077
   3078		mapped_length = extent_size;
   3079		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
   3080				      &mapped_length, &bioc, 0);
   3081		if (!ret && (!bioc || mapped_length < extent_size))
   3082			ret = -EIO;
   3083		if (ret) {
   3084			btrfs_put_bioc(bioc);
   3085			scrub_parity_mark_sectors_error(sparity, extent_start,
   3086							extent_size);
   3087			break;
   3088		}
   3089		extent_physical = bioc->stripes[0].physical;
   3090		extent_mirror_num = bioc->mirror_num;
   3091		extent_dev = bioc->stripes[0].dev;
   3092		btrfs_put_bioc(bioc);
   3093
   3094		ret = btrfs_lookup_csums_range(csum_root, extent_start,
   3095					       extent_start + extent_size - 1,
   3096					       &sctx->csum_list, 1);
   3097		if (ret) {
   3098			scrub_parity_mark_sectors_error(sparity, extent_start,
   3099							extent_size);
   3100			break;
   3101		}
   3102
   3103		ret = scrub_extent_for_parity(sparity, extent_start,
   3104					      extent_size, extent_physical,
   3105					      extent_dev, extent_flags,
   3106					      extent_gen, extent_mirror_num);
   3107		scrub_free_csums(sctx);
   3108
   3109		if (ret) {
   3110			scrub_parity_mark_sectors_error(sparity, extent_start,
   3111							extent_size);
   3112			break;
   3113		}
   3114
   3115		cond_resched();
   3116		cur_logical += extent_size;
   3117	}
   3118	btrfs_release_path(path);
   3119	return ret;
   3120}
   3121
   3122static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
   3123						  struct map_lookup *map,
   3124						  struct btrfs_device *sdev,
   3125						  u64 logic_start,
   3126						  u64 logic_end)
   3127{
   3128	struct btrfs_fs_info *fs_info = sctx->fs_info;
   3129	struct btrfs_path *path;
   3130	u64 cur_logical;
   3131	int ret;
   3132	struct scrub_parity *sparity;
   3133	int nsectors;
   3134	int bitmap_len;
   3135
   3136	path = btrfs_alloc_path();
   3137	if (!path) {
   3138		spin_lock(&sctx->stat_lock);
   3139		sctx->stat.malloc_errors++;
   3140		spin_unlock(&sctx->stat_lock);
   3141		return -ENOMEM;
   3142	}
   3143	path->search_commit_root = 1;
   3144	path->skip_locking = 1;
   3145
   3146	ASSERT(map->stripe_len <= U32_MAX);
   3147	nsectors = map->stripe_len >> fs_info->sectorsize_bits;
   3148	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
   3149	sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
   3150			  GFP_NOFS);
   3151	if (!sparity) {
   3152		spin_lock(&sctx->stat_lock);
   3153		sctx->stat.malloc_errors++;
   3154		spin_unlock(&sctx->stat_lock);
   3155		btrfs_free_path(path);
   3156		return -ENOMEM;
   3157	}
   3158
   3159	ASSERT(map->stripe_len <= U32_MAX);
   3160	sparity->stripe_len = map->stripe_len;
   3161	sparity->nsectors = nsectors;
   3162	sparity->sctx = sctx;
   3163	sparity->scrub_dev = sdev;
   3164	sparity->logic_start = logic_start;
   3165	sparity->logic_end = logic_end;
   3166	refcount_set(&sparity->refs, 1);
   3167	INIT_LIST_HEAD(&sparity->sectors_list);
   3168	sparity->dbitmap = sparity->bitmap;
   3169	sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
   3170
   3171	ret = 0;
   3172	for (cur_logical = logic_start; cur_logical < logic_end;
   3173	     cur_logical += map->stripe_len) {
   3174		ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
   3175							  sdev, path, cur_logical);
   3176		if (ret < 0)
   3177			break;
   3178	}
   3179
   3180	scrub_parity_put(sparity);
   3181	scrub_submit(sctx);
   3182	mutex_lock(&sctx->wr_lock);
   3183	scrub_wr_submit(sctx);
   3184	mutex_unlock(&sctx->wr_lock);
   3185
   3186	btrfs_free_path(path);
   3187	return ret < 0 ? ret : 0;
   3188}
   3189
   3190static void sync_replace_for_zoned(struct scrub_ctx *sctx)
   3191{
   3192	if (!btrfs_is_zoned(sctx->fs_info))
   3193		return;
   3194
   3195	sctx->flush_all_writes = true;
   3196	scrub_submit(sctx);
   3197	mutex_lock(&sctx->wr_lock);
   3198	scrub_wr_submit(sctx);
   3199	mutex_unlock(&sctx->wr_lock);
   3200
   3201	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
   3202}
   3203
   3204static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
   3205					u64 physical, u64 physical_end)
   3206{
   3207	struct btrfs_fs_info *fs_info = sctx->fs_info;
   3208	int ret = 0;
   3209
   3210	if (!btrfs_is_zoned(fs_info))
   3211		return 0;
   3212
   3213	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
   3214
   3215	mutex_lock(&sctx->wr_lock);
   3216	if (sctx->write_pointer < physical_end) {
   3217		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
   3218						    physical,
   3219						    sctx->write_pointer);
   3220		if (ret)
   3221			btrfs_err(fs_info,
   3222				  "zoned: failed to recover write pointer");
   3223	}
   3224	mutex_unlock(&sctx->wr_lock);
   3225	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
   3226
   3227	return ret;
   3228}
   3229
   3230/*
   3231 * Scrub one range which can only has simple mirror based profile.
   3232 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
   3233 *  RAID0/RAID10).
   3234 *
   3235 * Since we may need to handle a subset of block group, we need @logical_start
   3236 * and @logical_length parameter.
   3237 */
   3238static int scrub_simple_mirror(struct scrub_ctx *sctx,
   3239			       struct btrfs_root *extent_root,
   3240			       struct btrfs_root *csum_root,
   3241			       struct btrfs_block_group *bg,
   3242			       struct map_lookup *map,
   3243			       u64 logical_start, u64 logical_length,
   3244			       struct btrfs_device *device,
   3245			       u64 physical, int mirror_num)
   3246{
   3247	struct btrfs_fs_info *fs_info = sctx->fs_info;
   3248	const u64 logical_end = logical_start + logical_length;
   3249	/* An artificial limit, inherit from old scrub behavior */
   3250	const u32 max_length = SZ_64K;
   3251	struct btrfs_path path = { 0 };
   3252	u64 cur_logical = logical_start;
   3253	int ret;
   3254
   3255	/* The range must be inside the bg */
   3256	ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
   3257
   3258	path.search_commit_root = 1;
   3259	path.skip_locking = 1;
   3260	/* Go through each extent items inside the logical range */
   3261	while (cur_logical < logical_end) {
   3262		u64 extent_start;
   3263		u64 extent_len;
   3264		u64 extent_flags;
   3265		u64 extent_gen;
   3266		u64 scrub_len;
   3267
   3268		/* Canceled? */
   3269		if (atomic_read(&fs_info->scrub_cancel_req) ||
   3270		    atomic_read(&sctx->cancel_req)) {
   3271			ret = -ECANCELED;
   3272			break;
   3273		}
   3274		/* Paused? */
   3275		if (atomic_read(&fs_info->scrub_pause_req)) {
   3276			/* Push queued extents */
   3277			sctx->flush_all_writes = true;
   3278			scrub_submit(sctx);
   3279			mutex_lock(&sctx->wr_lock);
   3280			scrub_wr_submit(sctx);
   3281			mutex_unlock(&sctx->wr_lock);
   3282			wait_event(sctx->list_wait,
   3283				   atomic_read(&sctx->bios_in_flight) == 0);
   3284			sctx->flush_all_writes = false;
   3285			scrub_blocked_if_needed(fs_info);
   3286		}
   3287		/* Block group removed? */
   3288		spin_lock(&bg->lock);
   3289		if (bg->removed) {
   3290			spin_unlock(&bg->lock);
   3291			ret = 0;
   3292			break;
   3293		}
   3294		spin_unlock(&bg->lock);
   3295
   3296		ret = find_first_extent_item(extent_root, &path, cur_logical,
   3297					     logical_end - cur_logical);
   3298		if (ret > 0) {
   3299			/* No more extent, just update the accounting */
   3300			sctx->stat.last_physical = physical + logical_length;
   3301			ret = 0;
   3302			break;
   3303		}
   3304		if (ret < 0)
   3305			break;
   3306		get_extent_info(&path, &extent_start, &extent_len,
   3307				&extent_flags, &extent_gen);
   3308		/* Skip hole range which doesn't have any extent */
   3309		cur_logical = max(extent_start, cur_logical);
   3310
   3311		/*
   3312		 * Scrub len has three limits:
   3313		 * - Extent size limit
   3314		 * - Scrub range limit
   3315		 *   This is especially imporatant for RAID0/RAID10 to reuse
   3316		 *   this function
   3317		 * - Max scrub size limit
   3318		 */
   3319		scrub_len = min(min(extent_start + extent_len,
   3320				    logical_end), cur_logical + max_length) -
   3321			    cur_logical;
   3322
   3323		if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
   3324			ret = btrfs_lookup_csums_range(csum_root, cur_logical,
   3325					cur_logical + scrub_len - 1,
   3326					&sctx->csum_list, 1);
   3327			if (ret)
   3328				break;
   3329		}
   3330		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
   3331		    does_range_cross_boundary(extent_start, extent_len,
   3332					      logical_start, logical_length)) {
   3333			btrfs_err(fs_info,
   3334"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
   3335				  extent_start, logical_start, logical_end);
   3336			spin_lock(&sctx->stat_lock);
   3337			sctx->stat.uncorrectable_errors++;
   3338			spin_unlock(&sctx->stat_lock);
   3339			cur_logical += scrub_len;
   3340			continue;
   3341		}
   3342		ret = scrub_extent(sctx, map, cur_logical, scrub_len,
   3343				   cur_logical - logical_start + physical,
   3344				   device, extent_flags, extent_gen,
   3345				   mirror_num);
   3346		scrub_free_csums(sctx);
   3347		if (ret)
   3348			break;
   3349		if (sctx->is_dev_replace)
   3350			sync_replace_for_zoned(sctx);
   3351		cur_logical += scrub_len;
   3352		/* Don't hold CPU for too long time */
   3353		cond_resched();
   3354	}
   3355	btrfs_release_path(&path);
   3356	return ret;
   3357}
   3358
   3359/* Calculate the full stripe length for simple stripe based profiles */
   3360static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
   3361{
   3362	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
   3363			    BTRFS_BLOCK_GROUP_RAID10));
   3364
   3365	return map->num_stripes / map->sub_stripes * map->stripe_len;
   3366}
   3367
   3368/* Get the logical bytenr for the stripe */
   3369static u64 simple_stripe_get_logical(struct map_lookup *map,
   3370				     struct btrfs_block_group *bg,
   3371				     int stripe_index)
   3372{
   3373	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
   3374			    BTRFS_BLOCK_GROUP_RAID10));
   3375	ASSERT(stripe_index < map->num_stripes);
   3376
   3377	/*
   3378	 * (stripe_index / sub_stripes) gives how many data stripes we need to
   3379	 * skip.
   3380	 */
   3381	return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
   3382}
   3383
   3384/* Get the mirror number for the stripe */
   3385static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
   3386{
   3387	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
   3388			    BTRFS_BLOCK_GROUP_RAID10));
   3389	ASSERT(stripe_index < map->num_stripes);
   3390
   3391	/* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
   3392	return stripe_index % map->sub_stripes + 1;
   3393}
   3394
   3395static int scrub_simple_stripe(struct scrub_ctx *sctx,
   3396			       struct btrfs_root *extent_root,
   3397			       struct btrfs_root *csum_root,
   3398			       struct btrfs_block_group *bg,
   3399			       struct map_lookup *map,
   3400			       struct btrfs_device *device,
   3401			       int stripe_index)
   3402{
   3403	const u64 logical_increment = simple_stripe_full_stripe_len(map);
   3404	const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
   3405	const u64 orig_physical = map->stripes[stripe_index].physical;
   3406	const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
   3407	u64 cur_logical = orig_logical;
   3408	u64 cur_physical = orig_physical;
   3409	int ret = 0;
   3410
   3411	while (cur_logical < bg->start + bg->length) {
   3412		/*
   3413		 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
   3414		 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
   3415		 * this stripe.
   3416		 */
   3417		ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
   3418					  cur_logical, map->stripe_len, device,
   3419					  cur_physical, mirror_num);
   3420		if (ret)
   3421			return ret;
   3422		/* Skip to next stripe which belongs to the target device */
   3423		cur_logical += logical_increment;
   3424		/* For physical offset, we just go to next stripe */
   3425		cur_physical += map->stripe_len;
   3426	}
   3427	return ret;
   3428}
   3429
   3430static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
   3431					   struct btrfs_block_group *bg,
   3432					   struct map_lookup *map,
   3433					   struct btrfs_device *scrub_dev,
   3434					   int stripe_index, u64 dev_extent_len)
   3435{
   3436	struct btrfs_path *path;
   3437	struct btrfs_fs_info *fs_info = sctx->fs_info;
   3438	struct btrfs_root *root;
   3439	struct btrfs_root *csum_root;
   3440	struct blk_plug plug;
   3441	const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
   3442	const u64 chunk_logical = bg->start;
   3443	int ret;
   3444	u64 physical = map->stripes[stripe_index].physical;
   3445	const u64 physical_end = physical + dev_extent_len;
   3446	u64 logical;
   3447	u64 logic_end;
   3448	/* The logical increment after finishing one stripe */
   3449	u64 increment;
   3450	/* Offset inside the chunk */
   3451	u64 offset;
   3452	u64 stripe_logical;
   3453	u64 stripe_end;
   3454	int stop_loop = 0;
   3455
   3456	path = btrfs_alloc_path();
   3457	if (!path)
   3458		return -ENOMEM;
   3459
   3460	/*
   3461	 * work on commit root. The related disk blocks are static as
   3462	 * long as COW is applied. This means, it is save to rewrite
   3463	 * them to repair disk errors without any race conditions
   3464	 */
   3465	path->search_commit_root = 1;
   3466	path->skip_locking = 1;
   3467	path->reada = READA_FORWARD;
   3468
   3469	wait_event(sctx->list_wait,
   3470		   atomic_read(&sctx->bios_in_flight) == 0);
   3471	scrub_blocked_if_needed(fs_info);
   3472
   3473	root = btrfs_extent_root(fs_info, bg->start);
   3474	csum_root = btrfs_csum_root(fs_info, bg->start);
   3475
   3476	/*
   3477	 * collect all data csums for the stripe to avoid seeking during
   3478	 * the scrub. This might currently (crc32) end up to be about 1MB
   3479	 */
   3480	blk_start_plug(&plug);
   3481
   3482	if (sctx->is_dev_replace &&
   3483	    btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
   3484		mutex_lock(&sctx->wr_lock);
   3485		sctx->write_pointer = physical;
   3486		mutex_unlock(&sctx->wr_lock);
   3487		sctx->flush_all_writes = true;
   3488	}
   3489
   3490	/*
   3491	 * There used to be a big double loop to handle all profiles using the
   3492	 * same routine, which grows larger and more gross over time.
   3493	 *
   3494	 * So here we handle each profile differently, so simpler profiles
   3495	 * have simpler scrubbing function.
   3496	 */
   3497	if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
   3498			 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
   3499		/*
   3500		 * Above check rules out all complex profile, the remaining
   3501		 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
   3502		 * mirrored duplication without stripe.
   3503		 *
   3504		 * Only @physical and @mirror_num needs to calculated using
   3505		 * @stripe_index.
   3506		 */
   3507		ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
   3508				bg->start, bg->length, scrub_dev,
   3509				map->stripes[stripe_index].physical,
   3510				stripe_index + 1);
   3511		offset = 0;
   3512		goto out;
   3513	}
   3514	if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
   3515		ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
   3516					  scrub_dev, stripe_index);
   3517		offset = map->stripe_len * (stripe_index / map->sub_stripes);
   3518		goto out;
   3519	}
   3520
   3521	/* Only RAID56 goes through the old code */
   3522	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
   3523	ret = 0;
   3524
   3525	/* Calculate the logical end of the stripe */
   3526	get_raid56_logic_offset(physical_end, stripe_index,
   3527				map, &logic_end, NULL);
   3528	logic_end += chunk_logical;
   3529
   3530	/* Initialize @offset in case we need to go to out: label */
   3531	get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
   3532	increment = map->stripe_len * nr_data_stripes(map);
   3533
   3534	/*
   3535	 * Due to the rotation, for RAID56 it's better to iterate each stripe
   3536	 * using their physical offset.
   3537	 */
   3538	while (physical < physical_end) {
   3539		ret = get_raid56_logic_offset(physical, stripe_index, map,
   3540					      &logical, &stripe_logical);
   3541		logical += chunk_logical;
   3542		if (ret) {
   3543			/* it is parity strip */
   3544			stripe_logical += chunk_logical;
   3545			stripe_end = stripe_logical + increment;
   3546			ret = scrub_raid56_parity(sctx, map, scrub_dev,
   3547						  stripe_logical,
   3548						  stripe_end);
   3549			if (ret)
   3550				goto out;
   3551			goto next;
   3552		}
   3553
   3554		/*
   3555		 * Now we're at a data stripe, scrub each extents in the range.
   3556		 *
   3557		 * At this stage, if we ignore the repair part, inside each data
   3558		 * stripe it is no different than SINGLE profile.
   3559		 * We can reuse scrub_simple_mirror() here, as the repair part
   3560		 * is still based on @mirror_num.
   3561		 */
   3562		ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
   3563					  logical, map->stripe_len,
   3564					  scrub_dev, physical, 1);
   3565		if (ret < 0)
   3566			goto out;
   3567next:
   3568		logical += increment;
   3569		physical += map->stripe_len;
   3570		spin_lock(&sctx->stat_lock);
   3571		if (stop_loop)
   3572			sctx->stat.last_physical = map->stripes[stripe_index].physical +
   3573						   dev_extent_len;
   3574		else
   3575			sctx->stat.last_physical = physical;
   3576		spin_unlock(&sctx->stat_lock);
   3577		if (stop_loop)
   3578			break;
   3579	}
   3580out:
   3581	/* push queued extents */
   3582	scrub_submit(sctx);
   3583	mutex_lock(&sctx->wr_lock);
   3584	scrub_wr_submit(sctx);
   3585	mutex_unlock(&sctx->wr_lock);
   3586
   3587	blk_finish_plug(&plug);
   3588	btrfs_free_path(path);
   3589
   3590	if (sctx->is_dev_replace && ret >= 0) {
   3591		int ret2;
   3592
   3593		ret2 = sync_write_pointer_for_zoned(sctx,
   3594				chunk_logical + offset,
   3595				map->stripes[stripe_index].physical,
   3596				physical_end);
   3597		if (ret2)
   3598			ret = ret2;
   3599	}
   3600
   3601	return ret < 0 ? ret : 0;
   3602}
   3603
   3604static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
   3605					  struct btrfs_block_group *bg,
   3606					  struct btrfs_device *scrub_dev,
   3607					  u64 dev_offset,
   3608					  u64 dev_extent_len)
   3609{
   3610	struct btrfs_fs_info *fs_info = sctx->fs_info;
   3611	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
   3612	struct map_lookup *map;
   3613	struct extent_map *em;
   3614	int i;
   3615	int ret = 0;
   3616
   3617	read_lock(&map_tree->lock);
   3618	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
   3619	read_unlock(&map_tree->lock);
   3620
   3621	if (!em) {
   3622		/*
   3623		 * Might have been an unused block group deleted by the cleaner
   3624		 * kthread or relocation.
   3625		 */
   3626		spin_lock(&bg->lock);
   3627		if (!bg->removed)
   3628			ret = -EINVAL;
   3629		spin_unlock(&bg->lock);
   3630
   3631		return ret;
   3632	}
   3633	if (em->start != bg->start)
   3634		goto out;
   3635	if (em->len < dev_extent_len)
   3636		goto out;
   3637
   3638	map = em->map_lookup;
   3639	for (i = 0; i < map->num_stripes; ++i) {
   3640		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
   3641		    map->stripes[i].physical == dev_offset) {
   3642			ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
   3643					   dev_extent_len);
   3644			if (ret)
   3645				goto out;
   3646		}
   3647	}
   3648out:
   3649	free_extent_map(em);
   3650
   3651	return ret;
   3652}
   3653
   3654static int finish_extent_writes_for_zoned(struct btrfs_root *root,
   3655					  struct btrfs_block_group *cache)
   3656{
   3657	struct btrfs_fs_info *fs_info = cache->fs_info;
   3658	struct btrfs_trans_handle *trans;
   3659
   3660	if (!btrfs_is_zoned(fs_info))
   3661		return 0;
   3662
   3663	btrfs_wait_block_group_reservations(cache);
   3664	btrfs_wait_nocow_writers(cache);
   3665	btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
   3666
   3667	trans = btrfs_join_transaction(root);
   3668	if (IS_ERR(trans))
   3669		return PTR_ERR(trans);
   3670	return btrfs_commit_transaction(trans);
   3671}
   3672
   3673static noinline_for_stack
   3674int scrub_enumerate_chunks(struct scrub_ctx *sctx,
   3675			   struct btrfs_device *scrub_dev, u64 start, u64 end)
   3676{
   3677	struct btrfs_dev_extent *dev_extent = NULL;
   3678	struct btrfs_path *path;
   3679	struct btrfs_fs_info *fs_info = sctx->fs_info;
   3680	struct btrfs_root *root = fs_info->dev_root;
   3681	u64 chunk_offset;
   3682	int ret = 0;
   3683	int ro_set;
   3684	int slot;
   3685	struct extent_buffer *l;
   3686	struct btrfs_key key;
   3687	struct btrfs_key found_key;
   3688	struct btrfs_block_group *cache;
   3689	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   3690
   3691	path = btrfs_alloc_path();
   3692	if (!path)
   3693		return -ENOMEM;
   3694
   3695	path->reada = READA_FORWARD;
   3696	path->search_commit_root = 1;
   3697	path->skip_locking = 1;
   3698
   3699	key.objectid = scrub_dev->devid;
   3700	key.offset = 0ull;
   3701	key.type = BTRFS_DEV_EXTENT_KEY;
   3702
   3703	while (1) {
   3704		u64 dev_extent_len;
   3705
   3706		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   3707		if (ret < 0)
   3708			break;
   3709		if (ret > 0) {
   3710			if (path->slots[0] >=
   3711			    btrfs_header_nritems(path->nodes[0])) {
   3712				ret = btrfs_next_leaf(root, path);
   3713				if (ret < 0)
   3714					break;
   3715				if (ret > 0) {
   3716					ret = 0;
   3717					break;
   3718				}
   3719			} else {
   3720				ret = 0;
   3721			}
   3722		}
   3723
   3724		l = path->nodes[0];
   3725		slot = path->slots[0];
   3726
   3727		btrfs_item_key_to_cpu(l, &found_key, slot);
   3728
   3729		if (found_key.objectid != scrub_dev->devid)
   3730			break;
   3731
   3732		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
   3733			break;
   3734
   3735		if (found_key.offset >= end)
   3736			break;
   3737
   3738		if (found_key.offset < key.offset)
   3739			break;
   3740
   3741		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
   3742		dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
   3743
   3744		if (found_key.offset + dev_extent_len <= start)
   3745			goto skip;
   3746
   3747		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
   3748
   3749		/*
   3750		 * get a reference on the corresponding block group to prevent
   3751		 * the chunk from going away while we scrub it
   3752		 */
   3753		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
   3754
   3755		/* some chunks are removed but not committed to disk yet,
   3756		 * continue scrubbing */
   3757		if (!cache)
   3758			goto skip;
   3759
   3760		ASSERT(cache->start <= chunk_offset);
   3761		/*
   3762		 * We are using the commit root to search for device extents, so
   3763		 * that means we could have found a device extent item from a
   3764		 * block group that was deleted in the current transaction. The
   3765		 * logical start offset of the deleted block group, stored at
   3766		 * @chunk_offset, might be part of the logical address range of
   3767		 * a new block group (which uses different physical extents).
   3768		 * In this case btrfs_lookup_block_group() has returned the new
   3769		 * block group, and its start address is less than @chunk_offset.
   3770		 *
   3771		 * We skip such new block groups, because it's pointless to
   3772		 * process them, as we won't find their extents because we search
   3773		 * for them using the commit root of the extent tree. For a device
   3774		 * replace it's also fine to skip it, we won't miss copying them
   3775		 * to the target device because we have the write duplication
   3776		 * setup through the regular write path (by btrfs_map_block()),
   3777		 * and we have committed a transaction when we started the device
   3778		 * replace, right after setting up the device replace state.
   3779		 */
   3780		if (cache->start < chunk_offset) {
   3781			btrfs_put_block_group(cache);
   3782			goto skip;
   3783		}
   3784
   3785		if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
   3786			spin_lock(&cache->lock);
   3787			if (!cache->to_copy) {
   3788				spin_unlock(&cache->lock);
   3789				btrfs_put_block_group(cache);
   3790				goto skip;
   3791			}
   3792			spin_unlock(&cache->lock);
   3793		}
   3794
   3795		/*
   3796		 * Make sure that while we are scrubbing the corresponding block
   3797		 * group doesn't get its logical address and its device extents
   3798		 * reused for another block group, which can possibly be of a
   3799		 * different type and different profile. We do this to prevent
   3800		 * false error detections and crashes due to bogus attempts to
   3801		 * repair extents.
   3802		 */
   3803		spin_lock(&cache->lock);
   3804		if (cache->removed) {
   3805			spin_unlock(&cache->lock);
   3806			btrfs_put_block_group(cache);
   3807			goto skip;
   3808		}
   3809		btrfs_freeze_block_group(cache);
   3810		spin_unlock(&cache->lock);
   3811
   3812		/*
   3813		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
   3814		 * to avoid deadlock caused by:
   3815		 * btrfs_inc_block_group_ro()
   3816		 * -> btrfs_wait_for_commit()
   3817		 * -> btrfs_commit_transaction()
   3818		 * -> btrfs_scrub_pause()
   3819		 */
   3820		scrub_pause_on(fs_info);
   3821
   3822		/*
   3823		 * Don't do chunk preallocation for scrub.
   3824		 *
   3825		 * This is especially important for SYSTEM bgs, or we can hit
   3826		 * -EFBIG from btrfs_finish_chunk_alloc() like:
   3827		 * 1. The only SYSTEM bg is marked RO.
   3828		 *    Since SYSTEM bg is small, that's pretty common.
   3829		 * 2. New SYSTEM bg will be allocated
   3830		 *    Due to regular version will allocate new chunk.
   3831		 * 3. New SYSTEM bg is empty and will get cleaned up
   3832		 *    Before cleanup really happens, it's marked RO again.
   3833		 * 4. Empty SYSTEM bg get scrubbed
   3834		 *    We go back to 2.
   3835		 *
   3836		 * This can easily boost the amount of SYSTEM chunks if cleaner
   3837		 * thread can't be triggered fast enough, and use up all space
   3838		 * of btrfs_super_block::sys_chunk_array
   3839		 *
   3840		 * While for dev replace, we need to try our best to mark block
   3841		 * group RO, to prevent race between:
   3842		 * - Write duplication
   3843		 *   Contains latest data
   3844		 * - Scrub copy
   3845		 *   Contains data from commit tree
   3846		 *
   3847		 * If target block group is not marked RO, nocow writes can
   3848		 * be overwritten by scrub copy, causing data corruption.
   3849		 * So for dev-replace, it's not allowed to continue if a block
   3850		 * group is not RO.
   3851		 */
   3852		ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
   3853		if (!ret && sctx->is_dev_replace) {
   3854			ret = finish_extent_writes_for_zoned(root, cache);
   3855			if (ret) {
   3856				btrfs_dec_block_group_ro(cache);
   3857				scrub_pause_off(fs_info);
   3858				btrfs_put_block_group(cache);
   3859				break;
   3860			}
   3861		}
   3862
   3863		if (ret == 0) {
   3864			ro_set = 1;
   3865		} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
   3866			/*
   3867			 * btrfs_inc_block_group_ro return -ENOSPC when it
   3868			 * failed in creating new chunk for metadata.
   3869			 * It is not a problem for scrub, because
   3870			 * metadata are always cowed, and our scrub paused
   3871			 * commit_transactions.
   3872			 */
   3873			ro_set = 0;
   3874		} else if (ret == -ETXTBSY) {
   3875			btrfs_warn(fs_info,
   3876		   "skipping scrub of block group %llu due to active swapfile",
   3877				   cache->start);
   3878			scrub_pause_off(fs_info);
   3879			ret = 0;
   3880			goto skip_unfreeze;
   3881		} else {
   3882			btrfs_warn(fs_info,
   3883				   "failed setting block group ro: %d", ret);
   3884			btrfs_unfreeze_block_group(cache);
   3885			btrfs_put_block_group(cache);
   3886			scrub_pause_off(fs_info);
   3887			break;
   3888		}
   3889
   3890		/*
   3891		 * Now the target block is marked RO, wait for nocow writes to
   3892		 * finish before dev-replace.
   3893		 * COW is fine, as COW never overwrites extents in commit tree.
   3894		 */
   3895		if (sctx->is_dev_replace) {
   3896			btrfs_wait_nocow_writers(cache);
   3897			btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
   3898					cache->length);
   3899		}
   3900
   3901		scrub_pause_off(fs_info);
   3902		down_write(&dev_replace->rwsem);
   3903		dev_replace->cursor_right = found_key.offset + dev_extent_len;
   3904		dev_replace->cursor_left = found_key.offset;
   3905		dev_replace->item_needs_writeback = 1;
   3906		up_write(&dev_replace->rwsem);
   3907
   3908		ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
   3909				  dev_extent_len);
   3910
   3911		/*
   3912		 * flush, submit all pending read and write bios, afterwards
   3913		 * wait for them.
   3914		 * Note that in the dev replace case, a read request causes
   3915		 * write requests that are submitted in the read completion
   3916		 * worker. Therefore in the current situation, it is required
   3917		 * that all write requests are flushed, so that all read and
   3918		 * write requests are really completed when bios_in_flight
   3919		 * changes to 0.
   3920		 */
   3921		sctx->flush_all_writes = true;
   3922		scrub_submit(sctx);
   3923		mutex_lock(&sctx->wr_lock);
   3924		scrub_wr_submit(sctx);
   3925		mutex_unlock(&sctx->wr_lock);
   3926
   3927		wait_event(sctx->list_wait,
   3928			   atomic_read(&sctx->bios_in_flight) == 0);
   3929
   3930		scrub_pause_on(fs_info);
   3931
   3932		/*
   3933		 * must be called before we decrease @scrub_paused.
   3934		 * make sure we don't block transaction commit while
   3935		 * we are waiting pending workers finished.
   3936		 */
   3937		wait_event(sctx->list_wait,
   3938			   atomic_read(&sctx->workers_pending) == 0);
   3939		sctx->flush_all_writes = false;
   3940
   3941		scrub_pause_off(fs_info);
   3942
   3943		if (sctx->is_dev_replace &&
   3944		    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
   3945						      cache, found_key.offset))
   3946			ro_set = 0;
   3947
   3948		down_write(&dev_replace->rwsem);
   3949		dev_replace->cursor_left = dev_replace->cursor_right;
   3950		dev_replace->item_needs_writeback = 1;
   3951		up_write(&dev_replace->rwsem);
   3952
   3953		if (ro_set)
   3954			btrfs_dec_block_group_ro(cache);
   3955
   3956		/*
   3957		 * We might have prevented the cleaner kthread from deleting
   3958		 * this block group if it was already unused because we raced
   3959		 * and set it to RO mode first. So add it back to the unused
   3960		 * list, otherwise it might not ever be deleted unless a manual
   3961		 * balance is triggered or it becomes used and unused again.
   3962		 */
   3963		spin_lock(&cache->lock);
   3964		if (!cache->removed && !cache->ro && cache->reserved == 0 &&
   3965		    cache->used == 0) {
   3966			spin_unlock(&cache->lock);
   3967			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
   3968				btrfs_discard_queue_work(&fs_info->discard_ctl,
   3969							 cache);
   3970			else
   3971				btrfs_mark_bg_unused(cache);
   3972		} else {
   3973			spin_unlock(&cache->lock);
   3974		}
   3975skip_unfreeze:
   3976		btrfs_unfreeze_block_group(cache);
   3977		btrfs_put_block_group(cache);
   3978		if (ret)
   3979			break;
   3980		if (sctx->is_dev_replace &&
   3981		    atomic64_read(&dev_replace->num_write_errors) > 0) {
   3982			ret = -EIO;
   3983			break;
   3984		}
   3985		if (sctx->stat.malloc_errors > 0) {
   3986			ret = -ENOMEM;
   3987			break;
   3988		}
   3989skip:
   3990		key.offset = found_key.offset + dev_extent_len;
   3991		btrfs_release_path(path);
   3992	}
   3993
   3994	btrfs_free_path(path);
   3995
   3996	return ret;
   3997}
   3998
   3999static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
   4000					   struct btrfs_device *scrub_dev)
   4001{
   4002	int	i;
   4003	u64	bytenr;
   4004	u64	gen;
   4005	int	ret;
   4006	struct btrfs_fs_info *fs_info = sctx->fs_info;
   4007
   4008	if (BTRFS_FS_ERROR(fs_info))
   4009		return -EROFS;
   4010
   4011	/* Seed devices of a new filesystem has their own generation. */
   4012	if (scrub_dev->fs_devices != fs_info->fs_devices)
   4013		gen = scrub_dev->generation;
   4014	else
   4015		gen = fs_info->last_trans_committed;
   4016
   4017	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
   4018		bytenr = btrfs_sb_offset(i);
   4019		if (bytenr + BTRFS_SUPER_INFO_SIZE >
   4020		    scrub_dev->commit_total_bytes)
   4021			break;
   4022		if (!btrfs_check_super_location(scrub_dev, bytenr))
   4023			continue;
   4024
   4025		ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
   4026				    scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
   4027				    NULL, bytenr);
   4028		if (ret)
   4029			return ret;
   4030	}
   4031	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
   4032
   4033	return 0;
   4034}
   4035
   4036static void scrub_workers_put(struct btrfs_fs_info *fs_info)
   4037{
   4038	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
   4039					&fs_info->scrub_lock)) {
   4040		struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
   4041		struct workqueue_struct *scrub_wr_comp =
   4042						fs_info->scrub_wr_completion_workers;
   4043		struct workqueue_struct *scrub_parity =
   4044						fs_info->scrub_parity_workers;
   4045
   4046		fs_info->scrub_workers = NULL;
   4047		fs_info->scrub_wr_completion_workers = NULL;
   4048		fs_info->scrub_parity_workers = NULL;
   4049		mutex_unlock(&fs_info->scrub_lock);
   4050
   4051		if (scrub_workers)
   4052			destroy_workqueue(scrub_workers);
   4053		if (scrub_wr_comp)
   4054			destroy_workqueue(scrub_wr_comp);
   4055		if (scrub_parity)
   4056			destroy_workqueue(scrub_parity);
   4057	}
   4058}
   4059
   4060/*
   4061 * get a reference count on fs_info->scrub_workers. start worker if necessary
   4062 */
   4063static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
   4064						int is_dev_replace)
   4065{
   4066	struct workqueue_struct *scrub_workers = NULL;
   4067	struct workqueue_struct *scrub_wr_comp = NULL;
   4068	struct workqueue_struct *scrub_parity = NULL;
   4069	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
   4070	int max_active = fs_info->thread_pool_size;
   4071	int ret = -ENOMEM;
   4072
   4073	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
   4074		return 0;
   4075
   4076	scrub_workers = alloc_workqueue("btrfs-scrub", flags,
   4077					is_dev_replace ? 1 : max_active);
   4078	if (!scrub_workers)
   4079		goto fail_scrub_workers;
   4080
   4081	scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
   4082	if (!scrub_wr_comp)
   4083		goto fail_scrub_wr_completion_workers;
   4084
   4085	scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
   4086	if (!scrub_parity)
   4087		goto fail_scrub_parity_workers;
   4088
   4089	mutex_lock(&fs_info->scrub_lock);
   4090	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
   4091		ASSERT(fs_info->scrub_workers == NULL &&
   4092		       fs_info->scrub_wr_completion_workers == NULL &&
   4093		       fs_info->scrub_parity_workers == NULL);
   4094		fs_info->scrub_workers = scrub_workers;
   4095		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
   4096		fs_info->scrub_parity_workers = scrub_parity;
   4097		refcount_set(&fs_info->scrub_workers_refcnt, 1);
   4098		mutex_unlock(&fs_info->scrub_lock);
   4099		return 0;
   4100	}
   4101	/* Other thread raced in and created the workers for us */
   4102	refcount_inc(&fs_info->scrub_workers_refcnt);
   4103	mutex_unlock(&fs_info->scrub_lock);
   4104
   4105	ret = 0;
   4106	destroy_workqueue(scrub_parity);
   4107fail_scrub_parity_workers:
   4108	destroy_workqueue(scrub_wr_comp);
   4109fail_scrub_wr_completion_workers:
   4110	destroy_workqueue(scrub_workers);
   4111fail_scrub_workers:
   4112	return ret;
   4113}
   4114
   4115int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
   4116		    u64 end, struct btrfs_scrub_progress *progress,
   4117		    int readonly, int is_dev_replace)
   4118{
   4119	struct btrfs_dev_lookup_args args = { .devid = devid };
   4120	struct scrub_ctx *sctx;
   4121	int ret;
   4122	struct btrfs_device *dev;
   4123	unsigned int nofs_flag;
   4124
   4125	if (btrfs_fs_closing(fs_info))
   4126		return -EAGAIN;
   4127
   4128	if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
   4129		/*
   4130		 * in this case scrub is unable to calculate the checksum
   4131		 * the way scrub is implemented. Do not handle this
   4132		 * situation at all because it won't ever happen.
   4133		 */
   4134		btrfs_err(fs_info,
   4135			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
   4136		       fs_info->nodesize,
   4137		       BTRFS_STRIPE_LEN);
   4138		return -EINVAL;
   4139	}
   4140
   4141	if (fs_info->nodesize >
   4142	    SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
   4143	    fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
   4144		/*
   4145		 * Would exhaust the array bounds of sectorv member in
   4146		 * struct scrub_block
   4147		 */
   4148		btrfs_err(fs_info,
   4149"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
   4150		       fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
   4151		       fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
   4152		return -EINVAL;
   4153	}
   4154
   4155	/* Allocate outside of device_list_mutex */
   4156	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
   4157	if (IS_ERR(sctx))
   4158		return PTR_ERR(sctx);
   4159
   4160	ret = scrub_workers_get(fs_info, is_dev_replace);
   4161	if (ret)
   4162		goto out_free_ctx;
   4163
   4164	mutex_lock(&fs_info->fs_devices->device_list_mutex);
   4165	dev = btrfs_find_device(fs_info->fs_devices, &args);
   4166	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
   4167		     !is_dev_replace)) {
   4168		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4169		ret = -ENODEV;
   4170		goto out;
   4171	}
   4172
   4173	if (!is_dev_replace && !readonly &&
   4174	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
   4175		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4176		btrfs_err_in_rcu(fs_info,
   4177			"scrub on devid %llu: filesystem on %s is not writable",
   4178				 devid, rcu_str_deref(dev->name));
   4179		ret = -EROFS;
   4180		goto out;
   4181	}
   4182
   4183	mutex_lock(&fs_info->scrub_lock);
   4184	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
   4185	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
   4186		mutex_unlock(&fs_info->scrub_lock);
   4187		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4188		ret = -EIO;
   4189		goto out;
   4190	}
   4191
   4192	down_read(&fs_info->dev_replace.rwsem);
   4193	if (dev->scrub_ctx ||
   4194	    (!is_dev_replace &&
   4195	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
   4196		up_read(&fs_info->dev_replace.rwsem);
   4197		mutex_unlock(&fs_info->scrub_lock);
   4198		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4199		ret = -EINPROGRESS;
   4200		goto out;
   4201	}
   4202	up_read(&fs_info->dev_replace.rwsem);
   4203
   4204	sctx->readonly = readonly;
   4205	dev->scrub_ctx = sctx;
   4206	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4207
   4208	/*
   4209	 * checking @scrub_pause_req here, we can avoid
   4210	 * race between committing transaction and scrubbing.
   4211	 */
   4212	__scrub_blocked_if_needed(fs_info);
   4213	atomic_inc(&fs_info->scrubs_running);
   4214	mutex_unlock(&fs_info->scrub_lock);
   4215
   4216	/*
   4217	 * In order to avoid deadlock with reclaim when there is a transaction
   4218	 * trying to pause scrub, make sure we use GFP_NOFS for all the
   4219	 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
   4220	 * invoked by our callees. The pausing request is done when the
   4221	 * transaction commit starts, and it blocks the transaction until scrub
   4222	 * is paused (done at specific points at scrub_stripe() or right above
   4223	 * before incrementing fs_info->scrubs_running).
   4224	 */
   4225	nofs_flag = memalloc_nofs_save();
   4226	if (!is_dev_replace) {
   4227		btrfs_info(fs_info, "scrub: started on devid %llu", devid);
   4228		/*
   4229		 * by holding device list mutex, we can
   4230		 * kick off writing super in log tree sync.
   4231		 */
   4232		mutex_lock(&fs_info->fs_devices->device_list_mutex);
   4233		ret = scrub_supers(sctx, dev);
   4234		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4235	}
   4236
   4237	if (!ret)
   4238		ret = scrub_enumerate_chunks(sctx, dev, start, end);
   4239	memalloc_nofs_restore(nofs_flag);
   4240
   4241	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
   4242	atomic_dec(&fs_info->scrubs_running);
   4243	wake_up(&fs_info->scrub_pause_wait);
   4244
   4245	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
   4246
   4247	if (progress)
   4248		memcpy(progress, &sctx->stat, sizeof(*progress));
   4249
   4250	if (!is_dev_replace)
   4251		btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
   4252			ret ? "not finished" : "finished", devid, ret);
   4253
   4254	mutex_lock(&fs_info->scrub_lock);
   4255	dev->scrub_ctx = NULL;
   4256	mutex_unlock(&fs_info->scrub_lock);
   4257
   4258	scrub_workers_put(fs_info);
   4259	scrub_put_ctx(sctx);
   4260
   4261	return ret;
   4262out:
   4263	scrub_workers_put(fs_info);
   4264out_free_ctx:
   4265	scrub_free_ctx(sctx);
   4266
   4267	return ret;
   4268}
   4269
   4270void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
   4271{
   4272	mutex_lock(&fs_info->scrub_lock);
   4273	atomic_inc(&fs_info->scrub_pause_req);
   4274	while (atomic_read(&fs_info->scrubs_paused) !=
   4275	       atomic_read(&fs_info->scrubs_running)) {
   4276		mutex_unlock(&fs_info->scrub_lock);
   4277		wait_event(fs_info->scrub_pause_wait,
   4278			   atomic_read(&fs_info->scrubs_paused) ==
   4279			   atomic_read(&fs_info->scrubs_running));
   4280		mutex_lock(&fs_info->scrub_lock);
   4281	}
   4282	mutex_unlock(&fs_info->scrub_lock);
   4283}
   4284
   4285void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
   4286{
   4287	atomic_dec(&fs_info->scrub_pause_req);
   4288	wake_up(&fs_info->scrub_pause_wait);
   4289}
   4290
   4291int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
   4292{
   4293	mutex_lock(&fs_info->scrub_lock);
   4294	if (!atomic_read(&fs_info->scrubs_running)) {
   4295		mutex_unlock(&fs_info->scrub_lock);
   4296		return -ENOTCONN;
   4297	}
   4298
   4299	atomic_inc(&fs_info->scrub_cancel_req);
   4300	while (atomic_read(&fs_info->scrubs_running)) {
   4301		mutex_unlock(&fs_info->scrub_lock);
   4302		wait_event(fs_info->scrub_pause_wait,
   4303			   atomic_read(&fs_info->scrubs_running) == 0);
   4304		mutex_lock(&fs_info->scrub_lock);
   4305	}
   4306	atomic_dec(&fs_info->scrub_cancel_req);
   4307	mutex_unlock(&fs_info->scrub_lock);
   4308
   4309	return 0;
   4310}
   4311
   4312int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
   4313{
   4314	struct btrfs_fs_info *fs_info = dev->fs_info;
   4315	struct scrub_ctx *sctx;
   4316
   4317	mutex_lock(&fs_info->scrub_lock);
   4318	sctx = dev->scrub_ctx;
   4319	if (!sctx) {
   4320		mutex_unlock(&fs_info->scrub_lock);
   4321		return -ENOTCONN;
   4322	}
   4323	atomic_inc(&sctx->cancel_req);
   4324	while (dev->scrub_ctx) {
   4325		mutex_unlock(&fs_info->scrub_lock);
   4326		wait_event(fs_info->scrub_pause_wait,
   4327			   dev->scrub_ctx == NULL);
   4328		mutex_lock(&fs_info->scrub_lock);
   4329	}
   4330	mutex_unlock(&fs_info->scrub_lock);
   4331
   4332	return 0;
   4333}
   4334
   4335int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
   4336			 struct btrfs_scrub_progress *progress)
   4337{
   4338	struct btrfs_dev_lookup_args args = { .devid = devid };
   4339	struct btrfs_device *dev;
   4340	struct scrub_ctx *sctx = NULL;
   4341
   4342	mutex_lock(&fs_info->fs_devices->device_list_mutex);
   4343	dev = btrfs_find_device(fs_info->fs_devices, &args);
   4344	if (dev)
   4345		sctx = dev->scrub_ctx;
   4346	if (sctx)
   4347		memcpy(progress, &sctx->stat, sizeof(*progress));
   4348	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
   4349
   4350	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
   4351}
   4352
   4353static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
   4354				 u64 extent_logical, u32 extent_len,
   4355				 u64 *extent_physical,
   4356				 struct btrfs_device **extent_dev,
   4357				 int *extent_mirror_num)
   4358{
   4359	u64 mapped_length;
   4360	struct btrfs_io_context *bioc = NULL;
   4361	int ret;
   4362
   4363	mapped_length = extent_len;
   4364	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
   4365			      &mapped_length, &bioc, 0);
   4366	if (ret || !bioc || mapped_length < extent_len ||
   4367	    !bioc->stripes[0].dev->bdev) {
   4368		btrfs_put_bioc(bioc);
   4369		return;
   4370	}
   4371
   4372	*extent_physical = bioc->stripes[0].physical;
   4373	*extent_mirror_num = bioc->mirror_num;
   4374	*extent_dev = bioc->stripes[0].dev;
   4375	btrfs_put_bioc(bioc);
   4376}