cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

raid56.c (73893B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2012 Fusion-io  All rights reserved.
      4 * Copyright (C) 2012 Intel Corp. All rights reserved.
      5 */
      6
      7#include <linux/sched.h>
      8#include <linux/bio.h>
      9#include <linux/slab.h>
     10#include <linux/blkdev.h>
     11#include <linux/raid/pq.h>
     12#include <linux/hash.h>
     13#include <linux/list_sort.h>
     14#include <linux/raid/xor.h>
     15#include <linux/mm.h>
     16#include "misc.h"
     17#include "ctree.h"
     18#include "disk-io.h"
     19#include "volumes.h"
     20#include "raid56.h"
     21#include "async-thread.h"
     22
     23/* set when additional merges to this rbio are not allowed */
     24#define RBIO_RMW_LOCKED_BIT	1
     25
     26/*
     27 * set when this rbio is sitting in the hash, but it is just a cache
     28 * of past RMW
     29 */
     30#define RBIO_CACHE_BIT		2
     31
     32/*
     33 * set when it is safe to trust the stripe_pages for caching
     34 */
     35#define RBIO_CACHE_READY_BIT	3
     36
     37#define RBIO_CACHE_SIZE 1024
     38
     39#define BTRFS_STRIPE_HASH_TABLE_BITS				11
     40
     41/* Used by the raid56 code to lock stripes for read/modify/write */
     42struct btrfs_stripe_hash {
     43	struct list_head hash_list;
     44	spinlock_t lock;
     45};
     46
     47/* Used by the raid56 code to lock stripes for read/modify/write */
     48struct btrfs_stripe_hash_table {
     49	struct list_head stripe_cache;
     50	spinlock_t cache_lock;
     51	int cache_size;
     52	struct btrfs_stripe_hash table[];
     53};
     54
     55/*
     56 * A bvec like structure to present a sector inside a page.
     57 *
     58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
     59 */
     60struct sector_ptr {
     61	struct page *page;
     62	unsigned int pgoff:24;
     63	unsigned int uptodate:8;
     64};
     65
     66enum btrfs_rbio_ops {
     67	BTRFS_RBIO_WRITE,
     68	BTRFS_RBIO_READ_REBUILD,
     69	BTRFS_RBIO_PARITY_SCRUB,
     70	BTRFS_RBIO_REBUILD_MISSING,
     71};
     72
     73struct btrfs_raid_bio {
     74	struct btrfs_io_context *bioc;
     75
     76	/* while we're doing rmw on a stripe
     77	 * we put it into a hash table so we can
     78	 * lock the stripe and merge more rbios
     79	 * into it.
     80	 */
     81	struct list_head hash_list;
     82
     83	/*
     84	 * LRU list for the stripe cache
     85	 */
     86	struct list_head stripe_cache;
     87
     88	/*
     89	 * for scheduling work in the helper threads
     90	 */
     91	struct work_struct work;
     92
     93	/*
     94	 * bio list and bio_list_lock are used
     95	 * to add more bios into the stripe
     96	 * in hopes of avoiding the full rmw
     97	 */
     98	struct bio_list bio_list;
     99	spinlock_t bio_list_lock;
    100
    101	/* also protected by the bio_list_lock, the
    102	 * plug list is used by the plugging code
    103	 * to collect partial bios while plugged.  The
    104	 * stripe locking code also uses it to hand off
    105	 * the stripe lock to the next pending IO
    106	 */
    107	struct list_head plug_list;
    108
    109	/*
    110	 * flags that tell us if it is safe to
    111	 * merge with this bio
    112	 */
    113	unsigned long flags;
    114
    115	/*
    116	 * set if we're doing a parity rebuild
    117	 * for a read from higher up, which is handled
    118	 * differently from a parity rebuild as part of
    119	 * rmw
    120	 */
    121	enum btrfs_rbio_ops operation;
    122
    123	/* Size of each individual stripe on disk */
    124	u32 stripe_len;
    125
    126	/* How many pages there are for the full stripe including P/Q */
    127	u16 nr_pages;
    128
    129	/* How many sectors there are for the full stripe including P/Q */
    130	u16 nr_sectors;
    131
    132	/* Number of data stripes (no p/q) */
    133	u8 nr_data;
    134
    135	/* Numer of all stripes (including P/Q) */
    136	u8 real_stripes;
    137
    138	/* How many pages there are for each stripe */
    139	u8 stripe_npages;
    140
    141	/* How many sectors there are for each stripe */
    142	u8 stripe_nsectors;
    143
    144	/* First bad stripe, -1 means no corruption */
    145	s8 faila;
    146
    147	/* Second bad stripe (for RAID6 use) */
    148	s8 failb;
    149
    150	/* Stripe number that we're scrubbing  */
    151	u8 scrubp;
    152
    153	/*
    154	 * size of all the bios in the bio_list.  This
    155	 * helps us decide if the rbio maps to a full
    156	 * stripe or not
    157	 */
    158	int bio_list_bytes;
    159
    160	int generic_bio_cnt;
    161
    162	refcount_t refs;
    163
    164	atomic_t stripes_pending;
    165
    166	atomic_t error;
    167	/*
    168	 * these are two arrays of pointers.  We allocate the
    169	 * rbio big enough to hold them both and setup their
    170	 * locations when the rbio is allocated
    171	 */
    172
    173	/* pointers to pages that we allocated for
    174	 * reading/writing stripes directly from the disk (including P/Q)
    175	 */
    176	struct page **stripe_pages;
    177
    178	/* Pointers to the sectors in the bio_list, for faster lookup */
    179	struct sector_ptr *bio_sectors;
    180
    181	/*
    182	 * For subpage support, we need to map each sector to above
    183	 * stripe_pages.
    184	 */
    185	struct sector_ptr *stripe_sectors;
    186
    187	/* Bitmap to record which horizontal stripe has data */
    188	unsigned long *dbitmap;
    189
    190	/* allocated with real_stripes-many pointers for finish_*() calls */
    191	void **finish_pointers;
    192
    193	/* Allocated with stripe_nsectors-many bits for finish_*() calls */
    194	unsigned long *finish_pbitmap;
    195};
    196
    197static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
    198static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
    199static void rmw_work(struct work_struct *work);
    200static void read_rebuild_work(struct work_struct *work);
    201static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
    202static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
    203static void __free_raid_bio(struct btrfs_raid_bio *rbio);
    204static void index_rbio_pages(struct btrfs_raid_bio *rbio);
    205static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
    206
    207static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
    208					 int need_check);
    209static void scrub_parity_work(struct work_struct *work);
    210
    211static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
    212{
    213	INIT_WORK(&rbio->work, work_func);
    214	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
    215}
    216
    217/*
    218 * the stripe hash table is used for locking, and to collect
    219 * bios in hopes of making a full stripe
    220 */
    221int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
    222{
    223	struct btrfs_stripe_hash_table *table;
    224	struct btrfs_stripe_hash_table *x;
    225	struct btrfs_stripe_hash *cur;
    226	struct btrfs_stripe_hash *h;
    227	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
    228	int i;
    229
    230	if (info->stripe_hash_table)
    231		return 0;
    232
    233	/*
    234	 * The table is large, starting with order 4 and can go as high as
    235	 * order 7 in case lock debugging is turned on.
    236	 *
    237	 * Try harder to allocate and fallback to vmalloc to lower the chance
    238	 * of a failing mount.
    239	 */
    240	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
    241	if (!table)
    242		return -ENOMEM;
    243
    244	spin_lock_init(&table->cache_lock);
    245	INIT_LIST_HEAD(&table->stripe_cache);
    246
    247	h = table->table;
    248
    249	for (i = 0; i < num_entries; i++) {
    250		cur = h + i;
    251		INIT_LIST_HEAD(&cur->hash_list);
    252		spin_lock_init(&cur->lock);
    253	}
    254
    255	x = cmpxchg(&info->stripe_hash_table, NULL, table);
    256	kvfree(x);
    257	return 0;
    258}
    259
    260/*
    261 * caching an rbio means to copy anything from the
    262 * bio_sectors array into the stripe_pages array.  We
    263 * use the page uptodate bit in the stripe cache array
    264 * to indicate if it has valid data
    265 *
    266 * once the caching is done, we set the cache ready
    267 * bit.
    268 */
    269static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
    270{
    271	int i;
    272	int ret;
    273
    274	ret = alloc_rbio_pages(rbio);
    275	if (ret)
    276		return;
    277
    278	for (i = 0; i < rbio->nr_sectors; i++) {
    279		/* Some range not covered by bio (partial write), skip it */
    280		if (!rbio->bio_sectors[i].page)
    281			continue;
    282
    283		ASSERT(rbio->stripe_sectors[i].page);
    284		memcpy_page(rbio->stripe_sectors[i].page,
    285			    rbio->stripe_sectors[i].pgoff,
    286			    rbio->bio_sectors[i].page,
    287			    rbio->bio_sectors[i].pgoff,
    288			    rbio->bioc->fs_info->sectorsize);
    289		rbio->stripe_sectors[i].uptodate = 1;
    290	}
    291	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
    292}
    293
    294/*
    295 * we hash on the first logical address of the stripe
    296 */
    297static int rbio_bucket(struct btrfs_raid_bio *rbio)
    298{
    299	u64 num = rbio->bioc->raid_map[0];
    300
    301	/*
    302	 * we shift down quite a bit.  We're using byte
    303	 * addressing, and most of the lower bits are zeros.
    304	 * This tends to upset hash_64, and it consistently
    305	 * returns just one or two different values.
    306	 *
    307	 * shifting off the lower bits fixes things.
    308	 */
    309	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
    310}
    311
    312static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
    313				       unsigned int page_nr)
    314{
    315	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
    316	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
    317	int i;
    318
    319	ASSERT(page_nr < rbio->nr_pages);
    320
    321	for (i = sectors_per_page * page_nr;
    322	     i < sectors_per_page * page_nr + sectors_per_page;
    323	     i++) {
    324		if (!rbio->stripe_sectors[i].uptodate)
    325			return false;
    326	}
    327	return true;
    328}
    329
    330/*
    331 * Update the stripe_sectors[] array to use correct page and pgoff
    332 *
    333 * Should be called every time any page pointer in stripes_pages[] got modified.
    334 */
    335static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
    336{
    337	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
    338	u32 offset;
    339	int i;
    340
    341	for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
    342		int page_index = offset >> PAGE_SHIFT;
    343
    344		ASSERT(page_index < rbio->nr_pages);
    345		rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
    346		rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
    347	}
    348}
    349
    350/*
    351 * Stealing an rbio means taking all the uptodate pages from the stripe array
    352 * in the source rbio and putting them into the destination rbio.
    353 *
    354 * This will also update the involved stripe_sectors[] which are referring to
    355 * the old pages.
    356 */
    357static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
    358{
    359	int i;
    360	struct page *s;
    361	struct page *d;
    362
    363	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
    364		return;
    365
    366	for (i = 0; i < dest->nr_pages; i++) {
    367		s = src->stripe_pages[i];
    368		if (!s || !full_page_sectors_uptodate(src, i))
    369			continue;
    370
    371		d = dest->stripe_pages[i];
    372		if (d)
    373			__free_page(d);
    374
    375		dest->stripe_pages[i] = s;
    376		src->stripe_pages[i] = NULL;
    377	}
    378	index_stripe_sectors(dest);
    379	index_stripe_sectors(src);
    380}
    381
    382/*
    383 * merging means we take the bio_list from the victim and
    384 * splice it into the destination.  The victim should
    385 * be discarded afterwards.
    386 *
    387 * must be called with dest->rbio_list_lock held
    388 */
    389static void merge_rbio(struct btrfs_raid_bio *dest,
    390		       struct btrfs_raid_bio *victim)
    391{
    392	bio_list_merge(&dest->bio_list, &victim->bio_list);
    393	dest->bio_list_bytes += victim->bio_list_bytes;
    394	dest->generic_bio_cnt += victim->generic_bio_cnt;
    395	bio_list_init(&victim->bio_list);
    396}
    397
    398/*
    399 * used to prune items that are in the cache.  The caller
    400 * must hold the hash table lock.
    401 */
    402static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
    403{
    404	int bucket = rbio_bucket(rbio);
    405	struct btrfs_stripe_hash_table *table;
    406	struct btrfs_stripe_hash *h;
    407	int freeit = 0;
    408
    409	/*
    410	 * check the bit again under the hash table lock.
    411	 */
    412	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
    413		return;
    414
    415	table = rbio->bioc->fs_info->stripe_hash_table;
    416	h = table->table + bucket;
    417
    418	/* hold the lock for the bucket because we may be
    419	 * removing it from the hash table
    420	 */
    421	spin_lock(&h->lock);
    422
    423	/*
    424	 * hold the lock for the bio list because we need
    425	 * to make sure the bio list is empty
    426	 */
    427	spin_lock(&rbio->bio_list_lock);
    428
    429	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
    430		list_del_init(&rbio->stripe_cache);
    431		table->cache_size -= 1;
    432		freeit = 1;
    433
    434		/* if the bio list isn't empty, this rbio is
    435		 * still involved in an IO.  We take it out
    436		 * of the cache list, and drop the ref that
    437		 * was held for the list.
    438		 *
    439		 * If the bio_list was empty, we also remove
    440		 * the rbio from the hash_table, and drop
    441		 * the corresponding ref
    442		 */
    443		if (bio_list_empty(&rbio->bio_list)) {
    444			if (!list_empty(&rbio->hash_list)) {
    445				list_del_init(&rbio->hash_list);
    446				refcount_dec(&rbio->refs);
    447				BUG_ON(!list_empty(&rbio->plug_list));
    448			}
    449		}
    450	}
    451
    452	spin_unlock(&rbio->bio_list_lock);
    453	spin_unlock(&h->lock);
    454
    455	if (freeit)
    456		__free_raid_bio(rbio);
    457}
    458
    459/*
    460 * prune a given rbio from the cache
    461 */
    462static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
    463{
    464	struct btrfs_stripe_hash_table *table;
    465	unsigned long flags;
    466
    467	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
    468		return;
    469
    470	table = rbio->bioc->fs_info->stripe_hash_table;
    471
    472	spin_lock_irqsave(&table->cache_lock, flags);
    473	__remove_rbio_from_cache(rbio);
    474	spin_unlock_irqrestore(&table->cache_lock, flags);
    475}
    476
    477/*
    478 * remove everything in the cache
    479 */
    480static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
    481{
    482	struct btrfs_stripe_hash_table *table;
    483	unsigned long flags;
    484	struct btrfs_raid_bio *rbio;
    485
    486	table = info->stripe_hash_table;
    487
    488	spin_lock_irqsave(&table->cache_lock, flags);
    489	while (!list_empty(&table->stripe_cache)) {
    490		rbio = list_entry(table->stripe_cache.next,
    491				  struct btrfs_raid_bio,
    492				  stripe_cache);
    493		__remove_rbio_from_cache(rbio);
    494	}
    495	spin_unlock_irqrestore(&table->cache_lock, flags);
    496}
    497
    498/*
    499 * remove all cached entries and free the hash table
    500 * used by unmount
    501 */
    502void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
    503{
    504	if (!info->stripe_hash_table)
    505		return;
    506	btrfs_clear_rbio_cache(info);
    507	kvfree(info->stripe_hash_table);
    508	info->stripe_hash_table = NULL;
    509}
    510
    511/*
    512 * insert an rbio into the stripe cache.  It
    513 * must have already been prepared by calling
    514 * cache_rbio_pages
    515 *
    516 * If this rbio was already cached, it gets
    517 * moved to the front of the lru.
    518 *
    519 * If the size of the rbio cache is too big, we
    520 * prune an item.
    521 */
    522static void cache_rbio(struct btrfs_raid_bio *rbio)
    523{
    524	struct btrfs_stripe_hash_table *table;
    525	unsigned long flags;
    526
    527	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
    528		return;
    529
    530	table = rbio->bioc->fs_info->stripe_hash_table;
    531
    532	spin_lock_irqsave(&table->cache_lock, flags);
    533	spin_lock(&rbio->bio_list_lock);
    534
    535	/* bump our ref if we were not in the list before */
    536	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
    537		refcount_inc(&rbio->refs);
    538
    539	if (!list_empty(&rbio->stripe_cache)){
    540		list_move(&rbio->stripe_cache, &table->stripe_cache);
    541	} else {
    542		list_add(&rbio->stripe_cache, &table->stripe_cache);
    543		table->cache_size += 1;
    544	}
    545
    546	spin_unlock(&rbio->bio_list_lock);
    547
    548	if (table->cache_size > RBIO_CACHE_SIZE) {
    549		struct btrfs_raid_bio *found;
    550
    551		found = list_entry(table->stripe_cache.prev,
    552				  struct btrfs_raid_bio,
    553				  stripe_cache);
    554
    555		if (found != rbio)
    556			__remove_rbio_from_cache(found);
    557	}
    558
    559	spin_unlock_irqrestore(&table->cache_lock, flags);
    560}
    561
    562/*
    563 * helper function to run the xor_blocks api.  It is only
    564 * able to do MAX_XOR_BLOCKS at a time, so we need to
    565 * loop through.
    566 */
    567static void run_xor(void **pages, int src_cnt, ssize_t len)
    568{
    569	int src_off = 0;
    570	int xor_src_cnt = 0;
    571	void *dest = pages[src_cnt];
    572
    573	while(src_cnt > 0) {
    574		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
    575		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
    576
    577		src_cnt -= xor_src_cnt;
    578		src_off += xor_src_cnt;
    579	}
    580}
    581
    582/*
    583 * Returns true if the bio list inside this rbio covers an entire stripe (no
    584 * rmw required).
    585 */
    586static int rbio_is_full(struct btrfs_raid_bio *rbio)
    587{
    588	unsigned long flags;
    589	unsigned long size = rbio->bio_list_bytes;
    590	int ret = 1;
    591
    592	spin_lock_irqsave(&rbio->bio_list_lock, flags);
    593	if (size != rbio->nr_data * rbio->stripe_len)
    594		ret = 0;
    595	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
    596	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
    597
    598	return ret;
    599}
    600
    601/*
    602 * returns 1 if it is safe to merge two rbios together.
    603 * The merging is safe if the two rbios correspond to
    604 * the same stripe and if they are both going in the same
    605 * direction (read vs write), and if neither one is
    606 * locked for final IO
    607 *
    608 * The caller is responsible for locking such that
    609 * rmw_locked is safe to test
    610 */
    611static int rbio_can_merge(struct btrfs_raid_bio *last,
    612			  struct btrfs_raid_bio *cur)
    613{
    614	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
    615	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
    616		return 0;
    617
    618	/*
    619	 * we can't merge with cached rbios, since the
    620	 * idea is that when we merge the destination
    621	 * rbio is going to run our IO for us.  We can
    622	 * steal from cached rbios though, other functions
    623	 * handle that.
    624	 */
    625	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
    626	    test_bit(RBIO_CACHE_BIT, &cur->flags))
    627		return 0;
    628
    629	if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
    630		return 0;
    631
    632	/* we can't merge with different operations */
    633	if (last->operation != cur->operation)
    634		return 0;
    635	/*
    636	 * We've need read the full stripe from the drive.
    637	 * check and repair the parity and write the new results.
    638	 *
    639	 * We're not allowed to add any new bios to the
    640	 * bio list here, anyone else that wants to
    641	 * change this stripe needs to do their own rmw.
    642	 */
    643	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
    644		return 0;
    645
    646	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
    647		return 0;
    648
    649	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
    650		int fa = last->faila;
    651		int fb = last->failb;
    652		int cur_fa = cur->faila;
    653		int cur_fb = cur->failb;
    654
    655		if (last->faila >= last->failb) {
    656			fa = last->failb;
    657			fb = last->faila;
    658		}
    659
    660		if (cur->faila >= cur->failb) {
    661			cur_fa = cur->failb;
    662			cur_fb = cur->faila;
    663		}
    664
    665		if (fa != cur_fa || fb != cur_fb)
    666			return 0;
    667	}
    668	return 1;
    669}
    670
    671static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
    672					     unsigned int stripe_nr,
    673					     unsigned int sector_nr)
    674{
    675	ASSERT(stripe_nr < rbio->real_stripes);
    676	ASSERT(sector_nr < rbio->stripe_nsectors);
    677
    678	return stripe_nr * rbio->stripe_nsectors + sector_nr;
    679}
    680
    681/* Return a sector from rbio->stripe_sectors, not from the bio list */
    682static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
    683					     unsigned int stripe_nr,
    684					     unsigned int sector_nr)
    685{
    686	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
    687							      sector_nr)];
    688}
    689
    690/* Grab a sector inside P stripe */
    691static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
    692					      unsigned int sector_nr)
    693{
    694	return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
    695}
    696
    697/* Grab a sector inside Q stripe, return NULL if not RAID6 */
    698static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
    699					      unsigned int sector_nr)
    700{
    701	if (rbio->nr_data + 1 == rbio->real_stripes)
    702		return NULL;
    703	return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
    704}
    705
    706/*
    707 * The first stripe in the table for a logical address
    708 * has the lock.  rbios are added in one of three ways:
    709 *
    710 * 1) Nobody has the stripe locked yet.  The rbio is given
    711 * the lock and 0 is returned.  The caller must start the IO
    712 * themselves.
    713 *
    714 * 2) Someone has the stripe locked, but we're able to merge
    715 * with the lock owner.  The rbio is freed and the IO will
    716 * start automatically along with the existing rbio.  1 is returned.
    717 *
    718 * 3) Someone has the stripe locked, but we're not able to merge.
    719 * The rbio is added to the lock owner's plug list, or merged into
    720 * an rbio already on the plug list.  When the lock owner unlocks,
    721 * the next rbio on the list is run and the IO is started automatically.
    722 * 1 is returned
    723 *
    724 * If we return 0, the caller still owns the rbio and must continue with
    725 * IO submission.  If we return 1, the caller must assume the rbio has
    726 * already been freed.
    727 */
    728static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
    729{
    730	struct btrfs_stripe_hash *h;
    731	struct btrfs_raid_bio *cur;
    732	struct btrfs_raid_bio *pending;
    733	unsigned long flags;
    734	struct btrfs_raid_bio *freeit = NULL;
    735	struct btrfs_raid_bio *cache_drop = NULL;
    736	int ret = 0;
    737
    738	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
    739
    740	spin_lock_irqsave(&h->lock, flags);
    741	list_for_each_entry(cur, &h->hash_list, hash_list) {
    742		if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
    743			continue;
    744
    745		spin_lock(&cur->bio_list_lock);
    746
    747		/* Can we steal this cached rbio's pages? */
    748		if (bio_list_empty(&cur->bio_list) &&
    749		    list_empty(&cur->plug_list) &&
    750		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
    751		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
    752			list_del_init(&cur->hash_list);
    753			refcount_dec(&cur->refs);
    754
    755			steal_rbio(cur, rbio);
    756			cache_drop = cur;
    757			spin_unlock(&cur->bio_list_lock);
    758
    759			goto lockit;
    760		}
    761
    762		/* Can we merge into the lock owner? */
    763		if (rbio_can_merge(cur, rbio)) {
    764			merge_rbio(cur, rbio);
    765			spin_unlock(&cur->bio_list_lock);
    766			freeit = rbio;
    767			ret = 1;
    768			goto out;
    769		}
    770
    771
    772		/*
    773		 * We couldn't merge with the running rbio, see if we can merge
    774		 * with the pending ones.  We don't have to check for rmw_locked
    775		 * because there is no way they are inside finish_rmw right now
    776		 */
    777		list_for_each_entry(pending, &cur->plug_list, plug_list) {
    778			if (rbio_can_merge(pending, rbio)) {
    779				merge_rbio(pending, rbio);
    780				spin_unlock(&cur->bio_list_lock);
    781				freeit = rbio;
    782				ret = 1;
    783				goto out;
    784			}
    785		}
    786
    787		/*
    788		 * No merging, put us on the tail of the plug list, our rbio
    789		 * will be started with the currently running rbio unlocks
    790		 */
    791		list_add_tail(&rbio->plug_list, &cur->plug_list);
    792		spin_unlock(&cur->bio_list_lock);
    793		ret = 1;
    794		goto out;
    795	}
    796lockit:
    797	refcount_inc(&rbio->refs);
    798	list_add(&rbio->hash_list, &h->hash_list);
    799out:
    800	spin_unlock_irqrestore(&h->lock, flags);
    801	if (cache_drop)
    802		remove_rbio_from_cache(cache_drop);
    803	if (freeit)
    804		__free_raid_bio(freeit);
    805	return ret;
    806}
    807
    808/*
    809 * called as rmw or parity rebuild is completed.  If the plug list has more
    810 * rbios waiting for this stripe, the next one on the list will be started
    811 */
    812static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
    813{
    814	int bucket;
    815	struct btrfs_stripe_hash *h;
    816	unsigned long flags;
    817	int keep_cache = 0;
    818
    819	bucket = rbio_bucket(rbio);
    820	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
    821
    822	if (list_empty(&rbio->plug_list))
    823		cache_rbio(rbio);
    824
    825	spin_lock_irqsave(&h->lock, flags);
    826	spin_lock(&rbio->bio_list_lock);
    827
    828	if (!list_empty(&rbio->hash_list)) {
    829		/*
    830		 * if we're still cached and there is no other IO
    831		 * to perform, just leave this rbio here for others
    832		 * to steal from later
    833		 */
    834		if (list_empty(&rbio->plug_list) &&
    835		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
    836			keep_cache = 1;
    837			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
    838			BUG_ON(!bio_list_empty(&rbio->bio_list));
    839			goto done;
    840		}
    841
    842		list_del_init(&rbio->hash_list);
    843		refcount_dec(&rbio->refs);
    844
    845		/*
    846		 * we use the plug list to hold all the rbios
    847		 * waiting for the chance to lock this stripe.
    848		 * hand the lock over to one of them.
    849		 */
    850		if (!list_empty(&rbio->plug_list)) {
    851			struct btrfs_raid_bio *next;
    852			struct list_head *head = rbio->plug_list.next;
    853
    854			next = list_entry(head, struct btrfs_raid_bio,
    855					  plug_list);
    856
    857			list_del_init(&rbio->plug_list);
    858
    859			list_add(&next->hash_list, &h->hash_list);
    860			refcount_inc(&next->refs);
    861			spin_unlock(&rbio->bio_list_lock);
    862			spin_unlock_irqrestore(&h->lock, flags);
    863
    864			if (next->operation == BTRFS_RBIO_READ_REBUILD)
    865				start_async_work(next, read_rebuild_work);
    866			else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
    867				steal_rbio(rbio, next);
    868				start_async_work(next, read_rebuild_work);
    869			} else if (next->operation == BTRFS_RBIO_WRITE) {
    870				steal_rbio(rbio, next);
    871				start_async_work(next, rmw_work);
    872			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
    873				steal_rbio(rbio, next);
    874				start_async_work(next, scrub_parity_work);
    875			}
    876
    877			goto done_nolock;
    878		}
    879	}
    880done:
    881	spin_unlock(&rbio->bio_list_lock);
    882	spin_unlock_irqrestore(&h->lock, flags);
    883
    884done_nolock:
    885	if (!keep_cache)
    886		remove_rbio_from_cache(rbio);
    887}
    888
    889static void __free_raid_bio(struct btrfs_raid_bio *rbio)
    890{
    891	int i;
    892
    893	if (!refcount_dec_and_test(&rbio->refs))
    894		return;
    895
    896	WARN_ON(!list_empty(&rbio->stripe_cache));
    897	WARN_ON(!list_empty(&rbio->hash_list));
    898	WARN_ON(!bio_list_empty(&rbio->bio_list));
    899
    900	for (i = 0; i < rbio->nr_pages; i++) {
    901		if (rbio->stripe_pages[i]) {
    902			__free_page(rbio->stripe_pages[i]);
    903			rbio->stripe_pages[i] = NULL;
    904		}
    905	}
    906
    907	btrfs_put_bioc(rbio->bioc);
    908	kfree(rbio);
    909}
    910
    911static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
    912{
    913	struct bio *next;
    914
    915	while (cur) {
    916		next = cur->bi_next;
    917		cur->bi_next = NULL;
    918		cur->bi_status = err;
    919		bio_endio(cur);
    920		cur = next;
    921	}
    922}
    923
    924/*
    925 * this frees the rbio and runs through all the bios in the
    926 * bio_list and calls end_io on them
    927 */
    928static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
    929{
    930	struct bio *cur = bio_list_get(&rbio->bio_list);
    931	struct bio *extra;
    932
    933	if (rbio->generic_bio_cnt)
    934		btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
    935
    936	/*
    937	 * At this moment, rbio->bio_list is empty, however since rbio does not
    938	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
    939	 * hash list, rbio may be merged with others so that rbio->bio_list
    940	 * becomes non-empty.
    941	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
    942	 * more and we can call bio_endio() on all queued bios.
    943	 */
    944	unlock_stripe(rbio);
    945	extra = bio_list_get(&rbio->bio_list);
    946	__free_raid_bio(rbio);
    947
    948	rbio_endio_bio_list(cur, err);
    949	if (extra)
    950		rbio_endio_bio_list(extra, err);
    951}
    952
    953/*
    954 * end io function used by finish_rmw.  When we finally
    955 * get here, we've written a full stripe
    956 */
    957static void raid_write_end_io(struct bio *bio)
    958{
    959	struct btrfs_raid_bio *rbio = bio->bi_private;
    960	blk_status_t err = bio->bi_status;
    961	int max_errors;
    962
    963	if (err)
    964		fail_bio_stripe(rbio, bio);
    965
    966	bio_put(bio);
    967
    968	if (!atomic_dec_and_test(&rbio->stripes_pending))
    969		return;
    970
    971	err = BLK_STS_OK;
    972
    973	/* OK, we have read all the stripes we need to. */
    974	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
    975		     0 : rbio->bioc->max_errors;
    976	if (atomic_read(&rbio->error) > max_errors)
    977		err = BLK_STS_IOERR;
    978
    979	rbio_orig_end_io(rbio, err);
    980}
    981
    982/**
    983 * Get a sector pointer specified by its @stripe_nr and @sector_nr
    984 *
    985 * @rbio:               The raid bio
    986 * @stripe_nr:          Stripe number, valid range [0, real_stripe)
    987 * @sector_nr:		Sector number inside the stripe,
    988 *			valid range [0, stripe_nsectors)
    989 * @bio_list_only:      Whether to use sectors inside the bio list only.
    990 *
    991 * The read/modify/write code wants to reuse the original bio page as much
    992 * as possible, and only use stripe_sectors as fallback.
    993 */
    994static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
    995					 int stripe_nr, int sector_nr,
    996					 bool bio_list_only)
    997{
    998	struct sector_ptr *sector;
    999	int index;
   1000
   1001	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
   1002	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
   1003
   1004	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
   1005	ASSERT(index >= 0 && index < rbio->nr_sectors);
   1006
   1007	spin_lock_irq(&rbio->bio_list_lock);
   1008	sector = &rbio->bio_sectors[index];
   1009	if (sector->page || bio_list_only) {
   1010		/* Don't return sector without a valid page pointer */
   1011		if (!sector->page)
   1012			sector = NULL;
   1013		spin_unlock_irq(&rbio->bio_list_lock);
   1014		return sector;
   1015	}
   1016	spin_unlock_irq(&rbio->bio_list_lock);
   1017
   1018	return &rbio->stripe_sectors[index];
   1019}
   1020
   1021/*
   1022 * allocation and initial setup for the btrfs_raid_bio.  Not
   1023 * this does not allocate any pages for rbio->pages.
   1024 */
   1025static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
   1026					 struct btrfs_io_context *bioc,
   1027					 u32 stripe_len)
   1028{
   1029	const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
   1030	const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
   1031	const unsigned int num_pages = stripe_npages * real_stripes;
   1032	const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
   1033	const unsigned int num_sectors = stripe_nsectors * real_stripes;
   1034	struct btrfs_raid_bio *rbio;
   1035	int nr_data = 0;
   1036	void *p;
   1037
   1038	ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
   1039	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
   1040	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
   1041
   1042	rbio = kzalloc(sizeof(*rbio) +
   1043		       sizeof(*rbio->stripe_pages) * num_pages +
   1044		       sizeof(*rbio->bio_sectors) * num_sectors +
   1045		       sizeof(*rbio->stripe_sectors) * num_sectors +
   1046		       sizeof(*rbio->finish_pointers) * real_stripes +
   1047		       sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
   1048		       sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
   1049		       GFP_NOFS);
   1050	if (!rbio)
   1051		return ERR_PTR(-ENOMEM);
   1052
   1053	bio_list_init(&rbio->bio_list);
   1054	INIT_LIST_HEAD(&rbio->plug_list);
   1055	spin_lock_init(&rbio->bio_list_lock);
   1056	INIT_LIST_HEAD(&rbio->stripe_cache);
   1057	INIT_LIST_HEAD(&rbio->hash_list);
   1058	rbio->bioc = bioc;
   1059	rbio->stripe_len = stripe_len;
   1060	rbio->nr_pages = num_pages;
   1061	rbio->nr_sectors = num_sectors;
   1062	rbio->real_stripes = real_stripes;
   1063	rbio->stripe_npages = stripe_npages;
   1064	rbio->stripe_nsectors = stripe_nsectors;
   1065	rbio->faila = -1;
   1066	rbio->failb = -1;
   1067	refcount_set(&rbio->refs, 1);
   1068	atomic_set(&rbio->error, 0);
   1069	atomic_set(&rbio->stripes_pending, 0);
   1070
   1071	/*
   1072	 * The stripe_pages, bio_sectors, etc arrays point to the extra memory
   1073	 * we allocated past the end of the rbio.
   1074	 */
   1075	p = rbio + 1;
   1076#define CONSUME_ALLOC(ptr, count)	do {				\
   1077		ptr = p;						\
   1078		p = (unsigned char *)p + sizeof(*(ptr)) * (count);	\
   1079	} while (0)
   1080	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
   1081	CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
   1082	CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
   1083	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
   1084	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
   1085	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
   1086#undef  CONSUME_ALLOC
   1087
   1088	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
   1089		nr_data = real_stripes - 1;
   1090	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
   1091		nr_data = real_stripes - 2;
   1092	else
   1093		BUG();
   1094
   1095	rbio->nr_data = nr_data;
   1096	return rbio;
   1097}
   1098
   1099/* allocate pages for all the stripes in the bio, including parity */
   1100static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
   1101{
   1102	int ret;
   1103
   1104	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
   1105	if (ret < 0)
   1106		return ret;
   1107	/* Mapping all sectors */
   1108	index_stripe_sectors(rbio);
   1109	return 0;
   1110}
   1111
   1112/* only allocate pages for p/q stripes */
   1113static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
   1114{
   1115	const int data_pages = rbio->nr_data * rbio->stripe_npages;
   1116	int ret;
   1117
   1118	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
   1119				     rbio->stripe_pages + data_pages);
   1120	if (ret < 0)
   1121		return ret;
   1122
   1123	index_stripe_sectors(rbio);
   1124	return 0;
   1125}
   1126
   1127/*
   1128 * Add a single sector @sector into our list of bios for IO.
   1129 *
   1130 * Return 0 if everything went well.
   1131 * Return <0 for error.
   1132 */
   1133static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
   1134			      struct bio_list *bio_list,
   1135			      struct sector_ptr *sector,
   1136			      unsigned int stripe_nr,
   1137			      unsigned int sector_nr,
   1138			      unsigned long bio_max_len,
   1139			      unsigned int opf)
   1140{
   1141	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
   1142	struct bio *last = bio_list->tail;
   1143	int ret;
   1144	struct bio *bio;
   1145	struct btrfs_io_stripe *stripe;
   1146	u64 disk_start;
   1147
   1148	/*
   1149	 * Note: here stripe_nr has taken device replace into consideration,
   1150	 * thus it can be larger than rbio->real_stripe.
   1151	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
   1152	 */
   1153	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
   1154	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
   1155	ASSERT(sector->page);
   1156
   1157	stripe = &rbio->bioc->stripes[stripe_nr];
   1158	disk_start = stripe->physical + sector_nr * sectorsize;
   1159
   1160	/* if the device is missing, just fail this stripe */
   1161	if (!stripe->dev->bdev)
   1162		return fail_rbio_index(rbio, stripe_nr);
   1163
   1164	/* see if we can add this page onto our existing bio */
   1165	if (last) {
   1166		u64 last_end = last->bi_iter.bi_sector << 9;
   1167		last_end += last->bi_iter.bi_size;
   1168
   1169		/*
   1170		 * we can't merge these if they are from different
   1171		 * devices or if they are not contiguous
   1172		 */
   1173		if (last_end == disk_start && !last->bi_status &&
   1174		    last->bi_bdev == stripe->dev->bdev) {
   1175			ret = bio_add_page(last, sector->page, sectorsize,
   1176					   sector->pgoff);
   1177			if (ret == sectorsize)
   1178				return 0;
   1179		}
   1180	}
   1181
   1182	/* put a new bio on the list */
   1183	bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
   1184			opf, GFP_NOFS);
   1185	bio->bi_iter.bi_sector = disk_start >> 9;
   1186	bio->bi_private = rbio;
   1187
   1188	bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
   1189	bio_list_add(bio_list, bio);
   1190	return 0;
   1191}
   1192
   1193/*
   1194 * while we're doing the read/modify/write cycle, we could
   1195 * have errors in reading pages off the disk.  This checks
   1196 * for errors and if we're not able to read the page it'll
   1197 * trigger parity reconstruction.  The rmw will be finished
   1198 * after we've reconstructed the failed stripes
   1199 */
   1200static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
   1201{
   1202	if (rbio->faila >= 0 || rbio->failb >= 0) {
   1203		BUG_ON(rbio->faila == rbio->real_stripes - 1);
   1204		__raid56_parity_recover(rbio);
   1205	} else {
   1206		finish_rmw(rbio);
   1207	}
   1208}
   1209
   1210static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
   1211{
   1212	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
   1213	struct bio_vec bvec;
   1214	struct bvec_iter iter;
   1215	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
   1216		     rbio->bioc->raid_map[0];
   1217
   1218	if (bio_flagged(bio, BIO_CLONED))
   1219		bio->bi_iter = btrfs_bio(bio)->iter;
   1220
   1221	bio_for_each_segment(bvec, bio, iter) {
   1222		u32 bvec_offset;
   1223
   1224		for (bvec_offset = 0; bvec_offset < bvec.bv_len;
   1225		     bvec_offset += sectorsize, offset += sectorsize) {
   1226			int index = offset / sectorsize;
   1227			struct sector_ptr *sector = &rbio->bio_sectors[index];
   1228
   1229			sector->page = bvec.bv_page;
   1230			sector->pgoff = bvec.bv_offset + bvec_offset;
   1231			ASSERT(sector->pgoff < PAGE_SIZE);
   1232		}
   1233	}
   1234}
   1235
   1236/*
   1237 * helper function to walk our bio list and populate the bio_pages array with
   1238 * the result.  This seems expensive, but it is faster than constantly
   1239 * searching through the bio list as we setup the IO in finish_rmw or stripe
   1240 * reconstruction.
   1241 *
   1242 * This must be called before you trust the answers from page_in_rbio
   1243 */
   1244static void index_rbio_pages(struct btrfs_raid_bio *rbio)
   1245{
   1246	struct bio *bio;
   1247
   1248	spin_lock_irq(&rbio->bio_list_lock);
   1249	bio_list_for_each(bio, &rbio->bio_list)
   1250		index_one_bio(rbio, bio);
   1251
   1252	spin_unlock_irq(&rbio->bio_list_lock);
   1253}
   1254
   1255/*
   1256 * this is called from one of two situations.  We either
   1257 * have a full stripe from the higher layers, or we've read all
   1258 * the missing bits off disk.
   1259 *
   1260 * This will calculate the parity and then send down any
   1261 * changed blocks.
   1262 */
   1263static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
   1264{
   1265	struct btrfs_io_context *bioc = rbio->bioc;
   1266	const u32 sectorsize = bioc->fs_info->sectorsize;
   1267	void **pointers = rbio->finish_pointers;
   1268	int nr_data = rbio->nr_data;
   1269	int stripe;
   1270	int sectornr;
   1271	bool has_qstripe;
   1272	struct bio_list bio_list;
   1273	struct bio *bio;
   1274	int ret;
   1275
   1276	bio_list_init(&bio_list);
   1277
   1278	if (rbio->real_stripes - rbio->nr_data == 1)
   1279		has_qstripe = false;
   1280	else if (rbio->real_stripes - rbio->nr_data == 2)
   1281		has_qstripe = true;
   1282	else
   1283		BUG();
   1284
   1285	/* at this point we either have a full stripe,
   1286	 * or we've read the full stripe from the drive.
   1287	 * recalculate the parity and write the new results.
   1288	 *
   1289	 * We're not allowed to add any new bios to the
   1290	 * bio list here, anyone else that wants to
   1291	 * change this stripe needs to do their own rmw.
   1292	 */
   1293	spin_lock_irq(&rbio->bio_list_lock);
   1294	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
   1295	spin_unlock_irq(&rbio->bio_list_lock);
   1296
   1297	atomic_set(&rbio->error, 0);
   1298
   1299	/*
   1300	 * now that we've set rmw_locked, run through the
   1301	 * bio list one last time and map the page pointers
   1302	 *
   1303	 * We don't cache full rbios because we're assuming
   1304	 * the higher layers are unlikely to use this area of
   1305	 * the disk again soon.  If they do use it again,
   1306	 * hopefully they will send another full bio.
   1307	 */
   1308	index_rbio_pages(rbio);
   1309	if (!rbio_is_full(rbio))
   1310		cache_rbio_pages(rbio);
   1311	else
   1312		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
   1313
   1314	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
   1315		struct sector_ptr *sector;
   1316
   1317		/* First collect one sector from each data stripe */
   1318		for (stripe = 0; stripe < nr_data; stripe++) {
   1319			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
   1320			pointers[stripe] = kmap_local_page(sector->page) +
   1321					   sector->pgoff;
   1322		}
   1323
   1324		/* Then add the parity stripe */
   1325		sector = rbio_pstripe_sector(rbio, sectornr);
   1326		sector->uptodate = 1;
   1327		pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
   1328
   1329		if (has_qstripe) {
   1330			/*
   1331			 * RAID6, add the qstripe and call the library function
   1332			 * to fill in our p/q
   1333			 */
   1334			sector = rbio_qstripe_sector(rbio, sectornr);
   1335			sector->uptodate = 1;
   1336			pointers[stripe++] = kmap_local_page(sector->page) +
   1337					     sector->pgoff;
   1338
   1339			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
   1340						pointers);
   1341		} else {
   1342			/* raid5 */
   1343			memcpy(pointers[nr_data], pointers[0], sectorsize);
   1344			run_xor(pointers + 1, nr_data - 1, sectorsize);
   1345		}
   1346		for (stripe = stripe - 1; stripe >= 0; stripe--)
   1347			kunmap_local(pointers[stripe]);
   1348	}
   1349
   1350	/*
   1351	 * time to start writing.  Make bios for everything from the
   1352	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
   1353	 * everything else.
   1354	 */
   1355	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
   1356		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
   1357			struct sector_ptr *sector;
   1358
   1359			if (stripe < rbio->nr_data) {
   1360				sector = sector_in_rbio(rbio, stripe, sectornr, 1);
   1361				if (!sector)
   1362					continue;
   1363			} else {
   1364				sector = rbio_stripe_sector(rbio, stripe, sectornr);
   1365			}
   1366
   1367			ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
   1368						 sectornr, rbio->stripe_len,
   1369						 REQ_OP_WRITE);
   1370			if (ret)
   1371				goto cleanup;
   1372		}
   1373	}
   1374
   1375	if (likely(!bioc->num_tgtdevs))
   1376		goto write_data;
   1377
   1378	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
   1379		if (!bioc->tgtdev_map[stripe])
   1380			continue;
   1381
   1382		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
   1383			struct sector_ptr *sector;
   1384
   1385			if (stripe < rbio->nr_data) {
   1386				sector = sector_in_rbio(rbio, stripe, sectornr, 1);
   1387				if (!sector)
   1388					continue;
   1389			} else {
   1390				sector = rbio_stripe_sector(rbio, stripe, sectornr);
   1391			}
   1392
   1393			ret = rbio_add_io_sector(rbio, &bio_list, sector,
   1394					       rbio->bioc->tgtdev_map[stripe],
   1395					       sectornr, rbio->stripe_len,
   1396					       REQ_OP_WRITE);
   1397			if (ret)
   1398				goto cleanup;
   1399		}
   1400	}
   1401
   1402write_data:
   1403	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
   1404	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
   1405
   1406	while ((bio = bio_list_pop(&bio_list))) {
   1407		bio->bi_end_io = raid_write_end_io;
   1408
   1409		submit_bio(bio);
   1410	}
   1411	return;
   1412
   1413cleanup:
   1414	rbio_orig_end_io(rbio, BLK_STS_IOERR);
   1415
   1416	while ((bio = bio_list_pop(&bio_list)))
   1417		bio_put(bio);
   1418}
   1419
   1420/*
   1421 * helper to find the stripe number for a given bio.  Used to figure out which
   1422 * stripe has failed.  This expects the bio to correspond to a physical disk,
   1423 * so it looks up based on physical sector numbers.
   1424 */
   1425static int find_bio_stripe(struct btrfs_raid_bio *rbio,
   1426			   struct bio *bio)
   1427{
   1428	u64 physical = bio->bi_iter.bi_sector;
   1429	int i;
   1430	struct btrfs_io_stripe *stripe;
   1431
   1432	physical <<= 9;
   1433
   1434	for (i = 0; i < rbio->bioc->num_stripes; i++) {
   1435		stripe = &rbio->bioc->stripes[i];
   1436		if (in_range(physical, stripe->physical, rbio->stripe_len) &&
   1437		    stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
   1438			return i;
   1439		}
   1440	}
   1441	return -1;
   1442}
   1443
   1444/*
   1445 * helper to find the stripe number for a given
   1446 * bio (before mapping).  Used to figure out which stripe has
   1447 * failed.  This looks up based on logical block numbers.
   1448 */
   1449static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
   1450				   struct bio *bio)
   1451{
   1452	u64 logical = bio->bi_iter.bi_sector << 9;
   1453	int i;
   1454
   1455	for (i = 0; i < rbio->nr_data; i++) {
   1456		u64 stripe_start = rbio->bioc->raid_map[i];
   1457
   1458		if (in_range(logical, stripe_start, rbio->stripe_len))
   1459			return i;
   1460	}
   1461	return -1;
   1462}
   1463
   1464/*
   1465 * returns -EIO if we had too many failures
   1466 */
   1467static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
   1468{
   1469	unsigned long flags;
   1470	int ret = 0;
   1471
   1472	spin_lock_irqsave(&rbio->bio_list_lock, flags);
   1473
   1474	/* we already know this stripe is bad, move on */
   1475	if (rbio->faila == failed || rbio->failb == failed)
   1476		goto out;
   1477
   1478	if (rbio->faila == -1) {
   1479		/* first failure on this rbio */
   1480		rbio->faila = failed;
   1481		atomic_inc(&rbio->error);
   1482	} else if (rbio->failb == -1) {
   1483		/* second failure on this rbio */
   1484		rbio->failb = failed;
   1485		atomic_inc(&rbio->error);
   1486	} else {
   1487		ret = -EIO;
   1488	}
   1489out:
   1490	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
   1491
   1492	return ret;
   1493}
   1494
   1495/*
   1496 * helper to fail a stripe based on a physical disk
   1497 * bio.
   1498 */
   1499static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
   1500			   struct bio *bio)
   1501{
   1502	int failed = find_bio_stripe(rbio, bio);
   1503
   1504	if (failed < 0)
   1505		return -EIO;
   1506
   1507	return fail_rbio_index(rbio, failed);
   1508}
   1509
   1510/*
   1511 * For subpage case, we can no longer set page Uptodate directly for
   1512 * stripe_pages[], thus we need to locate the sector.
   1513 */
   1514static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
   1515					     struct page *page,
   1516					     unsigned int pgoff)
   1517{
   1518	int i;
   1519
   1520	for (i = 0; i < rbio->nr_sectors; i++) {
   1521		struct sector_ptr *sector = &rbio->stripe_sectors[i];
   1522
   1523		if (sector->page == page && sector->pgoff == pgoff)
   1524			return sector;
   1525	}
   1526	return NULL;
   1527}
   1528
   1529/*
   1530 * this sets each page in the bio uptodate.  It should only be used on private
   1531 * rbio pages, nothing that comes in from the higher layers
   1532 */
   1533static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
   1534{
   1535	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
   1536	struct bio_vec *bvec;
   1537	struct bvec_iter_all iter_all;
   1538
   1539	ASSERT(!bio_flagged(bio, BIO_CLONED));
   1540
   1541	bio_for_each_segment_all(bvec, bio, iter_all) {
   1542		struct sector_ptr *sector;
   1543		int pgoff;
   1544
   1545		for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
   1546		     pgoff += sectorsize) {
   1547			sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
   1548			ASSERT(sector);
   1549			if (sector)
   1550				sector->uptodate = 1;
   1551		}
   1552	}
   1553}
   1554
   1555/*
   1556 * end io for the read phase of the rmw cycle.  All the bios here are physical
   1557 * stripe bios we've read from the disk so we can recalculate the parity of the
   1558 * stripe.
   1559 *
   1560 * This will usually kick off finish_rmw once all the bios are read in, but it
   1561 * may trigger parity reconstruction if we had any errors along the way
   1562 */
   1563static void raid_rmw_end_io(struct bio *bio)
   1564{
   1565	struct btrfs_raid_bio *rbio = bio->bi_private;
   1566
   1567	if (bio->bi_status)
   1568		fail_bio_stripe(rbio, bio);
   1569	else
   1570		set_bio_pages_uptodate(rbio, bio);
   1571
   1572	bio_put(bio);
   1573
   1574	if (!atomic_dec_and_test(&rbio->stripes_pending))
   1575		return;
   1576
   1577	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
   1578		goto cleanup;
   1579
   1580	/*
   1581	 * this will normally call finish_rmw to start our write
   1582	 * but if there are any failed stripes we'll reconstruct
   1583	 * from parity first
   1584	 */
   1585	validate_rbio_for_rmw(rbio);
   1586	return;
   1587
   1588cleanup:
   1589
   1590	rbio_orig_end_io(rbio, BLK_STS_IOERR);
   1591}
   1592
   1593/*
   1594 * the stripe must be locked by the caller.  It will
   1595 * unlock after all the writes are done
   1596 */
   1597static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
   1598{
   1599	int bios_to_read = 0;
   1600	struct bio_list bio_list;
   1601	int ret;
   1602	int sectornr;
   1603	int stripe;
   1604	struct bio *bio;
   1605
   1606	bio_list_init(&bio_list);
   1607
   1608	ret = alloc_rbio_pages(rbio);
   1609	if (ret)
   1610		goto cleanup;
   1611
   1612	index_rbio_pages(rbio);
   1613
   1614	atomic_set(&rbio->error, 0);
   1615	/*
   1616	 * build a list of bios to read all the missing parts of this
   1617	 * stripe
   1618	 */
   1619	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
   1620		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
   1621			struct sector_ptr *sector;
   1622
   1623			/*
   1624			 * We want to find all the sectors missing from the
   1625			 * rbio and read them from the disk.  If * sector_in_rbio()
   1626			 * finds a page in the bio list we don't need to read
   1627			 * it off the stripe.
   1628			 */
   1629			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
   1630			if (sector)
   1631				continue;
   1632
   1633			sector = rbio_stripe_sector(rbio, stripe, sectornr);
   1634			/*
   1635			 * The bio cache may have handed us an uptodate page.
   1636			 * If so, be happy and use it.
   1637			 */
   1638			if (sector->uptodate)
   1639				continue;
   1640
   1641			ret = rbio_add_io_sector(rbio, &bio_list, sector,
   1642				       stripe, sectornr, rbio->stripe_len,
   1643				       REQ_OP_READ);
   1644			if (ret)
   1645				goto cleanup;
   1646		}
   1647	}
   1648
   1649	bios_to_read = bio_list_size(&bio_list);
   1650	if (!bios_to_read) {
   1651		/*
   1652		 * this can happen if others have merged with
   1653		 * us, it means there is nothing left to read.
   1654		 * But if there are missing devices it may not be
   1655		 * safe to do the full stripe write yet.
   1656		 */
   1657		goto finish;
   1658	}
   1659
   1660	/*
   1661	 * The bioc may be freed once we submit the last bio. Make sure not to
   1662	 * touch it after that.
   1663	 */
   1664	atomic_set(&rbio->stripes_pending, bios_to_read);
   1665	while ((bio = bio_list_pop(&bio_list))) {
   1666		bio->bi_end_io = raid_rmw_end_io;
   1667
   1668		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
   1669
   1670		submit_bio(bio);
   1671	}
   1672	/* the actual write will happen once the reads are done */
   1673	return 0;
   1674
   1675cleanup:
   1676	rbio_orig_end_io(rbio, BLK_STS_IOERR);
   1677
   1678	while ((bio = bio_list_pop(&bio_list)))
   1679		bio_put(bio);
   1680
   1681	return -EIO;
   1682
   1683finish:
   1684	validate_rbio_for_rmw(rbio);
   1685	return 0;
   1686}
   1687
   1688/*
   1689 * if the upper layers pass in a full stripe, we thank them by only allocating
   1690 * enough pages to hold the parity, and sending it all down quickly.
   1691 */
   1692static int full_stripe_write(struct btrfs_raid_bio *rbio)
   1693{
   1694	int ret;
   1695
   1696	ret = alloc_rbio_parity_pages(rbio);
   1697	if (ret) {
   1698		__free_raid_bio(rbio);
   1699		return ret;
   1700	}
   1701
   1702	ret = lock_stripe_add(rbio);
   1703	if (ret == 0)
   1704		finish_rmw(rbio);
   1705	return 0;
   1706}
   1707
   1708/*
   1709 * partial stripe writes get handed over to async helpers.
   1710 * We're really hoping to merge a few more writes into this
   1711 * rbio before calculating new parity
   1712 */
   1713static int partial_stripe_write(struct btrfs_raid_bio *rbio)
   1714{
   1715	int ret;
   1716
   1717	ret = lock_stripe_add(rbio);
   1718	if (ret == 0)
   1719		start_async_work(rbio, rmw_work);
   1720	return 0;
   1721}
   1722
   1723/*
   1724 * sometimes while we were reading from the drive to
   1725 * recalculate parity, enough new bios come into create
   1726 * a full stripe.  So we do a check here to see if we can
   1727 * go directly to finish_rmw
   1728 */
   1729static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
   1730{
   1731	/* head off into rmw land if we don't have a full stripe */
   1732	if (!rbio_is_full(rbio))
   1733		return partial_stripe_write(rbio);
   1734	return full_stripe_write(rbio);
   1735}
   1736
   1737/*
   1738 * We use plugging call backs to collect full stripes.
   1739 * Any time we get a partial stripe write while plugged
   1740 * we collect it into a list.  When the unplug comes down,
   1741 * we sort the list by logical block number and merge
   1742 * everything we can into the same rbios
   1743 */
   1744struct btrfs_plug_cb {
   1745	struct blk_plug_cb cb;
   1746	struct btrfs_fs_info *info;
   1747	struct list_head rbio_list;
   1748	struct work_struct work;
   1749};
   1750
   1751/*
   1752 * rbios on the plug list are sorted for easier merging.
   1753 */
   1754static int plug_cmp(void *priv, const struct list_head *a,
   1755		    const struct list_head *b)
   1756{
   1757	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
   1758						       plug_list);
   1759	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
   1760						       plug_list);
   1761	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
   1762	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
   1763
   1764	if (a_sector < b_sector)
   1765		return -1;
   1766	if (a_sector > b_sector)
   1767		return 1;
   1768	return 0;
   1769}
   1770
   1771static void run_plug(struct btrfs_plug_cb *plug)
   1772{
   1773	struct btrfs_raid_bio *cur;
   1774	struct btrfs_raid_bio *last = NULL;
   1775
   1776	/*
   1777	 * sort our plug list then try to merge
   1778	 * everything we can in hopes of creating full
   1779	 * stripes.
   1780	 */
   1781	list_sort(NULL, &plug->rbio_list, plug_cmp);
   1782	while (!list_empty(&plug->rbio_list)) {
   1783		cur = list_entry(plug->rbio_list.next,
   1784				 struct btrfs_raid_bio, plug_list);
   1785		list_del_init(&cur->plug_list);
   1786
   1787		if (rbio_is_full(cur)) {
   1788			int ret;
   1789
   1790			/* we have a full stripe, send it down */
   1791			ret = full_stripe_write(cur);
   1792			BUG_ON(ret);
   1793			continue;
   1794		}
   1795		if (last) {
   1796			if (rbio_can_merge(last, cur)) {
   1797				merge_rbio(last, cur);
   1798				__free_raid_bio(cur);
   1799				continue;
   1800
   1801			}
   1802			__raid56_parity_write(last);
   1803		}
   1804		last = cur;
   1805	}
   1806	if (last) {
   1807		__raid56_parity_write(last);
   1808	}
   1809	kfree(plug);
   1810}
   1811
   1812/*
   1813 * if the unplug comes from schedule, we have to push the
   1814 * work off to a helper thread
   1815 */
   1816static void unplug_work(struct work_struct *work)
   1817{
   1818	struct btrfs_plug_cb *plug;
   1819	plug = container_of(work, struct btrfs_plug_cb, work);
   1820	run_plug(plug);
   1821}
   1822
   1823static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
   1824{
   1825	struct btrfs_plug_cb *plug;
   1826	plug = container_of(cb, struct btrfs_plug_cb, cb);
   1827
   1828	if (from_schedule) {
   1829		INIT_WORK(&plug->work, unplug_work);
   1830		queue_work(plug->info->rmw_workers, &plug->work);
   1831		return;
   1832	}
   1833	run_plug(plug);
   1834}
   1835
   1836/*
   1837 * our main entry point for writes from the rest of the FS.
   1838 */
   1839int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
   1840{
   1841	struct btrfs_fs_info *fs_info = bioc->fs_info;
   1842	struct btrfs_raid_bio *rbio;
   1843	struct btrfs_plug_cb *plug = NULL;
   1844	struct blk_plug_cb *cb;
   1845	int ret;
   1846
   1847	rbio = alloc_rbio(fs_info, bioc, stripe_len);
   1848	if (IS_ERR(rbio)) {
   1849		btrfs_put_bioc(bioc);
   1850		return PTR_ERR(rbio);
   1851	}
   1852	bio_list_add(&rbio->bio_list, bio);
   1853	rbio->bio_list_bytes = bio->bi_iter.bi_size;
   1854	rbio->operation = BTRFS_RBIO_WRITE;
   1855
   1856	btrfs_bio_counter_inc_noblocked(fs_info);
   1857	rbio->generic_bio_cnt = 1;
   1858
   1859	/*
   1860	 * don't plug on full rbios, just get them out the door
   1861	 * as quickly as we can
   1862	 */
   1863	if (rbio_is_full(rbio)) {
   1864		ret = full_stripe_write(rbio);
   1865		if (ret)
   1866			btrfs_bio_counter_dec(fs_info);
   1867		return ret;
   1868	}
   1869
   1870	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
   1871	if (cb) {
   1872		plug = container_of(cb, struct btrfs_plug_cb, cb);
   1873		if (!plug->info) {
   1874			plug->info = fs_info;
   1875			INIT_LIST_HEAD(&plug->rbio_list);
   1876		}
   1877		list_add_tail(&rbio->plug_list, &plug->rbio_list);
   1878		ret = 0;
   1879	} else {
   1880		ret = __raid56_parity_write(rbio);
   1881		if (ret)
   1882			btrfs_bio_counter_dec(fs_info);
   1883	}
   1884	return ret;
   1885}
   1886
   1887/*
   1888 * all parity reconstruction happens here.  We've read in everything
   1889 * we can find from the drives and this does the heavy lifting of
   1890 * sorting the good from the bad.
   1891 */
   1892static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
   1893{
   1894	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
   1895	int sectornr, stripe;
   1896	void **pointers;
   1897	void **unmap_array;
   1898	int faila = -1, failb = -1;
   1899	blk_status_t err;
   1900	int i;
   1901
   1902	/*
   1903	 * This array stores the pointer for each sector, thus it has the extra
   1904	 * pgoff value added from each sector
   1905	 */
   1906	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
   1907	if (!pointers) {
   1908		err = BLK_STS_RESOURCE;
   1909		goto cleanup_io;
   1910	}
   1911
   1912	/*
   1913	 * Store copy of pointers that does not get reordered during
   1914	 * reconstruction so that kunmap_local works.
   1915	 */
   1916	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
   1917	if (!unmap_array) {
   1918		err = BLK_STS_RESOURCE;
   1919		goto cleanup_pointers;
   1920	}
   1921
   1922	faila = rbio->faila;
   1923	failb = rbio->failb;
   1924
   1925	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
   1926	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
   1927		spin_lock_irq(&rbio->bio_list_lock);
   1928		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
   1929		spin_unlock_irq(&rbio->bio_list_lock);
   1930	}
   1931
   1932	index_rbio_pages(rbio);
   1933
   1934	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
   1935		struct sector_ptr *sector;
   1936
   1937		/*
   1938		 * Now we just use bitmap to mark the horizontal stripes in
   1939		 * which we have data when doing parity scrub.
   1940		 */
   1941		if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
   1942		    !test_bit(sectornr, rbio->dbitmap))
   1943			continue;
   1944
   1945		/*
   1946		 * Setup our array of pointers with sectors from each stripe
   1947		 *
   1948		 * NOTE: store a duplicate array of pointers to preserve the
   1949		 * pointer order
   1950		 */
   1951		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
   1952			/*
   1953			 * If we're rebuilding a read, we have to use
   1954			 * pages from the bio list
   1955			 */
   1956			if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
   1957			     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
   1958			    (stripe == faila || stripe == failb)) {
   1959				sector = sector_in_rbio(rbio, stripe, sectornr, 0);
   1960			} else {
   1961				sector = rbio_stripe_sector(rbio, stripe, sectornr);
   1962			}
   1963			ASSERT(sector->page);
   1964			pointers[stripe] = kmap_local_page(sector->page) +
   1965					   sector->pgoff;
   1966			unmap_array[stripe] = pointers[stripe];
   1967		}
   1968
   1969		/* All raid6 handling here */
   1970		if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
   1971			/* Single failure, rebuild from parity raid5 style */
   1972			if (failb < 0) {
   1973				if (faila == rbio->nr_data) {
   1974					/*
   1975					 * Just the P stripe has failed, without
   1976					 * a bad data or Q stripe.
   1977					 * TODO, we should redo the xor here.
   1978					 */
   1979					err = BLK_STS_IOERR;
   1980					goto cleanup;
   1981				}
   1982				/*
   1983				 * a single failure in raid6 is rebuilt
   1984				 * in the pstripe code below
   1985				 */
   1986				goto pstripe;
   1987			}
   1988
   1989			/* make sure our ps and qs are in order */
   1990			if (faila > failb)
   1991				swap(faila, failb);
   1992
   1993			/* if the q stripe is failed, do a pstripe reconstruction
   1994			 * from the xors.
   1995			 * If both the q stripe and the P stripe are failed, we're
   1996			 * here due to a crc mismatch and we can't give them the
   1997			 * data they want
   1998			 */
   1999			if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
   2000				if (rbio->bioc->raid_map[faila] ==
   2001				    RAID5_P_STRIPE) {
   2002					err = BLK_STS_IOERR;
   2003					goto cleanup;
   2004				}
   2005				/*
   2006				 * otherwise we have one bad data stripe and
   2007				 * a good P stripe.  raid5!
   2008				 */
   2009				goto pstripe;
   2010			}
   2011
   2012			if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
   2013				raid6_datap_recov(rbio->real_stripes,
   2014						  sectorsize, faila, pointers);
   2015			} else {
   2016				raid6_2data_recov(rbio->real_stripes,
   2017						  sectorsize, faila, failb,
   2018						  pointers);
   2019			}
   2020		} else {
   2021			void *p;
   2022
   2023			/* rebuild from P stripe here (raid5 or raid6) */
   2024			BUG_ON(failb != -1);
   2025pstripe:
   2026			/* Copy parity block into failed block to start with */
   2027			memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
   2028
   2029			/* rearrange the pointer array */
   2030			p = pointers[faila];
   2031			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
   2032				pointers[stripe] = pointers[stripe + 1];
   2033			pointers[rbio->nr_data - 1] = p;
   2034
   2035			/* xor in the rest */
   2036			run_xor(pointers, rbio->nr_data - 1, sectorsize);
   2037		}
   2038		/* if we're doing this rebuild as part of an rmw, go through
   2039		 * and set all of our private rbio pages in the
   2040		 * failed stripes as uptodate.  This way finish_rmw will
   2041		 * know they can be trusted.  If this was a read reconstruction,
   2042		 * other endio functions will fiddle the uptodate bits
   2043		 */
   2044		if (rbio->operation == BTRFS_RBIO_WRITE) {
   2045			for (i = 0;  i < rbio->stripe_nsectors; i++) {
   2046				if (faila != -1) {
   2047					sector = rbio_stripe_sector(rbio, faila, i);
   2048					sector->uptodate = 1;
   2049				}
   2050				if (failb != -1) {
   2051					sector = rbio_stripe_sector(rbio, failb, i);
   2052					sector->uptodate = 1;
   2053				}
   2054			}
   2055		}
   2056		for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
   2057			kunmap_local(unmap_array[stripe]);
   2058	}
   2059
   2060	err = BLK_STS_OK;
   2061cleanup:
   2062	kfree(unmap_array);
   2063cleanup_pointers:
   2064	kfree(pointers);
   2065
   2066cleanup_io:
   2067	/*
   2068	 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
   2069	 * valid rbio which is consistent with ondisk content, thus such a
   2070	 * valid rbio can be cached to avoid further disk reads.
   2071	 */
   2072	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
   2073	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
   2074		/*
   2075		 * - In case of two failures, where rbio->failb != -1:
   2076		 *
   2077		 *   Do not cache this rbio since the above read reconstruction
   2078		 *   (raid6_datap_recov() or raid6_2data_recov()) may have
   2079		 *   changed some content of stripes which are not identical to
   2080		 *   on-disk content any more, otherwise, a later write/recover
   2081		 *   may steal stripe_pages from this rbio and end up with
   2082		 *   corruptions or rebuild failures.
   2083		 *
   2084		 * - In case of single failure, where rbio->failb == -1:
   2085		 *
   2086		 *   Cache this rbio iff the above read reconstruction is
   2087		 *   executed without problems.
   2088		 */
   2089		if (err == BLK_STS_OK && rbio->failb < 0)
   2090			cache_rbio_pages(rbio);
   2091		else
   2092			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
   2093
   2094		rbio_orig_end_io(rbio, err);
   2095	} else if (err == BLK_STS_OK) {
   2096		rbio->faila = -1;
   2097		rbio->failb = -1;
   2098
   2099		if (rbio->operation == BTRFS_RBIO_WRITE)
   2100			finish_rmw(rbio);
   2101		else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
   2102			finish_parity_scrub(rbio, 0);
   2103		else
   2104			BUG();
   2105	} else {
   2106		rbio_orig_end_io(rbio, err);
   2107	}
   2108}
   2109
   2110/*
   2111 * This is called only for stripes we've read from disk to
   2112 * reconstruct the parity.
   2113 */
   2114static void raid_recover_end_io(struct bio *bio)
   2115{
   2116	struct btrfs_raid_bio *rbio = bio->bi_private;
   2117
   2118	/*
   2119	 * we only read stripe pages off the disk, set them
   2120	 * up to date if there were no errors
   2121	 */
   2122	if (bio->bi_status)
   2123		fail_bio_stripe(rbio, bio);
   2124	else
   2125		set_bio_pages_uptodate(rbio, bio);
   2126	bio_put(bio);
   2127
   2128	if (!atomic_dec_and_test(&rbio->stripes_pending))
   2129		return;
   2130
   2131	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
   2132		rbio_orig_end_io(rbio, BLK_STS_IOERR);
   2133	else
   2134		__raid_recover_end_io(rbio);
   2135}
   2136
   2137/*
   2138 * reads everything we need off the disk to reconstruct
   2139 * the parity. endio handlers trigger final reconstruction
   2140 * when the IO is done.
   2141 *
   2142 * This is used both for reads from the higher layers and for
   2143 * parity construction required to finish a rmw cycle.
   2144 */
   2145static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
   2146{
   2147	int bios_to_read = 0;
   2148	struct bio_list bio_list;
   2149	int ret;
   2150	int sectornr;
   2151	int stripe;
   2152	struct bio *bio;
   2153
   2154	bio_list_init(&bio_list);
   2155
   2156	ret = alloc_rbio_pages(rbio);
   2157	if (ret)
   2158		goto cleanup;
   2159
   2160	atomic_set(&rbio->error, 0);
   2161
   2162	/*
   2163	 * read everything that hasn't failed.  Thanks to the
   2164	 * stripe cache, it is possible that some or all of these
   2165	 * pages are going to be uptodate.
   2166	 */
   2167	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
   2168		if (rbio->faila == stripe || rbio->failb == stripe) {
   2169			atomic_inc(&rbio->error);
   2170			continue;
   2171		}
   2172
   2173		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
   2174			struct sector_ptr *sector;
   2175
   2176			/*
   2177			 * the rmw code may have already read this
   2178			 * page in
   2179			 */
   2180			sector = rbio_stripe_sector(rbio, stripe, sectornr);
   2181			if (sector->uptodate)
   2182				continue;
   2183
   2184			ret = rbio_add_io_sector(rbio, &bio_list, sector,
   2185						 stripe, sectornr, rbio->stripe_len,
   2186						 REQ_OP_READ);
   2187			if (ret < 0)
   2188				goto cleanup;
   2189		}
   2190	}
   2191
   2192	bios_to_read = bio_list_size(&bio_list);
   2193	if (!bios_to_read) {
   2194		/*
   2195		 * we might have no bios to read just because the pages
   2196		 * were up to date, or we might have no bios to read because
   2197		 * the devices were gone.
   2198		 */
   2199		if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
   2200			__raid_recover_end_io(rbio);
   2201			return 0;
   2202		} else {
   2203			goto cleanup;
   2204		}
   2205	}
   2206
   2207	/*
   2208	 * The bioc may be freed once we submit the last bio. Make sure not to
   2209	 * touch it after that.
   2210	 */
   2211	atomic_set(&rbio->stripes_pending, bios_to_read);
   2212	while ((bio = bio_list_pop(&bio_list))) {
   2213		bio->bi_end_io = raid_recover_end_io;
   2214
   2215		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
   2216
   2217		submit_bio(bio);
   2218	}
   2219
   2220	return 0;
   2221
   2222cleanup:
   2223	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
   2224	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
   2225		rbio_orig_end_io(rbio, BLK_STS_IOERR);
   2226
   2227	while ((bio = bio_list_pop(&bio_list)))
   2228		bio_put(bio);
   2229
   2230	return -EIO;
   2231}
   2232
   2233/*
   2234 * the main entry point for reads from the higher layers.  This
   2235 * is really only called when the normal read path had a failure,
   2236 * so we assume the bio they send down corresponds to a failed part
   2237 * of the drive.
   2238 */
   2239int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
   2240			  u32 stripe_len, int mirror_num, int generic_io)
   2241{
   2242	struct btrfs_fs_info *fs_info = bioc->fs_info;
   2243	struct btrfs_raid_bio *rbio;
   2244	int ret;
   2245
   2246	if (generic_io) {
   2247		ASSERT(bioc->mirror_num == mirror_num);
   2248		btrfs_bio(bio)->mirror_num = mirror_num;
   2249	}
   2250
   2251	rbio = alloc_rbio(fs_info, bioc, stripe_len);
   2252	if (IS_ERR(rbio)) {
   2253		if (generic_io)
   2254			btrfs_put_bioc(bioc);
   2255		return PTR_ERR(rbio);
   2256	}
   2257
   2258	rbio->operation = BTRFS_RBIO_READ_REBUILD;
   2259	bio_list_add(&rbio->bio_list, bio);
   2260	rbio->bio_list_bytes = bio->bi_iter.bi_size;
   2261
   2262	rbio->faila = find_logical_bio_stripe(rbio, bio);
   2263	if (rbio->faila == -1) {
   2264		btrfs_warn(fs_info,
   2265"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
   2266			   __func__, bio->bi_iter.bi_sector << 9,
   2267			   (u64)bio->bi_iter.bi_size, bioc->map_type);
   2268		if (generic_io)
   2269			btrfs_put_bioc(bioc);
   2270		kfree(rbio);
   2271		return -EIO;
   2272	}
   2273
   2274	if (generic_io) {
   2275		btrfs_bio_counter_inc_noblocked(fs_info);
   2276		rbio->generic_bio_cnt = 1;
   2277	} else {
   2278		btrfs_get_bioc(bioc);
   2279	}
   2280
   2281	/*
   2282	 * Loop retry:
   2283	 * for 'mirror == 2', reconstruct from all other stripes.
   2284	 * for 'mirror_num > 2', select a stripe to fail on every retry.
   2285	 */
   2286	if (mirror_num > 2) {
   2287		/*
   2288		 * 'mirror == 3' is to fail the p stripe and
   2289		 * reconstruct from the q stripe.  'mirror > 3' is to
   2290		 * fail a data stripe and reconstruct from p+q stripe.
   2291		 */
   2292		rbio->failb = rbio->real_stripes - (mirror_num - 1);
   2293		ASSERT(rbio->failb > 0);
   2294		if (rbio->failb <= rbio->faila)
   2295			rbio->failb--;
   2296	}
   2297
   2298	ret = lock_stripe_add(rbio);
   2299
   2300	/*
   2301	 * __raid56_parity_recover will end the bio with
   2302	 * any errors it hits.  We don't want to return
   2303	 * its error value up the stack because our caller
   2304	 * will end up calling bio_endio with any nonzero
   2305	 * return
   2306	 */
   2307	if (ret == 0)
   2308		__raid56_parity_recover(rbio);
   2309	/*
   2310	 * our rbio has been added to the list of
   2311	 * rbios that will be handled after the
   2312	 * currently lock owner is done
   2313	 */
   2314	return 0;
   2315
   2316}
   2317
   2318static void rmw_work(struct work_struct *work)
   2319{
   2320	struct btrfs_raid_bio *rbio;
   2321
   2322	rbio = container_of(work, struct btrfs_raid_bio, work);
   2323	raid56_rmw_stripe(rbio);
   2324}
   2325
   2326static void read_rebuild_work(struct work_struct *work)
   2327{
   2328	struct btrfs_raid_bio *rbio;
   2329
   2330	rbio = container_of(work, struct btrfs_raid_bio, work);
   2331	__raid56_parity_recover(rbio);
   2332}
   2333
   2334/*
   2335 * The following code is used to scrub/replace the parity stripe
   2336 *
   2337 * Caller must have already increased bio_counter for getting @bioc.
   2338 *
   2339 * Note: We need make sure all the pages that add into the scrub/replace
   2340 * raid bio are correct and not be changed during the scrub/replace. That
   2341 * is those pages just hold metadata or file data with checksum.
   2342 */
   2343
   2344struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
   2345				struct btrfs_io_context *bioc,
   2346				u32 stripe_len, struct btrfs_device *scrub_dev,
   2347				unsigned long *dbitmap, int stripe_nsectors)
   2348{
   2349	struct btrfs_fs_info *fs_info = bioc->fs_info;
   2350	struct btrfs_raid_bio *rbio;
   2351	int i;
   2352
   2353	rbio = alloc_rbio(fs_info, bioc, stripe_len);
   2354	if (IS_ERR(rbio))
   2355		return NULL;
   2356	bio_list_add(&rbio->bio_list, bio);
   2357	/*
   2358	 * This is a special bio which is used to hold the completion handler
   2359	 * and make the scrub rbio is similar to the other types
   2360	 */
   2361	ASSERT(!bio->bi_iter.bi_size);
   2362	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
   2363
   2364	/*
   2365	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
   2366	 * to the end position, so this search can start from the first parity
   2367	 * stripe.
   2368	 */
   2369	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
   2370		if (bioc->stripes[i].dev == scrub_dev) {
   2371			rbio->scrubp = i;
   2372			break;
   2373		}
   2374	}
   2375	ASSERT(i < rbio->real_stripes);
   2376
   2377	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
   2378
   2379	/*
   2380	 * We have already increased bio_counter when getting bioc, record it
   2381	 * so we can free it at rbio_orig_end_io().
   2382	 */
   2383	rbio->generic_bio_cnt = 1;
   2384
   2385	return rbio;
   2386}
   2387
   2388/* Used for both parity scrub and missing. */
   2389void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
   2390			    unsigned int pgoff, u64 logical)
   2391{
   2392	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
   2393	int stripe_offset;
   2394	int index;
   2395
   2396	ASSERT(logical >= rbio->bioc->raid_map[0]);
   2397	ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
   2398				rbio->stripe_len * rbio->nr_data);
   2399	stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
   2400	index = stripe_offset / sectorsize;
   2401	rbio->bio_sectors[index].page = page;
   2402	rbio->bio_sectors[index].pgoff = pgoff;
   2403}
   2404
   2405/*
   2406 * We just scrub the parity that we have correct data on the same horizontal,
   2407 * so we needn't allocate all pages for all the stripes.
   2408 */
   2409static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
   2410{
   2411	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
   2412	int stripe;
   2413	int sectornr;
   2414
   2415	for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
   2416		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
   2417			struct page *page;
   2418			int index = (stripe * rbio->stripe_nsectors + sectornr) *
   2419				    sectorsize >> PAGE_SHIFT;
   2420
   2421			if (rbio->stripe_pages[index])
   2422				continue;
   2423
   2424			page = alloc_page(GFP_NOFS);
   2425			if (!page)
   2426				return -ENOMEM;
   2427			rbio->stripe_pages[index] = page;
   2428		}
   2429	}
   2430	index_stripe_sectors(rbio);
   2431	return 0;
   2432}
   2433
   2434static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
   2435					 int need_check)
   2436{
   2437	struct btrfs_io_context *bioc = rbio->bioc;
   2438	const u32 sectorsize = bioc->fs_info->sectorsize;
   2439	void **pointers = rbio->finish_pointers;
   2440	unsigned long *pbitmap = rbio->finish_pbitmap;
   2441	int nr_data = rbio->nr_data;
   2442	int stripe;
   2443	int sectornr;
   2444	bool has_qstripe;
   2445	struct sector_ptr p_sector = { 0 };
   2446	struct sector_ptr q_sector = { 0 };
   2447	struct bio_list bio_list;
   2448	struct bio *bio;
   2449	int is_replace = 0;
   2450	int ret;
   2451
   2452	bio_list_init(&bio_list);
   2453
   2454	if (rbio->real_stripes - rbio->nr_data == 1)
   2455		has_qstripe = false;
   2456	else if (rbio->real_stripes - rbio->nr_data == 2)
   2457		has_qstripe = true;
   2458	else
   2459		BUG();
   2460
   2461	if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
   2462		is_replace = 1;
   2463		bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors);
   2464	}
   2465
   2466	/*
   2467	 * Because the higher layers(scrubber) are unlikely to
   2468	 * use this area of the disk again soon, so don't cache
   2469	 * it.
   2470	 */
   2471	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
   2472
   2473	if (!need_check)
   2474		goto writeback;
   2475
   2476	p_sector.page = alloc_page(GFP_NOFS);
   2477	if (!p_sector.page)
   2478		goto cleanup;
   2479	p_sector.pgoff = 0;
   2480	p_sector.uptodate = 1;
   2481
   2482	if (has_qstripe) {
   2483		/* RAID6, allocate and map temp space for the Q stripe */
   2484		q_sector.page = alloc_page(GFP_NOFS);
   2485		if (!q_sector.page) {
   2486			__free_page(p_sector.page);
   2487			p_sector.page = NULL;
   2488			goto cleanup;
   2489		}
   2490		q_sector.pgoff = 0;
   2491		q_sector.uptodate = 1;
   2492		pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
   2493	}
   2494
   2495	atomic_set(&rbio->error, 0);
   2496
   2497	/* Map the parity stripe just once */
   2498	pointers[nr_data] = kmap_local_page(p_sector.page);
   2499
   2500	for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
   2501		struct sector_ptr *sector;
   2502		void *parity;
   2503
   2504		/* first collect one page from each data stripe */
   2505		for (stripe = 0; stripe < nr_data; stripe++) {
   2506			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
   2507			pointers[stripe] = kmap_local_page(sector->page) +
   2508					   sector->pgoff;
   2509		}
   2510
   2511		if (has_qstripe) {
   2512			/* RAID6, call the library function to fill in our P/Q */
   2513			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
   2514						pointers);
   2515		} else {
   2516			/* raid5 */
   2517			memcpy(pointers[nr_data], pointers[0], sectorsize);
   2518			run_xor(pointers + 1, nr_data - 1, sectorsize);
   2519		}
   2520
   2521		/* Check scrubbing parity and repair it */
   2522		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
   2523		parity = kmap_local_page(sector->page) + sector->pgoff;
   2524		if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
   2525			memcpy(parity, pointers[rbio->scrubp], sectorsize);
   2526		else
   2527			/* Parity is right, needn't writeback */
   2528			bitmap_clear(rbio->dbitmap, sectornr, 1);
   2529		kunmap_local(parity);
   2530
   2531		for (stripe = nr_data - 1; stripe >= 0; stripe--)
   2532			kunmap_local(pointers[stripe]);
   2533	}
   2534
   2535	kunmap_local(pointers[nr_data]);
   2536	__free_page(p_sector.page);
   2537	p_sector.page = NULL;
   2538	if (q_sector.page) {
   2539		kunmap_local(pointers[rbio->real_stripes - 1]);
   2540		__free_page(q_sector.page);
   2541		q_sector.page = NULL;
   2542	}
   2543
   2544writeback:
   2545	/*
   2546	 * time to start writing.  Make bios for everything from the
   2547	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
   2548	 * everything else.
   2549	 */
   2550	for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
   2551		struct sector_ptr *sector;
   2552
   2553		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
   2554		ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
   2555					 sectornr, rbio->stripe_len, REQ_OP_WRITE);
   2556		if (ret)
   2557			goto cleanup;
   2558	}
   2559
   2560	if (!is_replace)
   2561		goto submit_write;
   2562
   2563	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
   2564		struct sector_ptr *sector;
   2565
   2566		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
   2567		ret = rbio_add_io_sector(rbio, &bio_list, sector,
   2568				       bioc->tgtdev_map[rbio->scrubp],
   2569				       sectornr, rbio->stripe_len, REQ_OP_WRITE);
   2570		if (ret)
   2571			goto cleanup;
   2572	}
   2573
   2574submit_write:
   2575	nr_data = bio_list_size(&bio_list);
   2576	if (!nr_data) {
   2577		/* Every parity is right */
   2578		rbio_orig_end_io(rbio, BLK_STS_OK);
   2579		return;
   2580	}
   2581
   2582	atomic_set(&rbio->stripes_pending, nr_data);
   2583
   2584	while ((bio = bio_list_pop(&bio_list))) {
   2585		bio->bi_end_io = raid_write_end_io;
   2586
   2587		submit_bio(bio);
   2588	}
   2589	return;
   2590
   2591cleanup:
   2592	rbio_orig_end_io(rbio, BLK_STS_IOERR);
   2593
   2594	while ((bio = bio_list_pop(&bio_list)))
   2595		bio_put(bio);
   2596}
   2597
   2598static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
   2599{
   2600	if (stripe >= 0 && stripe < rbio->nr_data)
   2601		return 1;
   2602	return 0;
   2603}
   2604
   2605/*
   2606 * While we're doing the parity check and repair, we could have errors
   2607 * in reading pages off the disk.  This checks for errors and if we're
   2608 * not able to read the page it'll trigger parity reconstruction.  The
   2609 * parity scrub will be finished after we've reconstructed the failed
   2610 * stripes
   2611 */
   2612static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
   2613{
   2614	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
   2615		goto cleanup;
   2616
   2617	if (rbio->faila >= 0 || rbio->failb >= 0) {
   2618		int dfail = 0, failp = -1;
   2619
   2620		if (is_data_stripe(rbio, rbio->faila))
   2621			dfail++;
   2622		else if (is_parity_stripe(rbio->faila))
   2623			failp = rbio->faila;
   2624
   2625		if (is_data_stripe(rbio, rbio->failb))
   2626			dfail++;
   2627		else if (is_parity_stripe(rbio->failb))
   2628			failp = rbio->failb;
   2629
   2630		/*
   2631		 * Because we can not use a scrubbing parity to repair
   2632		 * the data, so the capability of the repair is declined.
   2633		 * (In the case of RAID5, we can not repair anything)
   2634		 */
   2635		if (dfail > rbio->bioc->max_errors - 1)
   2636			goto cleanup;
   2637
   2638		/*
   2639		 * If all data is good, only parity is correctly, just
   2640		 * repair the parity.
   2641		 */
   2642		if (dfail == 0) {
   2643			finish_parity_scrub(rbio, 0);
   2644			return;
   2645		}
   2646
   2647		/*
   2648		 * Here means we got one corrupted data stripe and one
   2649		 * corrupted parity on RAID6, if the corrupted parity
   2650		 * is scrubbing parity, luckily, use the other one to repair
   2651		 * the data, or we can not repair the data stripe.
   2652		 */
   2653		if (failp != rbio->scrubp)
   2654			goto cleanup;
   2655
   2656		__raid_recover_end_io(rbio);
   2657	} else {
   2658		finish_parity_scrub(rbio, 1);
   2659	}
   2660	return;
   2661
   2662cleanup:
   2663	rbio_orig_end_io(rbio, BLK_STS_IOERR);
   2664}
   2665
   2666/*
   2667 * end io for the read phase of the rmw cycle.  All the bios here are physical
   2668 * stripe bios we've read from the disk so we can recalculate the parity of the
   2669 * stripe.
   2670 *
   2671 * This will usually kick off finish_rmw once all the bios are read in, but it
   2672 * may trigger parity reconstruction if we had any errors along the way
   2673 */
   2674static void raid56_parity_scrub_end_io(struct bio *bio)
   2675{
   2676	struct btrfs_raid_bio *rbio = bio->bi_private;
   2677
   2678	if (bio->bi_status)
   2679		fail_bio_stripe(rbio, bio);
   2680	else
   2681		set_bio_pages_uptodate(rbio, bio);
   2682
   2683	bio_put(bio);
   2684
   2685	if (!atomic_dec_and_test(&rbio->stripes_pending))
   2686		return;
   2687
   2688	/*
   2689	 * this will normally call finish_rmw to start our write
   2690	 * but if there are any failed stripes we'll reconstruct
   2691	 * from parity first
   2692	 */
   2693	validate_rbio_for_parity_scrub(rbio);
   2694}
   2695
   2696static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
   2697{
   2698	int bios_to_read = 0;
   2699	struct bio_list bio_list;
   2700	int ret;
   2701	int sectornr;
   2702	int stripe;
   2703	struct bio *bio;
   2704
   2705	bio_list_init(&bio_list);
   2706
   2707	ret = alloc_rbio_essential_pages(rbio);
   2708	if (ret)
   2709		goto cleanup;
   2710
   2711	atomic_set(&rbio->error, 0);
   2712	/*
   2713	 * build a list of bios to read all the missing parts of this
   2714	 * stripe
   2715	 */
   2716	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
   2717		for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) {
   2718			struct sector_ptr *sector;
   2719			/*
   2720			 * We want to find all the sectors missing from the
   2721			 * rbio and read them from the disk.  If * sector_in_rbio()
   2722			 * finds a sector in the bio list we don't need to read
   2723			 * it off the stripe.
   2724			 */
   2725			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
   2726			if (sector)
   2727				continue;
   2728
   2729			sector = rbio_stripe_sector(rbio, stripe, sectornr);
   2730			/*
   2731			 * The bio cache may have handed us an uptodate sector.
   2732			 * If so, be happy and use it.
   2733			 */
   2734			if (sector->uptodate)
   2735				continue;
   2736
   2737			ret = rbio_add_io_sector(rbio, &bio_list, sector,
   2738						 stripe, sectornr, rbio->stripe_len,
   2739						 REQ_OP_READ);
   2740			if (ret)
   2741				goto cleanup;
   2742		}
   2743	}
   2744
   2745	bios_to_read = bio_list_size(&bio_list);
   2746	if (!bios_to_read) {
   2747		/*
   2748		 * this can happen if others have merged with
   2749		 * us, it means there is nothing left to read.
   2750		 * But if there are missing devices it may not be
   2751		 * safe to do the full stripe write yet.
   2752		 */
   2753		goto finish;
   2754	}
   2755
   2756	/*
   2757	 * The bioc may be freed once we submit the last bio. Make sure not to
   2758	 * touch it after that.
   2759	 */
   2760	atomic_set(&rbio->stripes_pending, bios_to_read);
   2761	while ((bio = bio_list_pop(&bio_list))) {
   2762		bio->bi_end_io = raid56_parity_scrub_end_io;
   2763
   2764		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
   2765
   2766		submit_bio(bio);
   2767	}
   2768	/* the actual write will happen once the reads are done */
   2769	return;
   2770
   2771cleanup:
   2772	rbio_orig_end_io(rbio, BLK_STS_IOERR);
   2773
   2774	while ((bio = bio_list_pop(&bio_list)))
   2775		bio_put(bio);
   2776
   2777	return;
   2778
   2779finish:
   2780	validate_rbio_for_parity_scrub(rbio);
   2781}
   2782
   2783static void scrub_parity_work(struct work_struct *work)
   2784{
   2785	struct btrfs_raid_bio *rbio;
   2786
   2787	rbio = container_of(work, struct btrfs_raid_bio, work);
   2788	raid56_parity_scrub_stripe(rbio);
   2789}
   2790
   2791void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
   2792{
   2793	if (!lock_stripe_add(rbio))
   2794		start_async_work(rbio, scrub_parity_work);
   2795}
   2796
   2797/* The following code is used for dev replace of a missing RAID 5/6 device. */
   2798
   2799struct btrfs_raid_bio *
   2800raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
   2801			  u64 length)
   2802{
   2803	struct btrfs_fs_info *fs_info = bioc->fs_info;
   2804	struct btrfs_raid_bio *rbio;
   2805
   2806	rbio = alloc_rbio(fs_info, bioc, length);
   2807	if (IS_ERR(rbio))
   2808		return NULL;
   2809
   2810	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
   2811	bio_list_add(&rbio->bio_list, bio);
   2812	/*
   2813	 * This is a special bio which is used to hold the completion handler
   2814	 * and make the scrub rbio is similar to the other types
   2815	 */
   2816	ASSERT(!bio->bi_iter.bi_size);
   2817
   2818	rbio->faila = find_logical_bio_stripe(rbio, bio);
   2819	if (rbio->faila == -1) {
   2820		BUG();
   2821		kfree(rbio);
   2822		return NULL;
   2823	}
   2824
   2825	/*
   2826	 * When we get bioc, we have already increased bio_counter, record it
   2827	 * so we can free it at rbio_orig_end_io()
   2828	 */
   2829	rbio->generic_bio_cnt = 1;
   2830
   2831	return rbio;
   2832}
   2833
   2834void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
   2835{
   2836	if (!lock_stripe_add(rbio))
   2837		start_async_work(rbio, read_rebuild_work);
   2838}