raid10.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
raid10.c (147861B)
      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * raid10.c : Multiple Devices driver for Linux
      4 *
      5 * Copyright (C) 2000-2004 Neil Brown
      6 *
      7 * RAID-10 support for md.
      8 *
      9 * Base on code in raid1.c.  See raid1.c for further copyright information.
     10 */
     11
     12#include <linux/slab.h>
     13#include <linux/delay.h>
     14#include <linux/blkdev.h>
     15#include <linux/module.h>
     16#include <linux/seq_file.h>
     17#include <linux/ratelimit.h>
     18#include <linux/kthread.h>
     19#include <linux/raid/md_p.h>
     20#include <trace/events/block.h>
     21#include "md.h"
     22#include "raid10.h"
     23#include "raid0.h"
     24#include "md-bitmap.h"
     25
     26/*
     27 * RAID10 provides a combination of RAID0 and RAID1 functionality.
     28 * The layout of data is defined by
     29 *    chunk_size
     30 *    raid_disks
     31 *    near_copies (stored in low byte of layout)
     32 *    far_copies (stored in second byte of layout)
     33 *    far_offset (stored in bit 16 of layout )
     34 *    use_far_sets (stored in bit 17 of layout )
     35 *    use_far_sets_bugfixed (stored in bit 18 of layout )
     36 *
     37 * The data to be stored is divided into chunks using chunksize.  Each device
     38 * is divided into far_copies sections.   In each section, chunks are laid out
     39 * in a style similar to raid0, but near_copies copies of each chunk is stored
     40 * (each on a different drive).  The starting device for each section is offset
     41 * near_copies from the starting device of the previous section.  Thus there
     42 * are (near_copies * far_copies) of each chunk, and each is on a different
     43 * drive.  near_copies and far_copies must be at least one, and their product
     44 * is at most raid_disks.
     45 *
     46 * If far_offset is true, then the far_copies are handled a bit differently.
     47 * The copies are still in different stripes, but instead of being very far
     48 * apart on disk, there are adjacent stripes.
     49 *
     50 * The far and offset algorithms are handled slightly differently if
     51 * 'use_far_sets' is true.  In this case, the array's devices are grouped into
     52 * sets that are (near_copies * far_copies) in size.  The far copied stripes
     53 * are still shifted by 'near_copies' devices, but this shifting stays confined
     54 * to the set rather than the entire array.  This is done to improve the number
     55 * of device combinations that can fail without causing the array to fail.
     56 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
     57 * on a device):
     58 *    A B C D    A B C D E
     59 *      ...         ...
     60 *    D A B C    E A B C D
     61 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
     62 *    [A B] [C D]    [A B] [C D E]
     63 *    |...| |...|    |...| | ... |
     64 *    [B A] [D C]    [B A] [E C D]
     65 */
     66
     67static void allow_barrier(struct r10conf *conf);
     68static void lower_barrier(struct r10conf *conf);
     69static int _enough(struct r10conf *conf, int previous, int ignore);
     70static int enough(struct r10conf *conf, int ignore);
     71static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
     72				int *skipped);
     73static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
     74static void end_reshape_write(struct bio *bio);
     75static void end_reshape(struct r10conf *conf);
     76
     77#define raid10_log(md, fmt, args...)				\
     78	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
     79
     80#include "raid1-10.c"
     81
     82/*
     83 * for resync bio, r10bio pointer can be retrieved from the per-bio
     84 * 'struct resync_pages'.
     85 */
     86static inline struct r10bio *get_resync_r10bio(struct bio *bio)
     87{
     88	return get_resync_pages(bio)->raid_bio;
     89}
     90
     91static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
     92{
     93	struct r10conf *conf = data;
     94	int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
     95
     96	/* allocate a r10bio with room for raid_disks entries in the
     97	 * bios array */
     98	return kzalloc(size, gfp_flags);
     99}
    100
    101#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
    102/* amount of memory to reserve for resync requests */
    103#define RESYNC_WINDOW (1024*1024)
    104/* maximum number of concurrent requests, memory permitting */
    105#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
    106#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
    107#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
    108
    109/*
    110 * When performing a resync, we need to read and compare, so
    111 * we need as many pages are there are copies.
    112 * When performing a recovery, we need 2 bios, one for read,
    113 * one for write (we recover only one drive per r10buf)
    114 *
    115 */
    116static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
    117{
    118	struct r10conf *conf = data;
    119	struct r10bio *r10_bio;
    120	struct bio *bio;
    121	int j;
    122	int nalloc, nalloc_rp;
    123	struct resync_pages *rps;
    124
    125	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
    126	if (!r10_bio)
    127		return NULL;
    128
    129	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
    130	    test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
    131		nalloc = conf->copies; /* resync */
    132	else
    133		nalloc = 2; /* recovery */
    134
    135	/* allocate once for all bios */
    136	if (!conf->have_replacement)
    137		nalloc_rp = nalloc;
    138	else
    139		nalloc_rp = nalloc * 2;
    140	rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
    141	if (!rps)
    142		goto out_free_r10bio;
    143
    144	/*
    145	 * Allocate bios.
    146	 */
    147	for (j = nalloc ; j-- ; ) {
    148		bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
    149		if (!bio)
    150			goto out_free_bio;
    151		bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
    152		r10_bio->devs[j].bio = bio;
    153		if (!conf->have_replacement)
    154			continue;
    155		bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
    156		if (!bio)
    157			goto out_free_bio;
    158		bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
    159		r10_bio->devs[j].repl_bio = bio;
    160	}
    161	/*
    162	 * Allocate RESYNC_PAGES data pages and attach them
    163	 * where needed.
    164	 */
    165	for (j = 0; j < nalloc; j++) {
    166		struct bio *rbio = r10_bio->devs[j].repl_bio;
    167		struct resync_pages *rp, *rp_repl;
    168
    169		rp = &rps[j];
    170		if (rbio)
    171			rp_repl = &rps[nalloc + j];
    172
    173		bio = r10_bio->devs[j].bio;
    174
    175		if (!j || test_bit(MD_RECOVERY_SYNC,
    176				   &conf->mddev->recovery)) {
    177			if (resync_alloc_pages(rp, gfp_flags))
    178				goto out_free_pages;
    179		} else {
    180			memcpy(rp, &rps[0], sizeof(*rp));
    181			resync_get_all_pages(rp);
    182		}
    183
    184		rp->raid_bio = r10_bio;
    185		bio->bi_private = rp;
    186		if (rbio) {
    187			memcpy(rp_repl, rp, sizeof(*rp));
    188			rbio->bi_private = rp_repl;
    189		}
    190	}
    191
    192	return r10_bio;
    193
    194out_free_pages:
    195	while (--j >= 0)
    196		resync_free_pages(&rps[j]);
    197
    198	j = 0;
    199out_free_bio:
    200	for ( ; j < nalloc; j++) {
    201		if (r10_bio->devs[j].bio)
    202			bio_uninit(r10_bio->devs[j].bio);
    203		kfree(r10_bio->devs[j].bio);
    204		if (r10_bio->devs[j].repl_bio)
    205			bio_uninit(r10_bio->devs[j].repl_bio);
    206		kfree(r10_bio->devs[j].repl_bio);
    207	}
    208	kfree(rps);
    209out_free_r10bio:
    210	rbio_pool_free(r10_bio, conf);
    211	return NULL;
    212}
    213
    214static void r10buf_pool_free(void *__r10_bio, void *data)
    215{
    216	struct r10conf *conf = data;
    217	struct r10bio *r10bio = __r10_bio;
    218	int j;
    219	struct resync_pages *rp = NULL;
    220
    221	for (j = conf->copies; j--; ) {
    222		struct bio *bio = r10bio->devs[j].bio;
    223
    224		if (bio) {
    225			rp = get_resync_pages(bio);
    226			resync_free_pages(rp);
    227			bio_uninit(bio);
    228			kfree(bio);
    229		}
    230
    231		bio = r10bio->devs[j].repl_bio;
    232		if (bio) {
    233			bio_uninit(bio);
    234			kfree(bio);
    235		}
    236	}
    237
    238	/* resync pages array stored in the 1st bio's .bi_private */
    239	kfree(rp);
    240
    241	rbio_pool_free(r10bio, conf);
    242}
    243
    244static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
    245{
    246	int i;
    247
    248	for (i = 0; i < conf->geo.raid_disks; i++) {
    249		struct bio **bio = & r10_bio->devs[i].bio;
    250		if (!BIO_SPECIAL(*bio))
    251			bio_put(*bio);
    252		*bio = NULL;
    253		bio = &r10_bio->devs[i].repl_bio;
    254		if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
    255			bio_put(*bio);
    256		*bio = NULL;
    257	}
    258}
    259
    260static void free_r10bio(struct r10bio *r10_bio)
    261{
    262	struct r10conf *conf = r10_bio->mddev->private;
    263
    264	put_all_bios(conf, r10_bio);
    265	mempool_free(r10_bio, &conf->r10bio_pool);
    266}
    267
    268static void put_buf(struct r10bio *r10_bio)
    269{
    270	struct r10conf *conf = r10_bio->mddev->private;
    271
    272	mempool_free(r10_bio, &conf->r10buf_pool);
    273
    274	lower_barrier(conf);
    275}
    276
    277static void reschedule_retry(struct r10bio *r10_bio)
    278{
    279	unsigned long flags;
    280	struct mddev *mddev = r10_bio->mddev;
    281	struct r10conf *conf = mddev->private;
    282
    283	spin_lock_irqsave(&conf->device_lock, flags);
    284	list_add(&r10_bio->retry_list, &conf->retry_list);
    285	conf->nr_queued ++;
    286	spin_unlock_irqrestore(&conf->device_lock, flags);
    287
    288	/* wake up frozen array... */
    289	wake_up(&conf->wait_barrier);
    290
    291	md_wakeup_thread(mddev->thread);
    292}
    293
    294/*
    295 * raid_end_bio_io() is called when we have finished servicing a mirrored
    296 * operation and are ready to return a success/failure code to the buffer
    297 * cache layer.
    298 */
    299static void raid_end_bio_io(struct r10bio *r10_bio)
    300{
    301	struct bio *bio = r10_bio->master_bio;
    302	struct r10conf *conf = r10_bio->mddev->private;
    303
    304	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
    305		bio->bi_status = BLK_STS_IOERR;
    306
    307	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
    308		bio_end_io_acct(bio, r10_bio->start_time);
    309	bio_endio(bio);
    310	/*
    311	 * Wake up any possible resync thread that waits for the device
    312	 * to go idle.
    313	 */
    314	allow_barrier(conf);
    315
    316	free_r10bio(r10_bio);
    317}
    318
    319/*
    320 * Update disk head position estimator based on IRQ completion info.
    321 */
    322static inline void update_head_pos(int slot, struct r10bio *r10_bio)
    323{
    324	struct r10conf *conf = r10_bio->mddev->private;
    325
    326	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
    327		r10_bio->devs[slot].addr + (r10_bio->sectors);
    328}
    329
    330/*
    331 * Find the disk number which triggered given bio
    332 */
    333static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
    334			 struct bio *bio, int *slotp, int *replp)
    335{
    336	int slot;
    337	int repl = 0;
    338
    339	for (slot = 0; slot < conf->geo.raid_disks; slot++) {
    340		if (r10_bio->devs[slot].bio == bio)
    341			break;
    342		if (r10_bio->devs[slot].repl_bio == bio) {
    343			repl = 1;
    344			break;
    345		}
    346	}
    347
    348	update_head_pos(slot, r10_bio);
    349
    350	if (slotp)
    351		*slotp = slot;
    352	if (replp)
    353		*replp = repl;
    354	return r10_bio->devs[slot].devnum;
    355}
    356
    357static void raid10_end_read_request(struct bio *bio)
    358{
    359	int uptodate = !bio->bi_status;
    360	struct r10bio *r10_bio = bio->bi_private;
    361	int slot;
    362	struct md_rdev *rdev;
    363	struct r10conf *conf = r10_bio->mddev->private;
    364
    365	slot = r10_bio->read_slot;
    366	rdev = r10_bio->devs[slot].rdev;
    367	/*
    368	 * this branch is our 'one mirror IO has finished' event handler:
    369	 */
    370	update_head_pos(slot, r10_bio);
    371
    372	if (uptodate) {
    373		/*
    374		 * Set R10BIO_Uptodate in our master bio, so that
    375		 * we will return a good error code to the higher
    376		 * levels even if IO on some other mirrored buffer fails.
    377		 *
    378		 * The 'master' represents the composite IO operation to
    379		 * user-side. So if something waits for IO, then it will
    380		 * wait for the 'master' bio.
    381		 */
    382		set_bit(R10BIO_Uptodate, &r10_bio->state);
    383	} else {
    384		/* If all other devices that store this block have
    385		 * failed, we want to return the error upwards rather
    386		 * than fail the last device.  Here we redefine
    387		 * "uptodate" to mean "Don't want to retry"
    388		 */
    389		if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
    390			     rdev->raid_disk))
    391			uptodate = 1;
    392	}
    393	if (uptodate) {
    394		raid_end_bio_io(r10_bio);
    395		rdev_dec_pending(rdev, conf->mddev);
    396	} else {
    397		/*
    398		 * oops, read error - keep the refcount on the rdev
    399		 */
    400		pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n",
    401				   mdname(conf->mddev),
    402				   rdev->bdev,
    403				   (unsigned long long)r10_bio->sector);
    404		set_bit(R10BIO_ReadError, &r10_bio->state);
    405		reschedule_retry(r10_bio);
    406	}
    407}
    408
    409static void close_write(struct r10bio *r10_bio)
    410{
    411	/* clear the bitmap if all writes complete successfully */
    412	md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
    413			   r10_bio->sectors,
    414			   !test_bit(R10BIO_Degraded, &r10_bio->state),
    415			   0);
    416	md_write_end(r10_bio->mddev);
    417}
    418
    419static void one_write_done(struct r10bio *r10_bio)
    420{
    421	if (atomic_dec_and_test(&r10_bio->remaining)) {
    422		if (test_bit(R10BIO_WriteError, &r10_bio->state))
    423			reschedule_retry(r10_bio);
    424		else {
    425			close_write(r10_bio);
    426			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
    427				reschedule_retry(r10_bio);
    428			else
    429				raid_end_bio_io(r10_bio);
    430		}
    431	}
    432}
    433
    434static void raid10_end_write_request(struct bio *bio)
    435{
    436	struct r10bio *r10_bio = bio->bi_private;
    437	int dev;
    438	int dec_rdev = 1;
    439	struct r10conf *conf = r10_bio->mddev->private;
    440	int slot, repl;
    441	struct md_rdev *rdev = NULL;
    442	struct bio *to_put = NULL;
    443	bool discard_error;
    444
    445	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
    446
    447	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
    448
    449	if (repl)
    450		rdev = conf->mirrors[dev].replacement;
    451	if (!rdev) {
    452		smp_rmb();
    453		repl = 0;
    454		rdev = conf->mirrors[dev].rdev;
    455	}
    456	/*
    457	 * this branch is our 'one mirror IO has finished' event handler:
    458	 */
    459	if (bio->bi_status && !discard_error) {
    460		if (repl)
    461			/* Never record new bad blocks to replacement,
    462			 * just fail it.
    463			 */
    464			md_error(rdev->mddev, rdev);
    465		else {
    466			set_bit(WriteErrorSeen,	&rdev->flags);
    467			if (!test_and_set_bit(WantReplacement, &rdev->flags))
    468				set_bit(MD_RECOVERY_NEEDED,
    469					&rdev->mddev->recovery);
    470
    471			dec_rdev = 0;
    472			if (test_bit(FailFast, &rdev->flags) &&
    473			    (bio->bi_opf & MD_FAILFAST)) {
    474				md_error(rdev->mddev, rdev);
    475			}
    476
    477			/*
    478			 * When the device is faulty, it is not necessary to
    479			 * handle write error.
    480			 */
    481			if (!test_bit(Faulty, &rdev->flags))
    482				set_bit(R10BIO_WriteError, &r10_bio->state);
    483			else {
    484				/* Fail the request */
    485				set_bit(R10BIO_Degraded, &r10_bio->state);
    486				r10_bio->devs[slot].bio = NULL;
    487				to_put = bio;
    488				dec_rdev = 1;
    489			}
    490		}
    491	} else {
    492		/*
    493		 * Set R10BIO_Uptodate in our master bio, so that
    494		 * we will return a good error code for to the higher
    495		 * levels even if IO on some other mirrored buffer fails.
    496		 *
    497		 * The 'master' represents the composite IO operation to
    498		 * user-side. So if something waits for IO, then it will
    499		 * wait for the 'master' bio.
    500		 */
    501		sector_t first_bad;
    502		int bad_sectors;
    503
    504		/*
    505		 * Do not set R10BIO_Uptodate if the current device is
    506		 * rebuilding or Faulty. This is because we cannot use
    507		 * such device for properly reading the data back (we could
    508		 * potentially use it, if the current write would have felt
    509		 * before rdev->recovery_offset, but for simplicity we don't
    510		 * check this here.
    511		 */
    512		if (test_bit(In_sync, &rdev->flags) &&
    513		    !test_bit(Faulty, &rdev->flags))
    514			set_bit(R10BIO_Uptodate, &r10_bio->state);
    515
    516		/* Maybe we can clear some bad blocks. */
    517		if (is_badblock(rdev,
    518				r10_bio->devs[slot].addr,
    519				r10_bio->sectors,
    520				&first_bad, &bad_sectors) && !discard_error) {
    521			bio_put(bio);
    522			if (repl)
    523				r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
    524			else
    525				r10_bio->devs[slot].bio = IO_MADE_GOOD;
    526			dec_rdev = 0;
    527			set_bit(R10BIO_MadeGood, &r10_bio->state);
    528		}
    529	}
    530
    531	/*
    532	 *
    533	 * Let's see if all mirrored write operations have finished
    534	 * already.
    535	 */
    536	one_write_done(r10_bio);
    537	if (dec_rdev)
    538		rdev_dec_pending(rdev, conf->mddev);
    539	if (to_put)
    540		bio_put(to_put);
    541}
    542
    543/*
    544 * RAID10 layout manager
    545 * As well as the chunksize and raid_disks count, there are two
    546 * parameters: near_copies and far_copies.
    547 * near_copies * far_copies must be <= raid_disks.
    548 * Normally one of these will be 1.
    549 * If both are 1, we get raid0.
    550 * If near_copies == raid_disks, we get raid1.
    551 *
    552 * Chunks are laid out in raid0 style with near_copies copies of the
    553 * first chunk, followed by near_copies copies of the next chunk and
    554 * so on.
    555 * If far_copies > 1, then after 1/far_copies of the array has been assigned
    556 * as described above, we start again with a device offset of near_copies.
    557 * So we effectively have another copy of the whole array further down all
    558 * the drives, but with blocks on different drives.
    559 * With this layout, and block is never stored twice on the one device.
    560 *
    561 * raid10_find_phys finds the sector offset of a given virtual sector
    562 * on each device that it is on.
    563 *
    564 * raid10_find_virt does the reverse mapping, from a device and a
    565 * sector offset to a virtual address
    566 */
    567
    568static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
    569{
    570	int n,f;
    571	sector_t sector;
    572	sector_t chunk;
    573	sector_t stripe;
    574	int dev;
    575	int slot = 0;
    576	int last_far_set_start, last_far_set_size;
    577
    578	last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
    579	last_far_set_start *= geo->far_set_size;
    580
    581	last_far_set_size = geo->far_set_size;
    582	last_far_set_size += (geo->raid_disks % geo->far_set_size);
    583
    584	/* now calculate first sector/dev */
    585	chunk = r10bio->sector >> geo->chunk_shift;
    586	sector = r10bio->sector & geo->chunk_mask;
    587
    588	chunk *= geo->near_copies;
    589	stripe = chunk;
    590	dev = sector_div(stripe, geo->raid_disks);
    591	if (geo->far_offset)
    592		stripe *= geo->far_copies;
    593
    594	sector += stripe << geo->chunk_shift;
    595
    596	/* and calculate all the others */
    597	for (n = 0; n < geo->near_copies; n++) {
    598		int d = dev;
    599		int set;
    600		sector_t s = sector;
    601		r10bio->devs[slot].devnum = d;
    602		r10bio->devs[slot].addr = s;
    603		slot++;
    604
    605		for (f = 1; f < geo->far_copies; f++) {
    606			set = d / geo->far_set_size;
    607			d += geo->near_copies;
    608
    609			if ((geo->raid_disks % geo->far_set_size) &&
    610			    (d > last_far_set_start)) {
    611				d -= last_far_set_start;
    612				d %= last_far_set_size;
    613				d += last_far_set_start;
    614			} else {
    615				d %= geo->far_set_size;
    616				d += geo->far_set_size * set;
    617			}
    618			s += geo->stride;
    619			r10bio->devs[slot].devnum = d;
    620			r10bio->devs[slot].addr = s;
    621			slot++;
    622		}
    623		dev++;
    624		if (dev >= geo->raid_disks) {
    625			dev = 0;
    626			sector += (geo->chunk_mask + 1);
    627		}
    628	}
    629}
    630
    631static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
    632{
    633	struct geom *geo = &conf->geo;
    634
    635	if (conf->reshape_progress != MaxSector &&
    636	    ((r10bio->sector >= conf->reshape_progress) !=
    637	     conf->mddev->reshape_backwards)) {
    638		set_bit(R10BIO_Previous, &r10bio->state);
    639		geo = &conf->prev;
    640	} else
    641		clear_bit(R10BIO_Previous, &r10bio->state);
    642
    643	__raid10_find_phys(geo, r10bio);
    644}
    645
    646static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
    647{
    648	sector_t offset, chunk, vchunk;
    649	/* Never use conf->prev as this is only called during resync
    650	 * or recovery, so reshape isn't happening
    651	 */
    652	struct geom *geo = &conf->geo;
    653	int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
    654	int far_set_size = geo->far_set_size;
    655	int last_far_set_start;
    656
    657	if (geo->raid_disks % geo->far_set_size) {
    658		last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
    659		last_far_set_start *= geo->far_set_size;
    660
    661		if (dev >= last_far_set_start) {
    662			far_set_size = geo->far_set_size;
    663			far_set_size += (geo->raid_disks % geo->far_set_size);
    664			far_set_start = last_far_set_start;
    665		}
    666	}
    667
    668	offset = sector & geo->chunk_mask;
    669	if (geo->far_offset) {
    670		int fc;
    671		chunk = sector >> geo->chunk_shift;
    672		fc = sector_div(chunk, geo->far_copies);
    673		dev -= fc * geo->near_copies;
    674		if (dev < far_set_start)
    675			dev += far_set_size;
    676	} else {
    677		while (sector >= geo->stride) {
    678			sector -= geo->stride;
    679			if (dev < (geo->near_copies + far_set_start))
    680				dev += far_set_size - geo->near_copies;
    681			else
    682				dev -= geo->near_copies;
    683		}
    684		chunk = sector >> geo->chunk_shift;
    685	}
    686	vchunk = chunk * geo->raid_disks + dev;
    687	sector_div(vchunk, geo->near_copies);
    688	return (vchunk << geo->chunk_shift) + offset;
    689}
    690
    691/*
    692 * This routine returns the disk from which the requested read should
    693 * be done. There is a per-array 'next expected sequential IO' sector
    694 * number - if this matches on the next IO then we use the last disk.
    695 * There is also a per-disk 'last know head position' sector that is
    696 * maintained from IRQ contexts, both the normal and the resync IO
    697 * completion handlers update this position correctly. If there is no
    698 * perfect sequential match then we pick the disk whose head is closest.
    699 *
    700 * If there are 2 mirrors in the same 2 devices, performance degrades
    701 * because position is mirror, not device based.
    702 *
    703 * The rdev for the device selected will have nr_pending incremented.
    704 */
    705
    706/*
    707 * FIXME: possibly should rethink readbalancing and do it differently
    708 * depending on near_copies / far_copies geometry.
    709 */
    710static struct md_rdev *read_balance(struct r10conf *conf,
    711				    struct r10bio *r10_bio,
    712				    int *max_sectors)
    713{
    714	const sector_t this_sector = r10_bio->sector;
    715	int disk, slot;
    716	int sectors = r10_bio->sectors;
    717	int best_good_sectors;
    718	sector_t new_distance, best_dist;
    719	struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
    720	int do_balance;
    721	int best_dist_slot, best_pending_slot;
    722	bool has_nonrot_disk = false;
    723	unsigned int min_pending;
    724	struct geom *geo = &conf->geo;
    725
    726	raid10_find_phys(conf, r10_bio);
    727	rcu_read_lock();
    728	best_dist_slot = -1;
    729	min_pending = UINT_MAX;
    730	best_dist_rdev = NULL;
    731	best_pending_rdev = NULL;
    732	best_dist = MaxSector;
    733	best_good_sectors = 0;
    734	do_balance = 1;
    735	clear_bit(R10BIO_FailFast, &r10_bio->state);
    736	/*
    737	 * Check if we can balance. We can balance on the whole
    738	 * device if no resync is going on (recovery is ok), or below
    739	 * the resync window. We take the first readable disk when
    740	 * above the resync window.
    741	 */
    742	if ((conf->mddev->recovery_cp < MaxSector
    743	     && (this_sector + sectors >= conf->next_resync)) ||
    744	    (mddev_is_clustered(conf->mddev) &&
    745	     md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
    746					    this_sector + sectors)))
    747		do_balance = 0;
    748
    749	for (slot = 0; slot < conf->copies ; slot++) {
    750		sector_t first_bad;
    751		int bad_sectors;
    752		sector_t dev_sector;
    753		unsigned int pending;
    754		bool nonrot;
    755
    756		if (r10_bio->devs[slot].bio == IO_BLOCKED)
    757			continue;
    758		disk = r10_bio->devs[slot].devnum;
    759		rdev = rcu_dereference(conf->mirrors[disk].replacement);
    760		if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
    761		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
    762			rdev = rcu_dereference(conf->mirrors[disk].rdev);
    763		if (rdev == NULL ||
    764		    test_bit(Faulty, &rdev->flags))
    765			continue;
    766		if (!test_bit(In_sync, &rdev->flags) &&
    767		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
    768			continue;
    769
    770		dev_sector = r10_bio->devs[slot].addr;
    771		if (is_badblock(rdev, dev_sector, sectors,
    772				&first_bad, &bad_sectors)) {
    773			if (best_dist < MaxSector)
    774				/* Already have a better slot */
    775				continue;
    776			if (first_bad <= dev_sector) {
    777				/* Cannot read here.  If this is the
    778				 * 'primary' device, then we must not read
    779				 * beyond 'bad_sectors' from another device.
    780				 */
    781				bad_sectors -= (dev_sector - first_bad);
    782				if (!do_balance && sectors > bad_sectors)
    783					sectors = bad_sectors;
    784				if (best_good_sectors > sectors)
    785					best_good_sectors = sectors;
    786			} else {
    787				sector_t good_sectors =
    788					first_bad - dev_sector;
    789				if (good_sectors > best_good_sectors) {
    790					best_good_sectors = good_sectors;
    791					best_dist_slot = slot;
    792					best_dist_rdev = rdev;
    793				}
    794				if (!do_balance)
    795					/* Must read from here */
    796					break;
    797			}
    798			continue;
    799		} else
    800			best_good_sectors = sectors;
    801
    802		if (!do_balance)
    803			break;
    804
    805		nonrot = bdev_nonrot(rdev->bdev);
    806		has_nonrot_disk |= nonrot;
    807		pending = atomic_read(&rdev->nr_pending);
    808		if (min_pending > pending && nonrot) {
    809			min_pending = pending;
    810			best_pending_slot = slot;
    811			best_pending_rdev = rdev;
    812		}
    813
    814		if (best_dist_slot >= 0)
    815			/* At least 2 disks to choose from so failfast is OK */
    816			set_bit(R10BIO_FailFast, &r10_bio->state);
    817		/* This optimisation is debatable, and completely destroys
    818		 * sequential read speed for 'far copies' arrays.  So only
    819		 * keep it for 'near' arrays, and review those later.
    820		 */
    821		if (geo->near_copies > 1 && !pending)
    822			new_distance = 0;
    823
    824		/* for far > 1 always use the lowest address */
    825		else if (geo->far_copies > 1)
    826			new_distance = r10_bio->devs[slot].addr;
    827		else
    828			new_distance = abs(r10_bio->devs[slot].addr -
    829					   conf->mirrors[disk].head_position);
    830
    831		if (new_distance < best_dist) {
    832			best_dist = new_distance;
    833			best_dist_slot = slot;
    834			best_dist_rdev = rdev;
    835		}
    836	}
    837	if (slot >= conf->copies) {
    838		if (has_nonrot_disk) {
    839			slot = best_pending_slot;
    840			rdev = best_pending_rdev;
    841		} else {
    842			slot = best_dist_slot;
    843			rdev = best_dist_rdev;
    844		}
    845	}
    846
    847	if (slot >= 0) {
    848		atomic_inc(&rdev->nr_pending);
    849		r10_bio->read_slot = slot;
    850	} else
    851		rdev = NULL;
    852	rcu_read_unlock();
    853	*max_sectors = best_good_sectors;
    854
    855	return rdev;
    856}
    857
    858static void flush_pending_writes(struct r10conf *conf)
    859{
    860	/* Any writes that have been queued but are awaiting
    861	 * bitmap updates get flushed here.
    862	 */
    863	spin_lock_irq(&conf->device_lock);
    864
    865	if (conf->pending_bio_list.head) {
    866		struct blk_plug plug;
    867		struct bio *bio;
    868
    869		bio = bio_list_get(&conf->pending_bio_list);
    870		spin_unlock_irq(&conf->device_lock);
    871
    872		/*
    873		 * As this is called in a wait_event() loop (see freeze_array),
    874		 * current->state might be TASK_UNINTERRUPTIBLE which will
    875		 * cause a warning when we prepare to wait again.  As it is
    876		 * rare that this path is taken, it is perfectly safe to force
    877		 * us to go around the wait_event() loop again, so the warning
    878		 * is a false-positive. Silence the warning by resetting
    879		 * thread state
    880		 */
    881		__set_current_state(TASK_RUNNING);
    882
    883		blk_start_plug(&plug);
    884		/* flush any pending bitmap writes to disk
    885		 * before proceeding w/ I/O */
    886		md_bitmap_unplug(conf->mddev->bitmap);
    887		wake_up(&conf->wait_barrier);
    888
    889		while (bio) { /* submit pending writes */
    890			struct bio *next = bio->bi_next;
    891			struct md_rdev *rdev = (void*)bio->bi_bdev;
    892			bio->bi_next = NULL;
    893			bio_set_dev(bio, rdev->bdev);
    894			if (test_bit(Faulty, &rdev->flags)) {
    895				bio_io_error(bio);
    896			} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
    897					    !bdev_max_discard_sectors(bio->bi_bdev)))
    898				/* Just ignore it */
    899				bio_endio(bio);
    900			else
    901				submit_bio_noacct(bio);
    902			bio = next;
    903		}
    904		blk_finish_plug(&plug);
    905	} else
    906		spin_unlock_irq(&conf->device_lock);
    907}
    908
    909/* Barriers....
    910 * Sometimes we need to suspend IO while we do something else,
    911 * either some resync/recovery, or reconfigure the array.
    912 * To do this we raise a 'barrier'.
    913 * The 'barrier' is a counter that can be raised multiple times
    914 * to count how many activities are happening which preclude
    915 * normal IO.
    916 * We can only raise the barrier if there is no pending IO.
    917 * i.e. if nr_pending == 0.
    918 * We choose only to raise the barrier if no-one is waiting for the
    919 * barrier to go down.  This means that as soon as an IO request
    920 * is ready, no other operations which require a barrier will start
    921 * until the IO request has had a chance.
    922 *
    923 * So: regular IO calls 'wait_barrier'.  When that returns there
    924 *    is no backgroup IO happening,  It must arrange to call
    925 *    allow_barrier when it has finished its IO.
    926 * backgroup IO calls must call raise_barrier.  Once that returns
    927 *    there is no normal IO happeing.  It must arrange to call
    928 *    lower_barrier when the particular background IO completes.
    929 */
    930
    931static void raise_barrier(struct r10conf *conf, int force)
    932{
    933	BUG_ON(force && !conf->barrier);
    934	spin_lock_irq(&conf->resync_lock);
    935
    936	/* Wait until no block IO is waiting (unless 'force') */
    937	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
    938			    conf->resync_lock);
    939
    940	/* block any new IO from starting */
    941	conf->barrier++;
    942
    943	/* Now wait for all pending IO to complete */
    944	wait_event_lock_irq(conf->wait_barrier,
    945			    !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
    946			    conf->resync_lock);
    947
    948	spin_unlock_irq(&conf->resync_lock);
    949}
    950
    951static void lower_barrier(struct r10conf *conf)
    952{
    953	unsigned long flags;
    954	spin_lock_irqsave(&conf->resync_lock, flags);
    955	conf->barrier--;
    956	spin_unlock_irqrestore(&conf->resync_lock, flags);
    957	wake_up(&conf->wait_barrier);
    958}
    959
    960static bool wait_barrier(struct r10conf *conf, bool nowait)
    961{
    962	bool ret = true;
    963
    964	spin_lock_irq(&conf->resync_lock);
    965	if (conf->barrier) {
    966		struct bio_list *bio_list = current->bio_list;
    967		conf->nr_waiting++;
    968		/* Wait for the barrier to drop.
    969		 * However if there are already pending
    970		 * requests (preventing the barrier from
    971		 * rising completely), and the
    972		 * pre-process bio queue isn't empty,
    973		 * then don't wait, as we need to empty
    974		 * that queue to get the nr_pending
    975		 * count down.
    976		 */
    977		/* Return false when nowait flag is set */
    978		if (nowait) {
    979			ret = false;
    980		} else {
    981			raid10_log(conf->mddev, "wait barrier");
    982			wait_event_lock_irq(conf->wait_barrier,
    983					    !conf->barrier ||
    984					    (atomic_read(&conf->nr_pending) &&
    985					     bio_list &&
    986					     (!bio_list_empty(&bio_list[0]) ||
    987					      !bio_list_empty(&bio_list[1]))) ||
    988					     /* move on if recovery thread is
    989					      * blocked by us
    990					      */
    991					     (conf->mddev->thread->tsk == current &&
    992					      test_bit(MD_RECOVERY_RUNNING,
    993						       &conf->mddev->recovery) &&
    994					      conf->nr_queued > 0),
    995					    conf->resync_lock);
    996		}
    997		conf->nr_waiting--;
    998		if (!conf->nr_waiting)
    999			wake_up(&conf->wait_barrier);
   1000	}
   1001	/* Only increment nr_pending when we wait */
   1002	if (ret)
   1003		atomic_inc(&conf->nr_pending);
   1004	spin_unlock_irq(&conf->resync_lock);
   1005	return ret;
   1006}
   1007
   1008static void allow_barrier(struct r10conf *conf)
   1009{
   1010	if ((atomic_dec_and_test(&conf->nr_pending)) ||
   1011			(conf->array_freeze_pending))
   1012		wake_up(&conf->wait_barrier);
   1013}
   1014
   1015static void freeze_array(struct r10conf *conf, int extra)
   1016{
   1017	/* stop syncio and normal IO and wait for everything to
   1018	 * go quiet.
   1019	 * We increment barrier and nr_waiting, and then
   1020	 * wait until nr_pending match nr_queued+extra
   1021	 * This is called in the context of one normal IO request
   1022	 * that has failed. Thus any sync request that might be pending
   1023	 * will be blocked by nr_pending, and we need to wait for
   1024	 * pending IO requests to complete or be queued for re-try.
   1025	 * Thus the number queued (nr_queued) plus this request (extra)
   1026	 * must match the number of pending IOs (nr_pending) before
   1027	 * we continue.
   1028	 */
   1029	spin_lock_irq(&conf->resync_lock);
   1030	conf->array_freeze_pending++;
   1031	conf->barrier++;
   1032	conf->nr_waiting++;
   1033	wait_event_lock_irq_cmd(conf->wait_barrier,
   1034				atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
   1035				conf->resync_lock,
   1036				flush_pending_writes(conf));
   1037
   1038	conf->array_freeze_pending--;
   1039	spin_unlock_irq(&conf->resync_lock);
   1040}
   1041
   1042static void unfreeze_array(struct r10conf *conf)
   1043{
   1044	/* reverse the effect of the freeze */
   1045	spin_lock_irq(&conf->resync_lock);
   1046	conf->barrier--;
   1047	conf->nr_waiting--;
   1048	wake_up(&conf->wait_barrier);
   1049	spin_unlock_irq(&conf->resync_lock);
   1050}
   1051
   1052static sector_t choose_data_offset(struct r10bio *r10_bio,
   1053				   struct md_rdev *rdev)
   1054{
   1055	if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
   1056	    test_bit(R10BIO_Previous, &r10_bio->state))
   1057		return rdev->data_offset;
   1058	else
   1059		return rdev->new_data_offset;
   1060}
   1061
   1062static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
   1063{
   1064	struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb);
   1065	struct mddev *mddev = plug->cb.data;
   1066	struct r10conf *conf = mddev->private;
   1067	struct bio *bio;
   1068
   1069	if (from_schedule || current->bio_list) {
   1070		spin_lock_irq(&conf->device_lock);
   1071		bio_list_merge(&conf->pending_bio_list, &plug->pending);
   1072		spin_unlock_irq(&conf->device_lock);
   1073		wake_up(&conf->wait_barrier);
   1074		md_wakeup_thread(mddev->thread);
   1075		kfree(plug);
   1076		return;
   1077	}
   1078
   1079	/* we aren't scheduling, so we can do the write-out directly. */
   1080	bio = bio_list_get(&plug->pending);
   1081	md_bitmap_unplug(mddev->bitmap);
   1082	wake_up(&conf->wait_barrier);
   1083
   1084	while (bio) { /* submit pending writes */
   1085		struct bio *next = bio->bi_next;
   1086		struct md_rdev *rdev = (void*)bio->bi_bdev;
   1087		bio->bi_next = NULL;
   1088		bio_set_dev(bio, rdev->bdev);
   1089		if (test_bit(Faulty, &rdev->flags)) {
   1090			bio_io_error(bio);
   1091		} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
   1092				    !bdev_max_discard_sectors(bio->bi_bdev)))
   1093			/* Just ignore it */
   1094			bio_endio(bio);
   1095		else
   1096			submit_bio_noacct(bio);
   1097		bio = next;
   1098	}
   1099	kfree(plug);
   1100}
   1101
   1102/*
   1103 * 1. Register the new request and wait if the reconstruction thread has put
   1104 * up a bar for new requests. Continue immediately if no resync is active
   1105 * currently.
   1106 * 2. If IO spans the reshape position.  Need to wait for reshape to pass.
   1107 */
   1108static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
   1109				 struct bio *bio, sector_t sectors)
   1110{
   1111	/* Bail out if REQ_NOWAIT is set for the bio */
   1112	if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
   1113		bio_wouldblock_error(bio);
   1114		return false;
   1115	}
   1116	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
   1117	    bio->bi_iter.bi_sector < conf->reshape_progress &&
   1118	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
   1119		allow_barrier(conf);
   1120		if (bio->bi_opf & REQ_NOWAIT) {
   1121			bio_wouldblock_error(bio);
   1122			return false;
   1123		}
   1124		raid10_log(conf->mddev, "wait reshape");
   1125		wait_event(conf->wait_barrier,
   1126			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
   1127			   conf->reshape_progress >= bio->bi_iter.bi_sector +
   1128			   sectors);
   1129		wait_barrier(conf, false);
   1130	}
   1131	return true;
   1132}
   1133
   1134static void raid10_read_request(struct mddev *mddev, struct bio *bio,
   1135				struct r10bio *r10_bio)
   1136{
   1137	struct r10conf *conf = mddev->private;
   1138	struct bio *read_bio;
   1139	const int op = bio_op(bio);
   1140	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
   1141	int max_sectors;
   1142	struct md_rdev *rdev;
   1143	char b[BDEVNAME_SIZE];
   1144	int slot = r10_bio->read_slot;
   1145	struct md_rdev *err_rdev = NULL;
   1146	gfp_t gfp = GFP_NOIO;
   1147
   1148	if (slot >= 0 && r10_bio->devs[slot].rdev) {
   1149		/*
   1150		 * This is an error retry, but we cannot
   1151		 * safely dereference the rdev in the r10_bio,
   1152		 * we must use the one in conf.
   1153		 * If it has already been disconnected (unlikely)
   1154		 * we lose the device name in error messages.
   1155		 */
   1156		int disk;
   1157		/*
   1158		 * As we are blocking raid10, it is a little safer to
   1159		 * use __GFP_HIGH.
   1160		 */
   1161		gfp = GFP_NOIO | __GFP_HIGH;
   1162
   1163		rcu_read_lock();
   1164		disk = r10_bio->devs[slot].devnum;
   1165		err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
   1166		if (err_rdev)
   1167			bdevname(err_rdev->bdev, b);
   1168		else {
   1169			strcpy(b, "???");
   1170			/* This never gets dereferenced */
   1171			err_rdev = r10_bio->devs[slot].rdev;
   1172		}
   1173		rcu_read_unlock();
   1174	}
   1175
   1176	if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors))
   1177		return;
   1178	rdev = read_balance(conf, r10_bio, &max_sectors);
   1179	if (!rdev) {
   1180		if (err_rdev) {
   1181			pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
   1182					    mdname(mddev), b,
   1183					    (unsigned long long)r10_bio->sector);
   1184		}
   1185		raid_end_bio_io(r10_bio);
   1186		return;
   1187	}
   1188	if (err_rdev)
   1189		pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n",
   1190				   mdname(mddev),
   1191				   rdev->bdev,
   1192				   (unsigned long long)r10_bio->sector);
   1193	if (max_sectors < bio_sectors(bio)) {
   1194		struct bio *split = bio_split(bio, max_sectors,
   1195					      gfp, &conf->bio_split);
   1196		bio_chain(split, bio);
   1197		allow_barrier(conf);
   1198		submit_bio_noacct(bio);
   1199		wait_barrier(conf, false);
   1200		bio = split;
   1201		r10_bio->master_bio = bio;
   1202		r10_bio->sectors = max_sectors;
   1203	}
   1204	slot = r10_bio->read_slot;
   1205
   1206	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
   1207		r10_bio->start_time = bio_start_io_acct(bio);
   1208	read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
   1209
   1210	r10_bio->devs[slot].bio = read_bio;
   1211	r10_bio->devs[slot].rdev = rdev;
   1212
   1213	read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
   1214		choose_data_offset(r10_bio, rdev);
   1215	read_bio->bi_end_io = raid10_end_read_request;
   1216	bio_set_op_attrs(read_bio, op, do_sync);
   1217	if (test_bit(FailFast, &rdev->flags) &&
   1218	    test_bit(R10BIO_FailFast, &r10_bio->state))
   1219	        read_bio->bi_opf |= MD_FAILFAST;
   1220	read_bio->bi_private = r10_bio;
   1221
   1222	if (mddev->gendisk)
   1223	        trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
   1224	                              r10_bio->sector);
   1225	submit_bio_noacct(read_bio);
   1226	return;
   1227}
   1228
   1229static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
   1230				  struct bio *bio, bool replacement,
   1231				  int n_copy)
   1232{
   1233	const int op = bio_op(bio);
   1234	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
   1235	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
   1236	unsigned long flags;
   1237	struct blk_plug_cb *cb;
   1238	struct raid1_plug_cb *plug = NULL;
   1239	struct r10conf *conf = mddev->private;
   1240	struct md_rdev *rdev;
   1241	int devnum = r10_bio->devs[n_copy].devnum;
   1242	struct bio *mbio;
   1243
   1244	if (replacement) {
   1245		rdev = conf->mirrors[devnum].replacement;
   1246		if (rdev == NULL) {
   1247			/* Replacement just got moved to main 'rdev' */
   1248			smp_mb();
   1249			rdev = conf->mirrors[devnum].rdev;
   1250		}
   1251	} else
   1252		rdev = conf->mirrors[devnum].rdev;
   1253
   1254	mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set);
   1255	if (replacement)
   1256		r10_bio->devs[n_copy].repl_bio = mbio;
   1257	else
   1258		r10_bio->devs[n_copy].bio = mbio;
   1259
   1260	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
   1261				   choose_data_offset(r10_bio, rdev));
   1262	mbio->bi_end_io	= raid10_end_write_request;
   1263	bio_set_op_attrs(mbio, op, do_sync | do_fua);
   1264	if (!replacement && test_bit(FailFast,
   1265				     &conf->mirrors[devnum].rdev->flags)
   1266			 && enough(conf, devnum))
   1267		mbio->bi_opf |= MD_FAILFAST;
   1268	mbio->bi_private = r10_bio;
   1269
   1270	if (conf->mddev->gendisk)
   1271		trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
   1272				      r10_bio->sector);
   1273	/* flush_pending_writes() needs access to the rdev so...*/
   1274	mbio->bi_bdev = (void *)rdev;
   1275
   1276	atomic_inc(&r10_bio->remaining);
   1277
   1278	cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
   1279	if (cb)
   1280		plug = container_of(cb, struct raid1_plug_cb, cb);
   1281	else
   1282		plug = NULL;
   1283	if (plug) {
   1284		bio_list_add(&plug->pending, mbio);
   1285	} else {
   1286		spin_lock_irqsave(&conf->device_lock, flags);
   1287		bio_list_add(&conf->pending_bio_list, mbio);
   1288		spin_unlock_irqrestore(&conf->device_lock, flags);
   1289		md_wakeup_thread(mddev->thread);
   1290	}
   1291}
   1292
   1293static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
   1294{
   1295	int i;
   1296	struct r10conf *conf = mddev->private;
   1297	struct md_rdev *blocked_rdev;
   1298
   1299retry_wait:
   1300	blocked_rdev = NULL;
   1301	rcu_read_lock();
   1302	for (i = 0; i < conf->copies; i++) {
   1303		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
   1304		struct md_rdev *rrdev = rcu_dereference(
   1305			conf->mirrors[i].replacement);
   1306		if (rdev == rrdev)
   1307			rrdev = NULL;
   1308		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
   1309			atomic_inc(&rdev->nr_pending);
   1310			blocked_rdev = rdev;
   1311			break;
   1312		}
   1313		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
   1314			atomic_inc(&rrdev->nr_pending);
   1315			blocked_rdev = rrdev;
   1316			break;
   1317		}
   1318
   1319		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
   1320			sector_t first_bad;
   1321			sector_t dev_sector = r10_bio->devs[i].addr;
   1322			int bad_sectors;
   1323			int is_bad;
   1324
   1325			/*
   1326			 * Discard request doesn't care the write result
   1327			 * so it doesn't need to wait blocked disk here.
   1328			 */
   1329			if (!r10_bio->sectors)
   1330				continue;
   1331
   1332			is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
   1333					     &first_bad, &bad_sectors);
   1334			if (is_bad < 0) {
   1335				/*
   1336				 * Mustn't write here until the bad block
   1337				 * is acknowledged
   1338				 */
   1339				atomic_inc(&rdev->nr_pending);
   1340				set_bit(BlockedBadBlocks, &rdev->flags);
   1341				blocked_rdev = rdev;
   1342				break;
   1343			}
   1344		}
   1345	}
   1346	rcu_read_unlock();
   1347
   1348	if (unlikely(blocked_rdev)) {
   1349		/* Have to wait for this device to get unblocked, then retry */
   1350		allow_barrier(conf);
   1351		raid10_log(conf->mddev, "%s wait rdev %d blocked",
   1352				__func__, blocked_rdev->raid_disk);
   1353		md_wait_for_blocked_rdev(blocked_rdev, mddev);
   1354		wait_barrier(conf, false);
   1355		goto retry_wait;
   1356	}
   1357}
   1358
   1359static void raid10_write_request(struct mddev *mddev, struct bio *bio,
   1360				 struct r10bio *r10_bio)
   1361{
   1362	struct r10conf *conf = mddev->private;
   1363	int i;
   1364	sector_t sectors;
   1365	int max_sectors;
   1366
   1367	if ((mddev_is_clustered(mddev) &&
   1368	     md_cluster_ops->area_resyncing(mddev, WRITE,
   1369					    bio->bi_iter.bi_sector,
   1370					    bio_end_sector(bio)))) {
   1371		DEFINE_WAIT(w);
   1372		/* Bail out if REQ_NOWAIT is set for the bio */
   1373		if (bio->bi_opf & REQ_NOWAIT) {
   1374			bio_wouldblock_error(bio);
   1375			return;
   1376		}
   1377		for (;;) {
   1378			prepare_to_wait(&conf->wait_barrier,
   1379					&w, TASK_IDLE);
   1380			if (!md_cluster_ops->area_resyncing(mddev, WRITE,
   1381				 bio->bi_iter.bi_sector, bio_end_sector(bio)))
   1382				break;
   1383			schedule();
   1384		}
   1385		finish_wait(&conf->wait_barrier, &w);
   1386	}
   1387
   1388	sectors = r10_bio->sectors;
   1389	if (!regular_request_wait(mddev, conf, bio, sectors))
   1390		return;
   1391	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
   1392	    (mddev->reshape_backwards
   1393	     ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
   1394		bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
   1395	     : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
   1396		bio->bi_iter.bi_sector < conf->reshape_progress))) {
   1397		/* Need to update reshape_position in metadata */
   1398		mddev->reshape_position = conf->reshape_progress;
   1399		set_mask_bits(&mddev->sb_flags, 0,
   1400			      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
   1401		md_wakeup_thread(mddev->thread);
   1402		if (bio->bi_opf & REQ_NOWAIT) {
   1403			allow_barrier(conf);
   1404			bio_wouldblock_error(bio);
   1405			return;
   1406		}
   1407		raid10_log(conf->mddev, "wait reshape metadata");
   1408		wait_event(mddev->sb_wait,
   1409			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
   1410
   1411		conf->reshape_safe = mddev->reshape_position;
   1412	}
   1413
   1414	/* first select target devices under rcu_lock and
   1415	 * inc refcount on their rdev.  Record them by setting
   1416	 * bios[x] to bio
   1417	 * If there are known/acknowledged bad blocks on any device
   1418	 * on which we have seen a write error, we want to avoid
   1419	 * writing to those blocks.  This potentially requires several
   1420	 * writes to write around the bad blocks.  Each set of writes
   1421	 * gets its own r10_bio with a set of bios attached.
   1422	 */
   1423
   1424	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
   1425	raid10_find_phys(conf, r10_bio);
   1426
   1427	wait_blocked_dev(mddev, r10_bio);
   1428
   1429	rcu_read_lock();
   1430	max_sectors = r10_bio->sectors;
   1431
   1432	for (i = 0;  i < conf->copies; i++) {
   1433		int d = r10_bio->devs[i].devnum;
   1434		struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
   1435		struct md_rdev *rrdev = rcu_dereference(
   1436			conf->mirrors[d].replacement);
   1437		if (rdev == rrdev)
   1438			rrdev = NULL;
   1439		if (rdev && (test_bit(Faulty, &rdev->flags)))
   1440			rdev = NULL;
   1441		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
   1442			rrdev = NULL;
   1443
   1444		r10_bio->devs[i].bio = NULL;
   1445		r10_bio->devs[i].repl_bio = NULL;
   1446
   1447		if (!rdev && !rrdev) {
   1448			set_bit(R10BIO_Degraded, &r10_bio->state);
   1449			continue;
   1450		}
   1451		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
   1452			sector_t first_bad;
   1453			sector_t dev_sector = r10_bio->devs[i].addr;
   1454			int bad_sectors;
   1455			int is_bad;
   1456
   1457			is_bad = is_badblock(rdev, dev_sector, max_sectors,
   1458					     &first_bad, &bad_sectors);
   1459			if (is_bad && first_bad <= dev_sector) {
   1460				/* Cannot write here at all */
   1461				bad_sectors -= (dev_sector - first_bad);
   1462				if (bad_sectors < max_sectors)
   1463					/* Mustn't write more than bad_sectors
   1464					 * to other devices yet
   1465					 */
   1466					max_sectors = bad_sectors;
   1467				/* We don't set R10BIO_Degraded as that
   1468				 * only applies if the disk is missing,
   1469				 * so it might be re-added, and we want to
   1470				 * know to recover this chunk.
   1471				 * In this case the device is here, and the
   1472				 * fact that this chunk is not in-sync is
   1473				 * recorded in the bad block log.
   1474				 */
   1475				continue;
   1476			}
   1477			if (is_bad) {
   1478				int good_sectors = first_bad - dev_sector;
   1479				if (good_sectors < max_sectors)
   1480					max_sectors = good_sectors;
   1481			}
   1482		}
   1483		if (rdev) {
   1484			r10_bio->devs[i].bio = bio;
   1485			atomic_inc(&rdev->nr_pending);
   1486		}
   1487		if (rrdev) {
   1488			r10_bio->devs[i].repl_bio = bio;
   1489			atomic_inc(&rrdev->nr_pending);
   1490		}
   1491	}
   1492	rcu_read_unlock();
   1493
   1494	if (max_sectors < r10_bio->sectors)
   1495		r10_bio->sectors = max_sectors;
   1496
   1497	if (r10_bio->sectors < bio_sectors(bio)) {
   1498		struct bio *split = bio_split(bio, r10_bio->sectors,
   1499					      GFP_NOIO, &conf->bio_split);
   1500		bio_chain(split, bio);
   1501		allow_barrier(conf);
   1502		submit_bio_noacct(bio);
   1503		wait_barrier(conf, false);
   1504		bio = split;
   1505		r10_bio->master_bio = bio;
   1506	}
   1507
   1508	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
   1509		r10_bio->start_time = bio_start_io_acct(bio);
   1510	atomic_set(&r10_bio->remaining, 1);
   1511	md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
   1512
   1513	for (i = 0; i < conf->copies; i++) {
   1514		if (r10_bio->devs[i].bio)
   1515			raid10_write_one_disk(mddev, r10_bio, bio, false, i);
   1516		if (r10_bio->devs[i].repl_bio)
   1517			raid10_write_one_disk(mddev, r10_bio, bio, true, i);
   1518	}
   1519	one_write_done(r10_bio);
   1520}
   1521
   1522static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
   1523{
   1524	struct r10conf *conf = mddev->private;
   1525	struct r10bio *r10_bio;
   1526
   1527	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
   1528
   1529	r10_bio->master_bio = bio;
   1530	r10_bio->sectors = sectors;
   1531
   1532	r10_bio->mddev = mddev;
   1533	r10_bio->sector = bio->bi_iter.bi_sector;
   1534	r10_bio->state = 0;
   1535	r10_bio->read_slot = -1;
   1536	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
   1537			conf->geo.raid_disks);
   1538
   1539	if (bio_data_dir(bio) == READ)
   1540		raid10_read_request(mddev, bio, r10_bio);
   1541	else
   1542		raid10_write_request(mddev, bio, r10_bio);
   1543}
   1544
   1545static void raid_end_discard_bio(struct r10bio *r10bio)
   1546{
   1547	struct r10conf *conf = r10bio->mddev->private;
   1548	struct r10bio *first_r10bio;
   1549
   1550	while (atomic_dec_and_test(&r10bio->remaining)) {
   1551
   1552		allow_barrier(conf);
   1553
   1554		if (!test_bit(R10BIO_Discard, &r10bio->state)) {
   1555			first_r10bio = (struct r10bio *)r10bio->master_bio;
   1556			free_r10bio(r10bio);
   1557			r10bio = first_r10bio;
   1558		} else {
   1559			md_write_end(r10bio->mddev);
   1560			bio_endio(r10bio->master_bio);
   1561			free_r10bio(r10bio);
   1562			break;
   1563		}
   1564	}
   1565}
   1566
   1567static void raid10_end_discard_request(struct bio *bio)
   1568{
   1569	struct r10bio *r10_bio = bio->bi_private;
   1570	struct r10conf *conf = r10_bio->mddev->private;
   1571	struct md_rdev *rdev = NULL;
   1572	int dev;
   1573	int slot, repl;
   1574
   1575	/*
   1576	 * We don't care the return value of discard bio
   1577	 */
   1578	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
   1579		set_bit(R10BIO_Uptodate, &r10_bio->state);
   1580
   1581	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
   1582	if (repl)
   1583		rdev = conf->mirrors[dev].replacement;
   1584	if (!rdev) {
   1585		/*
   1586		 * raid10_remove_disk uses smp_mb to make sure rdev is set to
   1587		 * replacement before setting replacement to NULL. It can read
   1588		 * rdev first without barrier protect even replacment is NULL
   1589		 */
   1590		smp_rmb();
   1591		rdev = conf->mirrors[dev].rdev;
   1592	}
   1593
   1594	raid_end_discard_bio(r10_bio);
   1595	rdev_dec_pending(rdev, conf->mddev);
   1596}
   1597
   1598/*
   1599 * There are some limitations to handle discard bio
   1600 * 1st, the discard size is bigger than stripe_size*2.
   1601 * 2st, if the discard bio spans reshape progress, we use the old way to
   1602 * handle discard bio
   1603 */
   1604static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
   1605{
   1606	struct r10conf *conf = mddev->private;
   1607	struct geom *geo = &conf->geo;
   1608	int far_copies = geo->far_copies;
   1609	bool first_copy = true;
   1610	struct r10bio *r10_bio, *first_r10bio;
   1611	struct bio *split;
   1612	int disk;
   1613	sector_t chunk;
   1614	unsigned int stripe_size;
   1615	unsigned int stripe_data_disks;
   1616	sector_t split_size;
   1617	sector_t bio_start, bio_end;
   1618	sector_t first_stripe_index, last_stripe_index;
   1619	sector_t start_disk_offset;
   1620	unsigned int start_disk_index;
   1621	sector_t end_disk_offset;
   1622	unsigned int end_disk_index;
   1623	unsigned int remainder;
   1624
   1625	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
   1626		return -EAGAIN;
   1627
   1628	if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) {
   1629		bio_wouldblock_error(bio);
   1630		return 0;
   1631	}
   1632	wait_barrier(conf, false);
   1633
   1634	/*
   1635	 * Check reshape again to avoid reshape happens after checking
   1636	 * MD_RECOVERY_RESHAPE and before wait_barrier
   1637	 */
   1638	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
   1639		goto out;
   1640
   1641	if (geo->near_copies)
   1642		stripe_data_disks = geo->raid_disks / geo->near_copies +
   1643					geo->raid_disks % geo->near_copies;
   1644	else
   1645		stripe_data_disks = geo->raid_disks;
   1646
   1647	stripe_size = stripe_data_disks << geo->chunk_shift;
   1648
   1649	bio_start = bio->bi_iter.bi_sector;
   1650	bio_end = bio_end_sector(bio);
   1651
   1652	/*
   1653	 * Maybe one discard bio is smaller than strip size or across one
   1654	 * stripe and discard region is larger than one stripe size. For far
   1655	 * offset layout, if the discard region is not aligned with stripe
   1656	 * size, there is hole when we submit discard bio to member disk.
   1657	 * For simplicity, we only handle discard bio which discard region
   1658	 * is bigger than stripe_size * 2
   1659	 */
   1660	if (bio_sectors(bio) < stripe_size*2)
   1661		goto out;
   1662
   1663	/*
   1664	 * Keep bio aligned with strip size.
   1665	 */
   1666	div_u64_rem(bio_start, stripe_size, &remainder);
   1667	if (remainder) {
   1668		split_size = stripe_size - remainder;
   1669		split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
   1670		bio_chain(split, bio);
   1671		allow_barrier(conf);
   1672		/* Resend the fist split part */
   1673		submit_bio_noacct(split);
   1674		wait_barrier(conf, false);
   1675	}
   1676	div_u64_rem(bio_end, stripe_size, &remainder);
   1677	if (remainder) {
   1678		split_size = bio_sectors(bio) - remainder;
   1679		split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
   1680		bio_chain(split, bio);
   1681		allow_barrier(conf);
   1682		/* Resend the second split part */
   1683		submit_bio_noacct(bio);
   1684		bio = split;
   1685		wait_barrier(conf, false);
   1686	}
   1687
   1688	bio_start = bio->bi_iter.bi_sector;
   1689	bio_end = bio_end_sector(bio);
   1690
   1691	/*
   1692	 * Raid10 uses chunk as the unit to store data. It's similar like raid0.
   1693	 * One stripe contains the chunks from all member disk (one chunk from
   1694	 * one disk at the same HBA address). For layout detail, see 'man md 4'
   1695	 */
   1696	chunk = bio_start >> geo->chunk_shift;
   1697	chunk *= geo->near_copies;
   1698	first_stripe_index = chunk;
   1699	start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
   1700	if (geo->far_offset)
   1701		first_stripe_index *= geo->far_copies;
   1702	start_disk_offset = (bio_start & geo->chunk_mask) +
   1703				(first_stripe_index << geo->chunk_shift);
   1704
   1705	chunk = bio_end >> geo->chunk_shift;
   1706	chunk *= geo->near_copies;
   1707	last_stripe_index = chunk;
   1708	end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
   1709	if (geo->far_offset)
   1710		last_stripe_index *= geo->far_copies;
   1711	end_disk_offset = (bio_end & geo->chunk_mask) +
   1712				(last_stripe_index << geo->chunk_shift);
   1713
   1714retry_discard:
   1715	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
   1716	r10_bio->mddev = mddev;
   1717	r10_bio->state = 0;
   1718	r10_bio->sectors = 0;
   1719	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
   1720	wait_blocked_dev(mddev, r10_bio);
   1721
   1722	/*
   1723	 * For far layout it needs more than one r10bio to cover all regions.
   1724	 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
   1725	 * to record the discard bio. Other r10bio->master_bio record the first
   1726	 * r10bio. The first r10bio only release after all other r10bios finish.
   1727	 * The discard bio returns only first r10bio finishes
   1728	 */
   1729	if (first_copy) {
   1730		r10_bio->master_bio = bio;
   1731		set_bit(R10BIO_Discard, &r10_bio->state);
   1732		first_copy = false;
   1733		first_r10bio = r10_bio;
   1734	} else
   1735		r10_bio->master_bio = (struct bio *)first_r10bio;
   1736
   1737	/*
   1738	 * first select target devices under rcu_lock and
   1739	 * inc refcount on their rdev.  Record them by setting
   1740	 * bios[x] to bio
   1741	 */
   1742	rcu_read_lock();
   1743	for (disk = 0; disk < geo->raid_disks; disk++) {
   1744		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
   1745		struct md_rdev *rrdev = rcu_dereference(
   1746			conf->mirrors[disk].replacement);
   1747
   1748		r10_bio->devs[disk].bio = NULL;
   1749		r10_bio->devs[disk].repl_bio = NULL;
   1750
   1751		if (rdev && (test_bit(Faulty, &rdev->flags)))
   1752			rdev = NULL;
   1753		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
   1754			rrdev = NULL;
   1755		if (!rdev && !rrdev)
   1756			continue;
   1757
   1758		if (rdev) {
   1759			r10_bio->devs[disk].bio = bio;
   1760			atomic_inc(&rdev->nr_pending);
   1761		}
   1762		if (rrdev) {
   1763			r10_bio->devs[disk].repl_bio = bio;
   1764			atomic_inc(&rrdev->nr_pending);
   1765		}
   1766	}
   1767	rcu_read_unlock();
   1768
   1769	atomic_set(&r10_bio->remaining, 1);
   1770	for (disk = 0; disk < geo->raid_disks; disk++) {
   1771		sector_t dev_start, dev_end;
   1772		struct bio *mbio, *rbio = NULL;
   1773
   1774		/*
   1775		 * Now start to calculate the start and end address for each disk.
   1776		 * The space between dev_start and dev_end is the discard region.
   1777		 *
   1778		 * For dev_start, it needs to consider three conditions:
   1779		 * 1st, the disk is before start_disk, you can imagine the disk in
   1780		 * the next stripe. So the dev_start is the start address of next
   1781		 * stripe.
   1782		 * 2st, the disk is after start_disk, it means the disk is at the
   1783		 * same stripe of first disk
   1784		 * 3st, the first disk itself, we can use start_disk_offset directly
   1785		 */
   1786		if (disk < start_disk_index)
   1787			dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
   1788		else if (disk > start_disk_index)
   1789			dev_start = first_stripe_index * mddev->chunk_sectors;
   1790		else
   1791			dev_start = start_disk_offset;
   1792
   1793		if (disk < end_disk_index)
   1794			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
   1795		else if (disk > end_disk_index)
   1796			dev_end = last_stripe_index * mddev->chunk_sectors;
   1797		else
   1798			dev_end = end_disk_offset;
   1799
   1800		/*
   1801		 * It only handles discard bio which size is >= stripe size, so
   1802		 * dev_end > dev_start all the time.
   1803		 * It doesn't need to use rcu lock to get rdev here. We already
   1804		 * add rdev->nr_pending in the first loop.
   1805		 */
   1806		if (r10_bio->devs[disk].bio) {
   1807			struct md_rdev *rdev = conf->mirrors[disk].rdev;
   1808			mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
   1809					       &mddev->bio_set);
   1810			mbio->bi_end_io = raid10_end_discard_request;
   1811			mbio->bi_private = r10_bio;
   1812			r10_bio->devs[disk].bio = mbio;
   1813			r10_bio->devs[disk].devnum = disk;
   1814			atomic_inc(&r10_bio->remaining);
   1815			md_submit_discard_bio(mddev, rdev, mbio,
   1816					dev_start + choose_data_offset(r10_bio, rdev),
   1817					dev_end - dev_start);
   1818			bio_endio(mbio);
   1819		}
   1820		if (r10_bio->devs[disk].repl_bio) {
   1821			struct md_rdev *rrdev = conf->mirrors[disk].replacement;
   1822			rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
   1823					       &mddev->bio_set);
   1824			rbio->bi_end_io = raid10_end_discard_request;
   1825			rbio->bi_private = r10_bio;
   1826			r10_bio->devs[disk].repl_bio = rbio;
   1827			r10_bio->devs[disk].devnum = disk;
   1828			atomic_inc(&r10_bio->remaining);
   1829			md_submit_discard_bio(mddev, rrdev, rbio,
   1830					dev_start + choose_data_offset(r10_bio, rrdev),
   1831					dev_end - dev_start);
   1832			bio_endio(rbio);
   1833		}
   1834	}
   1835
   1836	if (!geo->far_offset && --far_copies) {
   1837		first_stripe_index += geo->stride >> geo->chunk_shift;
   1838		start_disk_offset += geo->stride;
   1839		last_stripe_index += geo->stride >> geo->chunk_shift;
   1840		end_disk_offset += geo->stride;
   1841		atomic_inc(&first_r10bio->remaining);
   1842		raid_end_discard_bio(r10_bio);
   1843		wait_barrier(conf, false);
   1844		goto retry_discard;
   1845	}
   1846
   1847	raid_end_discard_bio(r10_bio);
   1848
   1849	return 0;
   1850out:
   1851	allow_barrier(conf);
   1852	return -EAGAIN;
   1853}
   1854
   1855static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
   1856{
   1857	struct r10conf *conf = mddev->private;
   1858	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
   1859	int chunk_sects = chunk_mask + 1;
   1860	int sectors = bio_sectors(bio);
   1861
   1862	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
   1863	    && md_flush_request(mddev, bio))
   1864		return true;
   1865
   1866	if (!md_write_start(mddev, bio))
   1867		return false;
   1868
   1869	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
   1870		if (!raid10_handle_discard(mddev, bio))
   1871			return true;
   1872
   1873	/*
   1874	 * If this request crosses a chunk boundary, we need to split
   1875	 * it.
   1876	 */
   1877	if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
   1878		     sectors > chunk_sects
   1879		     && (conf->geo.near_copies < conf->geo.raid_disks
   1880			 || conf->prev.near_copies <
   1881			 conf->prev.raid_disks)))
   1882		sectors = chunk_sects -
   1883			(bio->bi_iter.bi_sector &
   1884			 (chunk_sects - 1));
   1885	__make_request(mddev, bio, sectors);
   1886
   1887	/* In case raid10d snuck in to freeze_array */
   1888	wake_up(&conf->wait_barrier);
   1889	return true;
   1890}
   1891
   1892static void raid10_status(struct seq_file *seq, struct mddev *mddev)
   1893{
   1894	struct r10conf *conf = mddev->private;
   1895	int i;
   1896
   1897	if (conf->geo.near_copies < conf->geo.raid_disks)
   1898		seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
   1899	if (conf->geo.near_copies > 1)
   1900		seq_printf(seq, " %d near-copies", conf->geo.near_copies);
   1901	if (conf->geo.far_copies > 1) {
   1902		if (conf->geo.far_offset)
   1903			seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
   1904		else
   1905			seq_printf(seq, " %d far-copies", conf->geo.far_copies);
   1906		if (conf->geo.far_set_size != conf->geo.raid_disks)
   1907			seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
   1908	}
   1909	seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
   1910					conf->geo.raid_disks - mddev->degraded);
   1911	rcu_read_lock();
   1912	for (i = 0; i < conf->geo.raid_disks; i++) {
   1913		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
   1914		seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
   1915	}
   1916	rcu_read_unlock();
   1917	seq_printf(seq, "]");
   1918}
   1919
   1920/* check if there are enough drives for
   1921 * every block to appear on atleast one.
   1922 * Don't consider the device numbered 'ignore'
   1923 * as we might be about to remove it.
   1924 */
   1925static int _enough(struct r10conf *conf, int previous, int ignore)
   1926{
   1927	int first = 0;
   1928	int has_enough = 0;
   1929	int disks, ncopies;
   1930	if (previous) {
   1931		disks = conf->prev.raid_disks;
   1932		ncopies = conf->prev.near_copies;
   1933	} else {
   1934		disks = conf->geo.raid_disks;
   1935		ncopies = conf->geo.near_copies;
   1936	}
   1937
   1938	rcu_read_lock();
   1939	do {
   1940		int n = conf->copies;
   1941		int cnt = 0;
   1942		int this = first;
   1943		while (n--) {
   1944			struct md_rdev *rdev;
   1945			if (this != ignore &&
   1946			    (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
   1947			    test_bit(In_sync, &rdev->flags))
   1948				cnt++;
   1949			this = (this+1) % disks;
   1950		}
   1951		if (cnt == 0)
   1952			goto out;
   1953		first = (first + ncopies) % disks;
   1954	} while (first != 0);
   1955	has_enough = 1;
   1956out:
   1957	rcu_read_unlock();
   1958	return has_enough;
   1959}
   1960
   1961static int enough(struct r10conf *conf, int ignore)
   1962{
   1963	/* when calling 'enough', both 'prev' and 'geo' must
   1964	 * be stable.
   1965	 * This is ensured if ->reconfig_mutex or ->device_lock
   1966	 * is held.
   1967	 */
   1968	return _enough(conf, 0, ignore) &&
   1969		_enough(conf, 1, ignore);
   1970}
   1971
   1972/**
   1973 * raid10_error() - RAID10 error handler.
   1974 * @mddev: affected md device.
   1975 * @rdev: member device to fail.
   1976 *
   1977 * The routine acknowledges &rdev failure and determines new @mddev state.
   1978 * If it failed, then:
   1979 *	- &MD_BROKEN flag is set in &mddev->flags.
   1980 * Otherwise, it must be degraded:
   1981 *	- recovery is interrupted.
   1982 *	- &mddev->degraded is bumped.
   1983
   1984 * @rdev is marked as &Faulty excluding case when array is failed and
   1985 * &mddev->fail_last_dev is off.
   1986 */
   1987static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
   1988{
   1989	struct r10conf *conf = mddev->private;
   1990	unsigned long flags;
   1991
   1992	spin_lock_irqsave(&conf->device_lock, flags);
   1993
   1994	if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
   1995		set_bit(MD_BROKEN, &mddev->flags);
   1996
   1997		if (!mddev->fail_last_dev) {
   1998			spin_unlock_irqrestore(&conf->device_lock, flags);
   1999			return;
   2000		}
   2001	}
   2002	if (test_and_clear_bit(In_sync, &rdev->flags))
   2003		mddev->degraded++;
   2004
   2005	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   2006	set_bit(Blocked, &rdev->flags);
   2007	set_bit(Faulty, &rdev->flags);
   2008	set_mask_bits(&mddev->sb_flags, 0,
   2009		      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
   2010	spin_unlock_irqrestore(&conf->device_lock, flags);
   2011	pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n"
   2012		"md/raid10:%s: Operation continuing on %d devices.\n",
   2013		mdname(mddev), rdev->bdev,
   2014		mdname(mddev), conf->geo.raid_disks - mddev->degraded);
   2015}
   2016
   2017static void print_conf(struct r10conf *conf)
   2018{
   2019	int i;
   2020	struct md_rdev *rdev;
   2021
   2022	pr_debug("RAID10 conf printout:\n");
   2023	if (!conf) {
   2024		pr_debug("(!conf)\n");
   2025		return;
   2026	}
   2027	pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
   2028		 conf->geo.raid_disks);
   2029
   2030	/* This is only called with ->reconfix_mutex held, so
   2031	 * rcu protection of rdev is not needed */
   2032	for (i = 0; i < conf->geo.raid_disks; i++) {
   2033		rdev = conf->mirrors[i].rdev;
   2034		if (rdev)
   2035			pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
   2036				 i, !test_bit(In_sync, &rdev->flags),
   2037				 !test_bit(Faulty, &rdev->flags),
   2038				 rdev->bdev);
   2039	}
   2040}
   2041
   2042static void close_sync(struct r10conf *conf)
   2043{
   2044	wait_barrier(conf, false);
   2045	allow_barrier(conf);
   2046
   2047	mempool_exit(&conf->r10buf_pool);
   2048}
   2049
   2050static int raid10_spare_active(struct mddev *mddev)
   2051{
   2052	int i;
   2053	struct r10conf *conf = mddev->private;
   2054	struct raid10_info *tmp;
   2055	int count = 0;
   2056	unsigned long flags;
   2057
   2058	/*
   2059	 * Find all non-in_sync disks within the RAID10 configuration
   2060	 * and mark them in_sync
   2061	 */
   2062	for (i = 0; i < conf->geo.raid_disks; i++) {
   2063		tmp = conf->mirrors + i;
   2064		if (tmp->replacement
   2065		    && tmp->replacement->recovery_offset == MaxSector
   2066		    && !test_bit(Faulty, &tmp->replacement->flags)
   2067		    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
   2068			/* Replacement has just become active */
   2069			if (!tmp->rdev
   2070			    || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
   2071				count++;
   2072			if (tmp->rdev) {
   2073				/* Replaced device not technically faulty,
   2074				 * but we need to be sure it gets removed
   2075				 * and never re-added.
   2076				 */
   2077				set_bit(Faulty, &tmp->rdev->flags);
   2078				sysfs_notify_dirent_safe(
   2079					tmp->rdev->sysfs_state);
   2080			}
   2081			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
   2082		} else if (tmp->rdev
   2083			   && tmp->rdev->recovery_offset == MaxSector
   2084			   && !test_bit(Faulty, &tmp->rdev->flags)
   2085			   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
   2086			count++;
   2087			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
   2088		}
   2089	}
   2090	spin_lock_irqsave(&conf->device_lock, flags);
   2091	mddev->degraded -= count;
   2092	spin_unlock_irqrestore(&conf->device_lock, flags);
   2093
   2094	print_conf(conf);
   2095	return count;
   2096}
   2097
   2098static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
   2099{
   2100	struct r10conf *conf = mddev->private;
   2101	int err = -EEXIST;
   2102	int mirror;
   2103	int first = 0;
   2104	int last = conf->geo.raid_disks - 1;
   2105
   2106	if (mddev->recovery_cp < MaxSector)
   2107		/* only hot-add to in-sync arrays, as recovery is
   2108		 * very different from resync
   2109		 */
   2110		return -EBUSY;
   2111	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
   2112		return -EINVAL;
   2113
   2114	if (md_integrity_add_rdev(rdev, mddev))
   2115		return -ENXIO;
   2116
   2117	if (rdev->raid_disk >= 0)
   2118		first = last = rdev->raid_disk;
   2119
   2120	if (rdev->saved_raid_disk >= first &&
   2121	    rdev->saved_raid_disk < conf->geo.raid_disks &&
   2122	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
   2123		mirror = rdev->saved_raid_disk;
   2124	else
   2125		mirror = first;
   2126	for ( ; mirror <= last ; mirror++) {
   2127		struct raid10_info *p = &conf->mirrors[mirror];
   2128		if (p->recovery_disabled == mddev->recovery_disabled)
   2129			continue;
   2130		if (p->rdev) {
   2131			if (!test_bit(WantReplacement, &p->rdev->flags) ||
   2132			    p->replacement != NULL)
   2133				continue;
   2134			clear_bit(In_sync, &rdev->flags);
   2135			set_bit(Replacement, &rdev->flags);
   2136			rdev->raid_disk = mirror;
   2137			err = 0;
   2138			if (mddev->gendisk)
   2139				disk_stack_limits(mddev->gendisk, rdev->bdev,
   2140						  rdev->data_offset << 9);
   2141			conf->fullsync = 1;
   2142			rcu_assign_pointer(p->replacement, rdev);
   2143			break;
   2144		}
   2145
   2146		if (mddev->gendisk)
   2147			disk_stack_limits(mddev->gendisk, rdev->bdev,
   2148					  rdev->data_offset << 9);
   2149
   2150		p->head_position = 0;
   2151		p->recovery_disabled = mddev->recovery_disabled - 1;
   2152		rdev->raid_disk = mirror;
   2153		err = 0;
   2154		if (rdev->saved_raid_disk != mirror)
   2155			conf->fullsync = 1;
   2156		rcu_assign_pointer(p->rdev, rdev);
   2157		break;
   2158	}
   2159
   2160	print_conf(conf);
   2161	return err;
   2162}
   2163
   2164static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
   2165{
   2166	struct r10conf *conf = mddev->private;
   2167	int err = 0;
   2168	int number = rdev->raid_disk;
   2169	struct md_rdev **rdevp;
   2170	struct raid10_info *p = conf->mirrors + number;
   2171
   2172	print_conf(conf);
   2173	if (rdev == p->rdev)
   2174		rdevp = &p->rdev;
   2175	else if (rdev == p->replacement)
   2176		rdevp = &p->replacement;
   2177	else
   2178		return 0;
   2179
   2180	if (test_bit(In_sync, &rdev->flags) ||
   2181	    atomic_read(&rdev->nr_pending)) {
   2182		err = -EBUSY;
   2183		goto abort;
   2184	}
   2185	/* Only remove non-faulty devices if recovery
   2186	 * is not possible.
   2187	 */
   2188	if (!test_bit(Faulty, &rdev->flags) &&
   2189	    mddev->recovery_disabled != p->recovery_disabled &&
   2190	    (!p->replacement || p->replacement == rdev) &&
   2191	    number < conf->geo.raid_disks &&
   2192	    enough(conf, -1)) {
   2193		err = -EBUSY;
   2194		goto abort;
   2195	}
   2196	*rdevp = NULL;
   2197	if (!test_bit(RemoveSynchronized, &rdev->flags)) {
   2198		synchronize_rcu();
   2199		if (atomic_read(&rdev->nr_pending)) {
   2200			/* lost the race, try later */
   2201			err = -EBUSY;
   2202			*rdevp = rdev;
   2203			goto abort;
   2204		}
   2205	}
   2206	if (p->replacement) {
   2207		/* We must have just cleared 'rdev' */
   2208		p->rdev = p->replacement;
   2209		clear_bit(Replacement, &p->replacement->flags);
   2210		smp_mb(); /* Make sure other CPUs may see both as identical
   2211			   * but will never see neither -- if they are careful.
   2212			   */
   2213		p->replacement = NULL;
   2214	}
   2215
   2216	clear_bit(WantReplacement, &rdev->flags);
   2217	err = md_integrity_register(mddev);
   2218
   2219abort:
   2220
   2221	print_conf(conf);
   2222	return err;
   2223}
   2224
   2225static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
   2226{
   2227	struct r10conf *conf = r10_bio->mddev->private;
   2228
   2229	if (!bio->bi_status)
   2230		set_bit(R10BIO_Uptodate, &r10_bio->state);
   2231	else
   2232		/* The write handler will notice the lack of
   2233		 * R10BIO_Uptodate and record any errors etc
   2234		 */
   2235		atomic_add(r10_bio->sectors,
   2236			   &conf->mirrors[d].rdev->corrected_errors);
   2237
   2238	/* for reconstruct, we always reschedule after a read.
   2239	 * for resync, only after all reads
   2240	 */
   2241	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
   2242	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
   2243	    atomic_dec_and_test(&r10_bio->remaining)) {
   2244		/* we have read all the blocks,
   2245		 * do the comparison in process context in raid10d
   2246		 */
   2247		reschedule_retry(r10_bio);
   2248	}
   2249}
   2250
   2251static void end_sync_read(struct bio *bio)
   2252{
   2253	struct r10bio *r10_bio = get_resync_r10bio(bio);
   2254	struct r10conf *conf = r10_bio->mddev->private;
   2255	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
   2256
   2257	__end_sync_read(r10_bio, bio, d);
   2258}
   2259
   2260static void end_reshape_read(struct bio *bio)
   2261{
   2262	/* reshape read bio isn't allocated from r10buf_pool */
   2263	struct r10bio *r10_bio = bio->bi_private;
   2264
   2265	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
   2266}
   2267
   2268static void end_sync_request(struct r10bio *r10_bio)
   2269{
   2270	struct mddev *mddev = r10_bio->mddev;
   2271
   2272	while (atomic_dec_and_test(&r10_bio->remaining)) {
   2273		if (r10_bio->master_bio == NULL) {
   2274			/* the primary of several recovery bios */
   2275			sector_t s = r10_bio->sectors;
   2276			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
   2277			    test_bit(R10BIO_WriteError, &r10_bio->state))
   2278				reschedule_retry(r10_bio);
   2279			else
   2280				put_buf(r10_bio);
   2281			md_done_sync(mddev, s, 1);
   2282			break;
   2283		} else {
   2284			struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
   2285			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
   2286			    test_bit(R10BIO_WriteError, &r10_bio->state))
   2287				reschedule_retry(r10_bio);
   2288			else
   2289				put_buf(r10_bio);
   2290			r10_bio = r10_bio2;
   2291		}
   2292	}
   2293}
   2294
   2295static void end_sync_write(struct bio *bio)
   2296{
   2297	struct r10bio *r10_bio = get_resync_r10bio(bio);
   2298	struct mddev *mddev = r10_bio->mddev;
   2299	struct r10conf *conf = mddev->private;
   2300	int d;
   2301	sector_t first_bad;
   2302	int bad_sectors;
   2303	int slot;
   2304	int repl;
   2305	struct md_rdev *rdev = NULL;
   2306
   2307	d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
   2308	if (repl)
   2309		rdev = conf->mirrors[d].replacement;
   2310	else
   2311		rdev = conf->mirrors[d].rdev;
   2312
   2313	if (bio->bi_status) {
   2314		if (repl)
   2315			md_error(mddev, rdev);
   2316		else {
   2317			set_bit(WriteErrorSeen, &rdev->flags);
   2318			if (!test_and_set_bit(WantReplacement, &rdev->flags))
   2319				set_bit(MD_RECOVERY_NEEDED,
   2320					&rdev->mddev->recovery);
   2321			set_bit(R10BIO_WriteError, &r10_bio->state);
   2322		}
   2323	} else if (is_badblock(rdev,
   2324			     r10_bio->devs[slot].addr,
   2325			     r10_bio->sectors,
   2326			     &first_bad, &bad_sectors))
   2327		set_bit(R10BIO_MadeGood, &r10_bio->state);
   2328
   2329	rdev_dec_pending(rdev, mddev);
   2330
   2331	end_sync_request(r10_bio);
   2332}
   2333
   2334/*
   2335 * Note: sync and recover and handled very differently for raid10
   2336 * This code is for resync.
   2337 * For resync, we read through virtual addresses and read all blocks.
   2338 * If there is any error, we schedule a write.  The lowest numbered
   2339 * drive is authoritative.
   2340 * However requests come for physical address, so we need to map.
   2341 * For every physical address there are raid_disks/copies virtual addresses,
   2342 * which is always are least one, but is not necessarly an integer.
   2343 * This means that a physical address can span multiple chunks, so we may
   2344 * have to submit multiple io requests for a single sync request.
   2345 */
   2346/*
   2347 * We check if all blocks are in-sync and only write to blocks that
   2348 * aren't in sync
   2349 */
   2350static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
   2351{
   2352	struct r10conf *conf = mddev->private;
   2353	int i, first;
   2354	struct bio *tbio, *fbio;
   2355	int vcnt;
   2356	struct page **tpages, **fpages;
   2357
   2358	atomic_set(&r10_bio->remaining, 1);
   2359
   2360	/* find the first device with a block */
   2361	for (i=0; i<conf->copies; i++)
   2362		if (!r10_bio->devs[i].bio->bi_status)
   2363			break;
   2364
   2365	if (i == conf->copies)
   2366		goto done;
   2367
   2368	first = i;
   2369	fbio = r10_bio->devs[i].bio;
   2370	fbio->bi_iter.bi_size = r10_bio->sectors << 9;
   2371	fbio->bi_iter.bi_idx = 0;
   2372	fpages = get_resync_pages(fbio)->pages;
   2373
   2374	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
   2375	/* now find blocks with errors */
   2376	for (i=0 ; i < conf->copies ; i++) {
   2377		int  j, d;
   2378		struct md_rdev *rdev;
   2379		struct resync_pages *rp;
   2380
   2381		tbio = r10_bio->devs[i].bio;
   2382
   2383		if (tbio->bi_end_io != end_sync_read)
   2384			continue;
   2385		if (i == first)
   2386			continue;
   2387
   2388		tpages = get_resync_pages(tbio)->pages;
   2389		d = r10_bio->devs[i].devnum;
   2390		rdev = conf->mirrors[d].rdev;
   2391		if (!r10_bio->devs[i].bio->bi_status) {
   2392			/* We know that the bi_io_vec layout is the same for
   2393			 * both 'first' and 'i', so we just compare them.
   2394			 * All vec entries are PAGE_SIZE;
   2395			 */
   2396			int sectors = r10_bio->sectors;
   2397			for (j = 0; j < vcnt; j++) {
   2398				int len = PAGE_SIZE;
   2399				if (sectors < (len / 512))
   2400					len = sectors * 512;
   2401				if (memcmp(page_address(fpages[j]),
   2402					   page_address(tpages[j]),
   2403					   len))
   2404					break;
   2405				sectors -= len/512;
   2406			}
   2407			if (j == vcnt)
   2408				continue;
   2409			atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
   2410			if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
   2411				/* Don't fix anything. */
   2412				continue;
   2413		} else if (test_bit(FailFast, &rdev->flags)) {
   2414			/* Just give up on this device */
   2415			md_error(rdev->mddev, rdev);
   2416			continue;
   2417		}
   2418		/* Ok, we need to write this bio, either to correct an
   2419		 * inconsistency or to correct an unreadable block.
   2420		 * First we need to fixup bv_offset, bv_len and
   2421		 * bi_vecs, as the read request might have corrupted these
   2422		 */
   2423		rp = get_resync_pages(tbio);
   2424		bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE);
   2425
   2426		md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
   2427
   2428		rp->raid_bio = r10_bio;
   2429		tbio->bi_private = rp;
   2430		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
   2431		tbio->bi_end_io = end_sync_write;
   2432
   2433		bio_copy_data(tbio, fbio);
   2434
   2435		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
   2436		atomic_inc(&r10_bio->remaining);
   2437		md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
   2438
   2439		if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
   2440			tbio->bi_opf |= MD_FAILFAST;
   2441		tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
   2442		submit_bio_noacct(tbio);
   2443	}
   2444
   2445	/* Now write out to any replacement devices
   2446	 * that are active
   2447	 */
   2448	for (i = 0; i < conf->copies; i++) {
   2449		int d;
   2450
   2451		tbio = r10_bio->devs[i].repl_bio;
   2452		if (!tbio || !tbio->bi_end_io)
   2453			continue;
   2454		if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
   2455		    && r10_bio->devs[i].bio != fbio)
   2456			bio_copy_data(tbio, fbio);
   2457		d = r10_bio->devs[i].devnum;
   2458		atomic_inc(&r10_bio->remaining);
   2459		md_sync_acct(conf->mirrors[d].replacement->bdev,
   2460			     bio_sectors(tbio));
   2461		submit_bio_noacct(tbio);
   2462	}
   2463
   2464done:
   2465	if (atomic_dec_and_test(&r10_bio->remaining)) {
   2466		md_done_sync(mddev, r10_bio->sectors, 1);
   2467		put_buf(r10_bio);
   2468	}
   2469}
   2470
   2471/*
   2472 * Now for the recovery code.
   2473 * Recovery happens across physical sectors.
   2474 * We recover all non-is_sync drives by finding the virtual address of
   2475 * each, and then choose a working drive that also has that virt address.
   2476 * There is a separate r10_bio for each non-in_sync drive.
   2477 * Only the first two slots are in use. The first for reading,
   2478 * The second for writing.
   2479 *
   2480 */
   2481static void fix_recovery_read_error(struct r10bio *r10_bio)
   2482{
   2483	/* We got a read error during recovery.
   2484	 * We repeat the read in smaller page-sized sections.
   2485	 * If a read succeeds, write it to the new device or record
   2486	 * a bad block if we cannot.
   2487	 * If a read fails, record a bad block on both old and
   2488	 * new devices.
   2489	 */
   2490	struct mddev *mddev = r10_bio->mddev;
   2491	struct r10conf *conf = mddev->private;
   2492	struct bio *bio = r10_bio->devs[0].bio;
   2493	sector_t sect = 0;
   2494	int sectors = r10_bio->sectors;
   2495	int idx = 0;
   2496	int dr = r10_bio->devs[0].devnum;
   2497	int dw = r10_bio->devs[1].devnum;
   2498	struct page **pages = get_resync_pages(bio)->pages;
   2499
   2500	while (sectors) {
   2501		int s = sectors;
   2502		struct md_rdev *rdev;
   2503		sector_t addr;
   2504		int ok;
   2505
   2506		if (s > (PAGE_SIZE>>9))
   2507			s = PAGE_SIZE >> 9;
   2508
   2509		rdev = conf->mirrors[dr].rdev;
   2510		addr = r10_bio->devs[0].addr + sect,
   2511		ok = sync_page_io(rdev,
   2512				  addr,
   2513				  s << 9,
   2514				  pages[idx],
   2515				  REQ_OP_READ, 0, false);
   2516		if (ok) {
   2517			rdev = conf->mirrors[dw].rdev;
   2518			addr = r10_bio->devs[1].addr + sect;
   2519			ok = sync_page_io(rdev,
   2520					  addr,
   2521					  s << 9,
   2522					  pages[idx],
   2523					  REQ_OP_WRITE, 0, false);
   2524			if (!ok) {
   2525				set_bit(WriteErrorSeen, &rdev->flags);
   2526				if (!test_and_set_bit(WantReplacement,
   2527						      &rdev->flags))
   2528					set_bit(MD_RECOVERY_NEEDED,
   2529						&rdev->mddev->recovery);
   2530			}
   2531		}
   2532		if (!ok) {
   2533			/* We don't worry if we cannot set a bad block -
   2534			 * it really is bad so there is no loss in not
   2535			 * recording it yet
   2536			 */
   2537			rdev_set_badblocks(rdev, addr, s, 0);
   2538
   2539			if (rdev != conf->mirrors[dw].rdev) {
   2540				/* need bad block on destination too */
   2541				struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
   2542				addr = r10_bio->devs[1].addr + sect;
   2543				ok = rdev_set_badblocks(rdev2, addr, s, 0);
   2544				if (!ok) {
   2545					/* just abort the recovery */
   2546					pr_notice("md/raid10:%s: recovery aborted due to read error\n",
   2547						  mdname(mddev));
   2548
   2549					conf->mirrors[dw].recovery_disabled
   2550						= mddev->recovery_disabled;
   2551					set_bit(MD_RECOVERY_INTR,
   2552						&mddev->recovery);
   2553					break;
   2554				}
   2555			}
   2556		}
   2557
   2558		sectors -= s;
   2559		sect += s;
   2560		idx++;
   2561	}
   2562}
   2563
   2564static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
   2565{
   2566	struct r10conf *conf = mddev->private;
   2567	int d;
   2568	struct bio *wbio, *wbio2;
   2569
   2570	if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
   2571		fix_recovery_read_error(r10_bio);
   2572		end_sync_request(r10_bio);
   2573		return;
   2574	}
   2575
   2576	/*
   2577	 * share the pages with the first bio
   2578	 * and submit the write request
   2579	 */
   2580	d = r10_bio->devs[1].devnum;
   2581	wbio = r10_bio->devs[1].bio;
   2582	wbio2 = r10_bio->devs[1].repl_bio;
   2583	/* Need to test wbio2->bi_end_io before we call
   2584	 * submit_bio_noacct as if the former is NULL,
   2585	 * the latter is free to free wbio2.
   2586	 */
   2587	if (wbio2 && !wbio2->bi_end_io)
   2588		wbio2 = NULL;
   2589	if (wbio->bi_end_io) {
   2590		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
   2591		md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
   2592		submit_bio_noacct(wbio);
   2593	}
   2594	if (wbio2) {
   2595		atomic_inc(&conf->mirrors[d].replacement->nr_pending);
   2596		md_sync_acct(conf->mirrors[d].replacement->bdev,
   2597			     bio_sectors(wbio2));
   2598		submit_bio_noacct(wbio2);
   2599	}
   2600}
   2601
   2602/*
   2603 * Used by fix_read_error() to decay the per rdev read_errors.
   2604 * We halve the read error count for every hour that has elapsed
   2605 * since the last recorded read error.
   2606 *
   2607 */
   2608static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
   2609{
   2610	long cur_time_mon;
   2611	unsigned long hours_since_last;
   2612	unsigned int read_errors = atomic_read(&rdev->read_errors);
   2613
   2614	cur_time_mon = ktime_get_seconds();
   2615
   2616	if (rdev->last_read_error == 0) {
   2617		/* first time we've seen a read error */
   2618		rdev->last_read_error = cur_time_mon;
   2619		return;
   2620	}
   2621
   2622	hours_since_last = (long)(cur_time_mon -
   2623			    rdev->last_read_error) / 3600;
   2624
   2625	rdev->last_read_error = cur_time_mon;
   2626
   2627	/*
   2628	 * if hours_since_last is > the number of bits in read_errors
   2629	 * just set read errors to 0. We do this to avoid
   2630	 * overflowing the shift of read_errors by hours_since_last.
   2631	 */
   2632	if (hours_since_last >= 8 * sizeof(read_errors))
   2633		atomic_set(&rdev->read_errors, 0);
   2634	else
   2635		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
   2636}
   2637
   2638static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
   2639			    int sectors, struct page *page, int rw)
   2640{
   2641	sector_t first_bad;
   2642	int bad_sectors;
   2643
   2644	if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
   2645	    && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
   2646		return -1;
   2647	if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
   2648		/* success */
   2649		return 1;
   2650	if (rw == WRITE) {
   2651		set_bit(WriteErrorSeen, &rdev->flags);
   2652		if (!test_and_set_bit(WantReplacement, &rdev->flags))
   2653			set_bit(MD_RECOVERY_NEEDED,
   2654				&rdev->mddev->recovery);
   2655	}
   2656	/* need to record an error - either for the block or the device */
   2657	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
   2658		md_error(rdev->mddev, rdev);
   2659	return 0;
   2660}
   2661
   2662/*
   2663 * This is a kernel thread which:
   2664 *
   2665 *	1.	Retries failed read operations on working mirrors.
   2666 *	2.	Updates the raid superblock when problems encounter.
   2667 *	3.	Performs writes following reads for array synchronising.
   2668 */
   2669
   2670static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
   2671{
   2672	int sect = 0; /* Offset from r10_bio->sector */
   2673	int sectors = r10_bio->sectors;
   2674	struct md_rdev *rdev;
   2675	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
   2676	int d = r10_bio->devs[r10_bio->read_slot].devnum;
   2677
   2678	/* still own a reference to this rdev, so it cannot
   2679	 * have been cleared recently.
   2680	 */
   2681	rdev = conf->mirrors[d].rdev;
   2682
   2683	if (test_bit(Faulty, &rdev->flags))
   2684		/* drive has already been failed, just ignore any
   2685		   more fix_read_error() attempts */
   2686		return;
   2687
   2688	check_decay_read_errors(mddev, rdev);
   2689	atomic_inc(&rdev->read_errors);
   2690	if (atomic_read(&rdev->read_errors) > max_read_errors) {
   2691		pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
   2692			  mdname(mddev), rdev->bdev,
   2693			  atomic_read(&rdev->read_errors), max_read_errors);
   2694		pr_notice("md/raid10:%s: %pg: Failing raid device\n",
   2695			  mdname(mddev), rdev->bdev);
   2696		md_error(mddev, rdev);
   2697		r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
   2698		return;
   2699	}
   2700
   2701	while(sectors) {
   2702		int s = sectors;
   2703		int sl = r10_bio->read_slot;
   2704		int success = 0;
   2705		int start;
   2706
   2707		if (s > (PAGE_SIZE>>9))
   2708			s = PAGE_SIZE >> 9;
   2709
   2710		rcu_read_lock();
   2711		do {
   2712			sector_t first_bad;
   2713			int bad_sectors;
   2714
   2715			d = r10_bio->devs[sl].devnum;
   2716			rdev = rcu_dereference(conf->mirrors[d].rdev);
   2717			if (rdev &&
   2718			    test_bit(In_sync, &rdev->flags) &&
   2719			    !test_bit(Faulty, &rdev->flags) &&
   2720			    is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
   2721					&first_bad, &bad_sectors) == 0) {
   2722				atomic_inc(&rdev->nr_pending);
   2723				rcu_read_unlock();
   2724				success = sync_page_io(rdev,
   2725						       r10_bio->devs[sl].addr +
   2726						       sect,
   2727						       s<<9,
   2728						       conf->tmppage,
   2729						       REQ_OP_READ, 0, false);
   2730				rdev_dec_pending(rdev, mddev);
   2731				rcu_read_lock();
   2732				if (success)
   2733					break;
   2734			}
   2735			sl++;
   2736			if (sl == conf->copies)
   2737				sl = 0;
   2738		} while (!success && sl != r10_bio->read_slot);
   2739		rcu_read_unlock();
   2740
   2741		if (!success) {
   2742			/* Cannot read from anywhere, just mark the block
   2743			 * as bad on the first device to discourage future
   2744			 * reads.
   2745			 */
   2746			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
   2747			rdev = conf->mirrors[dn].rdev;
   2748
   2749			if (!rdev_set_badblocks(
   2750				    rdev,
   2751				    r10_bio->devs[r10_bio->read_slot].addr
   2752				    + sect,
   2753				    s, 0)) {
   2754				md_error(mddev, rdev);
   2755				r10_bio->devs[r10_bio->read_slot].bio
   2756					= IO_BLOCKED;
   2757			}
   2758			break;
   2759		}
   2760
   2761		start = sl;
   2762		/* write it back and re-read */
   2763		rcu_read_lock();
   2764		while (sl != r10_bio->read_slot) {
   2765			if (sl==0)
   2766				sl = conf->copies;
   2767			sl--;
   2768			d = r10_bio->devs[sl].devnum;
   2769			rdev = rcu_dereference(conf->mirrors[d].rdev);
   2770			if (!rdev ||
   2771			    test_bit(Faulty, &rdev->flags) ||
   2772			    !test_bit(In_sync, &rdev->flags))
   2773				continue;
   2774
   2775			atomic_inc(&rdev->nr_pending);
   2776			rcu_read_unlock();
   2777			if (r10_sync_page_io(rdev,
   2778					     r10_bio->devs[sl].addr +
   2779					     sect,
   2780					     s, conf->tmppage, WRITE)
   2781			    == 0) {
   2782				/* Well, this device is dead */
   2783				pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n",
   2784					  mdname(mddev), s,
   2785					  (unsigned long long)(
   2786						  sect +
   2787						  choose_data_offset(r10_bio,
   2788								     rdev)),
   2789					  rdev->bdev);
   2790				pr_notice("md/raid10:%s: %pg: failing drive\n",
   2791					  mdname(mddev),
   2792					  rdev->bdev);
   2793			}
   2794			rdev_dec_pending(rdev, mddev);
   2795			rcu_read_lock();
   2796		}
   2797		sl = start;
   2798		while (sl != r10_bio->read_slot) {
   2799			if (sl==0)
   2800				sl = conf->copies;
   2801			sl--;
   2802			d = r10_bio->devs[sl].devnum;
   2803			rdev = rcu_dereference(conf->mirrors[d].rdev);
   2804			if (!rdev ||
   2805			    test_bit(Faulty, &rdev->flags) ||
   2806			    !test_bit(In_sync, &rdev->flags))
   2807				continue;
   2808
   2809			atomic_inc(&rdev->nr_pending);
   2810			rcu_read_unlock();
   2811			switch (r10_sync_page_io(rdev,
   2812					     r10_bio->devs[sl].addr +
   2813					     sect,
   2814					     s, conf->tmppage,
   2815						 READ)) {
   2816			case 0:
   2817				/* Well, this device is dead */
   2818				pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n",
   2819				       mdname(mddev), s,
   2820				       (unsigned long long)(
   2821					       sect +
   2822					       choose_data_offset(r10_bio, rdev)),
   2823				       rdev->bdev);
   2824				pr_notice("md/raid10:%s: %pg: failing drive\n",
   2825				       mdname(mddev),
   2826				       rdev->bdev);
   2827				break;
   2828			case 1:
   2829				pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n",
   2830				       mdname(mddev), s,
   2831				       (unsigned long long)(
   2832					       sect +
   2833					       choose_data_offset(r10_bio, rdev)),
   2834				       rdev->bdev);
   2835				atomic_add(s, &rdev->corrected_errors);
   2836			}
   2837
   2838			rdev_dec_pending(rdev, mddev);
   2839			rcu_read_lock();
   2840		}
   2841		rcu_read_unlock();
   2842
   2843		sectors -= s;
   2844		sect += s;
   2845	}
   2846}
   2847
   2848static int narrow_write_error(struct r10bio *r10_bio, int i)
   2849{
   2850	struct bio *bio = r10_bio->master_bio;
   2851	struct mddev *mddev = r10_bio->mddev;
   2852	struct r10conf *conf = mddev->private;
   2853	struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
   2854	/* bio has the data to be written to slot 'i' where
   2855	 * we just recently had a write error.
   2856	 * We repeatedly clone the bio and trim down to one block,
   2857	 * then try the write.  Where the write fails we record
   2858	 * a bad block.
   2859	 * It is conceivable that the bio doesn't exactly align with
   2860	 * blocks.  We must handle this.
   2861	 *
   2862	 * We currently own a reference to the rdev.
   2863	 */
   2864
   2865	int block_sectors;
   2866	sector_t sector;
   2867	int sectors;
   2868	int sect_to_write = r10_bio->sectors;
   2869	int ok = 1;
   2870
   2871	if (rdev->badblocks.shift < 0)
   2872		return 0;
   2873
   2874	block_sectors = roundup(1 << rdev->badblocks.shift,
   2875				bdev_logical_block_size(rdev->bdev) >> 9);
   2876	sector = r10_bio->sector;
   2877	sectors = ((r10_bio->sector + block_sectors)
   2878		   & ~(sector_t)(block_sectors - 1))
   2879		- sector;
   2880
   2881	while (sect_to_write) {
   2882		struct bio *wbio;
   2883		sector_t wsector;
   2884		if (sectors > sect_to_write)
   2885			sectors = sect_to_write;
   2886		/* Write at 'sector' for 'sectors' */
   2887		wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
   2888				       &mddev->bio_set);
   2889		bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
   2890		wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
   2891		wbio->bi_iter.bi_sector = wsector +
   2892				   choose_data_offset(r10_bio, rdev);
   2893		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
   2894
   2895		if (submit_bio_wait(wbio) < 0)
   2896			/* Failure! */
   2897			ok = rdev_set_badblocks(rdev, wsector,
   2898						sectors, 0)
   2899				&& ok;
   2900
   2901		bio_put(wbio);
   2902		sect_to_write -= sectors;
   2903		sector += sectors;
   2904		sectors = block_sectors;
   2905	}
   2906	return ok;
   2907}
   2908
   2909static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
   2910{
   2911	int slot = r10_bio->read_slot;
   2912	struct bio *bio;
   2913	struct r10conf *conf = mddev->private;
   2914	struct md_rdev *rdev = r10_bio->devs[slot].rdev;
   2915
   2916	/* we got a read error. Maybe the drive is bad.  Maybe just
   2917	 * the block and we can fix it.
   2918	 * We freeze all other IO, and try reading the block from
   2919	 * other devices.  When we find one, we re-write
   2920	 * and check it that fixes the read error.
   2921	 * This is all done synchronously while the array is
   2922	 * frozen.
   2923	 */
   2924	bio = r10_bio->devs[slot].bio;
   2925	bio_put(bio);
   2926	r10_bio->devs[slot].bio = NULL;
   2927
   2928	if (mddev->ro)
   2929		r10_bio->devs[slot].bio = IO_BLOCKED;
   2930	else if (!test_bit(FailFast, &rdev->flags)) {
   2931		freeze_array(conf, 1);
   2932		fix_read_error(conf, mddev, r10_bio);
   2933		unfreeze_array(conf);
   2934	} else
   2935		md_error(mddev, rdev);
   2936
   2937	rdev_dec_pending(rdev, mddev);
   2938	allow_barrier(conf);
   2939	r10_bio->state = 0;
   2940	raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
   2941}
   2942
   2943static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
   2944{
   2945	/* Some sort of write request has finished and it
   2946	 * succeeded in writing where we thought there was a
   2947	 * bad block.  So forget the bad block.
   2948	 * Or possibly if failed and we need to record
   2949	 * a bad block.
   2950	 */
   2951	int m;
   2952	struct md_rdev *rdev;
   2953
   2954	if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
   2955	    test_bit(R10BIO_IsRecover, &r10_bio->state)) {
   2956		for (m = 0; m < conf->copies; m++) {
   2957			int dev = r10_bio->devs[m].devnum;
   2958			rdev = conf->mirrors[dev].rdev;
   2959			if (r10_bio->devs[m].bio == NULL ||
   2960				r10_bio->devs[m].bio->bi_end_io == NULL)
   2961				continue;
   2962			if (!r10_bio->devs[m].bio->bi_status) {
   2963				rdev_clear_badblocks(
   2964					rdev,
   2965					r10_bio->devs[m].addr,
   2966					r10_bio->sectors, 0);
   2967			} else {
   2968				if (!rdev_set_badblocks(
   2969					    rdev,
   2970					    r10_bio->devs[m].addr,
   2971					    r10_bio->sectors, 0))
   2972					md_error(conf->mddev, rdev);
   2973			}
   2974			rdev = conf->mirrors[dev].replacement;
   2975			if (r10_bio->devs[m].repl_bio == NULL ||
   2976				r10_bio->devs[m].repl_bio->bi_end_io == NULL)
   2977				continue;
   2978
   2979			if (!r10_bio->devs[m].repl_bio->bi_status) {
   2980				rdev_clear_badblocks(
   2981					rdev,
   2982					r10_bio->devs[m].addr,
   2983					r10_bio->sectors, 0);
   2984			} else {
   2985				if (!rdev_set_badblocks(
   2986					    rdev,
   2987					    r10_bio->devs[m].addr,
   2988					    r10_bio->sectors, 0))
   2989					md_error(conf->mddev, rdev);
   2990			}
   2991		}
   2992		put_buf(r10_bio);
   2993	} else {
   2994		bool fail = false;
   2995		for (m = 0; m < conf->copies; m++) {
   2996			int dev = r10_bio->devs[m].devnum;
   2997			struct bio *bio = r10_bio->devs[m].bio;
   2998			rdev = conf->mirrors[dev].rdev;
   2999			if (bio == IO_MADE_GOOD) {
   3000				rdev_clear_badblocks(
   3001					rdev,
   3002					r10_bio->devs[m].addr,
   3003					r10_bio->sectors, 0);
   3004				rdev_dec_pending(rdev, conf->mddev);
   3005			} else if (bio != NULL && bio->bi_status) {
   3006				fail = true;
   3007				if (!narrow_write_error(r10_bio, m)) {
   3008					md_error(conf->mddev, rdev);
   3009					set_bit(R10BIO_Degraded,
   3010						&r10_bio->state);
   3011				}
   3012				rdev_dec_pending(rdev, conf->mddev);
   3013			}
   3014			bio = r10_bio->devs[m].repl_bio;
   3015			rdev = conf->mirrors[dev].replacement;
   3016			if (rdev && bio == IO_MADE_GOOD) {
   3017				rdev_clear_badblocks(
   3018					rdev,
   3019					r10_bio->devs[m].addr,
   3020					r10_bio->sectors, 0);
   3021				rdev_dec_pending(rdev, conf->mddev);
   3022			}
   3023		}
   3024		if (fail) {
   3025			spin_lock_irq(&conf->device_lock);
   3026			list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
   3027			conf->nr_queued++;
   3028			spin_unlock_irq(&conf->device_lock);
   3029			/*
   3030			 * In case freeze_array() is waiting for condition
   3031			 * nr_pending == nr_queued + extra to be true.
   3032			 */
   3033			wake_up(&conf->wait_barrier);
   3034			md_wakeup_thread(conf->mddev->thread);
   3035		} else {
   3036			if (test_bit(R10BIO_WriteError,
   3037				     &r10_bio->state))
   3038				close_write(r10_bio);
   3039			raid_end_bio_io(r10_bio);
   3040		}
   3041	}
   3042}
   3043
   3044static void raid10d(struct md_thread *thread)
   3045{
   3046	struct mddev *mddev = thread->mddev;
   3047	struct r10bio *r10_bio;
   3048	unsigned long flags;
   3049	struct r10conf *conf = mddev->private;
   3050	struct list_head *head = &conf->retry_list;
   3051	struct blk_plug plug;
   3052
   3053	md_check_recovery(mddev);
   3054
   3055	if (!list_empty_careful(&conf->bio_end_io_list) &&
   3056	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
   3057		LIST_HEAD(tmp);
   3058		spin_lock_irqsave(&conf->device_lock, flags);
   3059		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
   3060			while (!list_empty(&conf->bio_end_io_list)) {
   3061				list_move(conf->bio_end_io_list.prev, &tmp);
   3062				conf->nr_queued--;
   3063			}
   3064		}
   3065		spin_unlock_irqrestore(&conf->device_lock, flags);
   3066		while (!list_empty(&tmp)) {
   3067			r10_bio = list_first_entry(&tmp, struct r10bio,
   3068						   retry_list);
   3069			list_del(&r10_bio->retry_list);
   3070			if (mddev->degraded)
   3071				set_bit(R10BIO_Degraded, &r10_bio->state);
   3072
   3073			if (test_bit(R10BIO_WriteError,
   3074				     &r10_bio->state))
   3075				close_write(r10_bio);
   3076			raid_end_bio_io(r10_bio);
   3077		}
   3078	}
   3079
   3080	blk_start_plug(&plug);
   3081	for (;;) {
   3082
   3083		flush_pending_writes(conf);
   3084
   3085		spin_lock_irqsave(&conf->device_lock, flags);
   3086		if (list_empty(head)) {
   3087			spin_unlock_irqrestore(&conf->device_lock, flags);
   3088			break;
   3089		}
   3090		r10_bio = list_entry(head->prev, struct r10bio, retry_list);
   3091		list_del(head->prev);
   3092		conf->nr_queued--;
   3093		spin_unlock_irqrestore(&conf->device_lock, flags);
   3094
   3095		mddev = r10_bio->mddev;
   3096		conf = mddev->private;
   3097		if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
   3098		    test_bit(R10BIO_WriteError, &r10_bio->state))
   3099			handle_write_completed(conf, r10_bio);
   3100		else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
   3101			reshape_request_write(mddev, r10_bio);
   3102		else if (test_bit(R10BIO_IsSync, &r10_bio->state))
   3103			sync_request_write(mddev, r10_bio);
   3104		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
   3105			recovery_request_write(mddev, r10_bio);
   3106		else if (test_bit(R10BIO_ReadError, &r10_bio->state))
   3107			handle_read_error(mddev, r10_bio);
   3108		else
   3109			WARN_ON_ONCE(1);
   3110
   3111		cond_resched();
   3112		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
   3113			md_check_recovery(mddev);
   3114	}
   3115	blk_finish_plug(&plug);
   3116}
   3117
   3118static int init_resync(struct r10conf *conf)
   3119{
   3120	int ret, buffs, i;
   3121
   3122	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
   3123	BUG_ON(mempool_initialized(&conf->r10buf_pool));
   3124	conf->have_replacement = 0;
   3125	for (i = 0; i < conf->geo.raid_disks; i++)
   3126		if (conf->mirrors[i].replacement)
   3127			conf->have_replacement = 1;
   3128	ret = mempool_init(&conf->r10buf_pool, buffs,
   3129			   r10buf_pool_alloc, r10buf_pool_free, conf);
   3130	if (ret)
   3131		return ret;
   3132	conf->next_resync = 0;
   3133	return 0;
   3134}
   3135
   3136static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
   3137{
   3138	struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
   3139	struct rsync_pages *rp;
   3140	struct bio *bio;
   3141	int nalloc;
   3142	int i;
   3143
   3144	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
   3145	    test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
   3146		nalloc = conf->copies; /* resync */
   3147	else
   3148		nalloc = 2; /* recovery */
   3149
   3150	for (i = 0; i < nalloc; i++) {
   3151		bio = r10bio->devs[i].bio;
   3152		rp = bio->bi_private;
   3153		bio_reset(bio, NULL, 0);
   3154		bio->bi_private = rp;
   3155		bio = r10bio->devs[i].repl_bio;
   3156		if (bio) {
   3157			rp = bio->bi_private;
   3158			bio_reset(bio, NULL, 0);
   3159			bio->bi_private = rp;
   3160		}
   3161	}
   3162	return r10bio;
   3163}
   3164
   3165/*
   3166 * Set cluster_sync_high since we need other nodes to add the
   3167 * range [cluster_sync_low, cluster_sync_high] to suspend list.
   3168 */
   3169static void raid10_set_cluster_sync_high(struct r10conf *conf)
   3170{
   3171	sector_t window_size;
   3172	int extra_chunk, chunks;
   3173
   3174	/*
   3175	 * First, here we define "stripe" as a unit which across
   3176	 * all member devices one time, so we get chunks by use
   3177	 * raid_disks / near_copies. Otherwise, if near_copies is
   3178	 * close to raid_disks, then resync window could increases
   3179	 * linearly with the increase of raid_disks, which means
   3180	 * we will suspend a really large IO window while it is not
   3181	 * necessary. If raid_disks is not divisible by near_copies,
   3182	 * an extra chunk is needed to ensure the whole "stripe" is
   3183	 * covered.
   3184	 */
   3185
   3186	chunks = conf->geo.raid_disks / conf->geo.near_copies;
   3187	if (conf->geo.raid_disks % conf->geo.near_copies == 0)
   3188		extra_chunk = 0;
   3189	else
   3190		extra_chunk = 1;
   3191	window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
   3192
   3193	/*
   3194	 * At least use a 32M window to align with raid1's resync window
   3195	 */
   3196	window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
   3197			CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
   3198
   3199	conf->cluster_sync_high = conf->cluster_sync_low + window_size;
   3200}
   3201
   3202/*
   3203 * perform a "sync" on one "block"
   3204 *
   3205 * We need to make sure that no normal I/O request - particularly write
   3206 * requests - conflict with active sync requests.
   3207 *
   3208 * This is achieved by tracking pending requests and a 'barrier' concept
   3209 * that can be installed to exclude normal IO requests.
   3210 *
   3211 * Resync and recovery are handled very differently.
   3212 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
   3213 *
   3214 * For resync, we iterate over virtual addresses, read all copies,
   3215 * and update if there are differences.  If only one copy is live,
   3216 * skip it.
   3217 * For recovery, we iterate over physical addresses, read a good
   3218 * value for each non-in_sync drive, and over-write.
   3219 *
   3220 * So, for recovery we may have several outstanding complex requests for a
   3221 * given address, one for each out-of-sync device.  We model this by allocating
   3222 * a number of r10_bio structures, one for each out-of-sync device.
   3223 * As we setup these structures, we collect all bio's together into a list
   3224 * which we then process collectively to add pages, and then process again
   3225 * to pass to submit_bio_noacct.
   3226 *
   3227 * The r10_bio structures are linked using a borrowed master_bio pointer.
   3228 * This link is counted in ->remaining.  When the r10_bio that points to NULL
   3229 * has its remaining count decremented to 0, the whole complex operation
   3230 * is complete.
   3231 *
   3232 */
   3233
   3234static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
   3235			     int *skipped)
   3236{
   3237	struct r10conf *conf = mddev->private;
   3238	struct r10bio *r10_bio;
   3239	struct bio *biolist = NULL, *bio;
   3240	sector_t max_sector, nr_sectors;
   3241	int i;
   3242	int max_sync;
   3243	sector_t sync_blocks;
   3244	sector_t sectors_skipped = 0;
   3245	int chunks_skipped = 0;
   3246	sector_t chunk_mask = conf->geo.chunk_mask;
   3247	int page_idx = 0;
   3248
   3249	if (!mempool_initialized(&conf->r10buf_pool))
   3250		if (init_resync(conf))
   3251			return 0;
   3252
   3253	/*
   3254	 * Allow skipping a full rebuild for incremental assembly
   3255	 * of a clean array, like RAID1 does.
   3256	 */
   3257	if (mddev->bitmap == NULL &&
   3258	    mddev->recovery_cp == MaxSector &&
   3259	    mddev->reshape_position == MaxSector &&
   3260	    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
   3261	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
   3262	    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
   3263	    conf->fullsync == 0) {
   3264		*skipped = 1;
   3265		return mddev->dev_sectors - sector_nr;
   3266	}
   3267
   3268 skipped:
   3269	max_sector = mddev->dev_sectors;
   3270	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
   3271	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
   3272		max_sector = mddev->resync_max_sectors;
   3273	if (sector_nr >= max_sector) {
   3274		conf->cluster_sync_low = 0;
   3275		conf->cluster_sync_high = 0;
   3276
   3277		/* If we aborted, we need to abort the
   3278		 * sync on the 'current' bitmap chucks (there can
   3279		 * be several when recovering multiple devices).
   3280		 * as we may have started syncing it but not finished.
   3281		 * We can find the current address in
   3282		 * mddev->curr_resync, but for recovery,
   3283		 * we need to convert that to several
   3284		 * virtual addresses.
   3285		 */
   3286		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
   3287			end_reshape(conf);
   3288			close_sync(conf);
   3289			return 0;
   3290		}
   3291
   3292		if (mddev->curr_resync < max_sector) { /* aborted */
   3293			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
   3294				md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
   3295						   &sync_blocks, 1);
   3296			else for (i = 0; i < conf->geo.raid_disks; i++) {
   3297				sector_t sect =
   3298					raid10_find_virt(conf, mddev->curr_resync, i);
   3299				md_bitmap_end_sync(mddev->bitmap, sect,
   3300						   &sync_blocks, 1);
   3301			}
   3302		} else {
   3303			/* completed sync */
   3304			if ((!mddev->bitmap || conf->fullsync)
   3305			    && conf->have_replacement
   3306			    && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
   3307				/* Completed a full sync so the replacements
   3308				 * are now fully recovered.
   3309				 */
   3310				rcu_read_lock();
   3311				for (i = 0; i < conf->geo.raid_disks; i++) {
   3312					struct md_rdev *rdev =
   3313						rcu_dereference(conf->mirrors[i].replacement);
   3314					if (rdev)
   3315						rdev->recovery_offset = MaxSector;
   3316				}
   3317				rcu_read_unlock();
   3318			}
   3319			conf->fullsync = 0;
   3320		}
   3321		md_bitmap_close_sync(mddev->bitmap);
   3322		close_sync(conf);
   3323		*skipped = 1;
   3324		return sectors_skipped;
   3325	}
   3326
   3327	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
   3328		return reshape_request(mddev, sector_nr, skipped);
   3329
   3330	if (chunks_skipped >= conf->geo.raid_disks) {
   3331		/* if there has been nothing to do on any drive,
   3332		 * then there is nothing to do at all..
   3333		 */
   3334		*skipped = 1;
   3335		return (max_sector - sector_nr) + sectors_skipped;
   3336	}
   3337
   3338	if (max_sector > mddev->resync_max)
   3339		max_sector = mddev->resync_max; /* Don't do IO beyond here */
   3340
   3341	/* make sure whole request will fit in a chunk - if chunks
   3342	 * are meaningful
   3343	 */
   3344	if (conf->geo.near_copies < conf->geo.raid_disks &&
   3345	    max_sector > (sector_nr | chunk_mask))
   3346		max_sector = (sector_nr | chunk_mask) + 1;
   3347
   3348	/*
   3349	 * If there is non-resync activity waiting for a turn, then let it
   3350	 * though before starting on this new sync request.
   3351	 */
   3352	if (conf->nr_waiting)
   3353		schedule_timeout_uninterruptible(1);
   3354
   3355	/* Again, very different code for resync and recovery.
   3356	 * Both must result in an r10bio with a list of bios that
   3357	 * have bi_end_io, bi_sector, bi_bdev set,
   3358	 * and bi_private set to the r10bio.
   3359	 * For recovery, we may actually create several r10bios
   3360	 * with 2 bios in each, that correspond to the bios in the main one.
   3361	 * In this case, the subordinate r10bios link back through a
   3362	 * borrowed master_bio pointer, and the counter in the master
   3363	 * includes a ref from each subordinate.
   3364	 */
   3365	/* First, we decide what to do and set ->bi_end_io
   3366	 * To end_sync_read if we want to read, and
   3367	 * end_sync_write if we will want to write.
   3368	 */
   3369
   3370	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
   3371	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
   3372		/* recovery... the complicated one */
   3373		int j;
   3374		r10_bio = NULL;
   3375
   3376		for (i = 0 ; i < conf->geo.raid_disks; i++) {
   3377			int still_degraded;
   3378			struct r10bio *rb2;
   3379			sector_t sect;
   3380			int must_sync;
   3381			int any_working;
   3382			int need_recover = 0;
   3383			int need_replace = 0;
   3384			struct raid10_info *mirror = &conf->mirrors[i];
   3385			struct md_rdev *mrdev, *mreplace;
   3386
   3387			rcu_read_lock();
   3388			mrdev = rcu_dereference(mirror->rdev);
   3389			mreplace = rcu_dereference(mirror->replacement);
   3390
   3391			if (mrdev != NULL &&
   3392			    !test_bit(Faulty, &mrdev->flags) &&
   3393			    !test_bit(In_sync, &mrdev->flags))
   3394				need_recover = 1;
   3395			if (mreplace != NULL &&
   3396			    !test_bit(Faulty, &mreplace->flags))
   3397				need_replace = 1;
   3398
   3399			if (!need_recover && !need_replace) {
   3400				rcu_read_unlock();
   3401				continue;
   3402			}
   3403
   3404			still_degraded = 0;
   3405			/* want to reconstruct this device */
   3406			rb2 = r10_bio;
   3407			sect = raid10_find_virt(conf, sector_nr, i);
   3408			if (sect >= mddev->resync_max_sectors) {
   3409				/* last stripe is not complete - don't
   3410				 * try to recover this sector.
   3411				 */
   3412				rcu_read_unlock();
   3413				continue;
   3414			}
   3415			if (mreplace && test_bit(Faulty, &mreplace->flags))
   3416				mreplace = NULL;
   3417			/* Unless we are doing a full sync, or a replacement
   3418			 * we only need to recover the block if it is set in
   3419			 * the bitmap
   3420			 */
   3421			must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
   3422							 &sync_blocks, 1);
   3423			if (sync_blocks < max_sync)
   3424				max_sync = sync_blocks;
   3425			if (!must_sync &&
   3426			    mreplace == NULL &&
   3427			    !conf->fullsync) {
   3428				/* yep, skip the sync_blocks here, but don't assume
   3429				 * that there will never be anything to do here
   3430				 */
   3431				chunks_skipped = -1;
   3432				rcu_read_unlock();
   3433				continue;
   3434			}
   3435			atomic_inc(&mrdev->nr_pending);
   3436			if (mreplace)
   3437				atomic_inc(&mreplace->nr_pending);
   3438			rcu_read_unlock();
   3439
   3440			r10_bio = raid10_alloc_init_r10buf(conf);
   3441			r10_bio->state = 0;
   3442			raise_barrier(conf, rb2 != NULL);
   3443			atomic_set(&r10_bio->remaining, 0);
   3444
   3445			r10_bio->master_bio = (struct bio*)rb2;
   3446			if (rb2)
   3447				atomic_inc(&rb2->remaining);
   3448			r10_bio->mddev = mddev;
   3449			set_bit(R10BIO_IsRecover, &r10_bio->state);
   3450			r10_bio->sector = sect;
   3451
   3452			raid10_find_phys(conf, r10_bio);
   3453
   3454			/* Need to check if the array will still be
   3455			 * degraded
   3456			 */
   3457			rcu_read_lock();
   3458			for (j = 0; j < conf->geo.raid_disks; j++) {
   3459				struct md_rdev *rdev = rcu_dereference(
   3460					conf->mirrors[j].rdev);
   3461				if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
   3462					still_degraded = 1;
   3463					break;
   3464				}
   3465			}
   3466
   3467			must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
   3468							 &sync_blocks, still_degraded);
   3469
   3470			any_working = 0;
   3471			for (j=0; j<conf->copies;j++) {
   3472				int k;
   3473				int d = r10_bio->devs[j].devnum;
   3474				sector_t from_addr, to_addr;
   3475				struct md_rdev *rdev =
   3476					rcu_dereference(conf->mirrors[d].rdev);
   3477				sector_t sector, first_bad;
   3478				int bad_sectors;
   3479				if (!rdev ||
   3480				    !test_bit(In_sync, &rdev->flags))
   3481					continue;
   3482				/* This is where we read from */
   3483				any_working = 1;
   3484				sector = r10_bio->devs[j].addr;
   3485
   3486				if (is_badblock(rdev, sector, max_sync,
   3487						&first_bad, &bad_sectors)) {
   3488					if (first_bad > sector)
   3489						max_sync = first_bad - sector;
   3490					else {
   3491						bad_sectors -= (sector
   3492								- first_bad);
   3493						if (max_sync > bad_sectors)
   3494							max_sync = bad_sectors;
   3495						continue;
   3496					}
   3497				}
   3498				bio = r10_bio->devs[0].bio;
   3499				bio->bi_next = biolist;
   3500				biolist = bio;
   3501				bio->bi_end_io = end_sync_read;
   3502				bio_set_op_attrs(bio, REQ_OP_READ, 0);
   3503				if (test_bit(FailFast, &rdev->flags))
   3504					bio->bi_opf |= MD_FAILFAST;
   3505				from_addr = r10_bio->devs[j].addr;
   3506				bio->bi_iter.bi_sector = from_addr +
   3507					rdev->data_offset;
   3508				bio_set_dev(bio, rdev->bdev);
   3509				atomic_inc(&rdev->nr_pending);
   3510				/* and we write to 'i' (if not in_sync) */
   3511
   3512				for (k=0; k<conf->copies; k++)
   3513					if (r10_bio->devs[k].devnum == i)
   3514						break;
   3515				BUG_ON(k == conf->copies);
   3516				to_addr = r10_bio->devs[k].addr;
   3517				r10_bio->devs[0].devnum = d;
   3518				r10_bio->devs[0].addr = from_addr;
   3519				r10_bio->devs[1].devnum = i;
   3520				r10_bio->devs[1].addr = to_addr;
   3521
   3522				if (need_recover) {
   3523					bio = r10_bio->devs[1].bio;
   3524					bio->bi_next = biolist;
   3525					biolist = bio;
   3526					bio->bi_end_io = end_sync_write;
   3527					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
   3528					bio->bi_iter.bi_sector = to_addr
   3529						+ mrdev->data_offset;
   3530					bio_set_dev(bio, mrdev->bdev);
   3531					atomic_inc(&r10_bio->remaining);
   3532				} else
   3533					r10_bio->devs[1].bio->bi_end_io = NULL;
   3534
   3535				/* and maybe write to replacement */
   3536				bio = r10_bio->devs[1].repl_bio;
   3537				if (bio)
   3538					bio->bi_end_io = NULL;
   3539				/* Note: if need_replace, then bio
   3540				 * cannot be NULL as r10buf_pool_alloc will
   3541				 * have allocated it.
   3542				 */
   3543				if (!need_replace)
   3544					break;
   3545				bio->bi_next = biolist;
   3546				biolist = bio;
   3547				bio->bi_end_io = end_sync_write;
   3548				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
   3549				bio->bi_iter.bi_sector = to_addr +
   3550					mreplace->data_offset;
   3551				bio_set_dev(bio, mreplace->bdev);
   3552				atomic_inc(&r10_bio->remaining);
   3553				break;
   3554			}
   3555			rcu_read_unlock();
   3556			if (j == conf->copies) {
   3557				/* Cannot recover, so abort the recovery or
   3558				 * record a bad block */
   3559				if (any_working) {
   3560					/* problem is that there are bad blocks
   3561					 * on other device(s)
   3562					 */
   3563					int k;
   3564					for (k = 0; k < conf->copies; k++)
   3565						if (r10_bio->devs[k].devnum == i)
   3566							break;
   3567					if (!test_bit(In_sync,
   3568						      &mrdev->flags)
   3569					    && !rdev_set_badblocks(
   3570						    mrdev,
   3571						    r10_bio->devs[k].addr,
   3572						    max_sync, 0))
   3573						any_working = 0;
   3574					if (mreplace &&
   3575					    !rdev_set_badblocks(
   3576						    mreplace,
   3577						    r10_bio->devs[k].addr,
   3578						    max_sync, 0))
   3579						any_working = 0;
   3580				}
   3581				if (!any_working)  {
   3582					if (!test_and_set_bit(MD_RECOVERY_INTR,
   3583							      &mddev->recovery))
   3584						pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
   3585						       mdname(mddev));
   3586					mirror->recovery_disabled
   3587						= mddev->recovery_disabled;
   3588				}
   3589				put_buf(r10_bio);
   3590				if (rb2)
   3591					atomic_dec(&rb2->remaining);
   3592				r10_bio = rb2;
   3593				rdev_dec_pending(mrdev, mddev);
   3594				if (mreplace)
   3595					rdev_dec_pending(mreplace, mddev);
   3596				break;
   3597			}
   3598			rdev_dec_pending(mrdev, mddev);
   3599			if (mreplace)
   3600				rdev_dec_pending(mreplace, mddev);
   3601			if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
   3602				/* Only want this if there is elsewhere to
   3603				 * read from. 'j' is currently the first
   3604				 * readable copy.
   3605				 */
   3606				int targets = 1;
   3607				for (; j < conf->copies; j++) {
   3608					int d = r10_bio->devs[j].devnum;
   3609					if (conf->mirrors[d].rdev &&
   3610					    test_bit(In_sync,
   3611						      &conf->mirrors[d].rdev->flags))
   3612						targets++;
   3613				}
   3614				if (targets == 1)
   3615					r10_bio->devs[0].bio->bi_opf
   3616						&= ~MD_FAILFAST;
   3617			}
   3618		}
   3619		if (biolist == NULL) {
   3620			while (r10_bio) {
   3621				struct r10bio *rb2 = r10_bio;
   3622				r10_bio = (struct r10bio*) rb2->master_bio;
   3623				rb2->master_bio = NULL;
   3624				put_buf(rb2);
   3625			}
   3626			goto giveup;
   3627		}
   3628	} else {
   3629		/* resync. Schedule a read for every block at this virt offset */
   3630		int count = 0;
   3631
   3632		/*
   3633		 * Since curr_resync_completed could probably not update in
   3634		 * time, and we will set cluster_sync_low based on it.
   3635		 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
   3636		 * safety reason, which ensures curr_resync_completed is
   3637		 * updated in bitmap_cond_end_sync.
   3638		 */
   3639		md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
   3640					mddev_is_clustered(mddev) &&
   3641					(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
   3642
   3643		if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
   3644					  &sync_blocks, mddev->degraded) &&
   3645		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
   3646						 &mddev->recovery)) {
   3647			/* We can skip this block */
   3648			*skipped = 1;
   3649			return sync_blocks + sectors_skipped;
   3650		}
   3651		if (sync_blocks < max_sync)
   3652			max_sync = sync_blocks;
   3653		r10_bio = raid10_alloc_init_r10buf(conf);
   3654		r10_bio->state = 0;
   3655
   3656		r10_bio->mddev = mddev;
   3657		atomic_set(&r10_bio->remaining, 0);
   3658		raise_barrier(conf, 0);
   3659		conf->next_resync = sector_nr;
   3660
   3661		r10_bio->master_bio = NULL;
   3662		r10_bio->sector = sector_nr;
   3663		set_bit(R10BIO_IsSync, &r10_bio->state);
   3664		raid10_find_phys(conf, r10_bio);
   3665		r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
   3666
   3667		for (i = 0; i < conf->copies; i++) {
   3668			int d = r10_bio->devs[i].devnum;
   3669			sector_t first_bad, sector;
   3670			int bad_sectors;
   3671			struct md_rdev *rdev;
   3672
   3673			if (r10_bio->devs[i].repl_bio)
   3674				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
   3675
   3676			bio = r10_bio->devs[i].bio;
   3677			bio->bi_status = BLK_STS_IOERR;
   3678			rcu_read_lock();
   3679			rdev = rcu_dereference(conf->mirrors[d].rdev);
   3680			if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
   3681				rcu_read_unlock();
   3682				continue;
   3683			}
   3684			sector = r10_bio->devs[i].addr;
   3685			if (is_badblock(rdev, sector, max_sync,
   3686					&first_bad, &bad_sectors)) {
   3687				if (first_bad > sector)
   3688					max_sync = first_bad - sector;
   3689				else {
   3690					bad_sectors -= (sector - first_bad);
   3691					if (max_sync > bad_sectors)
   3692						max_sync = bad_sectors;
   3693					rcu_read_unlock();
   3694					continue;
   3695				}
   3696			}
   3697			atomic_inc(&rdev->nr_pending);
   3698			atomic_inc(&r10_bio->remaining);
   3699			bio->bi_next = biolist;
   3700			biolist = bio;
   3701			bio->bi_end_io = end_sync_read;
   3702			bio_set_op_attrs(bio, REQ_OP_READ, 0);
   3703			if (test_bit(FailFast, &rdev->flags))
   3704				bio->bi_opf |= MD_FAILFAST;
   3705			bio->bi_iter.bi_sector = sector + rdev->data_offset;
   3706			bio_set_dev(bio, rdev->bdev);
   3707			count++;
   3708
   3709			rdev = rcu_dereference(conf->mirrors[d].replacement);
   3710			if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
   3711				rcu_read_unlock();
   3712				continue;
   3713			}
   3714			atomic_inc(&rdev->nr_pending);
   3715
   3716			/* Need to set up for writing to the replacement */
   3717			bio = r10_bio->devs[i].repl_bio;
   3718			bio->bi_status = BLK_STS_IOERR;
   3719
   3720			sector = r10_bio->devs[i].addr;
   3721			bio->bi_next = biolist;
   3722			biolist = bio;
   3723			bio->bi_end_io = end_sync_write;
   3724			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
   3725			if (test_bit(FailFast, &rdev->flags))
   3726				bio->bi_opf |= MD_FAILFAST;
   3727			bio->bi_iter.bi_sector = sector + rdev->data_offset;
   3728			bio_set_dev(bio, rdev->bdev);
   3729			count++;
   3730			rcu_read_unlock();
   3731		}
   3732
   3733		if (count < 2) {
   3734			for (i=0; i<conf->copies; i++) {
   3735				int d = r10_bio->devs[i].devnum;
   3736				if (r10_bio->devs[i].bio->bi_end_io)
   3737					rdev_dec_pending(conf->mirrors[d].rdev,
   3738							 mddev);
   3739				if (r10_bio->devs[i].repl_bio &&
   3740				    r10_bio->devs[i].repl_bio->bi_end_io)
   3741					rdev_dec_pending(
   3742						conf->mirrors[d].replacement,
   3743						mddev);
   3744			}
   3745			put_buf(r10_bio);
   3746			biolist = NULL;
   3747			goto giveup;
   3748		}
   3749	}
   3750
   3751	nr_sectors = 0;
   3752	if (sector_nr + max_sync < max_sector)
   3753		max_sector = sector_nr + max_sync;
   3754	do {
   3755		struct page *page;
   3756		int len = PAGE_SIZE;
   3757		if (sector_nr + (len>>9) > max_sector)
   3758			len = (max_sector - sector_nr) << 9;
   3759		if (len == 0)
   3760			break;
   3761		for (bio= biolist ; bio ; bio=bio->bi_next) {
   3762			struct resync_pages *rp = get_resync_pages(bio);
   3763			page = resync_fetch_page(rp, page_idx);
   3764			/*
   3765			 * won't fail because the vec table is big enough
   3766			 * to hold all these pages
   3767			 */
   3768			bio_add_page(bio, page, len, 0);
   3769		}
   3770		nr_sectors += len>>9;
   3771		sector_nr += len>>9;
   3772	} while (++page_idx < RESYNC_PAGES);
   3773	r10_bio->sectors = nr_sectors;
   3774
   3775	if (mddev_is_clustered(mddev) &&
   3776	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
   3777		/* It is resync not recovery */
   3778		if (conf->cluster_sync_high < sector_nr + nr_sectors) {
   3779			conf->cluster_sync_low = mddev->curr_resync_completed;
   3780			raid10_set_cluster_sync_high(conf);
   3781			/* Send resync message */
   3782			md_cluster_ops->resync_info_update(mddev,
   3783						conf->cluster_sync_low,
   3784						conf->cluster_sync_high);
   3785		}
   3786	} else if (mddev_is_clustered(mddev)) {
   3787		/* This is recovery not resync */
   3788		sector_t sect_va1, sect_va2;
   3789		bool broadcast_msg = false;
   3790
   3791		for (i = 0; i < conf->geo.raid_disks; i++) {
   3792			/*
   3793			 * sector_nr is a device address for recovery, so we
   3794			 * need translate it to array address before compare
   3795			 * with cluster_sync_high.
   3796			 */
   3797			sect_va1 = raid10_find_virt(conf, sector_nr, i);
   3798
   3799			if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
   3800				broadcast_msg = true;
   3801				/*
   3802				 * curr_resync_completed is similar as
   3803				 * sector_nr, so make the translation too.
   3804				 */
   3805				sect_va2 = raid10_find_virt(conf,
   3806					mddev->curr_resync_completed, i);
   3807
   3808				if (conf->cluster_sync_low == 0 ||
   3809				    conf->cluster_sync_low > sect_va2)
   3810					conf->cluster_sync_low = sect_va2;
   3811			}
   3812		}
   3813		if (broadcast_msg) {
   3814			raid10_set_cluster_sync_high(conf);
   3815			md_cluster_ops->resync_info_update(mddev,
   3816						conf->cluster_sync_low,
   3817						conf->cluster_sync_high);
   3818		}
   3819	}
   3820
   3821	while (biolist) {
   3822		bio = biolist;
   3823		biolist = biolist->bi_next;
   3824
   3825		bio->bi_next = NULL;
   3826		r10_bio = get_resync_r10bio(bio);
   3827		r10_bio->sectors = nr_sectors;
   3828
   3829		if (bio->bi_end_io == end_sync_read) {
   3830			md_sync_acct_bio(bio, nr_sectors);
   3831			bio->bi_status = 0;
   3832			submit_bio_noacct(bio);
   3833		}
   3834	}
   3835
   3836	if (sectors_skipped)
   3837		/* pretend they weren't skipped, it makes
   3838		 * no important difference in this case
   3839		 */
   3840		md_done_sync(mddev, sectors_skipped, 1);
   3841
   3842	return sectors_skipped + nr_sectors;
   3843 giveup:
   3844	/* There is nowhere to write, so all non-sync
   3845	 * drives must be failed or in resync, all drives
   3846	 * have a bad block, so try the next chunk...
   3847	 */
   3848	if (sector_nr + max_sync < max_sector)
   3849		max_sector = sector_nr + max_sync;
   3850
   3851	sectors_skipped += (max_sector - sector_nr);
   3852	chunks_skipped ++;
   3853	sector_nr = max_sector;
   3854	goto skipped;
   3855}
   3856
   3857static sector_t
   3858raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
   3859{
   3860	sector_t size;
   3861	struct r10conf *conf = mddev->private;
   3862
   3863	if (!raid_disks)
   3864		raid_disks = min(conf->geo.raid_disks,
   3865				 conf->prev.raid_disks);
   3866	if (!sectors)
   3867		sectors = conf->dev_sectors;
   3868
   3869	size = sectors >> conf->geo.chunk_shift;
   3870	sector_div(size, conf->geo.far_copies);
   3871	size = size * raid_disks;
   3872	sector_div(size, conf->geo.near_copies);
   3873
   3874	return size << conf->geo.chunk_shift;
   3875}
   3876
   3877static void calc_sectors(struct r10conf *conf, sector_t size)
   3878{
   3879	/* Calculate the number of sectors-per-device that will
   3880	 * actually be used, and set conf->dev_sectors and
   3881	 * conf->stride
   3882	 */
   3883
   3884	size = size >> conf->geo.chunk_shift;
   3885	sector_div(size, conf->geo.far_copies);
   3886	size = size * conf->geo.raid_disks;
   3887	sector_div(size, conf->geo.near_copies);
   3888	/* 'size' is now the number of chunks in the array */
   3889	/* calculate "used chunks per device" */
   3890	size = size * conf->copies;
   3891
   3892	/* We need to round up when dividing by raid_disks to
   3893	 * get the stride size.
   3894	 */
   3895	size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
   3896
   3897	conf->dev_sectors = size << conf->geo.chunk_shift;
   3898
   3899	if (conf->geo.far_offset)
   3900		conf->geo.stride = 1 << conf->geo.chunk_shift;
   3901	else {
   3902		sector_div(size, conf->geo.far_copies);
   3903		conf->geo.stride = size << conf->geo.chunk_shift;
   3904	}
   3905}
   3906
   3907enum geo_type {geo_new, geo_old, geo_start};
   3908static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
   3909{
   3910	int nc, fc, fo;
   3911	int layout, chunk, disks;
   3912	switch (new) {
   3913	case geo_old:
   3914		layout = mddev->layout;
   3915		chunk = mddev->chunk_sectors;
   3916		disks = mddev->raid_disks - mddev->delta_disks;
   3917		break;
   3918	case geo_new:
   3919		layout = mddev->new_layout;
   3920		chunk = mddev->new_chunk_sectors;
   3921		disks = mddev->raid_disks;
   3922		break;
   3923	default: /* avoid 'may be unused' warnings */
   3924	case geo_start: /* new when starting reshape - raid_disks not
   3925			 * updated yet. */
   3926		layout = mddev->new_layout;
   3927		chunk = mddev->new_chunk_sectors;
   3928		disks = mddev->raid_disks + mddev->delta_disks;
   3929		break;
   3930	}
   3931	if (layout >> 19)
   3932		return -1;
   3933	if (chunk < (PAGE_SIZE >> 9) ||
   3934	    !is_power_of_2(chunk))
   3935		return -2;
   3936	nc = layout & 255;
   3937	fc = (layout >> 8) & 255;
   3938	fo = layout & (1<<16);
   3939	geo->raid_disks = disks;
   3940	geo->near_copies = nc;
   3941	geo->far_copies = fc;
   3942	geo->far_offset = fo;
   3943	switch (layout >> 17) {
   3944	case 0:	/* original layout.  simple but not always optimal */
   3945		geo->far_set_size = disks;
   3946		break;
   3947	case 1: /* "improved" layout which was buggy.  Hopefully no-one is
   3948		 * actually using this, but leave code here just in case.*/
   3949		geo->far_set_size = disks/fc;
   3950		WARN(geo->far_set_size < fc,
   3951		     "This RAID10 layout does not provide data safety - please backup and create new array\n");
   3952		break;
   3953	case 2: /* "improved" layout fixed to match documentation */
   3954		geo->far_set_size = fc * nc;
   3955		break;
   3956	default: /* Not a valid layout */
   3957		return -1;
   3958	}
   3959	geo->chunk_mask = chunk - 1;
   3960	geo->chunk_shift = ffz(~chunk);
   3961	return nc*fc;
   3962}
   3963
   3964static struct r10conf *setup_conf(struct mddev *mddev)
   3965{
   3966	struct r10conf *conf = NULL;
   3967	int err = -EINVAL;
   3968	struct geom geo;
   3969	int copies;
   3970
   3971	copies = setup_geo(&geo, mddev, geo_new);
   3972
   3973	if (copies == -2) {
   3974		pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
   3975			mdname(mddev), PAGE_SIZE);
   3976		goto out;
   3977	}
   3978
   3979	if (copies < 2 || copies > mddev->raid_disks) {
   3980		pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
   3981			mdname(mddev), mddev->new_layout);
   3982		goto out;
   3983	}
   3984
   3985	err = -ENOMEM;
   3986	conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
   3987	if (!conf)
   3988		goto out;
   3989
   3990	/* FIXME calc properly */
   3991	conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
   3992				sizeof(struct raid10_info),
   3993				GFP_KERNEL);
   3994	if (!conf->mirrors)
   3995		goto out;
   3996
   3997	conf->tmppage = alloc_page(GFP_KERNEL);
   3998	if (!conf->tmppage)
   3999		goto out;
   4000
   4001	conf->geo = geo;
   4002	conf->copies = copies;
   4003	err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
   4004			   rbio_pool_free, conf);
   4005	if (err)
   4006		goto out;
   4007
   4008	err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
   4009	if (err)
   4010		goto out;
   4011
   4012	calc_sectors(conf, mddev->dev_sectors);
   4013	if (mddev->reshape_position == MaxSector) {
   4014		conf->prev = conf->geo;
   4015		conf->reshape_progress = MaxSector;
   4016	} else {
   4017		if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
   4018			err = -EINVAL;
   4019			goto out;
   4020		}
   4021		conf->reshape_progress = mddev->reshape_position;
   4022		if (conf->prev.far_offset)
   4023			conf->prev.stride = 1 << conf->prev.chunk_shift;
   4024		else
   4025			/* far_copies must be 1 */
   4026			conf->prev.stride = conf->dev_sectors;
   4027	}
   4028	conf->reshape_safe = conf->reshape_progress;
   4029	spin_lock_init(&conf->device_lock);
   4030	INIT_LIST_HEAD(&conf->retry_list);
   4031	INIT_LIST_HEAD(&conf->bio_end_io_list);
   4032
   4033	spin_lock_init(&conf->resync_lock);
   4034	init_waitqueue_head(&conf->wait_barrier);
   4035	atomic_set(&conf->nr_pending, 0);
   4036
   4037	err = -ENOMEM;
   4038	conf->thread = md_register_thread(raid10d, mddev, "raid10");
   4039	if (!conf->thread)
   4040		goto out;
   4041
   4042	conf->mddev = mddev;
   4043	return conf;
   4044
   4045 out:
   4046	if (conf) {
   4047		mempool_exit(&conf->r10bio_pool);
   4048		kfree(conf->mirrors);
   4049		safe_put_page(conf->tmppage);
   4050		bioset_exit(&conf->bio_split);
   4051		kfree(conf);
   4052	}
   4053	return ERR_PTR(err);
   4054}
   4055
   4056static void raid10_set_io_opt(struct r10conf *conf)
   4057{
   4058	int raid_disks = conf->geo.raid_disks;
   4059
   4060	if (!(conf->geo.raid_disks % conf->geo.near_copies))
   4061		raid_disks /= conf->geo.near_copies;
   4062	blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
   4063			 raid_disks);
   4064}
   4065
   4066static int raid10_run(struct mddev *mddev)
   4067{
   4068	struct r10conf *conf;
   4069	int i, disk_idx;
   4070	struct raid10_info *disk;
   4071	struct md_rdev *rdev;
   4072	sector_t size;
   4073	sector_t min_offset_diff = 0;
   4074	int first = 1;
   4075
   4076	if (mddev_init_writes_pending(mddev) < 0)
   4077		return -ENOMEM;
   4078
   4079	if (mddev->private == NULL) {
   4080		conf = setup_conf(mddev);
   4081		if (IS_ERR(conf))
   4082			return PTR_ERR(conf);
   4083		mddev->private = conf;
   4084	}
   4085	conf = mddev->private;
   4086	if (!conf)
   4087		goto out;
   4088
   4089	if (mddev_is_clustered(conf->mddev)) {
   4090		int fc, fo;
   4091
   4092		fc = (mddev->layout >> 8) & 255;
   4093		fo = mddev->layout & (1<<16);
   4094		if (fc > 1 || fo > 0) {
   4095			pr_err("only near layout is supported by clustered"
   4096				" raid10\n");
   4097			goto out_free_conf;
   4098		}
   4099	}
   4100
   4101	mddev->thread = conf->thread;
   4102	conf->thread = NULL;
   4103
   4104	if (mddev->queue) {
   4105		blk_queue_max_discard_sectors(mddev->queue,
   4106					      UINT_MAX);
   4107		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
   4108		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
   4109		raid10_set_io_opt(conf);
   4110	}
   4111
   4112	rdev_for_each(rdev, mddev) {
   4113		long long diff;
   4114
   4115		disk_idx = rdev->raid_disk;
   4116		if (disk_idx < 0)
   4117			continue;
   4118		if (disk_idx >= conf->geo.raid_disks &&
   4119		    disk_idx >= conf->prev.raid_disks)
   4120			continue;
   4121		disk = conf->mirrors + disk_idx;
   4122
   4123		if (test_bit(Replacement, &rdev->flags)) {
   4124			if (disk->replacement)
   4125				goto out_free_conf;
   4126			disk->replacement = rdev;
   4127		} else {
   4128			if (disk->rdev)
   4129				goto out_free_conf;
   4130			disk->rdev = rdev;
   4131		}
   4132		diff = (rdev->new_data_offset - rdev->data_offset);
   4133		if (!mddev->reshape_backwards)
   4134			diff = -diff;
   4135		if (diff < 0)
   4136			diff = 0;
   4137		if (first || diff < min_offset_diff)
   4138			min_offset_diff = diff;
   4139
   4140		if (mddev->gendisk)
   4141			disk_stack_limits(mddev->gendisk, rdev->bdev,
   4142					  rdev->data_offset << 9);
   4143
   4144		disk->head_position = 0;
   4145		first = 0;
   4146	}
   4147
   4148	/* need to check that every block has at least one working mirror */
   4149	if (!enough(conf, -1)) {
   4150		pr_err("md/raid10:%s: not enough operational mirrors.\n",
   4151		       mdname(mddev));
   4152		goto out_free_conf;
   4153	}
   4154
   4155	if (conf->reshape_progress != MaxSector) {
   4156		/* must ensure that shape change is supported */
   4157		if (conf->geo.far_copies != 1 &&
   4158		    conf->geo.far_offset == 0)
   4159			goto out_free_conf;
   4160		if (conf->prev.far_copies != 1 &&
   4161		    conf->prev.far_offset == 0)
   4162			goto out_free_conf;
   4163	}
   4164
   4165	mddev->degraded = 0;
   4166	for (i = 0;
   4167	     i < conf->geo.raid_disks
   4168		     || i < conf->prev.raid_disks;
   4169	     i++) {
   4170
   4171		disk = conf->mirrors + i;
   4172
   4173		if (!disk->rdev && disk->replacement) {
   4174			/* The replacement is all we have - use it */
   4175			disk->rdev = disk->replacement;
   4176			disk->replacement = NULL;
   4177			clear_bit(Replacement, &disk->rdev->flags);
   4178		}
   4179
   4180		if (!disk->rdev ||
   4181		    !test_bit(In_sync, &disk->rdev->flags)) {
   4182			disk->head_position = 0;
   4183			mddev->degraded++;
   4184			if (disk->rdev &&
   4185			    disk->rdev->saved_raid_disk < 0)
   4186				conf->fullsync = 1;
   4187		}
   4188
   4189		if (disk->replacement &&
   4190		    !test_bit(In_sync, &disk->replacement->flags) &&
   4191		    disk->replacement->saved_raid_disk < 0) {
   4192			conf->fullsync = 1;
   4193		}
   4194
   4195		disk->recovery_disabled = mddev->recovery_disabled - 1;
   4196	}
   4197
   4198	if (mddev->recovery_cp != MaxSector)
   4199		pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
   4200			  mdname(mddev));
   4201	pr_info("md/raid10:%s: active with %d out of %d devices\n",
   4202		mdname(mddev), conf->geo.raid_disks - mddev->degraded,
   4203		conf->geo.raid_disks);
   4204	/*
   4205	 * Ok, everything is just fine now
   4206	 */
   4207	mddev->dev_sectors = conf->dev_sectors;
   4208	size = raid10_size(mddev, 0, 0);
   4209	md_set_array_sectors(mddev, size);
   4210	mddev->resync_max_sectors = size;
   4211	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
   4212
   4213	if (md_integrity_register(mddev))
   4214		goto out_free_conf;
   4215
   4216	if (conf->reshape_progress != MaxSector) {
   4217		unsigned long before_length, after_length;
   4218
   4219		before_length = ((1 << conf->prev.chunk_shift) *
   4220				 conf->prev.far_copies);
   4221		after_length = ((1 << conf->geo.chunk_shift) *
   4222				conf->geo.far_copies);
   4223
   4224		if (max(before_length, after_length) > min_offset_diff) {
   4225			/* This cannot work */
   4226			pr_warn("md/raid10: offset difference not enough to continue reshape\n");
   4227			goto out_free_conf;
   4228		}
   4229		conf->offset_diff = min_offset_diff;
   4230
   4231		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
   4232		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
   4233		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
   4234		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
   4235		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
   4236							"reshape");
   4237		if (!mddev->sync_thread)
   4238			goto out_free_conf;
   4239	}
   4240
   4241	return 0;
   4242
   4243out_free_conf:
   4244	md_unregister_thread(&mddev->thread);
   4245	mempool_exit(&conf->r10bio_pool);
   4246	safe_put_page(conf->tmppage);
   4247	kfree(conf->mirrors);
   4248	kfree(conf);
   4249	mddev->private = NULL;
   4250out:
   4251	return -EIO;
   4252}
   4253
   4254static void raid10_free(struct mddev *mddev, void *priv)
   4255{
   4256	struct r10conf *conf = priv;
   4257
   4258	mempool_exit(&conf->r10bio_pool);
   4259	safe_put_page(conf->tmppage);
   4260	kfree(conf->mirrors);
   4261	kfree(conf->mirrors_old);
   4262	kfree(conf->mirrors_new);
   4263	bioset_exit(&conf->bio_split);
   4264	kfree(conf);
   4265}
   4266
   4267static void raid10_quiesce(struct mddev *mddev, int quiesce)
   4268{
   4269	struct r10conf *conf = mddev->private;
   4270
   4271	if (quiesce)
   4272		raise_barrier(conf, 0);
   4273	else
   4274		lower_barrier(conf);
   4275}
   4276
   4277static int raid10_resize(struct mddev *mddev, sector_t sectors)
   4278{
   4279	/* Resize of 'far' arrays is not supported.
   4280	 * For 'near' and 'offset' arrays we can set the
   4281	 * number of sectors used to be an appropriate multiple
   4282	 * of the chunk size.
   4283	 * For 'offset', this is far_copies*chunksize.
   4284	 * For 'near' the multiplier is the LCM of
   4285	 * near_copies and raid_disks.
   4286	 * So if far_copies > 1 && !far_offset, fail.
   4287	 * Else find LCM(raid_disks, near_copy)*far_copies and
   4288	 * multiply by chunk_size.  Then round to this number.
   4289	 * This is mostly done by raid10_size()
   4290	 */
   4291	struct r10conf *conf = mddev->private;
   4292	sector_t oldsize, size;
   4293
   4294	if (mddev->reshape_position != MaxSector)
   4295		return -EBUSY;
   4296
   4297	if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
   4298		return -EINVAL;
   4299
   4300	oldsize = raid10_size(mddev, 0, 0);
   4301	size = raid10_size(mddev, sectors, 0);
   4302	if (mddev->external_size &&
   4303	    mddev->array_sectors > size)
   4304		return -EINVAL;
   4305	if (mddev->bitmap) {
   4306		int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
   4307		if (ret)
   4308			return ret;
   4309	}
   4310	md_set_array_sectors(mddev, size);
   4311	if (sectors > mddev->dev_sectors &&
   4312	    mddev->recovery_cp > oldsize) {
   4313		mddev->recovery_cp = oldsize;
   4314		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
   4315	}
   4316	calc_sectors(conf, sectors);
   4317	mddev->dev_sectors = conf->dev_sectors;
   4318	mddev->resync_max_sectors = size;
   4319	return 0;
   4320}
   4321
   4322static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
   4323{
   4324	struct md_rdev *rdev;
   4325	struct r10conf *conf;
   4326
   4327	if (mddev->degraded > 0) {
   4328		pr_warn("md/raid10:%s: Error: degraded raid0!\n",
   4329			mdname(mddev));
   4330		return ERR_PTR(-EINVAL);
   4331	}
   4332	sector_div(size, devs);
   4333
   4334	/* Set new parameters */
   4335	mddev->new_level = 10;
   4336	/* new layout: far_copies = 1, near_copies = 2 */
   4337	mddev->new_layout = (1<<8) + 2;
   4338	mddev->new_chunk_sectors = mddev->chunk_sectors;
   4339	mddev->delta_disks = mddev->raid_disks;
   4340	mddev->raid_disks *= 2;
   4341	/* make sure it will be not marked as dirty */
   4342	mddev->recovery_cp = MaxSector;
   4343	mddev->dev_sectors = size;
   4344
   4345	conf = setup_conf(mddev);
   4346	if (!IS_ERR(conf)) {
   4347		rdev_for_each(rdev, mddev)
   4348			if (rdev->raid_disk >= 0) {
   4349				rdev->new_raid_disk = rdev->raid_disk * 2;
   4350				rdev->sectors = size;
   4351			}
   4352		conf->barrier = 1;
   4353	}
   4354
   4355	return conf;
   4356}
   4357
   4358static void *raid10_takeover(struct mddev *mddev)
   4359{
   4360	struct r0conf *raid0_conf;
   4361
   4362	/* raid10 can take over:
   4363	 *  raid0 - providing it has only two drives
   4364	 */
   4365	if (mddev->level == 0) {
   4366		/* for raid0 takeover only one zone is supported */
   4367		raid0_conf = mddev->private;
   4368		if (raid0_conf->nr_strip_zones > 1) {
   4369			pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
   4370				mdname(mddev));
   4371			return ERR_PTR(-EINVAL);
   4372		}
   4373		return raid10_takeover_raid0(mddev,
   4374			raid0_conf->strip_zone->zone_end,
   4375			raid0_conf->strip_zone->nb_dev);
   4376	}
   4377	return ERR_PTR(-EINVAL);
   4378}
   4379
   4380static int raid10_check_reshape(struct mddev *mddev)
   4381{
   4382	/* Called when there is a request to change
   4383	 * - layout (to ->new_layout)
   4384	 * - chunk size (to ->new_chunk_sectors)
   4385	 * - raid_disks (by delta_disks)
   4386	 * or when trying to restart a reshape that was ongoing.
   4387	 *
   4388	 * We need to validate the request and possibly allocate
   4389	 * space if that might be an issue later.
   4390	 *
   4391	 * Currently we reject any reshape of a 'far' mode array,
   4392	 * allow chunk size to change if new is generally acceptable,
   4393	 * allow raid_disks to increase, and allow
   4394	 * a switch between 'near' mode and 'offset' mode.
   4395	 */
   4396	struct r10conf *conf = mddev->private;
   4397	struct geom geo;
   4398
   4399	if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
   4400		return -EINVAL;
   4401
   4402	if (setup_geo(&geo, mddev, geo_start) != conf->copies)
   4403		/* mustn't change number of copies */
   4404		return -EINVAL;
   4405	if (geo.far_copies > 1 && !geo.far_offset)
   4406		/* Cannot switch to 'far' mode */
   4407		return -EINVAL;
   4408
   4409	if (mddev->array_sectors & geo.chunk_mask)
   4410			/* not factor of array size */
   4411			return -EINVAL;
   4412
   4413	if (!enough(conf, -1))
   4414		return -EINVAL;
   4415
   4416	kfree(conf->mirrors_new);
   4417	conf->mirrors_new = NULL;
   4418	if (mddev->delta_disks > 0) {
   4419		/* allocate new 'mirrors' list */
   4420		conf->mirrors_new =
   4421			kcalloc(mddev->raid_disks + mddev->delta_disks,
   4422				sizeof(struct raid10_info),
   4423				GFP_KERNEL);
   4424		if (!conf->mirrors_new)
   4425			return -ENOMEM;
   4426	}
   4427	return 0;
   4428}
   4429
   4430/*
   4431 * Need to check if array has failed when deciding whether to:
   4432 *  - start an array
   4433 *  - remove non-faulty devices
   4434 *  - add a spare
   4435 *  - allow a reshape
   4436 * This determination is simple when no reshape is happening.
   4437 * However if there is a reshape, we need to carefully check
   4438 * both the before and after sections.
   4439 * This is because some failed devices may only affect one
   4440 * of the two sections, and some non-in_sync devices may
   4441 * be insync in the section most affected by failed devices.
   4442 */
   4443static int calc_degraded(struct r10conf *conf)
   4444{
   4445	int degraded, degraded2;
   4446	int i;
   4447
   4448	rcu_read_lock();
   4449	degraded = 0;
   4450	/* 'prev' section first */
   4451	for (i = 0; i < conf->prev.raid_disks; i++) {
   4452		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
   4453		if (!rdev || test_bit(Faulty, &rdev->flags))
   4454			degraded++;
   4455		else if (!test_bit(In_sync, &rdev->flags))
   4456			/* When we can reduce the number of devices in
   4457			 * an array, this might not contribute to
   4458			 * 'degraded'.  It does now.
   4459			 */
   4460			degraded++;
   4461	}
   4462	rcu_read_unlock();
   4463	if (conf->geo.raid_disks == conf->prev.raid_disks)
   4464		return degraded;
   4465	rcu_read_lock();
   4466	degraded2 = 0;
   4467	for (i = 0; i < conf->geo.raid_disks; i++) {
   4468		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
   4469		if (!rdev || test_bit(Faulty, &rdev->flags))
   4470			degraded2++;
   4471		else if (!test_bit(In_sync, &rdev->flags)) {
   4472			/* If reshape is increasing the number of devices,
   4473			 * this section has already been recovered, so
   4474			 * it doesn't contribute to degraded.
   4475			 * else it does.
   4476			 */
   4477			if (conf->geo.raid_disks <= conf->prev.raid_disks)
   4478				degraded2++;
   4479		}
   4480	}
   4481	rcu_read_unlock();
   4482	if (degraded2 > degraded)
   4483		return degraded2;
   4484	return degraded;
   4485}
   4486
   4487static int raid10_start_reshape(struct mddev *mddev)
   4488{
   4489	/* A 'reshape' has been requested. This commits
   4490	 * the various 'new' fields and sets MD_RECOVER_RESHAPE
   4491	 * This also checks if there are enough spares and adds them
   4492	 * to the array.
   4493	 * We currently require enough spares to make the final
   4494	 * array non-degraded.  We also require that the difference
   4495	 * between old and new data_offset - on each device - is
   4496	 * enough that we never risk over-writing.
   4497	 */
   4498
   4499	unsigned long before_length, after_length;
   4500	sector_t min_offset_diff = 0;
   4501	int first = 1;
   4502	struct geom new;
   4503	struct r10conf *conf = mddev->private;
   4504	struct md_rdev *rdev;
   4505	int spares = 0;
   4506	int ret;
   4507
   4508	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
   4509		return -EBUSY;
   4510
   4511	if (setup_geo(&new, mddev, geo_start) != conf->copies)
   4512		return -EINVAL;
   4513
   4514	before_length = ((1 << conf->prev.chunk_shift) *
   4515			 conf->prev.far_copies);
   4516	after_length = ((1 << conf->geo.chunk_shift) *
   4517			conf->geo.far_copies);
   4518
   4519	rdev_for_each(rdev, mddev) {
   4520		if (!test_bit(In_sync, &rdev->flags)
   4521		    && !test_bit(Faulty, &rdev->flags))
   4522			spares++;
   4523		if (rdev->raid_disk >= 0) {
   4524			long long diff = (rdev->new_data_offset
   4525					  - rdev->data_offset);
   4526			if (!mddev->reshape_backwards)
   4527				diff = -diff;
   4528			if (diff < 0)
   4529				diff = 0;
   4530			if (first || diff < min_offset_diff)
   4531				min_offset_diff = diff;
   4532			first = 0;
   4533		}
   4534	}
   4535
   4536	if (max(before_length, after_length) > min_offset_diff)
   4537		return -EINVAL;
   4538
   4539	if (spares < mddev->delta_disks)
   4540		return -EINVAL;
   4541
   4542	conf->offset_diff = min_offset_diff;
   4543	spin_lock_irq(&conf->device_lock);
   4544	if (conf->mirrors_new) {
   4545		memcpy(conf->mirrors_new, conf->mirrors,
   4546		       sizeof(struct raid10_info)*conf->prev.raid_disks);
   4547		smp_mb();
   4548		kfree(conf->mirrors_old);
   4549		conf->mirrors_old = conf->mirrors;
   4550		conf->mirrors = conf->mirrors_new;
   4551		conf->mirrors_new = NULL;
   4552	}
   4553	setup_geo(&conf->geo, mddev, geo_start);
   4554	smp_mb();
   4555	if (mddev->reshape_backwards) {
   4556		sector_t size = raid10_size(mddev, 0, 0);
   4557		if (size < mddev->array_sectors) {
   4558			spin_unlock_irq(&conf->device_lock);
   4559			pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
   4560				mdname(mddev));
   4561			return -EINVAL;
   4562		}
   4563		mddev->resync_max_sectors = size;
   4564		conf->reshape_progress = size;
   4565	} else
   4566		conf->reshape_progress = 0;
   4567	conf->reshape_safe = conf->reshape_progress;
   4568	spin_unlock_irq(&conf->device_lock);
   4569
   4570	if (mddev->delta_disks && mddev->bitmap) {
   4571		struct mdp_superblock_1 *sb = NULL;
   4572		sector_t oldsize, newsize;
   4573
   4574		oldsize = raid10_size(mddev, 0, 0);
   4575		newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
   4576
   4577		if (!mddev_is_clustered(mddev)) {
   4578			ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
   4579			if (ret)
   4580				goto abort;
   4581			else
   4582				goto out;
   4583		}
   4584
   4585		rdev_for_each(rdev, mddev) {
   4586			if (rdev->raid_disk > -1 &&
   4587			    !test_bit(Faulty, &rdev->flags))
   4588				sb = page_address(rdev->sb_page);
   4589		}
   4590
   4591		/*
   4592		 * some node is already performing reshape, and no need to
   4593		 * call md_bitmap_resize again since it should be called when
   4594		 * receiving BITMAP_RESIZE msg
   4595		 */
   4596		if ((sb && (le32_to_cpu(sb->feature_map) &
   4597			    MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
   4598			goto out;
   4599
   4600		ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
   4601		if (ret)
   4602			goto abort;
   4603
   4604		ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
   4605		if (ret) {
   4606			md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
   4607			goto abort;
   4608		}
   4609	}
   4610out:
   4611	if (mddev->delta_disks > 0) {
   4612		rdev_for_each(rdev, mddev)
   4613			if (rdev->raid_disk < 0 &&
   4614			    !test_bit(Faulty, &rdev->flags)) {
   4615				if (raid10_add_disk(mddev, rdev) == 0) {
   4616					if (rdev->raid_disk >=
   4617					    conf->prev.raid_disks)
   4618						set_bit(In_sync, &rdev->flags);
   4619					else
   4620						rdev->recovery_offset = 0;
   4621
   4622					/* Failure here is OK */
   4623					sysfs_link_rdev(mddev, rdev);
   4624				}
   4625			} else if (rdev->raid_disk >= conf->prev.raid_disks
   4626				   && !test_bit(Faulty, &rdev->flags)) {
   4627				/* This is a spare that was manually added */
   4628				set_bit(In_sync, &rdev->flags);
   4629			}
   4630	}
   4631	/* When a reshape changes the number of devices,
   4632	 * ->degraded is measured against the larger of the
   4633	 * pre and  post numbers.
   4634	 */
   4635	spin_lock_irq(&conf->device_lock);
   4636	mddev->degraded = calc_degraded(conf);
   4637	spin_unlock_irq(&conf->device_lock);
   4638	mddev->raid_disks = conf->geo.raid_disks;
   4639	mddev->reshape_position = conf->reshape_progress;
   4640	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
   4641
   4642	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
   4643	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
   4644	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
   4645	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
   4646	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
   4647
   4648	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
   4649						"reshape");
   4650	if (!mddev->sync_thread) {
   4651		ret = -EAGAIN;
   4652		goto abort;
   4653	}
   4654	conf->reshape_checkpoint = jiffies;
   4655	md_wakeup_thread(mddev->sync_thread);
   4656	md_new_event();
   4657	return 0;
   4658
   4659abort:
   4660	mddev->recovery = 0;
   4661	spin_lock_irq(&conf->device_lock);
   4662	conf->geo = conf->prev;
   4663	mddev->raid_disks = conf->geo.raid_disks;
   4664	rdev_for_each(rdev, mddev)
   4665		rdev->new_data_offset = rdev->data_offset;
   4666	smp_wmb();
   4667	conf->reshape_progress = MaxSector;
   4668	conf->reshape_safe = MaxSector;
   4669	mddev->reshape_position = MaxSector;
   4670	spin_unlock_irq(&conf->device_lock);
   4671	return ret;
   4672}
   4673
   4674/* Calculate the last device-address that could contain
   4675 * any block from the chunk that includes the array-address 's'
   4676 * and report the next address.
   4677 * i.e. the address returned will be chunk-aligned and after
   4678 * any data that is in the chunk containing 's'.
   4679 */
   4680static sector_t last_dev_address(sector_t s, struct geom *geo)
   4681{
   4682	s = (s | geo->chunk_mask) + 1;
   4683	s >>= geo->chunk_shift;
   4684	s *= geo->near_copies;
   4685	s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
   4686	s *= geo->far_copies;
   4687	s <<= geo->chunk_shift;
   4688	return s;
   4689}
   4690
   4691/* Calculate the first device-address that could contain
   4692 * any block from the chunk that includes the array-address 's'.
   4693 * This too will be the start of a chunk
   4694 */
   4695static sector_t first_dev_address(sector_t s, struct geom *geo)
   4696{
   4697	s >>= geo->chunk_shift;
   4698	s *= geo->near_copies;
   4699	sector_div(s, geo->raid_disks);
   4700	s *= geo->far_copies;
   4701	s <<= geo->chunk_shift;
   4702	return s;
   4703}
   4704
   4705static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
   4706				int *skipped)
   4707{
   4708	/* We simply copy at most one chunk (smallest of old and new)
   4709	 * at a time, possibly less if that exceeds RESYNC_PAGES,
   4710	 * or we hit a bad block or something.
   4711	 * This might mean we pause for normal IO in the middle of
   4712	 * a chunk, but that is not a problem as mddev->reshape_position
   4713	 * can record any location.
   4714	 *
   4715	 * If we will want to write to a location that isn't
   4716	 * yet recorded as 'safe' (i.e. in metadata on disk) then
   4717	 * we need to flush all reshape requests and update the metadata.
   4718	 *
   4719	 * When reshaping forwards (e.g. to more devices), we interpret
   4720	 * 'safe' as the earliest block which might not have been copied
   4721	 * down yet.  We divide this by previous stripe size and multiply
   4722	 * by previous stripe length to get lowest device offset that we
   4723	 * cannot write to yet.
   4724	 * We interpret 'sector_nr' as an address that we want to write to.
   4725	 * From this we use last_device_address() to find where we might
   4726	 * write to, and first_device_address on the  'safe' position.
   4727	 * If this 'next' write position is after the 'safe' position,
   4728	 * we must update the metadata to increase the 'safe' position.
   4729	 *
   4730	 * When reshaping backwards, we round in the opposite direction
   4731	 * and perform the reverse test:  next write position must not be
   4732	 * less than current safe position.
   4733	 *
   4734	 * In all this the minimum difference in data offsets
   4735	 * (conf->offset_diff - always positive) allows a bit of slack,
   4736	 * so next can be after 'safe', but not by more than offset_diff
   4737	 *
   4738	 * We need to prepare all the bios here before we start any IO
   4739	 * to ensure the size we choose is acceptable to all devices.
   4740	 * The means one for each copy for write-out and an extra one for
   4741	 * read-in.
   4742	 * We store the read-in bio in ->master_bio and the others in
   4743	 * ->devs[x].bio and ->devs[x].repl_bio.
   4744	 */
   4745	struct r10conf *conf = mddev->private;
   4746	struct r10bio *r10_bio;
   4747	sector_t next, safe, last;
   4748	int max_sectors;
   4749	int nr_sectors;
   4750	int s;
   4751	struct md_rdev *rdev;
   4752	int need_flush = 0;
   4753	struct bio *blist;
   4754	struct bio *bio, *read_bio;
   4755	int sectors_done = 0;
   4756	struct page **pages;
   4757
   4758	if (sector_nr == 0) {
   4759		/* If restarting in the middle, skip the initial sectors */
   4760		if (mddev->reshape_backwards &&
   4761		    conf->reshape_progress < raid10_size(mddev, 0, 0)) {
   4762			sector_nr = (raid10_size(mddev, 0, 0)
   4763				     - conf->reshape_progress);
   4764		} else if (!mddev->reshape_backwards &&
   4765			   conf->reshape_progress > 0)
   4766			sector_nr = conf->reshape_progress;
   4767		if (sector_nr) {
   4768			mddev->curr_resync_completed = sector_nr;
   4769			sysfs_notify_dirent_safe(mddev->sysfs_completed);
   4770			*skipped = 1;
   4771			return sector_nr;
   4772		}
   4773	}
   4774
   4775	/* We don't use sector_nr to track where we are up to
   4776	 * as that doesn't work well for ->reshape_backwards.
   4777	 * So just use ->reshape_progress.
   4778	 */
   4779	if (mddev->reshape_backwards) {
   4780		/* 'next' is the earliest device address that we might
   4781		 * write to for this chunk in the new layout
   4782		 */
   4783		next = first_dev_address(conf->reshape_progress - 1,
   4784					 &conf->geo);
   4785
   4786		/* 'safe' is the last device address that we might read from
   4787		 * in the old layout after a restart
   4788		 */
   4789		safe = last_dev_address(conf->reshape_safe - 1,
   4790					&conf->prev);
   4791
   4792		if (next + conf->offset_diff < safe)
   4793			need_flush = 1;
   4794
   4795		last = conf->reshape_progress - 1;
   4796		sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
   4797					       & conf->prev.chunk_mask);
   4798		if (sector_nr + RESYNC_SECTORS < last)
   4799			sector_nr = last + 1 - RESYNC_SECTORS;
   4800	} else {
   4801		/* 'next' is after the last device address that we
   4802		 * might write to for this chunk in the new layout
   4803		 */
   4804		next = last_dev_address(conf->reshape_progress, &conf->geo);
   4805
   4806		/* 'safe' is the earliest device address that we might
   4807		 * read from in the old layout after a restart
   4808		 */
   4809		safe = first_dev_address(conf->reshape_safe, &conf->prev);
   4810
   4811		/* Need to update metadata if 'next' might be beyond 'safe'
   4812		 * as that would possibly corrupt data
   4813		 */
   4814		if (next > safe + conf->offset_diff)
   4815			need_flush = 1;
   4816
   4817		sector_nr = conf->reshape_progress;
   4818		last  = sector_nr | (conf->geo.chunk_mask
   4819				     & conf->prev.chunk_mask);
   4820
   4821		if (sector_nr + RESYNC_SECTORS <= last)
   4822			last = sector_nr + RESYNC_SECTORS - 1;
   4823	}
   4824
   4825	if (need_flush ||
   4826	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
   4827		/* Need to update reshape_position in metadata */
   4828		wait_barrier(conf, false);
   4829		mddev->reshape_position = conf->reshape_progress;
   4830		if (mddev->reshape_backwards)
   4831			mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
   4832				- conf->reshape_progress;
   4833		else
   4834			mddev->curr_resync_completed = conf->reshape_progress;
   4835		conf->reshape_checkpoint = jiffies;
   4836		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
   4837		md_wakeup_thread(mddev->thread);
   4838		wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
   4839			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
   4840		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
   4841			allow_barrier(conf);
   4842			return sectors_done;
   4843		}
   4844		conf->reshape_safe = mddev->reshape_position;
   4845		allow_barrier(conf);
   4846	}
   4847
   4848	raise_barrier(conf, 0);
   4849read_more:
   4850	/* Now schedule reads for blocks from sector_nr to last */
   4851	r10_bio = raid10_alloc_init_r10buf(conf);
   4852	r10_bio->state = 0;
   4853	raise_barrier(conf, 1);
   4854	atomic_set(&r10_bio->remaining, 0);
   4855	r10_bio->mddev = mddev;
   4856	r10_bio->sector = sector_nr;
   4857	set_bit(R10BIO_IsReshape, &r10_bio->state);
   4858	r10_bio->sectors = last - sector_nr + 1;
   4859	rdev = read_balance(conf, r10_bio, &max_sectors);
   4860	BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
   4861
   4862	if (!rdev) {
   4863		/* Cannot read from here, so need to record bad blocks
   4864		 * on all the target devices.
   4865		 */
   4866		// FIXME
   4867		mempool_free(r10_bio, &conf->r10buf_pool);
   4868		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   4869		return sectors_done;
   4870	}
   4871
   4872	read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ,
   4873				    GFP_KERNEL, &mddev->bio_set);
   4874	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
   4875			       + rdev->data_offset);
   4876	read_bio->bi_private = r10_bio;
   4877	read_bio->bi_end_io = end_reshape_read;
   4878	r10_bio->master_bio = read_bio;
   4879	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
   4880
   4881	/*
   4882	 * Broadcast RESYNC message to other nodes, so all nodes would not
   4883	 * write to the region to avoid conflict.
   4884	*/
   4885	if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
   4886		struct mdp_superblock_1 *sb = NULL;
   4887		int sb_reshape_pos = 0;
   4888
   4889		conf->cluster_sync_low = sector_nr;
   4890		conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
   4891		sb = page_address(rdev->sb_page);
   4892		if (sb) {
   4893			sb_reshape_pos = le64_to_cpu(sb->reshape_position);
   4894			/*
   4895			 * Set cluster_sync_low again if next address for array
   4896			 * reshape is less than cluster_sync_low. Since we can't
   4897			 * update cluster_sync_low until it has finished reshape.
   4898			 */
   4899			if (sb_reshape_pos < conf->cluster_sync_low)
   4900				conf->cluster_sync_low = sb_reshape_pos;
   4901		}
   4902
   4903		md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
   4904							  conf->cluster_sync_high);
   4905	}
   4906
   4907	/* Now find the locations in the new layout */
   4908	__raid10_find_phys(&conf->geo, r10_bio);
   4909
   4910	blist = read_bio;
   4911	read_bio->bi_next = NULL;
   4912
   4913	rcu_read_lock();
   4914	for (s = 0; s < conf->copies*2; s++) {
   4915		struct bio *b;
   4916		int d = r10_bio->devs[s/2].devnum;
   4917		struct md_rdev *rdev2;
   4918		if (s&1) {
   4919			rdev2 = rcu_dereference(conf->mirrors[d].replacement);
   4920			b = r10_bio->devs[s/2].repl_bio;
   4921		} else {
   4922			rdev2 = rcu_dereference(conf->mirrors[d].rdev);
   4923			b = r10_bio->devs[s/2].bio;
   4924		}
   4925		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
   4926			continue;
   4927
   4928		bio_set_dev(b, rdev2->bdev);
   4929		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
   4930			rdev2->new_data_offset;
   4931		b->bi_end_io = end_reshape_write;
   4932		bio_set_op_attrs(b, REQ_OP_WRITE, 0);
   4933		b->bi_next = blist;
   4934		blist = b;
   4935	}
   4936
   4937	/* Now add as many pages as possible to all of these bios. */
   4938
   4939	nr_sectors = 0;
   4940	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
   4941	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
   4942		struct page *page = pages[s / (PAGE_SIZE >> 9)];
   4943		int len = (max_sectors - s) << 9;
   4944		if (len > PAGE_SIZE)
   4945			len = PAGE_SIZE;
   4946		for (bio = blist; bio ; bio = bio->bi_next) {
   4947			/*
   4948			 * won't fail because the vec table is big enough
   4949			 * to hold all these pages
   4950			 */
   4951			bio_add_page(bio, page, len, 0);
   4952		}
   4953		sector_nr += len >> 9;
   4954		nr_sectors += len >> 9;
   4955	}
   4956	rcu_read_unlock();
   4957	r10_bio->sectors = nr_sectors;
   4958
   4959	/* Now submit the read */
   4960	md_sync_acct_bio(read_bio, r10_bio->sectors);
   4961	atomic_inc(&r10_bio->remaining);
   4962	read_bio->bi_next = NULL;
   4963	submit_bio_noacct(read_bio);
   4964	sectors_done += nr_sectors;
   4965	if (sector_nr <= last)
   4966		goto read_more;
   4967
   4968	lower_barrier(conf);
   4969
   4970	/* Now that we have done the whole section we can
   4971	 * update reshape_progress
   4972	 */
   4973	if (mddev->reshape_backwards)
   4974		conf->reshape_progress -= sectors_done;
   4975	else
   4976		conf->reshape_progress += sectors_done;
   4977
   4978	return sectors_done;
   4979}
   4980
   4981static void end_reshape_request(struct r10bio *r10_bio);
   4982static int handle_reshape_read_error(struct mddev *mddev,
   4983				     struct r10bio *r10_bio);
   4984static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
   4985{
   4986	/* Reshape read completed.  Hopefully we have a block
   4987	 * to write out.
   4988	 * If we got a read error then we do sync 1-page reads from
   4989	 * elsewhere until we find the data - or give up.
   4990	 */
   4991	struct r10conf *conf = mddev->private;
   4992	int s;
   4993
   4994	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
   4995		if (handle_reshape_read_error(mddev, r10_bio) < 0) {
   4996			/* Reshape has been aborted */
   4997			md_done_sync(mddev, r10_bio->sectors, 0);
   4998			return;
   4999		}
   5000
   5001	/* We definitely have the data in the pages, schedule the
   5002	 * writes.
   5003	 */
   5004	atomic_set(&r10_bio->remaining, 1);
   5005	for (s = 0; s < conf->copies*2; s++) {
   5006		struct bio *b;
   5007		int d = r10_bio->devs[s/2].devnum;
   5008		struct md_rdev *rdev;
   5009		rcu_read_lock();
   5010		if (s&1) {
   5011			rdev = rcu_dereference(conf->mirrors[d].replacement);
   5012			b = r10_bio->devs[s/2].repl_bio;
   5013		} else {
   5014			rdev = rcu_dereference(conf->mirrors[d].rdev);
   5015			b = r10_bio->devs[s/2].bio;
   5016		}
   5017		if (!rdev || test_bit(Faulty, &rdev->flags)) {
   5018			rcu_read_unlock();
   5019			continue;
   5020		}
   5021		atomic_inc(&rdev->nr_pending);
   5022		rcu_read_unlock();
   5023		md_sync_acct_bio(b, r10_bio->sectors);
   5024		atomic_inc(&r10_bio->remaining);
   5025		b->bi_next = NULL;
   5026		submit_bio_noacct(b);
   5027	}
   5028	end_reshape_request(r10_bio);
   5029}
   5030
   5031static void end_reshape(struct r10conf *conf)
   5032{
   5033	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
   5034		return;
   5035
   5036	spin_lock_irq(&conf->device_lock);
   5037	conf->prev = conf->geo;
   5038	md_finish_reshape(conf->mddev);
   5039	smp_wmb();
   5040	conf->reshape_progress = MaxSector;
   5041	conf->reshape_safe = MaxSector;
   5042	spin_unlock_irq(&conf->device_lock);
   5043
   5044	if (conf->mddev->queue)
   5045		raid10_set_io_opt(conf);
   5046	conf->fullsync = 0;
   5047}
   5048
   5049static void raid10_update_reshape_pos(struct mddev *mddev)
   5050{
   5051	struct r10conf *conf = mddev->private;
   5052	sector_t lo, hi;
   5053
   5054	md_cluster_ops->resync_info_get(mddev, &lo, &hi);
   5055	if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
   5056	    || mddev->reshape_position == MaxSector)
   5057		conf->reshape_progress = mddev->reshape_position;
   5058	else
   5059		WARN_ON_ONCE(1);
   5060}
   5061
   5062static int handle_reshape_read_error(struct mddev *mddev,
   5063				     struct r10bio *r10_bio)
   5064{
   5065	/* Use sync reads to get the blocks from somewhere else */
   5066	int sectors = r10_bio->sectors;
   5067	struct r10conf *conf = mddev->private;
   5068	struct r10bio *r10b;
   5069	int slot = 0;
   5070	int idx = 0;
   5071	struct page **pages;
   5072
   5073	r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
   5074	if (!r10b) {
   5075		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   5076		return -ENOMEM;
   5077	}
   5078
   5079	/* reshape IOs share pages from .devs[0].bio */
   5080	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
   5081
   5082	r10b->sector = r10_bio->sector;
   5083	__raid10_find_phys(&conf->prev, r10b);
   5084
   5085	while (sectors) {
   5086		int s = sectors;
   5087		int success = 0;
   5088		int first_slot = slot;
   5089
   5090		if (s > (PAGE_SIZE >> 9))
   5091			s = PAGE_SIZE >> 9;
   5092
   5093		rcu_read_lock();
   5094		while (!success) {
   5095			int d = r10b->devs[slot].devnum;
   5096			struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
   5097			sector_t addr;
   5098			if (rdev == NULL ||
   5099			    test_bit(Faulty, &rdev->flags) ||
   5100			    !test_bit(In_sync, &rdev->flags))
   5101				goto failed;
   5102
   5103			addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
   5104			atomic_inc(&rdev->nr_pending);
   5105			rcu_read_unlock();
   5106			success = sync_page_io(rdev,
   5107					       addr,
   5108					       s << 9,
   5109					       pages[idx],
   5110					       REQ_OP_READ, 0, false);
   5111			rdev_dec_pending(rdev, mddev);
   5112			rcu_read_lock();
   5113			if (success)
   5114				break;
   5115		failed:
   5116			slot++;
   5117			if (slot >= conf->copies)
   5118				slot = 0;
   5119			if (slot == first_slot)
   5120				break;
   5121		}
   5122		rcu_read_unlock();
   5123		if (!success) {
   5124			/* couldn't read this block, must give up */
   5125			set_bit(MD_RECOVERY_INTR,
   5126				&mddev->recovery);
   5127			kfree(r10b);
   5128			return -EIO;
   5129		}
   5130		sectors -= s;
   5131		idx++;
   5132	}
   5133	kfree(r10b);
   5134	return 0;
   5135}
   5136
   5137static void end_reshape_write(struct bio *bio)
   5138{
   5139	struct r10bio *r10_bio = get_resync_r10bio(bio);
   5140	struct mddev *mddev = r10_bio->mddev;
   5141	struct r10conf *conf = mddev->private;
   5142	int d;
   5143	int slot;
   5144	int repl;
   5145	struct md_rdev *rdev = NULL;
   5146
   5147	d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
   5148	if (repl)
   5149		rdev = conf->mirrors[d].replacement;
   5150	if (!rdev) {
   5151		smp_mb();
   5152		rdev = conf->mirrors[d].rdev;
   5153	}
   5154
   5155	if (bio->bi_status) {
   5156		/* FIXME should record badblock */
   5157		md_error(mddev, rdev);
   5158	}
   5159
   5160	rdev_dec_pending(rdev, mddev);
   5161	end_reshape_request(r10_bio);
   5162}
   5163
   5164static void end_reshape_request(struct r10bio *r10_bio)
   5165{
   5166	if (!atomic_dec_and_test(&r10_bio->remaining))
   5167		return;
   5168	md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
   5169	bio_put(r10_bio->master_bio);
   5170	put_buf(r10_bio);
   5171}
   5172
   5173static void raid10_finish_reshape(struct mddev *mddev)
   5174{
   5175	struct r10conf *conf = mddev->private;
   5176
   5177	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
   5178		return;
   5179
   5180	if (mddev->delta_disks > 0) {
   5181		if (mddev->recovery_cp > mddev->resync_max_sectors) {
   5182			mddev->recovery_cp = mddev->resync_max_sectors;
   5183			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
   5184		}
   5185		mddev->resync_max_sectors = mddev->array_sectors;
   5186	} else {
   5187		int d;
   5188		rcu_read_lock();
   5189		for (d = conf->geo.raid_disks ;
   5190		     d < conf->geo.raid_disks - mddev->delta_disks;
   5191		     d++) {
   5192			struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
   5193			if (rdev)
   5194				clear_bit(In_sync, &rdev->flags);
   5195			rdev = rcu_dereference(conf->mirrors[d].replacement);
   5196			if (rdev)
   5197				clear_bit(In_sync, &rdev->flags);
   5198		}
   5199		rcu_read_unlock();
   5200	}
   5201	mddev->layout = mddev->new_layout;
   5202	mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
   5203	mddev->reshape_position = MaxSector;
   5204	mddev->delta_disks = 0;
   5205	mddev->reshape_backwards = 0;
   5206}
   5207
   5208static struct md_personality raid10_personality =
   5209{
   5210	.name		= "raid10",
   5211	.level		= 10,
   5212	.owner		= THIS_MODULE,
   5213	.make_request	= raid10_make_request,
   5214	.run		= raid10_run,
   5215	.free		= raid10_free,
   5216	.status		= raid10_status,
   5217	.error_handler	= raid10_error,
   5218	.hot_add_disk	= raid10_add_disk,
   5219	.hot_remove_disk= raid10_remove_disk,
   5220	.spare_active	= raid10_spare_active,
   5221	.sync_request	= raid10_sync_request,
   5222	.quiesce	= raid10_quiesce,
   5223	.size		= raid10_size,
   5224	.resize		= raid10_resize,
   5225	.takeover	= raid10_takeover,
   5226	.check_reshape	= raid10_check_reshape,
   5227	.start_reshape	= raid10_start_reshape,
   5228	.finish_reshape	= raid10_finish_reshape,
   5229	.update_reshape_pos = raid10_update_reshape_pos,
   5230};
   5231
   5232static int __init raid_init(void)
   5233{
   5234	return register_md_personality(&raid10_personality);
   5235}
   5236
   5237static void raid_exit(void)
   5238{
   5239	unregister_md_personality(&raid10_personality);
   5240}
   5241
   5242module_init(raid_init);
   5243module_exit(raid_exit);
   5244MODULE_LICENSE("GPL");
   5245MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
   5246MODULE_ALIAS("md-personality-9"); /* RAID10 */
   5247MODULE_ALIAS("md-raid10");
   5248MODULE_ALIAS("md-level-10");