cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

raid1.c (94731B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * raid1.c : Multiple Devices driver for Linux
      4 *
      5 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
      6 *
      7 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
      8 *
      9 * RAID-1 management functions.
     10 *
     11 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
     12 *
     13 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
     14 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
     15 *
     16 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
     17 * bitmapped intelligence in resync:
     18 *
     19 *      - bitmap marked during normal i/o
     20 *      - bitmap used to skip nondirty blocks during sync
     21 *
     22 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
     23 * - persistent bitmap code
     24 */
     25
     26#include <linux/slab.h>
     27#include <linux/delay.h>
     28#include <linux/blkdev.h>
     29#include <linux/module.h>
     30#include <linux/seq_file.h>
     31#include <linux/ratelimit.h>
     32#include <linux/interval_tree_generic.h>
     33
     34#include <trace/events/block.h>
     35
     36#include "md.h"
     37#include "raid1.h"
     38#include "md-bitmap.h"
     39
     40#define UNSUPPORTED_MDDEV_FLAGS		\
     41	((1L << MD_HAS_JOURNAL) |	\
     42	 (1L << MD_JOURNAL_CLEAN) |	\
     43	 (1L << MD_HAS_PPL) |		\
     44	 (1L << MD_HAS_MULTIPLE_PPLS))
     45
     46static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
     47static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
     48
     49#define raid1_log(md, fmt, args...)				\
     50	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
     51
     52#include "raid1-10.c"
     53
     54#define START(node) ((node)->start)
     55#define LAST(node) ((node)->last)
     56INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
     57		     START, LAST, static inline, raid1_rb);
     58
     59static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
     60				struct serial_info *si, int idx)
     61{
     62	unsigned long flags;
     63	int ret = 0;
     64	sector_t lo = r1_bio->sector;
     65	sector_t hi = lo + r1_bio->sectors;
     66	struct serial_in_rdev *serial = &rdev->serial[idx];
     67
     68	spin_lock_irqsave(&serial->serial_lock, flags);
     69	/* collision happened */
     70	if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
     71		ret = -EBUSY;
     72	else {
     73		si->start = lo;
     74		si->last = hi;
     75		raid1_rb_insert(si, &serial->serial_rb);
     76	}
     77	spin_unlock_irqrestore(&serial->serial_lock, flags);
     78
     79	return ret;
     80}
     81
     82static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
     83{
     84	struct mddev *mddev = rdev->mddev;
     85	struct serial_info *si;
     86	int idx = sector_to_idx(r1_bio->sector);
     87	struct serial_in_rdev *serial = &rdev->serial[idx];
     88
     89	if (WARN_ON(!mddev->serial_info_pool))
     90		return;
     91	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
     92	wait_event(serial->serial_io_wait,
     93		   check_and_add_serial(rdev, r1_bio, si, idx) == 0);
     94}
     95
     96static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
     97{
     98	struct serial_info *si;
     99	unsigned long flags;
    100	int found = 0;
    101	struct mddev *mddev = rdev->mddev;
    102	int idx = sector_to_idx(lo);
    103	struct serial_in_rdev *serial = &rdev->serial[idx];
    104
    105	spin_lock_irqsave(&serial->serial_lock, flags);
    106	for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
    107	     si; si = raid1_rb_iter_next(si, lo, hi)) {
    108		if (si->start == lo && si->last == hi) {
    109			raid1_rb_remove(si, &serial->serial_rb);
    110			mempool_free(si, mddev->serial_info_pool);
    111			found = 1;
    112			break;
    113		}
    114	}
    115	if (!found)
    116		WARN(1, "The write IO is not recorded for serialization\n");
    117	spin_unlock_irqrestore(&serial->serial_lock, flags);
    118	wake_up(&serial->serial_io_wait);
    119}
    120
    121/*
    122 * for resync bio, r1bio pointer can be retrieved from the per-bio
    123 * 'struct resync_pages'.
    124 */
    125static inline struct r1bio *get_resync_r1bio(struct bio *bio)
    126{
    127	return get_resync_pages(bio)->raid_bio;
    128}
    129
    130static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
    131{
    132	struct pool_info *pi = data;
    133	int size = offsetof(struct r1bio, bios[pi->raid_disks]);
    134
    135	/* allocate a r1bio with room for raid_disks entries in the bios array */
    136	return kzalloc(size, gfp_flags);
    137}
    138
    139#define RESYNC_DEPTH 32
    140#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
    141#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
    142#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
    143#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
    144#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
    145
    146static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
    147{
    148	struct pool_info *pi = data;
    149	struct r1bio *r1_bio;
    150	struct bio *bio;
    151	int need_pages;
    152	int j;
    153	struct resync_pages *rps;
    154
    155	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
    156	if (!r1_bio)
    157		return NULL;
    158
    159	rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages),
    160			    gfp_flags);
    161	if (!rps)
    162		goto out_free_r1bio;
    163
    164	/*
    165	 * Allocate bios : 1 for reading, n-1 for writing
    166	 */
    167	for (j = pi->raid_disks ; j-- ; ) {
    168		bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
    169		if (!bio)
    170			goto out_free_bio;
    171		bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
    172		r1_bio->bios[j] = bio;
    173	}
    174	/*
    175	 * Allocate RESYNC_PAGES data pages and attach them to
    176	 * the first bio.
    177	 * If this is a user-requested check/repair, allocate
    178	 * RESYNC_PAGES for each bio.
    179	 */
    180	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
    181		need_pages = pi->raid_disks;
    182	else
    183		need_pages = 1;
    184	for (j = 0; j < pi->raid_disks; j++) {
    185		struct resync_pages *rp = &rps[j];
    186
    187		bio = r1_bio->bios[j];
    188
    189		if (j < need_pages) {
    190			if (resync_alloc_pages(rp, gfp_flags))
    191				goto out_free_pages;
    192		} else {
    193			memcpy(rp, &rps[0], sizeof(*rp));
    194			resync_get_all_pages(rp);
    195		}
    196
    197		rp->raid_bio = r1_bio;
    198		bio->bi_private = rp;
    199	}
    200
    201	r1_bio->master_bio = NULL;
    202
    203	return r1_bio;
    204
    205out_free_pages:
    206	while (--j >= 0)
    207		resync_free_pages(&rps[j]);
    208
    209out_free_bio:
    210	while (++j < pi->raid_disks) {
    211		bio_uninit(r1_bio->bios[j]);
    212		kfree(r1_bio->bios[j]);
    213	}
    214	kfree(rps);
    215
    216out_free_r1bio:
    217	rbio_pool_free(r1_bio, data);
    218	return NULL;
    219}
    220
    221static void r1buf_pool_free(void *__r1_bio, void *data)
    222{
    223	struct pool_info *pi = data;
    224	int i;
    225	struct r1bio *r1bio = __r1_bio;
    226	struct resync_pages *rp = NULL;
    227
    228	for (i = pi->raid_disks; i--; ) {
    229		rp = get_resync_pages(r1bio->bios[i]);
    230		resync_free_pages(rp);
    231		bio_uninit(r1bio->bios[i]);
    232		kfree(r1bio->bios[i]);
    233	}
    234
    235	/* resync pages array stored in the 1st bio's .bi_private */
    236	kfree(rp);
    237
    238	rbio_pool_free(r1bio, data);
    239}
    240
    241static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
    242{
    243	int i;
    244
    245	for (i = 0; i < conf->raid_disks * 2; i++) {
    246		struct bio **bio = r1_bio->bios + i;
    247		if (!BIO_SPECIAL(*bio))
    248			bio_put(*bio);
    249		*bio = NULL;
    250	}
    251}
    252
    253static void free_r1bio(struct r1bio *r1_bio)
    254{
    255	struct r1conf *conf = r1_bio->mddev->private;
    256
    257	put_all_bios(conf, r1_bio);
    258	mempool_free(r1_bio, &conf->r1bio_pool);
    259}
    260
    261static void put_buf(struct r1bio *r1_bio)
    262{
    263	struct r1conf *conf = r1_bio->mddev->private;
    264	sector_t sect = r1_bio->sector;
    265	int i;
    266
    267	for (i = 0; i < conf->raid_disks * 2; i++) {
    268		struct bio *bio = r1_bio->bios[i];
    269		if (bio->bi_end_io)
    270			rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
    271	}
    272
    273	mempool_free(r1_bio, &conf->r1buf_pool);
    274
    275	lower_barrier(conf, sect);
    276}
    277
    278static void reschedule_retry(struct r1bio *r1_bio)
    279{
    280	unsigned long flags;
    281	struct mddev *mddev = r1_bio->mddev;
    282	struct r1conf *conf = mddev->private;
    283	int idx;
    284
    285	idx = sector_to_idx(r1_bio->sector);
    286	spin_lock_irqsave(&conf->device_lock, flags);
    287	list_add(&r1_bio->retry_list, &conf->retry_list);
    288	atomic_inc(&conf->nr_queued[idx]);
    289	spin_unlock_irqrestore(&conf->device_lock, flags);
    290
    291	wake_up(&conf->wait_barrier);
    292	md_wakeup_thread(mddev->thread);
    293}
    294
    295/*
    296 * raid_end_bio_io() is called when we have finished servicing a mirrored
    297 * operation and are ready to return a success/failure code to the buffer
    298 * cache layer.
    299 */
    300static void call_bio_endio(struct r1bio *r1_bio)
    301{
    302	struct bio *bio = r1_bio->master_bio;
    303
    304	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
    305		bio->bi_status = BLK_STS_IOERR;
    306
    307	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
    308		bio_end_io_acct(bio, r1_bio->start_time);
    309	bio_endio(bio);
    310}
    311
    312static void raid_end_bio_io(struct r1bio *r1_bio)
    313{
    314	struct bio *bio = r1_bio->master_bio;
    315	struct r1conf *conf = r1_bio->mddev->private;
    316
    317	/* if nobody has done the final endio yet, do it now */
    318	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
    319		pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
    320			 (bio_data_dir(bio) == WRITE) ? "write" : "read",
    321			 (unsigned long long) bio->bi_iter.bi_sector,
    322			 (unsigned long long) bio_end_sector(bio) - 1);
    323
    324		call_bio_endio(r1_bio);
    325	}
    326	/*
    327	 * Wake up any possible resync thread that waits for the device
    328	 * to go idle.  All I/Os, even write-behind writes, are done.
    329	 */
    330	allow_barrier(conf, r1_bio->sector);
    331
    332	free_r1bio(r1_bio);
    333}
    334
    335/*
    336 * Update disk head position estimator based on IRQ completion info.
    337 */
    338static inline void update_head_pos(int disk, struct r1bio *r1_bio)
    339{
    340	struct r1conf *conf = r1_bio->mddev->private;
    341
    342	conf->mirrors[disk].head_position =
    343		r1_bio->sector + (r1_bio->sectors);
    344}
    345
    346/*
    347 * Find the disk number which triggered given bio
    348 */
    349static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
    350{
    351	int mirror;
    352	struct r1conf *conf = r1_bio->mddev->private;
    353	int raid_disks = conf->raid_disks;
    354
    355	for (mirror = 0; mirror < raid_disks * 2; mirror++)
    356		if (r1_bio->bios[mirror] == bio)
    357			break;
    358
    359	BUG_ON(mirror == raid_disks * 2);
    360	update_head_pos(mirror, r1_bio);
    361
    362	return mirror;
    363}
    364
    365static void raid1_end_read_request(struct bio *bio)
    366{
    367	int uptodate = !bio->bi_status;
    368	struct r1bio *r1_bio = bio->bi_private;
    369	struct r1conf *conf = r1_bio->mddev->private;
    370	struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
    371
    372	/*
    373	 * this branch is our 'one mirror IO has finished' event handler:
    374	 */
    375	update_head_pos(r1_bio->read_disk, r1_bio);
    376
    377	if (uptodate)
    378		set_bit(R1BIO_Uptodate, &r1_bio->state);
    379	else if (test_bit(FailFast, &rdev->flags) &&
    380		 test_bit(R1BIO_FailFast, &r1_bio->state))
    381		/* This was a fail-fast read so we definitely
    382		 * want to retry */
    383		;
    384	else {
    385		/* If all other devices have failed, we want to return
    386		 * the error upwards rather than fail the last device.
    387		 * Here we redefine "uptodate" to mean "Don't want to retry"
    388		 */
    389		unsigned long flags;
    390		spin_lock_irqsave(&conf->device_lock, flags);
    391		if (r1_bio->mddev->degraded == conf->raid_disks ||
    392		    (r1_bio->mddev->degraded == conf->raid_disks-1 &&
    393		     test_bit(In_sync, &rdev->flags)))
    394			uptodate = 1;
    395		spin_unlock_irqrestore(&conf->device_lock, flags);
    396	}
    397
    398	if (uptodate) {
    399		raid_end_bio_io(r1_bio);
    400		rdev_dec_pending(rdev, conf->mddev);
    401	} else {
    402		/*
    403		 * oops, read error:
    404		 */
    405		pr_err_ratelimited("md/raid1:%s: %pg: rescheduling sector %llu\n",
    406				   mdname(conf->mddev),
    407				   rdev->bdev,
    408				   (unsigned long long)r1_bio->sector);
    409		set_bit(R1BIO_ReadError, &r1_bio->state);
    410		reschedule_retry(r1_bio);
    411		/* don't drop the reference on read_disk yet */
    412	}
    413}
    414
    415static void close_write(struct r1bio *r1_bio)
    416{
    417	/* it really is the end of this request */
    418	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
    419		bio_free_pages(r1_bio->behind_master_bio);
    420		bio_put(r1_bio->behind_master_bio);
    421		r1_bio->behind_master_bio = NULL;
    422	}
    423	/* clear the bitmap if all writes complete successfully */
    424	md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
    425			   r1_bio->sectors,
    426			   !test_bit(R1BIO_Degraded, &r1_bio->state),
    427			   test_bit(R1BIO_BehindIO, &r1_bio->state));
    428	md_write_end(r1_bio->mddev);
    429}
    430
    431static void r1_bio_write_done(struct r1bio *r1_bio)
    432{
    433	if (!atomic_dec_and_test(&r1_bio->remaining))
    434		return;
    435
    436	if (test_bit(R1BIO_WriteError, &r1_bio->state))
    437		reschedule_retry(r1_bio);
    438	else {
    439		close_write(r1_bio);
    440		if (test_bit(R1BIO_MadeGood, &r1_bio->state))
    441			reschedule_retry(r1_bio);
    442		else
    443			raid_end_bio_io(r1_bio);
    444	}
    445}
    446
    447static void raid1_end_write_request(struct bio *bio)
    448{
    449	struct r1bio *r1_bio = bio->bi_private;
    450	int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
    451	struct r1conf *conf = r1_bio->mddev->private;
    452	struct bio *to_put = NULL;
    453	int mirror = find_bio_disk(r1_bio, bio);
    454	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
    455	bool discard_error;
    456	sector_t lo = r1_bio->sector;
    457	sector_t hi = r1_bio->sector + r1_bio->sectors;
    458
    459	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
    460
    461	/*
    462	 * 'one mirror IO has finished' event handler:
    463	 */
    464	if (bio->bi_status && !discard_error) {
    465		set_bit(WriteErrorSeen,	&rdev->flags);
    466		if (!test_and_set_bit(WantReplacement, &rdev->flags))
    467			set_bit(MD_RECOVERY_NEEDED, &
    468				conf->mddev->recovery);
    469
    470		if (test_bit(FailFast, &rdev->flags) &&
    471		    (bio->bi_opf & MD_FAILFAST) &&
    472		    /* We never try FailFast to WriteMostly devices */
    473		    !test_bit(WriteMostly, &rdev->flags)) {
    474			md_error(r1_bio->mddev, rdev);
    475		}
    476
    477		/*
    478		 * When the device is faulty, it is not necessary to
    479		 * handle write error.
    480		 */
    481		if (!test_bit(Faulty, &rdev->flags))
    482			set_bit(R1BIO_WriteError, &r1_bio->state);
    483		else {
    484			/* Fail the request */
    485			set_bit(R1BIO_Degraded, &r1_bio->state);
    486			/* Finished with this branch */
    487			r1_bio->bios[mirror] = NULL;
    488			to_put = bio;
    489		}
    490	} else {
    491		/*
    492		 * Set R1BIO_Uptodate in our master bio, so that we
    493		 * will return a good error code for to the higher
    494		 * levels even if IO on some other mirrored buffer
    495		 * fails.
    496		 *
    497		 * The 'master' represents the composite IO operation
    498		 * to user-side. So if something waits for IO, then it
    499		 * will wait for the 'master' bio.
    500		 */
    501		sector_t first_bad;
    502		int bad_sectors;
    503
    504		r1_bio->bios[mirror] = NULL;
    505		to_put = bio;
    506		/*
    507		 * Do not set R1BIO_Uptodate if the current device is
    508		 * rebuilding or Faulty. This is because we cannot use
    509		 * such device for properly reading the data back (we could
    510		 * potentially use it, if the current write would have felt
    511		 * before rdev->recovery_offset, but for simplicity we don't
    512		 * check this here.
    513		 */
    514		if (test_bit(In_sync, &rdev->flags) &&
    515		    !test_bit(Faulty, &rdev->flags))
    516			set_bit(R1BIO_Uptodate, &r1_bio->state);
    517
    518		/* Maybe we can clear some bad blocks. */
    519		if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
    520				&first_bad, &bad_sectors) && !discard_error) {
    521			r1_bio->bios[mirror] = IO_MADE_GOOD;
    522			set_bit(R1BIO_MadeGood, &r1_bio->state);
    523		}
    524	}
    525
    526	if (behind) {
    527		if (test_bit(CollisionCheck, &rdev->flags))
    528			remove_serial(rdev, lo, hi);
    529		if (test_bit(WriteMostly, &rdev->flags))
    530			atomic_dec(&r1_bio->behind_remaining);
    531
    532		/*
    533		 * In behind mode, we ACK the master bio once the I/O
    534		 * has safely reached all non-writemostly
    535		 * disks. Setting the Returned bit ensures that this
    536		 * gets done only once -- we don't ever want to return
    537		 * -EIO here, instead we'll wait
    538		 */
    539		if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
    540		    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
    541			/* Maybe we can return now */
    542			if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
    543				struct bio *mbio = r1_bio->master_bio;
    544				pr_debug("raid1: behind end write sectors"
    545					 " %llu-%llu\n",
    546					 (unsigned long long) mbio->bi_iter.bi_sector,
    547					 (unsigned long long) bio_end_sector(mbio) - 1);
    548				call_bio_endio(r1_bio);
    549			}
    550		}
    551	} else if (rdev->mddev->serialize_policy)
    552		remove_serial(rdev, lo, hi);
    553	if (r1_bio->bios[mirror] == NULL)
    554		rdev_dec_pending(rdev, conf->mddev);
    555
    556	/*
    557	 * Let's see if all mirrored write operations have finished
    558	 * already.
    559	 */
    560	r1_bio_write_done(r1_bio);
    561
    562	if (to_put)
    563		bio_put(to_put);
    564}
    565
    566static sector_t align_to_barrier_unit_end(sector_t start_sector,
    567					  sector_t sectors)
    568{
    569	sector_t len;
    570
    571	WARN_ON(sectors == 0);
    572	/*
    573	 * len is the number of sectors from start_sector to end of the
    574	 * barrier unit which start_sector belongs to.
    575	 */
    576	len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
    577	      start_sector;
    578
    579	if (len > sectors)
    580		len = sectors;
    581
    582	return len;
    583}
    584
    585/*
    586 * This routine returns the disk from which the requested read should
    587 * be done. There is a per-array 'next expected sequential IO' sector
    588 * number - if this matches on the next IO then we use the last disk.
    589 * There is also a per-disk 'last know head position' sector that is
    590 * maintained from IRQ contexts, both the normal and the resync IO
    591 * completion handlers update this position correctly. If there is no
    592 * perfect sequential match then we pick the disk whose head is closest.
    593 *
    594 * If there are 2 mirrors in the same 2 devices, performance degrades
    595 * because position is mirror, not device based.
    596 *
    597 * The rdev for the device selected will have nr_pending incremented.
    598 */
    599static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
    600{
    601	const sector_t this_sector = r1_bio->sector;
    602	int sectors;
    603	int best_good_sectors;
    604	int best_disk, best_dist_disk, best_pending_disk;
    605	int has_nonrot_disk;
    606	int disk;
    607	sector_t best_dist;
    608	unsigned int min_pending;
    609	struct md_rdev *rdev;
    610	int choose_first;
    611	int choose_next_idle;
    612
    613	rcu_read_lock();
    614	/*
    615	 * Check if we can balance. We can balance on the whole
    616	 * device if no resync is going on, or below the resync window.
    617	 * We take the first readable disk when above the resync window.
    618	 */
    619 retry:
    620	sectors = r1_bio->sectors;
    621	best_disk = -1;
    622	best_dist_disk = -1;
    623	best_dist = MaxSector;
    624	best_pending_disk = -1;
    625	min_pending = UINT_MAX;
    626	best_good_sectors = 0;
    627	has_nonrot_disk = 0;
    628	choose_next_idle = 0;
    629	clear_bit(R1BIO_FailFast, &r1_bio->state);
    630
    631	if ((conf->mddev->recovery_cp < this_sector + sectors) ||
    632	    (mddev_is_clustered(conf->mddev) &&
    633	    md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
    634		    this_sector + sectors)))
    635		choose_first = 1;
    636	else
    637		choose_first = 0;
    638
    639	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
    640		sector_t dist;
    641		sector_t first_bad;
    642		int bad_sectors;
    643		unsigned int pending;
    644		bool nonrot;
    645
    646		rdev = rcu_dereference(conf->mirrors[disk].rdev);
    647		if (r1_bio->bios[disk] == IO_BLOCKED
    648		    || rdev == NULL
    649		    || test_bit(Faulty, &rdev->flags))
    650			continue;
    651		if (!test_bit(In_sync, &rdev->flags) &&
    652		    rdev->recovery_offset < this_sector + sectors)
    653			continue;
    654		if (test_bit(WriteMostly, &rdev->flags)) {
    655			/* Don't balance among write-mostly, just
    656			 * use the first as a last resort */
    657			if (best_dist_disk < 0) {
    658				if (is_badblock(rdev, this_sector, sectors,
    659						&first_bad, &bad_sectors)) {
    660					if (first_bad <= this_sector)
    661						/* Cannot use this */
    662						continue;
    663					best_good_sectors = first_bad - this_sector;
    664				} else
    665					best_good_sectors = sectors;
    666				best_dist_disk = disk;
    667				best_pending_disk = disk;
    668			}
    669			continue;
    670		}
    671		/* This is a reasonable device to use.  It might
    672		 * even be best.
    673		 */
    674		if (is_badblock(rdev, this_sector, sectors,
    675				&first_bad, &bad_sectors)) {
    676			if (best_dist < MaxSector)
    677				/* already have a better device */
    678				continue;
    679			if (first_bad <= this_sector) {
    680				/* cannot read here. If this is the 'primary'
    681				 * device, then we must not read beyond
    682				 * bad_sectors from another device..
    683				 */
    684				bad_sectors -= (this_sector - first_bad);
    685				if (choose_first && sectors > bad_sectors)
    686					sectors = bad_sectors;
    687				if (best_good_sectors > sectors)
    688					best_good_sectors = sectors;
    689
    690			} else {
    691				sector_t good_sectors = first_bad - this_sector;
    692				if (good_sectors > best_good_sectors) {
    693					best_good_sectors = good_sectors;
    694					best_disk = disk;
    695				}
    696				if (choose_first)
    697					break;
    698			}
    699			continue;
    700		} else {
    701			if ((sectors > best_good_sectors) && (best_disk >= 0))
    702				best_disk = -1;
    703			best_good_sectors = sectors;
    704		}
    705
    706		if (best_disk >= 0)
    707			/* At least two disks to choose from so failfast is OK */
    708			set_bit(R1BIO_FailFast, &r1_bio->state);
    709
    710		nonrot = bdev_nonrot(rdev->bdev);
    711		has_nonrot_disk |= nonrot;
    712		pending = atomic_read(&rdev->nr_pending);
    713		dist = abs(this_sector - conf->mirrors[disk].head_position);
    714		if (choose_first) {
    715			best_disk = disk;
    716			break;
    717		}
    718		/* Don't change to another disk for sequential reads */
    719		if (conf->mirrors[disk].next_seq_sect == this_sector
    720		    || dist == 0) {
    721			int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
    722			struct raid1_info *mirror = &conf->mirrors[disk];
    723
    724			best_disk = disk;
    725			/*
    726			 * If buffered sequential IO size exceeds optimal
    727			 * iosize, check if there is idle disk. If yes, choose
    728			 * the idle disk. read_balance could already choose an
    729			 * idle disk before noticing it's a sequential IO in
    730			 * this disk. This doesn't matter because this disk
    731			 * will idle, next time it will be utilized after the
    732			 * first disk has IO size exceeds optimal iosize. In
    733			 * this way, iosize of the first disk will be optimal
    734			 * iosize at least. iosize of the second disk might be
    735			 * small, but not a big deal since when the second disk
    736			 * starts IO, the first disk is likely still busy.
    737			 */
    738			if (nonrot && opt_iosize > 0 &&
    739			    mirror->seq_start != MaxSector &&
    740			    mirror->next_seq_sect > opt_iosize &&
    741			    mirror->next_seq_sect - opt_iosize >=
    742			    mirror->seq_start) {
    743				choose_next_idle = 1;
    744				continue;
    745			}
    746			break;
    747		}
    748
    749		if (choose_next_idle)
    750			continue;
    751
    752		if (min_pending > pending) {
    753			min_pending = pending;
    754			best_pending_disk = disk;
    755		}
    756
    757		if (dist < best_dist) {
    758			best_dist = dist;
    759			best_dist_disk = disk;
    760		}
    761	}
    762
    763	/*
    764	 * If all disks are rotational, choose the closest disk. If any disk is
    765	 * non-rotational, choose the disk with less pending request even the
    766	 * disk is rotational, which might/might not be optimal for raids with
    767	 * mixed ratation/non-rotational disks depending on workload.
    768	 */
    769	if (best_disk == -1) {
    770		if (has_nonrot_disk || min_pending == 0)
    771			best_disk = best_pending_disk;
    772		else
    773			best_disk = best_dist_disk;
    774	}
    775
    776	if (best_disk >= 0) {
    777		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
    778		if (!rdev)
    779			goto retry;
    780		atomic_inc(&rdev->nr_pending);
    781		sectors = best_good_sectors;
    782
    783		if (conf->mirrors[best_disk].next_seq_sect != this_sector)
    784			conf->mirrors[best_disk].seq_start = this_sector;
    785
    786		conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
    787	}
    788	rcu_read_unlock();
    789	*max_sectors = sectors;
    790
    791	return best_disk;
    792}
    793
    794static void flush_bio_list(struct r1conf *conf, struct bio *bio)
    795{
    796	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
    797	md_bitmap_unplug(conf->mddev->bitmap);
    798	wake_up(&conf->wait_barrier);
    799
    800	while (bio) { /* submit pending writes */
    801		struct bio *next = bio->bi_next;
    802		struct md_rdev *rdev = (void *)bio->bi_bdev;
    803		bio->bi_next = NULL;
    804		bio_set_dev(bio, rdev->bdev);
    805		if (test_bit(Faulty, &rdev->flags)) {
    806			bio_io_error(bio);
    807		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
    808				    !bdev_max_discard_sectors(bio->bi_bdev)))
    809			/* Just ignore it */
    810			bio_endio(bio);
    811		else
    812			submit_bio_noacct(bio);
    813		bio = next;
    814		cond_resched();
    815	}
    816}
    817
    818static void flush_pending_writes(struct r1conf *conf)
    819{
    820	/* Any writes that have been queued but are awaiting
    821	 * bitmap updates get flushed here.
    822	 */
    823	spin_lock_irq(&conf->device_lock);
    824
    825	if (conf->pending_bio_list.head) {
    826		struct blk_plug plug;
    827		struct bio *bio;
    828
    829		bio = bio_list_get(&conf->pending_bio_list);
    830		spin_unlock_irq(&conf->device_lock);
    831
    832		/*
    833		 * As this is called in a wait_event() loop (see freeze_array),
    834		 * current->state might be TASK_UNINTERRUPTIBLE which will
    835		 * cause a warning when we prepare to wait again.  As it is
    836		 * rare that this path is taken, it is perfectly safe to force
    837		 * us to go around the wait_event() loop again, so the warning
    838		 * is a false-positive.  Silence the warning by resetting
    839		 * thread state
    840		 */
    841		__set_current_state(TASK_RUNNING);
    842		blk_start_plug(&plug);
    843		flush_bio_list(conf, bio);
    844		blk_finish_plug(&plug);
    845	} else
    846		spin_unlock_irq(&conf->device_lock);
    847}
    848
    849/* Barriers....
    850 * Sometimes we need to suspend IO while we do something else,
    851 * either some resync/recovery, or reconfigure the array.
    852 * To do this we raise a 'barrier'.
    853 * The 'barrier' is a counter that can be raised multiple times
    854 * to count how many activities are happening which preclude
    855 * normal IO.
    856 * We can only raise the barrier if there is no pending IO.
    857 * i.e. if nr_pending == 0.
    858 * We choose only to raise the barrier if no-one is waiting for the
    859 * barrier to go down.  This means that as soon as an IO request
    860 * is ready, no other operations which require a barrier will start
    861 * until the IO request has had a chance.
    862 *
    863 * So: regular IO calls 'wait_barrier'.  When that returns there
    864 *    is no backgroup IO happening,  It must arrange to call
    865 *    allow_barrier when it has finished its IO.
    866 * backgroup IO calls must call raise_barrier.  Once that returns
    867 *    there is no normal IO happeing.  It must arrange to call
    868 *    lower_barrier when the particular background IO completes.
    869 *
    870 * If resync/recovery is interrupted, returns -EINTR;
    871 * Otherwise, returns 0.
    872 */
    873static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
    874{
    875	int idx = sector_to_idx(sector_nr);
    876
    877	spin_lock_irq(&conf->resync_lock);
    878
    879	/* Wait until no block IO is waiting */
    880	wait_event_lock_irq(conf->wait_barrier,
    881			    !atomic_read(&conf->nr_waiting[idx]),
    882			    conf->resync_lock);
    883
    884	/* block any new IO from starting */
    885	atomic_inc(&conf->barrier[idx]);
    886	/*
    887	 * In raise_barrier() we firstly increase conf->barrier[idx] then
    888	 * check conf->nr_pending[idx]. In _wait_barrier() we firstly
    889	 * increase conf->nr_pending[idx] then check conf->barrier[idx].
    890	 * A memory barrier here to make sure conf->nr_pending[idx] won't
    891	 * be fetched before conf->barrier[idx] is increased. Otherwise
    892	 * there will be a race between raise_barrier() and _wait_barrier().
    893	 */
    894	smp_mb__after_atomic();
    895
    896	/* For these conditions we must wait:
    897	 * A: while the array is in frozen state
    898	 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
    899	 *    existing in corresponding I/O barrier bucket.
    900	 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
    901	 *    max resync count which allowed on current I/O barrier bucket.
    902	 */
    903	wait_event_lock_irq(conf->wait_barrier,
    904			    (!conf->array_frozen &&
    905			     !atomic_read(&conf->nr_pending[idx]) &&
    906			     atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) ||
    907				test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
    908			    conf->resync_lock);
    909
    910	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
    911		atomic_dec(&conf->barrier[idx]);
    912		spin_unlock_irq(&conf->resync_lock);
    913		wake_up(&conf->wait_barrier);
    914		return -EINTR;
    915	}
    916
    917	atomic_inc(&conf->nr_sync_pending);
    918	spin_unlock_irq(&conf->resync_lock);
    919
    920	return 0;
    921}
    922
    923static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
    924{
    925	int idx = sector_to_idx(sector_nr);
    926
    927	BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
    928
    929	atomic_dec(&conf->barrier[idx]);
    930	atomic_dec(&conf->nr_sync_pending);
    931	wake_up(&conf->wait_barrier);
    932}
    933
    934static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
    935{
    936	bool ret = true;
    937
    938	/*
    939	 * We need to increase conf->nr_pending[idx] very early here,
    940	 * then raise_barrier() can be blocked when it waits for
    941	 * conf->nr_pending[idx] to be 0. Then we can avoid holding
    942	 * conf->resync_lock when there is no barrier raised in same
    943	 * barrier unit bucket. Also if the array is frozen, I/O
    944	 * should be blocked until array is unfrozen.
    945	 */
    946	atomic_inc(&conf->nr_pending[idx]);
    947	/*
    948	 * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
    949	 * check conf->barrier[idx]. In raise_barrier() we firstly increase
    950	 * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
    951	 * barrier is necessary here to make sure conf->barrier[idx] won't be
    952	 * fetched before conf->nr_pending[idx] is increased. Otherwise there
    953	 * will be a race between _wait_barrier() and raise_barrier().
    954	 */
    955	smp_mb__after_atomic();
    956
    957	/*
    958	 * Don't worry about checking two atomic_t variables at same time
    959	 * here. If during we check conf->barrier[idx], the array is
    960	 * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
    961	 * 0, it is safe to return and make the I/O continue. Because the
    962	 * array is frozen, all I/O returned here will eventually complete
    963	 * or be queued, no race will happen. See code comment in
    964	 * frozen_array().
    965	 */
    966	if (!READ_ONCE(conf->array_frozen) &&
    967	    !atomic_read(&conf->barrier[idx]))
    968		return ret;
    969
    970	/*
    971	 * After holding conf->resync_lock, conf->nr_pending[idx]
    972	 * should be decreased before waiting for barrier to drop.
    973	 * Otherwise, we may encounter a race condition because
    974	 * raise_barrer() might be waiting for conf->nr_pending[idx]
    975	 * to be 0 at same time.
    976	 */
    977	spin_lock_irq(&conf->resync_lock);
    978	atomic_inc(&conf->nr_waiting[idx]);
    979	atomic_dec(&conf->nr_pending[idx]);
    980	/*
    981	 * In case freeze_array() is waiting for
    982	 * get_unqueued_pending() == extra
    983	 */
    984	wake_up(&conf->wait_barrier);
    985	/* Wait for the barrier in same barrier unit bucket to drop. */
    986
    987	/* Return false when nowait flag is set */
    988	if (nowait) {
    989		ret = false;
    990	} else {
    991		wait_event_lock_irq(conf->wait_barrier,
    992				!conf->array_frozen &&
    993				!atomic_read(&conf->barrier[idx]),
    994				conf->resync_lock);
    995		atomic_inc(&conf->nr_pending[idx]);
    996	}
    997
    998	atomic_dec(&conf->nr_waiting[idx]);
    999	spin_unlock_irq(&conf->resync_lock);
   1000	return ret;
   1001}
   1002
   1003static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
   1004{
   1005	int idx = sector_to_idx(sector_nr);
   1006	bool ret = true;
   1007
   1008	/*
   1009	 * Very similar to _wait_barrier(). The difference is, for read
   1010	 * I/O we don't need wait for sync I/O, but if the whole array
   1011	 * is frozen, the read I/O still has to wait until the array is
   1012	 * unfrozen. Since there is no ordering requirement with
   1013	 * conf->barrier[idx] here, memory barrier is unnecessary as well.
   1014	 */
   1015	atomic_inc(&conf->nr_pending[idx]);
   1016
   1017	if (!READ_ONCE(conf->array_frozen))
   1018		return ret;
   1019
   1020	spin_lock_irq(&conf->resync_lock);
   1021	atomic_inc(&conf->nr_waiting[idx]);
   1022	atomic_dec(&conf->nr_pending[idx]);
   1023	/*
   1024	 * In case freeze_array() is waiting for
   1025	 * get_unqueued_pending() == extra
   1026	 */
   1027	wake_up(&conf->wait_barrier);
   1028	/* Wait for array to be unfrozen */
   1029
   1030	/* Return false when nowait flag is set */
   1031	if (nowait) {
   1032		/* Return false when nowait flag is set */
   1033		ret = false;
   1034	} else {
   1035		wait_event_lock_irq(conf->wait_barrier,
   1036				!conf->array_frozen,
   1037				conf->resync_lock);
   1038		atomic_inc(&conf->nr_pending[idx]);
   1039	}
   1040
   1041	atomic_dec(&conf->nr_waiting[idx]);
   1042	spin_unlock_irq(&conf->resync_lock);
   1043	return ret;
   1044}
   1045
   1046static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
   1047{
   1048	int idx = sector_to_idx(sector_nr);
   1049
   1050	return _wait_barrier(conf, idx, nowait);
   1051}
   1052
   1053static void _allow_barrier(struct r1conf *conf, int idx)
   1054{
   1055	atomic_dec(&conf->nr_pending[idx]);
   1056	wake_up(&conf->wait_barrier);
   1057}
   1058
   1059static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
   1060{
   1061	int idx = sector_to_idx(sector_nr);
   1062
   1063	_allow_barrier(conf, idx);
   1064}
   1065
   1066/* conf->resync_lock should be held */
   1067static int get_unqueued_pending(struct r1conf *conf)
   1068{
   1069	int idx, ret;
   1070
   1071	ret = atomic_read(&conf->nr_sync_pending);
   1072	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
   1073		ret += atomic_read(&conf->nr_pending[idx]) -
   1074			atomic_read(&conf->nr_queued[idx]);
   1075
   1076	return ret;
   1077}
   1078
   1079static void freeze_array(struct r1conf *conf, int extra)
   1080{
   1081	/* Stop sync I/O and normal I/O and wait for everything to
   1082	 * go quiet.
   1083	 * This is called in two situations:
   1084	 * 1) management command handlers (reshape, remove disk, quiesce).
   1085	 * 2) one normal I/O request failed.
   1086
   1087	 * After array_frozen is set to 1, new sync IO will be blocked at
   1088	 * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
   1089	 * or wait_read_barrier(). The flying I/Os will either complete or be
   1090	 * queued. When everything goes quite, there are only queued I/Os left.
   1091
   1092	 * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
   1093	 * barrier bucket index which this I/O request hits. When all sync and
   1094	 * normal I/O are queued, sum of all conf->nr_pending[] will match sum
   1095	 * of all conf->nr_queued[]. But normal I/O failure is an exception,
   1096	 * in handle_read_error(), we may call freeze_array() before trying to
   1097	 * fix the read error. In this case, the error read I/O is not queued,
   1098	 * so get_unqueued_pending() == 1.
   1099	 *
   1100	 * Therefore before this function returns, we need to wait until
   1101	 * get_unqueued_pendings(conf) gets equal to extra. For
   1102	 * normal I/O context, extra is 1, in rested situations extra is 0.
   1103	 */
   1104	spin_lock_irq(&conf->resync_lock);
   1105	conf->array_frozen = 1;
   1106	raid1_log(conf->mddev, "wait freeze");
   1107	wait_event_lock_irq_cmd(
   1108		conf->wait_barrier,
   1109		get_unqueued_pending(conf) == extra,
   1110		conf->resync_lock,
   1111		flush_pending_writes(conf));
   1112	spin_unlock_irq(&conf->resync_lock);
   1113}
   1114static void unfreeze_array(struct r1conf *conf)
   1115{
   1116	/* reverse the effect of the freeze */
   1117	spin_lock_irq(&conf->resync_lock);
   1118	conf->array_frozen = 0;
   1119	spin_unlock_irq(&conf->resync_lock);
   1120	wake_up(&conf->wait_barrier);
   1121}
   1122
   1123static void alloc_behind_master_bio(struct r1bio *r1_bio,
   1124					   struct bio *bio)
   1125{
   1126	int size = bio->bi_iter.bi_size;
   1127	unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
   1128	int i = 0;
   1129	struct bio *behind_bio = NULL;
   1130
   1131	behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
   1132				      &r1_bio->mddev->bio_set);
   1133	if (!behind_bio)
   1134		return;
   1135
   1136	/* discard op, we don't support writezero/writesame yet */
   1137	if (!bio_has_data(bio)) {
   1138		behind_bio->bi_iter.bi_size = size;
   1139		goto skip_copy;
   1140	}
   1141
   1142	while (i < vcnt && size) {
   1143		struct page *page;
   1144		int len = min_t(int, PAGE_SIZE, size);
   1145
   1146		page = alloc_page(GFP_NOIO);
   1147		if (unlikely(!page))
   1148			goto free_pages;
   1149
   1150		bio_add_page(behind_bio, page, len, 0);
   1151
   1152		size -= len;
   1153		i++;
   1154	}
   1155
   1156	bio_copy_data(behind_bio, bio);
   1157skip_copy:
   1158	r1_bio->behind_master_bio = behind_bio;
   1159	set_bit(R1BIO_BehindIO, &r1_bio->state);
   1160
   1161	return;
   1162
   1163free_pages:
   1164	pr_debug("%dB behind alloc failed, doing sync I/O\n",
   1165		 bio->bi_iter.bi_size);
   1166	bio_free_pages(behind_bio);
   1167	bio_put(behind_bio);
   1168}
   1169
   1170static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
   1171{
   1172	struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
   1173						  cb);
   1174	struct mddev *mddev = plug->cb.data;
   1175	struct r1conf *conf = mddev->private;
   1176	struct bio *bio;
   1177
   1178	if (from_schedule || current->bio_list) {
   1179		spin_lock_irq(&conf->device_lock);
   1180		bio_list_merge(&conf->pending_bio_list, &plug->pending);
   1181		spin_unlock_irq(&conf->device_lock);
   1182		wake_up(&conf->wait_barrier);
   1183		md_wakeup_thread(mddev->thread);
   1184		kfree(plug);
   1185		return;
   1186	}
   1187
   1188	/* we aren't scheduling, so we can do the write-out directly. */
   1189	bio = bio_list_get(&plug->pending);
   1190	flush_bio_list(conf, bio);
   1191	kfree(plug);
   1192}
   1193
   1194static void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio)
   1195{
   1196	r1_bio->master_bio = bio;
   1197	r1_bio->sectors = bio_sectors(bio);
   1198	r1_bio->state = 0;
   1199	r1_bio->mddev = mddev;
   1200	r1_bio->sector = bio->bi_iter.bi_sector;
   1201}
   1202
   1203static inline struct r1bio *
   1204alloc_r1bio(struct mddev *mddev, struct bio *bio)
   1205{
   1206	struct r1conf *conf = mddev->private;
   1207	struct r1bio *r1_bio;
   1208
   1209	r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO);
   1210	/* Ensure no bio records IO_BLOCKED */
   1211	memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
   1212	init_r1bio(r1_bio, mddev, bio);
   1213	return r1_bio;
   1214}
   1215
   1216static void raid1_read_request(struct mddev *mddev, struct bio *bio,
   1217			       int max_read_sectors, struct r1bio *r1_bio)
   1218{
   1219	struct r1conf *conf = mddev->private;
   1220	struct raid1_info *mirror;
   1221	struct bio *read_bio;
   1222	struct bitmap *bitmap = mddev->bitmap;
   1223	const int op = bio_op(bio);
   1224	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
   1225	int max_sectors;
   1226	int rdisk;
   1227	bool r1bio_existed = !!r1_bio;
   1228	char b[BDEVNAME_SIZE];
   1229
   1230	/*
   1231	 * If r1_bio is set, we are blocking the raid1d thread
   1232	 * so there is a tiny risk of deadlock.  So ask for
   1233	 * emergency memory if needed.
   1234	 */
   1235	gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
   1236
   1237	if (r1bio_existed) {
   1238		/* Need to get the block device name carefully */
   1239		struct md_rdev *rdev;
   1240		rcu_read_lock();
   1241		rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
   1242		if (rdev)
   1243			bdevname(rdev->bdev, b);
   1244		else
   1245			strcpy(b, "???");
   1246		rcu_read_unlock();
   1247	}
   1248
   1249	/*
   1250	 * Still need barrier for READ in case that whole
   1251	 * array is frozen.
   1252	 */
   1253	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
   1254				bio->bi_opf & REQ_NOWAIT)) {
   1255		bio_wouldblock_error(bio);
   1256		return;
   1257	}
   1258
   1259	if (!r1_bio)
   1260		r1_bio = alloc_r1bio(mddev, bio);
   1261	else
   1262		init_r1bio(r1_bio, mddev, bio);
   1263	r1_bio->sectors = max_read_sectors;
   1264
   1265	/*
   1266	 * make_request() can abort the operation when read-ahead is being
   1267	 * used and no empty request is available.
   1268	 */
   1269	rdisk = read_balance(conf, r1_bio, &max_sectors);
   1270
   1271	if (rdisk < 0) {
   1272		/* couldn't find anywhere to read from */
   1273		if (r1bio_existed) {
   1274			pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
   1275					    mdname(mddev),
   1276					    b,
   1277					    (unsigned long long)r1_bio->sector);
   1278		}
   1279		raid_end_bio_io(r1_bio);
   1280		return;
   1281	}
   1282	mirror = conf->mirrors + rdisk;
   1283
   1284	if (r1bio_existed)
   1285		pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %pg\n",
   1286				    mdname(mddev),
   1287				    (unsigned long long)r1_bio->sector,
   1288				    mirror->rdev->bdev);
   1289
   1290	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
   1291	    bitmap) {
   1292		/*
   1293		 * Reading from a write-mostly device must take care not to
   1294		 * over-take any writes that are 'behind'
   1295		 */
   1296		raid1_log(mddev, "wait behind writes");
   1297		wait_event(bitmap->behind_wait,
   1298			   atomic_read(&bitmap->behind_writes) == 0);
   1299	}
   1300
   1301	if (max_sectors < bio_sectors(bio)) {
   1302		struct bio *split = bio_split(bio, max_sectors,
   1303					      gfp, &conf->bio_split);
   1304		bio_chain(split, bio);
   1305		submit_bio_noacct(bio);
   1306		bio = split;
   1307		r1_bio->master_bio = bio;
   1308		r1_bio->sectors = max_sectors;
   1309	}
   1310
   1311	r1_bio->read_disk = rdisk;
   1312
   1313	if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
   1314		r1_bio->start_time = bio_start_io_acct(bio);
   1315
   1316	read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
   1317				   &mddev->bio_set);
   1318
   1319	r1_bio->bios[rdisk] = read_bio;
   1320
   1321	read_bio->bi_iter.bi_sector = r1_bio->sector +
   1322		mirror->rdev->data_offset;
   1323	read_bio->bi_end_io = raid1_end_read_request;
   1324	bio_set_op_attrs(read_bio, op, do_sync);
   1325	if (test_bit(FailFast, &mirror->rdev->flags) &&
   1326	    test_bit(R1BIO_FailFast, &r1_bio->state))
   1327	        read_bio->bi_opf |= MD_FAILFAST;
   1328	read_bio->bi_private = r1_bio;
   1329
   1330	if (mddev->gendisk)
   1331	        trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
   1332				      r1_bio->sector);
   1333
   1334	submit_bio_noacct(read_bio);
   1335}
   1336
   1337static void raid1_write_request(struct mddev *mddev, struct bio *bio,
   1338				int max_write_sectors)
   1339{
   1340	struct r1conf *conf = mddev->private;
   1341	struct r1bio *r1_bio;
   1342	int i, disks;
   1343	struct bitmap *bitmap = mddev->bitmap;
   1344	unsigned long flags;
   1345	struct md_rdev *blocked_rdev;
   1346	struct blk_plug_cb *cb;
   1347	struct raid1_plug_cb *plug = NULL;
   1348	int first_clone;
   1349	int max_sectors;
   1350	bool write_behind = false;
   1351
   1352	if (mddev_is_clustered(mddev) &&
   1353	     md_cluster_ops->area_resyncing(mddev, WRITE,
   1354		     bio->bi_iter.bi_sector, bio_end_sector(bio))) {
   1355
   1356		DEFINE_WAIT(w);
   1357		if (bio->bi_opf & REQ_NOWAIT) {
   1358			bio_wouldblock_error(bio);
   1359			return;
   1360		}
   1361		for (;;) {
   1362			prepare_to_wait(&conf->wait_barrier,
   1363					&w, TASK_IDLE);
   1364			if (!md_cluster_ops->area_resyncing(mddev, WRITE,
   1365							bio->bi_iter.bi_sector,
   1366							bio_end_sector(bio)))
   1367				break;
   1368			schedule();
   1369		}
   1370		finish_wait(&conf->wait_barrier, &w);
   1371	}
   1372
   1373	/*
   1374	 * Register the new request and wait if the reconstruction
   1375	 * thread has put up a bar for new requests.
   1376	 * Continue immediately if no resync is active currently.
   1377	 */
   1378	if (!wait_barrier(conf, bio->bi_iter.bi_sector,
   1379				bio->bi_opf & REQ_NOWAIT)) {
   1380		bio_wouldblock_error(bio);
   1381		return;
   1382	}
   1383
   1384	r1_bio = alloc_r1bio(mddev, bio);
   1385	r1_bio->sectors = max_write_sectors;
   1386
   1387	/* first select target devices under rcu_lock and
   1388	 * inc refcount on their rdev.  Record them by setting
   1389	 * bios[x] to bio
   1390	 * If there are known/acknowledged bad blocks on any device on
   1391	 * which we have seen a write error, we want to avoid writing those
   1392	 * blocks.
   1393	 * This potentially requires several writes to write around
   1394	 * the bad blocks.  Each set of writes gets it's own r1bio
   1395	 * with a set of bios attached.
   1396	 */
   1397
   1398	disks = conf->raid_disks * 2;
   1399 retry_write:
   1400	blocked_rdev = NULL;
   1401	rcu_read_lock();
   1402	max_sectors = r1_bio->sectors;
   1403	for (i = 0;  i < disks; i++) {
   1404		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
   1405
   1406		/*
   1407		 * The write-behind io is only attempted on drives marked as
   1408		 * write-mostly, which means we could allocate write behind
   1409		 * bio later.
   1410		 */
   1411		if (rdev && test_bit(WriteMostly, &rdev->flags))
   1412			write_behind = true;
   1413
   1414		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
   1415			atomic_inc(&rdev->nr_pending);
   1416			blocked_rdev = rdev;
   1417			break;
   1418		}
   1419		r1_bio->bios[i] = NULL;
   1420		if (!rdev || test_bit(Faulty, &rdev->flags)) {
   1421			if (i < conf->raid_disks)
   1422				set_bit(R1BIO_Degraded, &r1_bio->state);
   1423			continue;
   1424		}
   1425
   1426		atomic_inc(&rdev->nr_pending);
   1427		if (test_bit(WriteErrorSeen, &rdev->flags)) {
   1428			sector_t first_bad;
   1429			int bad_sectors;
   1430			int is_bad;
   1431
   1432			is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
   1433					     &first_bad, &bad_sectors);
   1434			if (is_bad < 0) {
   1435				/* mustn't write here until the bad block is
   1436				 * acknowledged*/
   1437				set_bit(BlockedBadBlocks, &rdev->flags);
   1438				blocked_rdev = rdev;
   1439				break;
   1440			}
   1441			if (is_bad && first_bad <= r1_bio->sector) {
   1442				/* Cannot write here at all */
   1443				bad_sectors -= (r1_bio->sector - first_bad);
   1444				if (bad_sectors < max_sectors)
   1445					/* mustn't write more than bad_sectors
   1446					 * to other devices yet
   1447					 */
   1448					max_sectors = bad_sectors;
   1449				rdev_dec_pending(rdev, mddev);
   1450				/* We don't set R1BIO_Degraded as that
   1451				 * only applies if the disk is
   1452				 * missing, so it might be re-added,
   1453				 * and we want to know to recover this
   1454				 * chunk.
   1455				 * In this case the device is here,
   1456				 * and the fact that this chunk is not
   1457				 * in-sync is recorded in the bad
   1458				 * block log
   1459				 */
   1460				continue;
   1461			}
   1462			if (is_bad) {
   1463				int good_sectors = first_bad - r1_bio->sector;
   1464				if (good_sectors < max_sectors)
   1465					max_sectors = good_sectors;
   1466			}
   1467		}
   1468		r1_bio->bios[i] = bio;
   1469	}
   1470	rcu_read_unlock();
   1471
   1472	if (unlikely(blocked_rdev)) {
   1473		/* Wait for this device to become unblocked */
   1474		int j;
   1475
   1476		for (j = 0; j < i; j++)
   1477			if (r1_bio->bios[j])
   1478				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
   1479		r1_bio->state = 0;
   1480		allow_barrier(conf, bio->bi_iter.bi_sector);
   1481
   1482		if (bio->bi_opf & REQ_NOWAIT) {
   1483			bio_wouldblock_error(bio);
   1484			return;
   1485		}
   1486		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
   1487		md_wait_for_blocked_rdev(blocked_rdev, mddev);
   1488		wait_barrier(conf, bio->bi_iter.bi_sector, false);
   1489		goto retry_write;
   1490	}
   1491
   1492	/*
   1493	 * When using a bitmap, we may call alloc_behind_master_bio below.
   1494	 * alloc_behind_master_bio allocates a copy of the data payload a page
   1495	 * at a time and thus needs a new bio that can fit the whole payload
   1496	 * this bio in page sized chunks.
   1497	 */
   1498	if (write_behind && bitmap)
   1499		max_sectors = min_t(int, max_sectors,
   1500				    BIO_MAX_VECS * (PAGE_SIZE >> 9));
   1501	if (max_sectors < bio_sectors(bio)) {
   1502		struct bio *split = bio_split(bio, max_sectors,
   1503					      GFP_NOIO, &conf->bio_split);
   1504		bio_chain(split, bio);
   1505		submit_bio_noacct(bio);
   1506		bio = split;
   1507		r1_bio->master_bio = bio;
   1508		r1_bio->sectors = max_sectors;
   1509	}
   1510
   1511	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
   1512		r1_bio->start_time = bio_start_io_acct(bio);
   1513	atomic_set(&r1_bio->remaining, 1);
   1514	atomic_set(&r1_bio->behind_remaining, 0);
   1515
   1516	first_clone = 1;
   1517
   1518	for (i = 0; i < disks; i++) {
   1519		struct bio *mbio = NULL;
   1520		struct md_rdev *rdev = conf->mirrors[i].rdev;
   1521		if (!r1_bio->bios[i])
   1522			continue;
   1523
   1524		if (first_clone) {
   1525			/* do behind I/O ?
   1526			 * Not if there are too many, or cannot
   1527			 * allocate memory, or a reader on WriteMostly
   1528			 * is waiting for behind writes to flush */
   1529			if (bitmap &&
   1530			    test_bit(WriteMostly, &rdev->flags) &&
   1531			    (atomic_read(&bitmap->behind_writes)
   1532			     < mddev->bitmap_info.max_write_behind) &&
   1533			    !waitqueue_active(&bitmap->behind_wait)) {
   1534				alloc_behind_master_bio(r1_bio, bio);
   1535			}
   1536
   1537			md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
   1538					     test_bit(R1BIO_BehindIO, &r1_bio->state));
   1539			first_clone = 0;
   1540		}
   1541
   1542		if (r1_bio->behind_master_bio) {
   1543			mbio = bio_alloc_clone(rdev->bdev,
   1544					       r1_bio->behind_master_bio,
   1545					       GFP_NOIO, &mddev->bio_set);
   1546			if (test_bit(CollisionCheck, &rdev->flags))
   1547				wait_for_serialization(rdev, r1_bio);
   1548			if (test_bit(WriteMostly, &rdev->flags))
   1549				atomic_inc(&r1_bio->behind_remaining);
   1550		} else {
   1551			mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
   1552					       &mddev->bio_set);
   1553
   1554			if (mddev->serialize_policy)
   1555				wait_for_serialization(rdev, r1_bio);
   1556		}
   1557
   1558		r1_bio->bios[i] = mbio;
   1559
   1560		mbio->bi_iter.bi_sector	= (r1_bio->sector + rdev->data_offset);
   1561		mbio->bi_end_io	= raid1_end_write_request;
   1562		mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
   1563		if (test_bit(FailFast, &rdev->flags) &&
   1564		    !test_bit(WriteMostly, &rdev->flags) &&
   1565		    conf->raid_disks - mddev->degraded > 1)
   1566			mbio->bi_opf |= MD_FAILFAST;
   1567		mbio->bi_private = r1_bio;
   1568
   1569		atomic_inc(&r1_bio->remaining);
   1570
   1571		if (mddev->gendisk)
   1572			trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
   1573					      r1_bio->sector);
   1574		/* flush_pending_writes() needs access to the rdev so...*/
   1575		mbio->bi_bdev = (void *)rdev;
   1576
   1577		cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
   1578		if (cb)
   1579			plug = container_of(cb, struct raid1_plug_cb, cb);
   1580		else
   1581			plug = NULL;
   1582		if (plug) {
   1583			bio_list_add(&plug->pending, mbio);
   1584		} else {
   1585			spin_lock_irqsave(&conf->device_lock, flags);
   1586			bio_list_add(&conf->pending_bio_list, mbio);
   1587			spin_unlock_irqrestore(&conf->device_lock, flags);
   1588			md_wakeup_thread(mddev->thread);
   1589		}
   1590	}
   1591
   1592	r1_bio_write_done(r1_bio);
   1593
   1594	/* In case raid1d snuck in to freeze_array */
   1595	wake_up(&conf->wait_barrier);
   1596}
   1597
   1598static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
   1599{
   1600	sector_t sectors;
   1601
   1602	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
   1603	    && md_flush_request(mddev, bio))
   1604		return true;
   1605
   1606	/*
   1607	 * There is a limit to the maximum size, but
   1608	 * the read/write handler might find a lower limit
   1609	 * due to bad blocks.  To avoid multiple splits,
   1610	 * we pass the maximum number of sectors down
   1611	 * and let the lower level perform the split.
   1612	 */
   1613	sectors = align_to_barrier_unit_end(
   1614		bio->bi_iter.bi_sector, bio_sectors(bio));
   1615
   1616	if (bio_data_dir(bio) == READ)
   1617		raid1_read_request(mddev, bio, sectors, NULL);
   1618	else {
   1619		if (!md_write_start(mddev,bio))
   1620			return false;
   1621		raid1_write_request(mddev, bio, sectors);
   1622	}
   1623	return true;
   1624}
   1625
   1626static void raid1_status(struct seq_file *seq, struct mddev *mddev)
   1627{
   1628	struct r1conf *conf = mddev->private;
   1629	int i;
   1630
   1631	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
   1632		   conf->raid_disks - mddev->degraded);
   1633	rcu_read_lock();
   1634	for (i = 0; i < conf->raid_disks; i++) {
   1635		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
   1636		seq_printf(seq, "%s",
   1637			   rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
   1638	}
   1639	rcu_read_unlock();
   1640	seq_printf(seq, "]");
   1641}
   1642
   1643/**
   1644 * raid1_error() - RAID1 error handler.
   1645 * @mddev: affected md device.
   1646 * @rdev: member device to fail.
   1647 *
   1648 * The routine acknowledges &rdev failure and determines new @mddev state.
   1649 * If it failed, then:
   1650 *	- &MD_BROKEN flag is set in &mddev->flags.
   1651 *	- recovery is disabled.
   1652 * Otherwise, it must be degraded:
   1653 *	- recovery is interrupted.
   1654 *	- &mddev->degraded is bumped.
   1655 *
   1656 * @rdev is marked as &Faulty excluding case when array is failed and
   1657 * &mddev->fail_last_dev is off.
   1658 */
   1659static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
   1660{
   1661	struct r1conf *conf = mddev->private;
   1662	unsigned long flags;
   1663
   1664	spin_lock_irqsave(&conf->device_lock, flags);
   1665
   1666	if (test_bit(In_sync, &rdev->flags) &&
   1667	    (conf->raid_disks - mddev->degraded) == 1) {
   1668		set_bit(MD_BROKEN, &mddev->flags);
   1669
   1670		if (!mddev->fail_last_dev) {
   1671			conf->recovery_disabled = mddev->recovery_disabled;
   1672			spin_unlock_irqrestore(&conf->device_lock, flags);
   1673			return;
   1674		}
   1675	}
   1676	set_bit(Blocked, &rdev->flags);
   1677	if (test_and_clear_bit(In_sync, &rdev->flags))
   1678		mddev->degraded++;
   1679	set_bit(Faulty, &rdev->flags);
   1680	spin_unlock_irqrestore(&conf->device_lock, flags);
   1681	/*
   1682	 * if recovery is running, make sure it aborts.
   1683	 */
   1684	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   1685	set_mask_bits(&mddev->sb_flags, 0,
   1686		      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
   1687	pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n"
   1688		"md/raid1:%s: Operation continuing on %d devices.\n",
   1689		mdname(mddev), rdev->bdev,
   1690		mdname(mddev), conf->raid_disks - mddev->degraded);
   1691}
   1692
   1693static void print_conf(struct r1conf *conf)
   1694{
   1695	int i;
   1696
   1697	pr_debug("RAID1 conf printout:\n");
   1698	if (!conf) {
   1699		pr_debug("(!conf)\n");
   1700		return;
   1701	}
   1702	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
   1703		 conf->raid_disks);
   1704
   1705	rcu_read_lock();
   1706	for (i = 0; i < conf->raid_disks; i++) {
   1707		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
   1708		if (rdev)
   1709			pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
   1710				 i, !test_bit(In_sync, &rdev->flags),
   1711				 !test_bit(Faulty, &rdev->flags),
   1712				 rdev->bdev);
   1713	}
   1714	rcu_read_unlock();
   1715}
   1716
   1717static void close_sync(struct r1conf *conf)
   1718{
   1719	int idx;
   1720
   1721	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
   1722		_wait_barrier(conf, idx, false);
   1723		_allow_barrier(conf, idx);
   1724	}
   1725
   1726	mempool_exit(&conf->r1buf_pool);
   1727}
   1728
   1729static int raid1_spare_active(struct mddev *mddev)
   1730{
   1731	int i;
   1732	struct r1conf *conf = mddev->private;
   1733	int count = 0;
   1734	unsigned long flags;
   1735
   1736	/*
   1737	 * Find all failed disks within the RAID1 configuration
   1738	 * and mark them readable.
   1739	 * Called under mddev lock, so rcu protection not needed.
   1740	 * device_lock used to avoid races with raid1_end_read_request
   1741	 * which expects 'In_sync' flags and ->degraded to be consistent.
   1742	 */
   1743	spin_lock_irqsave(&conf->device_lock, flags);
   1744	for (i = 0; i < conf->raid_disks; i++) {
   1745		struct md_rdev *rdev = conf->mirrors[i].rdev;
   1746		struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
   1747		if (repl
   1748		    && !test_bit(Candidate, &repl->flags)
   1749		    && repl->recovery_offset == MaxSector
   1750		    && !test_bit(Faulty, &repl->flags)
   1751		    && !test_and_set_bit(In_sync, &repl->flags)) {
   1752			/* replacement has just become active */
   1753			if (!rdev ||
   1754			    !test_and_clear_bit(In_sync, &rdev->flags))
   1755				count++;
   1756			if (rdev) {
   1757				/* Replaced device not technically
   1758				 * faulty, but we need to be sure
   1759				 * it gets removed and never re-added
   1760				 */
   1761				set_bit(Faulty, &rdev->flags);
   1762				sysfs_notify_dirent_safe(
   1763					rdev->sysfs_state);
   1764			}
   1765		}
   1766		if (rdev
   1767		    && rdev->recovery_offset == MaxSector
   1768		    && !test_bit(Faulty, &rdev->flags)
   1769		    && !test_and_set_bit(In_sync, &rdev->flags)) {
   1770			count++;
   1771			sysfs_notify_dirent_safe(rdev->sysfs_state);
   1772		}
   1773	}
   1774	mddev->degraded -= count;
   1775	spin_unlock_irqrestore(&conf->device_lock, flags);
   1776
   1777	print_conf(conf);
   1778	return count;
   1779}
   1780
   1781static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
   1782{
   1783	struct r1conf *conf = mddev->private;
   1784	int err = -EEXIST;
   1785	int mirror = 0;
   1786	struct raid1_info *p;
   1787	int first = 0;
   1788	int last = conf->raid_disks - 1;
   1789
   1790	if (mddev->recovery_disabled == conf->recovery_disabled)
   1791		return -EBUSY;
   1792
   1793	if (md_integrity_add_rdev(rdev, mddev))
   1794		return -ENXIO;
   1795
   1796	if (rdev->raid_disk >= 0)
   1797		first = last = rdev->raid_disk;
   1798
   1799	/*
   1800	 * find the disk ... but prefer rdev->saved_raid_disk
   1801	 * if possible.
   1802	 */
   1803	if (rdev->saved_raid_disk >= 0 &&
   1804	    rdev->saved_raid_disk >= first &&
   1805	    rdev->saved_raid_disk < conf->raid_disks &&
   1806	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
   1807		first = last = rdev->saved_raid_disk;
   1808
   1809	for (mirror = first; mirror <= last; mirror++) {
   1810		p = conf->mirrors + mirror;
   1811		if (!p->rdev) {
   1812			if (mddev->gendisk)
   1813				disk_stack_limits(mddev->gendisk, rdev->bdev,
   1814						  rdev->data_offset << 9);
   1815
   1816			p->head_position = 0;
   1817			rdev->raid_disk = mirror;
   1818			err = 0;
   1819			/* As all devices are equivalent, we don't need a full recovery
   1820			 * if this was recently any drive of the array
   1821			 */
   1822			if (rdev->saved_raid_disk < 0)
   1823				conf->fullsync = 1;
   1824			rcu_assign_pointer(p->rdev, rdev);
   1825			break;
   1826		}
   1827		if (test_bit(WantReplacement, &p->rdev->flags) &&
   1828		    p[conf->raid_disks].rdev == NULL) {
   1829			/* Add this device as a replacement */
   1830			clear_bit(In_sync, &rdev->flags);
   1831			set_bit(Replacement, &rdev->flags);
   1832			rdev->raid_disk = mirror;
   1833			err = 0;
   1834			conf->fullsync = 1;
   1835			rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
   1836			break;
   1837		}
   1838	}
   1839	print_conf(conf);
   1840	return err;
   1841}
   1842
   1843static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
   1844{
   1845	struct r1conf *conf = mddev->private;
   1846	int err = 0;
   1847	int number = rdev->raid_disk;
   1848	struct raid1_info *p = conf->mirrors + number;
   1849
   1850	if (rdev != p->rdev)
   1851		p = conf->mirrors + conf->raid_disks + number;
   1852
   1853	print_conf(conf);
   1854	if (rdev == p->rdev) {
   1855		if (test_bit(In_sync, &rdev->flags) ||
   1856		    atomic_read(&rdev->nr_pending)) {
   1857			err = -EBUSY;
   1858			goto abort;
   1859		}
   1860		/* Only remove non-faulty devices if recovery
   1861		 * is not possible.
   1862		 */
   1863		if (!test_bit(Faulty, &rdev->flags) &&
   1864		    mddev->recovery_disabled != conf->recovery_disabled &&
   1865		    mddev->degraded < conf->raid_disks) {
   1866			err = -EBUSY;
   1867			goto abort;
   1868		}
   1869		p->rdev = NULL;
   1870		if (!test_bit(RemoveSynchronized, &rdev->flags)) {
   1871			synchronize_rcu();
   1872			if (atomic_read(&rdev->nr_pending)) {
   1873				/* lost the race, try later */
   1874				err = -EBUSY;
   1875				p->rdev = rdev;
   1876				goto abort;
   1877			}
   1878		}
   1879		if (conf->mirrors[conf->raid_disks + number].rdev) {
   1880			/* We just removed a device that is being replaced.
   1881			 * Move down the replacement.  We drain all IO before
   1882			 * doing this to avoid confusion.
   1883			 */
   1884			struct md_rdev *repl =
   1885				conf->mirrors[conf->raid_disks + number].rdev;
   1886			freeze_array(conf, 0);
   1887			if (atomic_read(&repl->nr_pending)) {
   1888				/* It means that some queued IO of retry_list
   1889				 * hold repl. Thus, we cannot set replacement
   1890				 * as NULL, avoiding rdev NULL pointer
   1891				 * dereference in sync_request_write and
   1892				 * handle_write_finished.
   1893				 */
   1894				err = -EBUSY;
   1895				unfreeze_array(conf);
   1896				goto abort;
   1897			}
   1898			clear_bit(Replacement, &repl->flags);
   1899			p->rdev = repl;
   1900			conf->mirrors[conf->raid_disks + number].rdev = NULL;
   1901			unfreeze_array(conf);
   1902		}
   1903
   1904		clear_bit(WantReplacement, &rdev->flags);
   1905		err = md_integrity_register(mddev);
   1906	}
   1907abort:
   1908
   1909	print_conf(conf);
   1910	return err;
   1911}
   1912
   1913static void end_sync_read(struct bio *bio)
   1914{
   1915	struct r1bio *r1_bio = get_resync_r1bio(bio);
   1916
   1917	update_head_pos(r1_bio->read_disk, r1_bio);
   1918
   1919	/*
   1920	 * we have read a block, now it needs to be re-written,
   1921	 * or re-read if the read failed.
   1922	 * We don't do much here, just schedule handling by raid1d
   1923	 */
   1924	if (!bio->bi_status)
   1925		set_bit(R1BIO_Uptodate, &r1_bio->state);
   1926
   1927	if (atomic_dec_and_test(&r1_bio->remaining))
   1928		reschedule_retry(r1_bio);
   1929}
   1930
   1931static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
   1932{
   1933	sector_t sync_blocks = 0;
   1934	sector_t s = r1_bio->sector;
   1935	long sectors_to_go = r1_bio->sectors;
   1936
   1937	/* make sure these bits don't get cleared. */
   1938	do {
   1939		md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
   1940		s += sync_blocks;
   1941		sectors_to_go -= sync_blocks;
   1942	} while (sectors_to_go > 0);
   1943}
   1944
   1945static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
   1946{
   1947	if (atomic_dec_and_test(&r1_bio->remaining)) {
   1948		struct mddev *mddev = r1_bio->mddev;
   1949		int s = r1_bio->sectors;
   1950
   1951		if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
   1952		    test_bit(R1BIO_WriteError, &r1_bio->state))
   1953			reschedule_retry(r1_bio);
   1954		else {
   1955			put_buf(r1_bio);
   1956			md_done_sync(mddev, s, uptodate);
   1957		}
   1958	}
   1959}
   1960
   1961static void end_sync_write(struct bio *bio)
   1962{
   1963	int uptodate = !bio->bi_status;
   1964	struct r1bio *r1_bio = get_resync_r1bio(bio);
   1965	struct mddev *mddev = r1_bio->mddev;
   1966	struct r1conf *conf = mddev->private;
   1967	sector_t first_bad;
   1968	int bad_sectors;
   1969	struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
   1970
   1971	if (!uptodate) {
   1972		abort_sync_write(mddev, r1_bio);
   1973		set_bit(WriteErrorSeen, &rdev->flags);
   1974		if (!test_and_set_bit(WantReplacement, &rdev->flags))
   1975			set_bit(MD_RECOVERY_NEEDED, &
   1976				mddev->recovery);
   1977		set_bit(R1BIO_WriteError, &r1_bio->state);
   1978	} else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
   1979			       &first_bad, &bad_sectors) &&
   1980		   !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
   1981				r1_bio->sector,
   1982				r1_bio->sectors,
   1983				&first_bad, &bad_sectors)
   1984		)
   1985		set_bit(R1BIO_MadeGood, &r1_bio->state);
   1986
   1987	put_sync_write_buf(r1_bio, uptodate);
   1988}
   1989
   1990static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
   1991			    int sectors, struct page *page, int rw)
   1992{
   1993	if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
   1994		/* success */
   1995		return 1;
   1996	if (rw == WRITE) {
   1997		set_bit(WriteErrorSeen, &rdev->flags);
   1998		if (!test_and_set_bit(WantReplacement,
   1999				      &rdev->flags))
   2000			set_bit(MD_RECOVERY_NEEDED, &
   2001				rdev->mddev->recovery);
   2002	}
   2003	/* need to record an error - either for the block or the device */
   2004	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
   2005		md_error(rdev->mddev, rdev);
   2006	return 0;
   2007}
   2008
   2009static int fix_sync_read_error(struct r1bio *r1_bio)
   2010{
   2011	/* Try some synchronous reads of other devices to get
   2012	 * good data, much like with normal read errors.  Only
   2013	 * read into the pages we already have so we don't
   2014	 * need to re-issue the read request.
   2015	 * We don't need to freeze the array, because being in an
   2016	 * active sync request, there is no normal IO, and
   2017	 * no overlapping syncs.
   2018	 * We don't need to check is_badblock() again as we
   2019	 * made sure that anything with a bad block in range
   2020	 * will have bi_end_io clear.
   2021	 */
   2022	struct mddev *mddev = r1_bio->mddev;
   2023	struct r1conf *conf = mddev->private;
   2024	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
   2025	struct page **pages = get_resync_pages(bio)->pages;
   2026	sector_t sect = r1_bio->sector;
   2027	int sectors = r1_bio->sectors;
   2028	int idx = 0;
   2029	struct md_rdev *rdev;
   2030
   2031	rdev = conf->mirrors[r1_bio->read_disk].rdev;
   2032	if (test_bit(FailFast, &rdev->flags)) {
   2033		/* Don't try recovering from here - just fail it
   2034		 * ... unless it is the last working device of course */
   2035		md_error(mddev, rdev);
   2036		if (test_bit(Faulty, &rdev->flags))
   2037			/* Don't try to read from here, but make sure
   2038			 * put_buf does it's thing
   2039			 */
   2040			bio->bi_end_io = end_sync_write;
   2041	}
   2042
   2043	while(sectors) {
   2044		int s = sectors;
   2045		int d = r1_bio->read_disk;
   2046		int success = 0;
   2047		int start;
   2048
   2049		if (s > (PAGE_SIZE>>9))
   2050			s = PAGE_SIZE >> 9;
   2051		do {
   2052			if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
   2053				/* No rcu protection needed here devices
   2054				 * can only be removed when no resync is
   2055				 * active, and resync is currently active
   2056				 */
   2057				rdev = conf->mirrors[d].rdev;
   2058				if (sync_page_io(rdev, sect, s<<9,
   2059						 pages[idx],
   2060						 REQ_OP_READ, 0, false)) {
   2061					success = 1;
   2062					break;
   2063				}
   2064			}
   2065			d++;
   2066			if (d == conf->raid_disks * 2)
   2067				d = 0;
   2068		} while (!success && d != r1_bio->read_disk);
   2069
   2070		if (!success) {
   2071			int abort = 0;
   2072			/* Cannot read from anywhere, this block is lost.
   2073			 * Record a bad block on each device.  If that doesn't
   2074			 * work just disable and interrupt the recovery.
   2075			 * Don't fail devices as that won't really help.
   2076			 */
   2077			pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
   2078					    mdname(mddev), bio->bi_bdev,
   2079					    (unsigned long long)r1_bio->sector);
   2080			for (d = 0; d < conf->raid_disks * 2; d++) {
   2081				rdev = conf->mirrors[d].rdev;
   2082				if (!rdev || test_bit(Faulty, &rdev->flags))
   2083					continue;
   2084				if (!rdev_set_badblocks(rdev, sect, s, 0))
   2085					abort = 1;
   2086			}
   2087			if (abort) {
   2088				conf->recovery_disabled =
   2089					mddev->recovery_disabled;
   2090				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   2091				md_done_sync(mddev, r1_bio->sectors, 0);
   2092				put_buf(r1_bio);
   2093				return 0;
   2094			}
   2095			/* Try next page */
   2096			sectors -= s;
   2097			sect += s;
   2098			idx++;
   2099			continue;
   2100		}
   2101
   2102		start = d;
   2103		/* write it back and re-read */
   2104		while (d != r1_bio->read_disk) {
   2105			if (d == 0)
   2106				d = conf->raid_disks * 2;
   2107			d--;
   2108			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
   2109				continue;
   2110			rdev = conf->mirrors[d].rdev;
   2111			if (r1_sync_page_io(rdev, sect, s,
   2112					    pages[idx],
   2113					    WRITE) == 0) {
   2114				r1_bio->bios[d]->bi_end_io = NULL;
   2115				rdev_dec_pending(rdev, mddev);
   2116			}
   2117		}
   2118		d = start;
   2119		while (d != r1_bio->read_disk) {
   2120			if (d == 0)
   2121				d = conf->raid_disks * 2;
   2122			d--;
   2123			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
   2124				continue;
   2125			rdev = conf->mirrors[d].rdev;
   2126			if (r1_sync_page_io(rdev, sect, s,
   2127					    pages[idx],
   2128					    READ) != 0)
   2129				atomic_add(s, &rdev->corrected_errors);
   2130		}
   2131		sectors -= s;
   2132		sect += s;
   2133		idx ++;
   2134	}
   2135	set_bit(R1BIO_Uptodate, &r1_bio->state);
   2136	bio->bi_status = 0;
   2137	return 1;
   2138}
   2139
   2140static void process_checks(struct r1bio *r1_bio)
   2141{
   2142	/* We have read all readable devices.  If we haven't
   2143	 * got the block, then there is no hope left.
   2144	 * If we have, then we want to do a comparison
   2145	 * and skip the write if everything is the same.
   2146	 * If any blocks failed to read, then we need to
   2147	 * attempt an over-write
   2148	 */
   2149	struct mddev *mddev = r1_bio->mddev;
   2150	struct r1conf *conf = mddev->private;
   2151	int primary;
   2152	int i;
   2153	int vcnt;
   2154
   2155	/* Fix variable parts of all bios */
   2156	vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
   2157	for (i = 0; i < conf->raid_disks * 2; i++) {
   2158		blk_status_t status;
   2159		struct bio *b = r1_bio->bios[i];
   2160		struct resync_pages *rp = get_resync_pages(b);
   2161		if (b->bi_end_io != end_sync_read)
   2162			continue;
   2163		/* fixup the bio for reuse, but preserve errno */
   2164		status = b->bi_status;
   2165		bio_reset(b, conf->mirrors[i].rdev->bdev, REQ_OP_READ);
   2166		b->bi_status = status;
   2167		b->bi_iter.bi_sector = r1_bio->sector +
   2168			conf->mirrors[i].rdev->data_offset;
   2169		b->bi_end_io = end_sync_read;
   2170		rp->raid_bio = r1_bio;
   2171		b->bi_private = rp;
   2172
   2173		/* initialize bvec table again */
   2174		md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9);
   2175	}
   2176	for (primary = 0; primary < conf->raid_disks * 2; primary++)
   2177		if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
   2178		    !r1_bio->bios[primary]->bi_status) {
   2179			r1_bio->bios[primary]->bi_end_io = NULL;
   2180			rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
   2181			break;
   2182		}
   2183	r1_bio->read_disk = primary;
   2184	for (i = 0; i < conf->raid_disks * 2; i++) {
   2185		int j = 0;
   2186		struct bio *pbio = r1_bio->bios[primary];
   2187		struct bio *sbio = r1_bio->bios[i];
   2188		blk_status_t status = sbio->bi_status;
   2189		struct page **ppages = get_resync_pages(pbio)->pages;
   2190		struct page **spages = get_resync_pages(sbio)->pages;
   2191		struct bio_vec *bi;
   2192		int page_len[RESYNC_PAGES] = { 0 };
   2193		struct bvec_iter_all iter_all;
   2194
   2195		if (sbio->bi_end_io != end_sync_read)
   2196			continue;
   2197		/* Now we can 'fixup' the error value */
   2198		sbio->bi_status = 0;
   2199
   2200		bio_for_each_segment_all(bi, sbio, iter_all)
   2201			page_len[j++] = bi->bv_len;
   2202
   2203		if (!status) {
   2204			for (j = vcnt; j-- ; ) {
   2205				if (memcmp(page_address(ppages[j]),
   2206					   page_address(spages[j]),
   2207					   page_len[j]))
   2208					break;
   2209			}
   2210		} else
   2211			j = 0;
   2212		if (j >= 0)
   2213			atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
   2214		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
   2215			      && !status)) {
   2216			/* No need to write to this device. */
   2217			sbio->bi_end_io = NULL;
   2218			rdev_dec_pending(conf->mirrors[i].rdev, mddev);
   2219			continue;
   2220		}
   2221
   2222		bio_copy_data(sbio, pbio);
   2223	}
   2224}
   2225
   2226static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
   2227{
   2228	struct r1conf *conf = mddev->private;
   2229	int i;
   2230	int disks = conf->raid_disks * 2;
   2231	struct bio *wbio;
   2232
   2233	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
   2234		/* ouch - failed to read all of that. */
   2235		if (!fix_sync_read_error(r1_bio))
   2236			return;
   2237
   2238	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
   2239		process_checks(r1_bio);
   2240
   2241	/*
   2242	 * schedule writes
   2243	 */
   2244	atomic_set(&r1_bio->remaining, 1);
   2245	for (i = 0; i < disks ; i++) {
   2246		wbio = r1_bio->bios[i];
   2247		if (wbio->bi_end_io == NULL ||
   2248		    (wbio->bi_end_io == end_sync_read &&
   2249		     (i == r1_bio->read_disk ||
   2250		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
   2251			continue;
   2252		if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
   2253			abort_sync_write(mddev, r1_bio);
   2254			continue;
   2255		}
   2256
   2257		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
   2258		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
   2259			wbio->bi_opf |= MD_FAILFAST;
   2260
   2261		wbio->bi_end_io = end_sync_write;
   2262		atomic_inc(&r1_bio->remaining);
   2263		md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
   2264
   2265		submit_bio_noacct(wbio);
   2266	}
   2267
   2268	put_sync_write_buf(r1_bio, 1);
   2269}
   2270
   2271/*
   2272 * This is a kernel thread which:
   2273 *
   2274 *	1.	Retries failed read operations on working mirrors.
   2275 *	2.	Updates the raid superblock when problems encounter.
   2276 *	3.	Performs writes following reads for array synchronising.
   2277 */
   2278
   2279static void fix_read_error(struct r1conf *conf, int read_disk,
   2280			   sector_t sect, int sectors)
   2281{
   2282	struct mddev *mddev = conf->mddev;
   2283	while(sectors) {
   2284		int s = sectors;
   2285		int d = read_disk;
   2286		int success = 0;
   2287		int start;
   2288		struct md_rdev *rdev;
   2289
   2290		if (s > (PAGE_SIZE>>9))
   2291			s = PAGE_SIZE >> 9;
   2292
   2293		do {
   2294			sector_t first_bad;
   2295			int bad_sectors;
   2296
   2297			rcu_read_lock();
   2298			rdev = rcu_dereference(conf->mirrors[d].rdev);
   2299			if (rdev &&
   2300			    (test_bit(In_sync, &rdev->flags) ||
   2301			     (!test_bit(Faulty, &rdev->flags) &&
   2302			      rdev->recovery_offset >= sect + s)) &&
   2303			    is_badblock(rdev, sect, s,
   2304					&first_bad, &bad_sectors) == 0) {
   2305				atomic_inc(&rdev->nr_pending);
   2306				rcu_read_unlock();
   2307				if (sync_page_io(rdev, sect, s<<9,
   2308					 conf->tmppage, REQ_OP_READ, 0, false))
   2309					success = 1;
   2310				rdev_dec_pending(rdev, mddev);
   2311				if (success)
   2312					break;
   2313			} else
   2314				rcu_read_unlock();
   2315			d++;
   2316			if (d == conf->raid_disks * 2)
   2317				d = 0;
   2318		} while (!success && d != read_disk);
   2319
   2320		if (!success) {
   2321			/* Cannot read from anywhere - mark it bad */
   2322			struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
   2323			if (!rdev_set_badblocks(rdev, sect, s, 0))
   2324				md_error(mddev, rdev);
   2325			break;
   2326		}
   2327		/* write it back and re-read */
   2328		start = d;
   2329		while (d != read_disk) {
   2330			if (d==0)
   2331				d = conf->raid_disks * 2;
   2332			d--;
   2333			rcu_read_lock();
   2334			rdev = rcu_dereference(conf->mirrors[d].rdev);
   2335			if (rdev &&
   2336			    !test_bit(Faulty, &rdev->flags)) {
   2337				atomic_inc(&rdev->nr_pending);
   2338				rcu_read_unlock();
   2339				r1_sync_page_io(rdev, sect, s,
   2340						conf->tmppage, WRITE);
   2341				rdev_dec_pending(rdev, mddev);
   2342			} else
   2343				rcu_read_unlock();
   2344		}
   2345		d = start;
   2346		while (d != read_disk) {
   2347			if (d==0)
   2348				d = conf->raid_disks * 2;
   2349			d--;
   2350			rcu_read_lock();
   2351			rdev = rcu_dereference(conf->mirrors[d].rdev);
   2352			if (rdev &&
   2353			    !test_bit(Faulty, &rdev->flags)) {
   2354				atomic_inc(&rdev->nr_pending);
   2355				rcu_read_unlock();
   2356				if (r1_sync_page_io(rdev, sect, s,
   2357						    conf->tmppage, READ)) {
   2358					atomic_add(s, &rdev->corrected_errors);
   2359					pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n",
   2360						mdname(mddev), s,
   2361						(unsigned long long)(sect +
   2362								     rdev->data_offset),
   2363						rdev->bdev);
   2364				}
   2365				rdev_dec_pending(rdev, mddev);
   2366			} else
   2367				rcu_read_unlock();
   2368		}
   2369		sectors -= s;
   2370		sect += s;
   2371	}
   2372}
   2373
   2374static int narrow_write_error(struct r1bio *r1_bio, int i)
   2375{
   2376	struct mddev *mddev = r1_bio->mddev;
   2377	struct r1conf *conf = mddev->private;
   2378	struct md_rdev *rdev = conf->mirrors[i].rdev;
   2379
   2380	/* bio has the data to be written to device 'i' where
   2381	 * we just recently had a write error.
   2382	 * We repeatedly clone the bio and trim down to one block,
   2383	 * then try the write.  Where the write fails we record
   2384	 * a bad block.
   2385	 * It is conceivable that the bio doesn't exactly align with
   2386	 * blocks.  We must handle this somehow.
   2387	 *
   2388	 * We currently own a reference on the rdev.
   2389	 */
   2390
   2391	int block_sectors;
   2392	sector_t sector;
   2393	int sectors;
   2394	int sect_to_write = r1_bio->sectors;
   2395	int ok = 1;
   2396
   2397	if (rdev->badblocks.shift < 0)
   2398		return 0;
   2399
   2400	block_sectors = roundup(1 << rdev->badblocks.shift,
   2401				bdev_logical_block_size(rdev->bdev) >> 9);
   2402	sector = r1_bio->sector;
   2403	sectors = ((sector + block_sectors)
   2404		   & ~(sector_t)(block_sectors - 1))
   2405		- sector;
   2406
   2407	while (sect_to_write) {
   2408		struct bio *wbio;
   2409		if (sectors > sect_to_write)
   2410			sectors = sect_to_write;
   2411		/* Write at 'sector' for 'sectors'*/
   2412
   2413		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
   2414			wbio = bio_alloc_clone(rdev->bdev,
   2415					       r1_bio->behind_master_bio,
   2416					       GFP_NOIO, &mddev->bio_set);
   2417		} else {
   2418			wbio = bio_alloc_clone(rdev->bdev, r1_bio->master_bio,
   2419					       GFP_NOIO, &mddev->bio_set);
   2420		}
   2421
   2422		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
   2423		wbio->bi_iter.bi_sector = r1_bio->sector;
   2424		wbio->bi_iter.bi_size = r1_bio->sectors << 9;
   2425
   2426		bio_trim(wbio, sector - r1_bio->sector, sectors);
   2427		wbio->bi_iter.bi_sector += rdev->data_offset;
   2428
   2429		if (submit_bio_wait(wbio) < 0)
   2430			/* failure! */
   2431			ok = rdev_set_badblocks(rdev, sector,
   2432						sectors, 0)
   2433				&& ok;
   2434
   2435		bio_put(wbio);
   2436		sect_to_write -= sectors;
   2437		sector += sectors;
   2438		sectors = block_sectors;
   2439	}
   2440	return ok;
   2441}
   2442
   2443static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
   2444{
   2445	int m;
   2446	int s = r1_bio->sectors;
   2447	for (m = 0; m < conf->raid_disks * 2 ; m++) {
   2448		struct md_rdev *rdev = conf->mirrors[m].rdev;
   2449		struct bio *bio = r1_bio->bios[m];
   2450		if (bio->bi_end_io == NULL)
   2451			continue;
   2452		if (!bio->bi_status &&
   2453		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
   2454			rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
   2455		}
   2456		if (bio->bi_status &&
   2457		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
   2458			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
   2459				md_error(conf->mddev, rdev);
   2460		}
   2461	}
   2462	put_buf(r1_bio);
   2463	md_done_sync(conf->mddev, s, 1);
   2464}
   2465
   2466static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
   2467{
   2468	int m, idx;
   2469	bool fail = false;
   2470
   2471	for (m = 0; m < conf->raid_disks * 2 ; m++)
   2472		if (r1_bio->bios[m] == IO_MADE_GOOD) {
   2473			struct md_rdev *rdev = conf->mirrors[m].rdev;
   2474			rdev_clear_badblocks(rdev,
   2475					     r1_bio->sector,
   2476					     r1_bio->sectors, 0);
   2477			rdev_dec_pending(rdev, conf->mddev);
   2478		} else if (r1_bio->bios[m] != NULL) {
   2479			/* This drive got a write error.  We need to
   2480			 * narrow down and record precise write
   2481			 * errors.
   2482			 */
   2483			fail = true;
   2484			if (!narrow_write_error(r1_bio, m)) {
   2485				md_error(conf->mddev,
   2486					 conf->mirrors[m].rdev);
   2487				/* an I/O failed, we can't clear the bitmap */
   2488				set_bit(R1BIO_Degraded, &r1_bio->state);
   2489			}
   2490			rdev_dec_pending(conf->mirrors[m].rdev,
   2491					 conf->mddev);
   2492		}
   2493	if (fail) {
   2494		spin_lock_irq(&conf->device_lock);
   2495		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
   2496		idx = sector_to_idx(r1_bio->sector);
   2497		atomic_inc(&conf->nr_queued[idx]);
   2498		spin_unlock_irq(&conf->device_lock);
   2499		/*
   2500		 * In case freeze_array() is waiting for condition
   2501		 * get_unqueued_pending() == extra to be true.
   2502		 */
   2503		wake_up(&conf->wait_barrier);
   2504		md_wakeup_thread(conf->mddev->thread);
   2505	} else {
   2506		if (test_bit(R1BIO_WriteError, &r1_bio->state))
   2507			close_write(r1_bio);
   2508		raid_end_bio_io(r1_bio);
   2509	}
   2510}
   2511
   2512static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
   2513{
   2514	struct mddev *mddev = conf->mddev;
   2515	struct bio *bio;
   2516	struct md_rdev *rdev;
   2517
   2518	clear_bit(R1BIO_ReadError, &r1_bio->state);
   2519	/* we got a read error. Maybe the drive is bad.  Maybe just
   2520	 * the block and we can fix it.
   2521	 * We freeze all other IO, and try reading the block from
   2522	 * other devices.  When we find one, we re-write
   2523	 * and check it that fixes the read error.
   2524	 * This is all done synchronously while the array is
   2525	 * frozen
   2526	 */
   2527
   2528	bio = r1_bio->bios[r1_bio->read_disk];
   2529	bio_put(bio);
   2530	r1_bio->bios[r1_bio->read_disk] = NULL;
   2531
   2532	rdev = conf->mirrors[r1_bio->read_disk].rdev;
   2533	if (mddev->ro == 0
   2534	    && !test_bit(FailFast, &rdev->flags)) {
   2535		freeze_array(conf, 1);
   2536		fix_read_error(conf, r1_bio->read_disk,
   2537			       r1_bio->sector, r1_bio->sectors);
   2538		unfreeze_array(conf);
   2539	} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
   2540		md_error(mddev, rdev);
   2541	} else {
   2542		r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
   2543	}
   2544
   2545	rdev_dec_pending(rdev, conf->mddev);
   2546	allow_barrier(conf, r1_bio->sector);
   2547	bio = r1_bio->master_bio;
   2548
   2549	/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
   2550	r1_bio->state = 0;
   2551	raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
   2552}
   2553
   2554static void raid1d(struct md_thread *thread)
   2555{
   2556	struct mddev *mddev = thread->mddev;
   2557	struct r1bio *r1_bio;
   2558	unsigned long flags;
   2559	struct r1conf *conf = mddev->private;
   2560	struct list_head *head = &conf->retry_list;
   2561	struct blk_plug plug;
   2562	int idx;
   2563
   2564	md_check_recovery(mddev);
   2565
   2566	if (!list_empty_careful(&conf->bio_end_io_list) &&
   2567	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
   2568		LIST_HEAD(tmp);
   2569		spin_lock_irqsave(&conf->device_lock, flags);
   2570		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
   2571			list_splice_init(&conf->bio_end_io_list, &tmp);
   2572		spin_unlock_irqrestore(&conf->device_lock, flags);
   2573		while (!list_empty(&tmp)) {
   2574			r1_bio = list_first_entry(&tmp, struct r1bio,
   2575						  retry_list);
   2576			list_del(&r1_bio->retry_list);
   2577			idx = sector_to_idx(r1_bio->sector);
   2578			atomic_dec(&conf->nr_queued[idx]);
   2579			if (mddev->degraded)
   2580				set_bit(R1BIO_Degraded, &r1_bio->state);
   2581			if (test_bit(R1BIO_WriteError, &r1_bio->state))
   2582				close_write(r1_bio);
   2583			raid_end_bio_io(r1_bio);
   2584		}
   2585	}
   2586
   2587	blk_start_plug(&plug);
   2588	for (;;) {
   2589
   2590		flush_pending_writes(conf);
   2591
   2592		spin_lock_irqsave(&conf->device_lock, flags);
   2593		if (list_empty(head)) {
   2594			spin_unlock_irqrestore(&conf->device_lock, flags);
   2595			break;
   2596		}
   2597		r1_bio = list_entry(head->prev, struct r1bio, retry_list);
   2598		list_del(head->prev);
   2599		idx = sector_to_idx(r1_bio->sector);
   2600		atomic_dec(&conf->nr_queued[idx]);
   2601		spin_unlock_irqrestore(&conf->device_lock, flags);
   2602
   2603		mddev = r1_bio->mddev;
   2604		conf = mddev->private;
   2605		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
   2606			if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
   2607			    test_bit(R1BIO_WriteError, &r1_bio->state))
   2608				handle_sync_write_finished(conf, r1_bio);
   2609			else
   2610				sync_request_write(mddev, r1_bio);
   2611		} else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
   2612			   test_bit(R1BIO_WriteError, &r1_bio->state))
   2613			handle_write_finished(conf, r1_bio);
   2614		else if (test_bit(R1BIO_ReadError, &r1_bio->state))
   2615			handle_read_error(conf, r1_bio);
   2616		else
   2617			WARN_ON_ONCE(1);
   2618
   2619		cond_resched();
   2620		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
   2621			md_check_recovery(mddev);
   2622	}
   2623	blk_finish_plug(&plug);
   2624}
   2625
   2626static int init_resync(struct r1conf *conf)
   2627{
   2628	int buffs;
   2629
   2630	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
   2631	BUG_ON(mempool_initialized(&conf->r1buf_pool));
   2632
   2633	return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
   2634			    r1buf_pool_free, conf->poolinfo);
   2635}
   2636
   2637static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
   2638{
   2639	struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO);
   2640	struct resync_pages *rps;
   2641	struct bio *bio;
   2642	int i;
   2643
   2644	for (i = conf->poolinfo->raid_disks; i--; ) {
   2645		bio = r1bio->bios[i];
   2646		rps = bio->bi_private;
   2647		bio_reset(bio, NULL, 0);
   2648		bio->bi_private = rps;
   2649	}
   2650	r1bio->master_bio = NULL;
   2651	return r1bio;
   2652}
   2653
   2654/*
   2655 * perform a "sync" on one "block"
   2656 *
   2657 * We need to make sure that no normal I/O request - particularly write
   2658 * requests - conflict with active sync requests.
   2659 *
   2660 * This is achieved by tracking pending requests and a 'barrier' concept
   2661 * that can be installed to exclude normal IO requests.
   2662 */
   2663
   2664static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
   2665				   int *skipped)
   2666{
   2667	struct r1conf *conf = mddev->private;
   2668	struct r1bio *r1_bio;
   2669	struct bio *bio;
   2670	sector_t max_sector, nr_sectors;
   2671	int disk = -1;
   2672	int i;
   2673	int wonly = -1;
   2674	int write_targets = 0, read_targets = 0;
   2675	sector_t sync_blocks;
   2676	int still_degraded = 0;
   2677	int good_sectors = RESYNC_SECTORS;
   2678	int min_bad = 0; /* number of sectors that are bad in all devices */
   2679	int idx = sector_to_idx(sector_nr);
   2680	int page_idx = 0;
   2681
   2682	if (!mempool_initialized(&conf->r1buf_pool))
   2683		if (init_resync(conf))
   2684			return 0;
   2685
   2686	max_sector = mddev->dev_sectors;
   2687	if (sector_nr >= max_sector) {
   2688		/* If we aborted, we need to abort the
   2689		 * sync on the 'current' bitmap chunk (there will
   2690		 * only be one in raid1 resync.
   2691		 * We can find the current addess in mddev->curr_resync
   2692		 */
   2693		if (mddev->curr_resync < max_sector) /* aborted */
   2694			md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
   2695					   &sync_blocks, 1);
   2696		else /* completed sync */
   2697			conf->fullsync = 0;
   2698
   2699		md_bitmap_close_sync(mddev->bitmap);
   2700		close_sync(conf);
   2701
   2702		if (mddev_is_clustered(mddev)) {
   2703			conf->cluster_sync_low = 0;
   2704			conf->cluster_sync_high = 0;
   2705		}
   2706		return 0;
   2707	}
   2708
   2709	if (mddev->bitmap == NULL &&
   2710	    mddev->recovery_cp == MaxSector &&
   2711	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
   2712	    conf->fullsync == 0) {
   2713		*skipped = 1;
   2714		return max_sector - sector_nr;
   2715	}
   2716	/* before building a request, check if we can skip these blocks..
   2717	 * This call the bitmap_start_sync doesn't actually record anything
   2718	 */
   2719	if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
   2720	    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
   2721		/* We can skip this block, and probably several more */
   2722		*skipped = 1;
   2723		return sync_blocks;
   2724	}
   2725
   2726	/*
   2727	 * If there is non-resync activity waiting for a turn, then let it
   2728	 * though before starting on this new sync request.
   2729	 */
   2730	if (atomic_read(&conf->nr_waiting[idx]))
   2731		schedule_timeout_uninterruptible(1);
   2732
   2733	/* we are incrementing sector_nr below. To be safe, we check against
   2734	 * sector_nr + two times RESYNC_SECTORS
   2735	 */
   2736
   2737	md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
   2738		mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
   2739
   2740
   2741	if (raise_barrier(conf, sector_nr))
   2742		return 0;
   2743
   2744	r1_bio = raid1_alloc_init_r1buf(conf);
   2745
   2746	rcu_read_lock();
   2747	/*
   2748	 * If we get a correctably read error during resync or recovery,
   2749	 * we might want to read from a different device.  So we
   2750	 * flag all drives that could conceivably be read from for READ,
   2751	 * and any others (which will be non-In_sync devices) for WRITE.
   2752	 * If a read fails, we try reading from something else for which READ
   2753	 * is OK.
   2754	 */
   2755
   2756	r1_bio->mddev = mddev;
   2757	r1_bio->sector = sector_nr;
   2758	r1_bio->state = 0;
   2759	set_bit(R1BIO_IsSync, &r1_bio->state);
   2760	/* make sure good_sectors won't go across barrier unit boundary */
   2761	good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
   2762
   2763	for (i = 0; i < conf->raid_disks * 2; i++) {
   2764		struct md_rdev *rdev;
   2765		bio = r1_bio->bios[i];
   2766
   2767		rdev = rcu_dereference(conf->mirrors[i].rdev);
   2768		if (rdev == NULL ||
   2769		    test_bit(Faulty, &rdev->flags)) {
   2770			if (i < conf->raid_disks)
   2771				still_degraded = 1;
   2772		} else if (!test_bit(In_sync, &rdev->flags)) {
   2773			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
   2774			bio->bi_end_io = end_sync_write;
   2775			write_targets ++;
   2776		} else {
   2777			/* may need to read from here */
   2778			sector_t first_bad = MaxSector;
   2779			int bad_sectors;
   2780
   2781			if (is_badblock(rdev, sector_nr, good_sectors,
   2782					&first_bad, &bad_sectors)) {
   2783				if (first_bad > sector_nr)
   2784					good_sectors = first_bad - sector_nr;
   2785				else {
   2786					bad_sectors -= (sector_nr - first_bad);
   2787					if (min_bad == 0 ||
   2788					    min_bad > bad_sectors)
   2789						min_bad = bad_sectors;
   2790				}
   2791			}
   2792			if (sector_nr < first_bad) {
   2793				if (test_bit(WriteMostly, &rdev->flags)) {
   2794					if (wonly < 0)
   2795						wonly = i;
   2796				} else {
   2797					if (disk < 0)
   2798						disk = i;
   2799				}
   2800				bio_set_op_attrs(bio, REQ_OP_READ, 0);
   2801				bio->bi_end_io = end_sync_read;
   2802				read_targets++;
   2803			} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
   2804				test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
   2805				!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
   2806				/*
   2807				 * The device is suitable for reading (InSync),
   2808				 * but has bad block(s) here. Let's try to correct them,
   2809				 * if we are doing resync or repair. Otherwise, leave
   2810				 * this device alone for this sync request.
   2811				 */
   2812				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
   2813				bio->bi_end_io = end_sync_write;
   2814				write_targets++;
   2815			}
   2816		}
   2817		if (rdev && bio->bi_end_io) {
   2818			atomic_inc(&rdev->nr_pending);
   2819			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
   2820			bio_set_dev(bio, rdev->bdev);
   2821			if (test_bit(FailFast, &rdev->flags))
   2822				bio->bi_opf |= MD_FAILFAST;
   2823		}
   2824	}
   2825	rcu_read_unlock();
   2826	if (disk < 0)
   2827		disk = wonly;
   2828	r1_bio->read_disk = disk;
   2829
   2830	if (read_targets == 0 && min_bad > 0) {
   2831		/* These sectors are bad on all InSync devices, so we
   2832		 * need to mark them bad on all write targets
   2833		 */
   2834		int ok = 1;
   2835		for (i = 0 ; i < conf->raid_disks * 2 ; i++)
   2836			if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
   2837				struct md_rdev *rdev = conf->mirrors[i].rdev;
   2838				ok = rdev_set_badblocks(rdev, sector_nr,
   2839							min_bad, 0
   2840					) && ok;
   2841			}
   2842		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
   2843		*skipped = 1;
   2844		put_buf(r1_bio);
   2845
   2846		if (!ok) {
   2847			/* Cannot record the badblocks, so need to
   2848			 * abort the resync.
   2849			 * If there are multiple read targets, could just
   2850			 * fail the really bad ones ???
   2851			 */
   2852			conf->recovery_disabled = mddev->recovery_disabled;
   2853			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   2854			return 0;
   2855		} else
   2856			return min_bad;
   2857
   2858	}
   2859	if (min_bad > 0 && min_bad < good_sectors) {
   2860		/* only resync enough to reach the next bad->good
   2861		 * transition */
   2862		good_sectors = min_bad;
   2863	}
   2864
   2865	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
   2866		/* extra read targets are also write targets */
   2867		write_targets += read_targets-1;
   2868
   2869	if (write_targets == 0 || read_targets == 0) {
   2870		/* There is nowhere to write, so all non-sync
   2871		 * drives must be failed - so we are finished
   2872		 */
   2873		sector_t rv;
   2874		if (min_bad > 0)
   2875			max_sector = sector_nr + min_bad;
   2876		rv = max_sector - sector_nr;
   2877		*skipped = 1;
   2878		put_buf(r1_bio);
   2879		return rv;
   2880	}
   2881
   2882	if (max_sector > mddev->resync_max)
   2883		max_sector = mddev->resync_max; /* Don't do IO beyond here */
   2884	if (max_sector > sector_nr + good_sectors)
   2885		max_sector = sector_nr + good_sectors;
   2886	nr_sectors = 0;
   2887	sync_blocks = 0;
   2888	do {
   2889		struct page *page;
   2890		int len = PAGE_SIZE;
   2891		if (sector_nr + (len>>9) > max_sector)
   2892			len = (max_sector - sector_nr) << 9;
   2893		if (len == 0)
   2894			break;
   2895		if (sync_blocks == 0) {
   2896			if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
   2897						  &sync_blocks, still_degraded) &&
   2898			    !conf->fullsync &&
   2899			    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
   2900				break;
   2901			if ((len >> 9) > sync_blocks)
   2902				len = sync_blocks<<9;
   2903		}
   2904
   2905		for (i = 0 ; i < conf->raid_disks * 2; i++) {
   2906			struct resync_pages *rp;
   2907
   2908			bio = r1_bio->bios[i];
   2909			rp = get_resync_pages(bio);
   2910			if (bio->bi_end_io) {
   2911				page = resync_fetch_page(rp, page_idx);
   2912
   2913				/*
   2914				 * won't fail because the vec table is big
   2915				 * enough to hold all these pages
   2916				 */
   2917				bio_add_page(bio, page, len, 0);
   2918			}
   2919		}
   2920		nr_sectors += len>>9;
   2921		sector_nr += len>>9;
   2922		sync_blocks -= (len>>9);
   2923	} while (++page_idx < RESYNC_PAGES);
   2924
   2925	r1_bio->sectors = nr_sectors;
   2926
   2927	if (mddev_is_clustered(mddev) &&
   2928			conf->cluster_sync_high < sector_nr + nr_sectors) {
   2929		conf->cluster_sync_low = mddev->curr_resync_completed;
   2930		conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
   2931		/* Send resync message */
   2932		md_cluster_ops->resync_info_update(mddev,
   2933				conf->cluster_sync_low,
   2934				conf->cluster_sync_high);
   2935	}
   2936
   2937	/* For a user-requested sync, we read all readable devices and do a
   2938	 * compare
   2939	 */
   2940	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
   2941		atomic_set(&r1_bio->remaining, read_targets);
   2942		for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
   2943			bio = r1_bio->bios[i];
   2944			if (bio->bi_end_io == end_sync_read) {
   2945				read_targets--;
   2946				md_sync_acct_bio(bio, nr_sectors);
   2947				if (read_targets == 1)
   2948					bio->bi_opf &= ~MD_FAILFAST;
   2949				submit_bio_noacct(bio);
   2950			}
   2951		}
   2952	} else {
   2953		atomic_set(&r1_bio->remaining, 1);
   2954		bio = r1_bio->bios[r1_bio->read_disk];
   2955		md_sync_acct_bio(bio, nr_sectors);
   2956		if (read_targets == 1)
   2957			bio->bi_opf &= ~MD_FAILFAST;
   2958		submit_bio_noacct(bio);
   2959	}
   2960	return nr_sectors;
   2961}
   2962
   2963static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
   2964{
   2965	if (sectors)
   2966		return sectors;
   2967
   2968	return mddev->dev_sectors;
   2969}
   2970
   2971static struct r1conf *setup_conf(struct mddev *mddev)
   2972{
   2973	struct r1conf *conf;
   2974	int i;
   2975	struct raid1_info *disk;
   2976	struct md_rdev *rdev;
   2977	int err = -ENOMEM;
   2978
   2979	conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
   2980	if (!conf)
   2981		goto abort;
   2982
   2983	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
   2984				   sizeof(atomic_t), GFP_KERNEL);
   2985	if (!conf->nr_pending)
   2986		goto abort;
   2987
   2988	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
   2989				   sizeof(atomic_t), GFP_KERNEL);
   2990	if (!conf->nr_waiting)
   2991		goto abort;
   2992
   2993	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
   2994				  sizeof(atomic_t), GFP_KERNEL);
   2995	if (!conf->nr_queued)
   2996		goto abort;
   2997
   2998	conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
   2999				sizeof(atomic_t), GFP_KERNEL);
   3000	if (!conf->barrier)
   3001		goto abort;
   3002
   3003	conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info),
   3004					    mddev->raid_disks, 2),
   3005				GFP_KERNEL);
   3006	if (!conf->mirrors)
   3007		goto abort;
   3008
   3009	conf->tmppage = alloc_page(GFP_KERNEL);
   3010	if (!conf->tmppage)
   3011		goto abort;
   3012
   3013	conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
   3014	if (!conf->poolinfo)
   3015		goto abort;
   3016	conf->poolinfo->raid_disks = mddev->raid_disks * 2;
   3017	err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
   3018			   rbio_pool_free, conf->poolinfo);
   3019	if (err)
   3020		goto abort;
   3021
   3022	err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
   3023	if (err)
   3024		goto abort;
   3025
   3026	conf->poolinfo->mddev = mddev;
   3027
   3028	err = -EINVAL;
   3029	spin_lock_init(&conf->device_lock);
   3030	rdev_for_each(rdev, mddev) {
   3031		int disk_idx = rdev->raid_disk;
   3032		if (disk_idx >= mddev->raid_disks
   3033		    || disk_idx < 0)
   3034			continue;
   3035		if (test_bit(Replacement, &rdev->flags))
   3036			disk = conf->mirrors + mddev->raid_disks + disk_idx;
   3037		else
   3038			disk = conf->mirrors + disk_idx;
   3039
   3040		if (disk->rdev)
   3041			goto abort;
   3042		disk->rdev = rdev;
   3043		disk->head_position = 0;
   3044		disk->seq_start = MaxSector;
   3045	}
   3046	conf->raid_disks = mddev->raid_disks;
   3047	conf->mddev = mddev;
   3048	INIT_LIST_HEAD(&conf->retry_list);
   3049	INIT_LIST_HEAD(&conf->bio_end_io_list);
   3050
   3051	spin_lock_init(&conf->resync_lock);
   3052	init_waitqueue_head(&conf->wait_barrier);
   3053
   3054	bio_list_init(&conf->pending_bio_list);
   3055	conf->recovery_disabled = mddev->recovery_disabled - 1;
   3056
   3057	err = -EIO;
   3058	for (i = 0; i < conf->raid_disks * 2; i++) {
   3059
   3060		disk = conf->mirrors + i;
   3061
   3062		if (i < conf->raid_disks &&
   3063		    disk[conf->raid_disks].rdev) {
   3064			/* This slot has a replacement. */
   3065			if (!disk->rdev) {
   3066				/* No original, just make the replacement
   3067				 * a recovering spare
   3068				 */
   3069				disk->rdev =
   3070					disk[conf->raid_disks].rdev;
   3071				disk[conf->raid_disks].rdev = NULL;
   3072			} else if (!test_bit(In_sync, &disk->rdev->flags))
   3073				/* Original is not in_sync - bad */
   3074				goto abort;
   3075		}
   3076
   3077		if (!disk->rdev ||
   3078		    !test_bit(In_sync, &disk->rdev->flags)) {
   3079			disk->head_position = 0;
   3080			if (disk->rdev &&
   3081			    (disk->rdev->saved_raid_disk < 0))
   3082				conf->fullsync = 1;
   3083		}
   3084	}
   3085
   3086	err = -ENOMEM;
   3087	conf->thread = md_register_thread(raid1d, mddev, "raid1");
   3088	if (!conf->thread)
   3089		goto abort;
   3090
   3091	return conf;
   3092
   3093 abort:
   3094	if (conf) {
   3095		mempool_exit(&conf->r1bio_pool);
   3096		kfree(conf->mirrors);
   3097		safe_put_page(conf->tmppage);
   3098		kfree(conf->poolinfo);
   3099		kfree(conf->nr_pending);
   3100		kfree(conf->nr_waiting);
   3101		kfree(conf->nr_queued);
   3102		kfree(conf->barrier);
   3103		bioset_exit(&conf->bio_split);
   3104		kfree(conf);
   3105	}
   3106	return ERR_PTR(err);
   3107}
   3108
   3109static void raid1_free(struct mddev *mddev, void *priv);
   3110static int raid1_run(struct mddev *mddev)
   3111{
   3112	struct r1conf *conf;
   3113	int i;
   3114	struct md_rdev *rdev;
   3115	int ret;
   3116
   3117	if (mddev->level != 1) {
   3118		pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n",
   3119			mdname(mddev), mddev->level);
   3120		return -EIO;
   3121	}
   3122	if (mddev->reshape_position != MaxSector) {
   3123		pr_warn("md/raid1:%s: reshape_position set but not supported\n",
   3124			mdname(mddev));
   3125		return -EIO;
   3126	}
   3127	if (mddev_init_writes_pending(mddev) < 0)
   3128		return -ENOMEM;
   3129	/*
   3130	 * copy the already verified devices into our private RAID1
   3131	 * bookkeeping area. [whatever we allocate in run(),
   3132	 * should be freed in raid1_free()]
   3133	 */
   3134	if (mddev->private == NULL)
   3135		conf = setup_conf(mddev);
   3136	else
   3137		conf = mddev->private;
   3138
   3139	if (IS_ERR(conf))
   3140		return PTR_ERR(conf);
   3141
   3142	if (mddev->queue)
   3143		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
   3144
   3145	rdev_for_each(rdev, mddev) {
   3146		if (!mddev->gendisk)
   3147			continue;
   3148		disk_stack_limits(mddev->gendisk, rdev->bdev,
   3149				  rdev->data_offset << 9);
   3150	}
   3151
   3152	mddev->degraded = 0;
   3153	for (i = 0; i < conf->raid_disks; i++)
   3154		if (conf->mirrors[i].rdev == NULL ||
   3155		    !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
   3156		    test_bit(Faulty, &conf->mirrors[i].rdev->flags))
   3157			mddev->degraded++;
   3158	/*
   3159	 * RAID1 needs at least one disk in active
   3160	 */
   3161	if (conf->raid_disks - mddev->degraded < 1) {
   3162		ret = -EINVAL;
   3163		goto abort;
   3164	}
   3165
   3166	if (conf->raid_disks - mddev->degraded == 1)
   3167		mddev->recovery_cp = MaxSector;
   3168
   3169	if (mddev->recovery_cp != MaxSector)
   3170		pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
   3171			mdname(mddev));
   3172	pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
   3173		mdname(mddev), mddev->raid_disks - mddev->degraded,
   3174		mddev->raid_disks);
   3175
   3176	/*
   3177	 * Ok, everything is just fine now
   3178	 */
   3179	mddev->thread = conf->thread;
   3180	conf->thread = NULL;
   3181	mddev->private = conf;
   3182	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
   3183
   3184	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
   3185
   3186	ret = md_integrity_register(mddev);
   3187	if (ret) {
   3188		md_unregister_thread(&mddev->thread);
   3189		goto abort;
   3190	}
   3191	return 0;
   3192
   3193abort:
   3194	raid1_free(mddev, conf);
   3195	return ret;
   3196}
   3197
   3198static void raid1_free(struct mddev *mddev, void *priv)
   3199{
   3200	struct r1conf *conf = priv;
   3201
   3202	mempool_exit(&conf->r1bio_pool);
   3203	kfree(conf->mirrors);
   3204	safe_put_page(conf->tmppage);
   3205	kfree(conf->poolinfo);
   3206	kfree(conf->nr_pending);
   3207	kfree(conf->nr_waiting);
   3208	kfree(conf->nr_queued);
   3209	kfree(conf->barrier);
   3210	bioset_exit(&conf->bio_split);
   3211	kfree(conf);
   3212}
   3213
   3214static int raid1_resize(struct mddev *mddev, sector_t sectors)
   3215{
   3216	/* no resync is happening, and there is enough space
   3217	 * on all devices, so we can resize.
   3218	 * We need to make sure resync covers any new space.
   3219	 * If the array is shrinking we should possibly wait until
   3220	 * any io in the removed space completes, but it hardly seems
   3221	 * worth it.
   3222	 */
   3223	sector_t newsize = raid1_size(mddev, sectors, 0);
   3224	if (mddev->external_size &&
   3225	    mddev->array_sectors > newsize)
   3226		return -EINVAL;
   3227	if (mddev->bitmap) {
   3228		int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
   3229		if (ret)
   3230			return ret;
   3231	}
   3232	md_set_array_sectors(mddev, newsize);
   3233	if (sectors > mddev->dev_sectors &&
   3234	    mddev->recovery_cp > mddev->dev_sectors) {
   3235		mddev->recovery_cp = mddev->dev_sectors;
   3236		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
   3237	}
   3238	mddev->dev_sectors = sectors;
   3239	mddev->resync_max_sectors = sectors;
   3240	return 0;
   3241}
   3242
   3243static int raid1_reshape(struct mddev *mddev)
   3244{
   3245	/* We need to:
   3246	 * 1/ resize the r1bio_pool
   3247	 * 2/ resize conf->mirrors
   3248	 *
   3249	 * We allocate a new r1bio_pool if we can.
   3250	 * Then raise a device barrier and wait until all IO stops.
   3251	 * Then resize conf->mirrors and swap in the new r1bio pool.
   3252	 *
   3253	 * At the same time, we "pack" the devices so that all the missing
   3254	 * devices have the higher raid_disk numbers.
   3255	 */
   3256	mempool_t newpool, oldpool;
   3257	struct pool_info *newpoolinfo;
   3258	struct raid1_info *newmirrors;
   3259	struct r1conf *conf = mddev->private;
   3260	int cnt, raid_disks;
   3261	unsigned long flags;
   3262	int d, d2;
   3263	int ret;
   3264
   3265	memset(&newpool, 0, sizeof(newpool));
   3266	memset(&oldpool, 0, sizeof(oldpool));
   3267
   3268	/* Cannot change chunk_size, layout, or level */
   3269	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
   3270	    mddev->layout != mddev->new_layout ||
   3271	    mddev->level != mddev->new_level) {
   3272		mddev->new_chunk_sectors = mddev->chunk_sectors;
   3273		mddev->new_layout = mddev->layout;
   3274		mddev->new_level = mddev->level;
   3275		return -EINVAL;
   3276	}
   3277
   3278	if (!mddev_is_clustered(mddev))
   3279		md_allow_write(mddev);
   3280
   3281	raid_disks = mddev->raid_disks + mddev->delta_disks;
   3282
   3283	if (raid_disks < conf->raid_disks) {
   3284		cnt=0;
   3285		for (d= 0; d < conf->raid_disks; d++)
   3286			if (conf->mirrors[d].rdev)
   3287				cnt++;
   3288		if (cnt > raid_disks)
   3289			return -EBUSY;
   3290	}
   3291
   3292	newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
   3293	if (!newpoolinfo)
   3294		return -ENOMEM;
   3295	newpoolinfo->mddev = mddev;
   3296	newpoolinfo->raid_disks = raid_disks * 2;
   3297
   3298	ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
   3299			   rbio_pool_free, newpoolinfo);
   3300	if (ret) {
   3301		kfree(newpoolinfo);
   3302		return ret;
   3303	}
   3304	newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
   3305					 raid_disks, 2),
   3306			     GFP_KERNEL);
   3307	if (!newmirrors) {
   3308		kfree(newpoolinfo);
   3309		mempool_exit(&newpool);
   3310		return -ENOMEM;
   3311	}
   3312
   3313	freeze_array(conf, 0);
   3314
   3315	/* ok, everything is stopped */
   3316	oldpool = conf->r1bio_pool;
   3317	conf->r1bio_pool = newpool;
   3318
   3319	for (d = d2 = 0; d < conf->raid_disks; d++) {
   3320		struct md_rdev *rdev = conf->mirrors[d].rdev;
   3321		if (rdev && rdev->raid_disk != d2) {
   3322			sysfs_unlink_rdev(mddev, rdev);
   3323			rdev->raid_disk = d2;
   3324			sysfs_unlink_rdev(mddev, rdev);
   3325			if (sysfs_link_rdev(mddev, rdev))
   3326				pr_warn("md/raid1:%s: cannot register rd%d\n",
   3327					mdname(mddev), rdev->raid_disk);
   3328		}
   3329		if (rdev)
   3330			newmirrors[d2++].rdev = rdev;
   3331	}
   3332	kfree(conf->mirrors);
   3333	conf->mirrors = newmirrors;
   3334	kfree(conf->poolinfo);
   3335	conf->poolinfo = newpoolinfo;
   3336
   3337	spin_lock_irqsave(&conf->device_lock, flags);
   3338	mddev->degraded += (raid_disks - conf->raid_disks);
   3339	spin_unlock_irqrestore(&conf->device_lock, flags);
   3340	conf->raid_disks = mddev->raid_disks = raid_disks;
   3341	mddev->delta_disks = 0;
   3342
   3343	unfreeze_array(conf);
   3344
   3345	set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
   3346	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
   3347	md_wakeup_thread(mddev->thread);
   3348
   3349	mempool_exit(&oldpool);
   3350	return 0;
   3351}
   3352
   3353static void raid1_quiesce(struct mddev *mddev, int quiesce)
   3354{
   3355	struct r1conf *conf = mddev->private;
   3356
   3357	if (quiesce)
   3358		freeze_array(conf, 0);
   3359	else
   3360		unfreeze_array(conf);
   3361}
   3362
   3363static void *raid1_takeover(struct mddev *mddev)
   3364{
   3365	/* raid1 can take over:
   3366	 *  raid5 with 2 devices, any layout or chunk size
   3367	 */
   3368	if (mddev->level == 5 && mddev->raid_disks == 2) {
   3369		struct r1conf *conf;
   3370		mddev->new_level = 1;
   3371		mddev->new_layout = 0;
   3372		mddev->new_chunk_sectors = 0;
   3373		conf = setup_conf(mddev);
   3374		if (!IS_ERR(conf)) {
   3375			/* Array must appear to be quiesced */
   3376			conf->array_frozen = 1;
   3377			mddev_clear_unsupported_flags(mddev,
   3378				UNSUPPORTED_MDDEV_FLAGS);
   3379		}
   3380		return conf;
   3381	}
   3382	return ERR_PTR(-EINVAL);
   3383}
   3384
   3385static struct md_personality raid1_personality =
   3386{
   3387	.name		= "raid1",
   3388	.level		= 1,
   3389	.owner		= THIS_MODULE,
   3390	.make_request	= raid1_make_request,
   3391	.run		= raid1_run,
   3392	.free		= raid1_free,
   3393	.status		= raid1_status,
   3394	.error_handler	= raid1_error,
   3395	.hot_add_disk	= raid1_add_disk,
   3396	.hot_remove_disk= raid1_remove_disk,
   3397	.spare_active	= raid1_spare_active,
   3398	.sync_request	= raid1_sync_request,
   3399	.resize		= raid1_resize,
   3400	.size		= raid1_size,
   3401	.check_reshape	= raid1_reshape,
   3402	.quiesce	= raid1_quiesce,
   3403	.takeover	= raid1_takeover,
   3404};
   3405
   3406static int __init raid_init(void)
   3407{
   3408	return register_md_personality(&raid1_personality);
   3409}
   3410
   3411static void raid_exit(void)
   3412{
   3413	unregister_md_personality(&raid1_personality);
   3414}
   3415
   3416module_init(raid_init);
   3417module_exit(raid_exit);
   3418MODULE_LICENSE("GPL");
   3419MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
   3420MODULE_ALIAS("md-personality-3"); /* RAID1 */
   3421MODULE_ALIAS("md-raid1");
   3422MODULE_ALIAS("md-level-1");