cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-raid.c (121051B)


      1/*
      2 * Copyright (C) 2010-2011 Neil Brown
      3 * Copyright (C) 2010-2018 Red Hat, Inc. All rights reserved.
      4 *
      5 * This file is released under the GPL.
      6 */
      7
      8#include <linux/slab.h>
      9#include <linux/module.h>
     10
     11#include "md.h"
     12#include "raid1.h"
     13#include "raid5.h"
     14#include "raid10.h"
     15#include "md-bitmap.h"
     16
     17#include <linux/device-mapper.h>
     18
     19#define DM_MSG_PREFIX "raid"
     20#define	MAX_RAID_DEVICES	253 /* md-raid kernel limit */
     21
     22/*
     23 * Minimum sectors of free reshape space per raid device
     24 */
     25#define	MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
     26
     27/*
     28 * Minimum journal space 4 MiB in sectors.
     29 */
     30#define	MIN_RAID456_JOURNAL_SPACE (4*2048)
     31
     32static bool devices_handle_discard_safely = false;
     33
     34/*
     35 * The following flags are used by dm-raid.c to set up the array state.
     36 * They must be cleared before md_run is called.
     37 */
     38#define FirstUse 10		/* rdev flag */
     39
     40struct raid_dev {
     41	/*
     42	 * Two DM devices, one to hold metadata and one to hold the
     43	 * actual data/parity.	The reason for this is to not confuse
     44	 * ti->len and give more flexibility in altering size and
     45	 * characteristics.
     46	 *
     47	 * While it is possible for this device to be associated
     48	 * with a different physical device than the data_dev, it
     49	 * is intended for it to be the same.
     50	 *    |--------- Physical Device ---------|
     51	 *    |- meta_dev -|------ data_dev ------|
     52	 */
     53	struct dm_dev *meta_dev;
     54	struct dm_dev *data_dev;
     55	struct md_rdev rdev;
     56};
     57
     58/*
     59 * Bits for establishing rs->ctr_flags
     60 *
     61 * 1 = no flag value
     62 * 2 = flag with value
     63 */
     64#define __CTR_FLAG_SYNC			0  /* 1 */ /* Not with raid0! */
     65#define __CTR_FLAG_NOSYNC		1  /* 1 */ /* Not with raid0! */
     66#define __CTR_FLAG_REBUILD		2  /* 2 */ /* Not with raid0! */
     67#define __CTR_FLAG_DAEMON_SLEEP		3  /* 2 */ /* Not with raid0! */
     68#define __CTR_FLAG_MIN_RECOVERY_RATE	4  /* 2 */ /* Not with raid0! */
     69#define __CTR_FLAG_MAX_RECOVERY_RATE	5  /* 2 */ /* Not with raid0! */
     70#define __CTR_FLAG_MAX_WRITE_BEHIND	6  /* 2 */ /* Only with raid1! */
     71#define __CTR_FLAG_WRITE_MOSTLY		7  /* 2 */ /* Only with raid1! */
     72#define __CTR_FLAG_STRIPE_CACHE		8  /* 2 */ /* Only with raid4/5/6! */
     73#define __CTR_FLAG_REGION_SIZE		9  /* 2 */ /* Not with raid0! */
     74#define __CTR_FLAG_RAID10_COPIES	10 /* 2 */ /* Only with raid10 */
     75#define __CTR_FLAG_RAID10_FORMAT	11 /* 2 */ /* Only with raid10 */
     76/* New for v1.9.0 */
     77#define __CTR_FLAG_DELTA_DISKS		12 /* 2 */ /* Only with reshapable raid1/4/5/6/10! */
     78#define __CTR_FLAG_DATA_OFFSET		13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
     79#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
     80
     81/* New for v1.10.0 */
     82#define __CTR_FLAG_JOURNAL_DEV		15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
     83
     84/* New for v1.11.1 */
     85#define __CTR_FLAG_JOURNAL_MODE		16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
     86
     87/*
     88 * Flags for rs->ctr_flags field.
     89 */
     90#define CTR_FLAG_SYNC			(1 << __CTR_FLAG_SYNC)
     91#define CTR_FLAG_NOSYNC			(1 << __CTR_FLAG_NOSYNC)
     92#define CTR_FLAG_REBUILD		(1 << __CTR_FLAG_REBUILD)
     93#define CTR_FLAG_DAEMON_SLEEP		(1 << __CTR_FLAG_DAEMON_SLEEP)
     94#define CTR_FLAG_MIN_RECOVERY_RATE	(1 << __CTR_FLAG_MIN_RECOVERY_RATE)
     95#define CTR_FLAG_MAX_RECOVERY_RATE	(1 << __CTR_FLAG_MAX_RECOVERY_RATE)
     96#define CTR_FLAG_MAX_WRITE_BEHIND	(1 << __CTR_FLAG_MAX_WRITE_BEHIND)
     97#define CTR_FLAG_WRITE_MOSTLY		(1 << __CTR_FLAG_WRITE_MOSTLY)
     98#define CTR_FLAG_STRIPE_CACHE		(1 << __CTR_FLAG_STRIPE_CACHE)
     99#define CTR_FLAG_REGION_SIZE		(1 << __CTR_FLAG_REGION_SIZE)
    100#define CTR_FLAG_RAID10_COPIES		(1 << __CTR_FLAG_RAID10_COPIES)
    101#define CTR_FLAG_RAID10_FORMAT		(1 << __CTR_FLAG_RAID10_FORMAT)
    102#define CTR_FLAG_DELTA_DISKS		(1 << __CTR_FLAG_DELTA_DISKS)
    103#define CTR_FLAG_DATA_OFFSET		(1 << __CTR_FLAG_DATA_OFFSET)
    104#define CTR_FLAG_RAID10_USE_NEAR_SETS	(1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
    105#define CTR_FLAG_JOURNAL_DEV		(1 << __CTR_FLAG_JOURNAL_DEV)
    106#define CTR_FLAG_JOURNAL_MODE		(1 << __CTR_FLAG_JOURNAL_MODE)
    107
    108/*
    109 * Definitions of various constructor flags to
    110 * be used in checks of valid / invalid flags
    111 * per raid level.
    112 */
    113/* Define all any sync flags */
    114#define	CTR_FLAGS_ANY_SYNC		(CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)
    115
    116/* Define flags for options without argument (e.g. 'nosync') */
    117#define	CTR_FLAG_OPTIONS_NO_ARGS	(CTR_FLAGS_ANY_SYNC | \
    118					 CTR_FLAG_RAID10_USE_NEAR_SETS)
    119
    120/* Define flags for options with one argument (e.g. 'delta_disks +2') */
    121#define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \
    122				  CTR_FLAG_WRITE_MOSTLY | \
    123				  CTR_FLAG_DAEMON_SLEEP | \
    124				  CTR_FLAG_MIN_RECOVERY_RATE | \
    125				  CTR_FLAG_MAX_RECOVERY_RATE | \
    126				  CTR_FLAG_MAX_WRITE_BEHIND | \
    127				  CTR_FLAG_STRIPE_CACHE | \
    128				  CTR_FLAG_REGION_SIZE | \
    129				  CTR_FLAG_RAID10_COPIES | \
    130				  CTR_FLAG_RAID10_FORMAT | \
    131				  CTR_FLAG_DELTA_DISKS | \
    132				  CTR_FLAG_DATA_OFFSET | \
    133				  CTR_FLAG_JOURNAL_DEV | \
    134				  CTR_FLAG_JOURNAL_MODE)
    135
    136/* Valid options definitions per raid level... */
    137
    138/* "raid0" does only accept data offset */
    139#define RAID0_VALID_FLAGS	(CTR_FLAG_DATA_OFFSET)
    140
    141/* "raid1" does not accept stripe cache, data offset, delta_disks or any raid10 options */
    142#define RAID1_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
    143				 CTR_FLAG_REBUILD | \
    144				 CTR_FLAG_WRITE_MOSTLY | \
    145				 CTR_FLAG_DAEMON_SLEEP | \
    146				 CTR_FLAG_MIN_RECOVERY_RATE | \
    147				 CTR_FLAG_MAX_RECOVERY_RATE | \
    148				 CTR_FLAG_MAX_WRITE_BEHIND | \
    149				 CTR_FLAG_REGION_SIZE | \
    150				 CTR_FLAG_DELTA_DISKS | \
    151				 CTR_FLAG_DATA_OFFSET)
    152
    153/* "raid10" does not accept any raid1 or stripe cache options */
    154#define RAID10_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
    155				 CTR_FLAG_REBUILD | \
    156				 CTR_FLAG_DAEMON_SLEEP | \
    157				 CTR_FLAG_MIN_RECOVERY_RATE | \
    158				 CTR_FLAG_MAX_RECOVERY_RATE | \
    159				 CTR_FLAG_REGION_SIZE | \
    160				 CTR_FLAG_RAID10_COPIES | \
    161				 CTR_FLAG_RAID10_FORMAT | \
    162				 CTR_FLAG_DELTA_DISKS | \
    163				 CTR_FLAG_DATA_OFFSET | \
    164				 CTR_FLAG_RAID10_USE_NEAR_SETS)
    165
    166/*
    167 * "raid4/5/6" do not accept any raid1 or raid10 specific options
    168 *
    169 * "raid6" does not accept "nosync", because it is not guaranteed
    170 * that both parity and q-syndrome are being written properly with
    171 * any writes
    172 */
    173#define RAID45_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
    174				 CTR_FLAG_REBUILD | \
    175				 CTR_FLAG_DAEMON_SLEEP | \
    176				 CTR_FLAG_MIN_RECOVERY_RATE | \
    177				 CTR_FLAG_MAX_RECOVERY_RATE | \
    178				 CTR_FLAG_STRIPE_CACHE | \
    179				 CTR_FLAG_REGION_SIZE | \
    180				 CTR_FLAG_DELTA_DISKS | \
    181				 CTR_FLAG_DATA_OFFSET | \
    182				 CTR_FLAG_JOURNAL_DEV | \
    183				 CTR_FLAG_JOURNAL_MODE)
    184
    185#define RAID6_VALID_FLAGS	(CTR_FLAG_SYNC | \
    186				 CTR_FLAG_REBUILD | \
    187				 CTR_FLAG_DAEMON_SLEEP | \
    188				 CTR_FLAG_MIN_RECOVERY_RATE | \
    189				 CTR_FLAG_MAX_RECOVERY_RATE | \
    190				 CTR_FLAG_STRIPE_CACHE | \
    191				 CTR_FLAG_REGION_SIZE | \
    192				 CTR_FLAG_DELTA_DISKS | \
    193				 CTR_FLAG_DATA_OFFSET | \
    194				 CTR_FLAG_JOURNAL_DEV | \
    195				 CTR_FLAG_JOURNAL_MODE)
    196/* ...valid options definitions per raid level */
    197
    198/*
    199 * Flags for rs->runtime_flags field
    200 * (RT_FLAG prefix meaning "runtime flag")
    201 *
    202 * These are all internal and used to define runtime state,
    203 * e.g. to prevent another resume from preresume processing
    204 * the raid set all over again.
    205 */
    206#define RT_FLAG_RS_PRERESUMED		0
    207#define RT_FLAG_RS_RESUMED		1
    208#define RT_FLAG_RS_BITMAP_LOADED	2
    209#define RT_FLAG_UPDATE_SBS		3
    210#define RT_FLAG_RESHAPE_RS		4
    211#define RT_FLAG_RS_SUSPENDED		5
    212#define RT_FLAG_RS_IN_SYNC		6
    213#define RT_FLAG_RS_RESYNCING		7
    214#define RT_FLAG_RS_GROW			8
    215
    216/* Array elements of 64 bit needed for rebuild/failed disk bits */
    217#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
    218
    219/*
    220 * raid set level, layout and chunk sectors backup/restore
    221 */
    222struct rs_layout {
    223	int new_level;
    224	int new_layout;
    225	int new_chunk_sectors;
    226};
    227
    228struct raid_set {
    229	struct dm_target *ti;
    230
    231	uint32_t stripe_cache_entries;
    232	unsigned long ctr_flags;
    233	unsigned long runtime_flags;
    234
    235	uint64_t rebuild_disks[DISKS_ARRAY_ELEMS];
    236
    237	int raid_disks;
    238	int delta_disks;
    239	int data_offset;
    240	int raid10_copies;
    241	int requested_bitmap_chunk_sectors;
    242
    243	struct mddev md;
    244	struct raid_type *raid_type;
    245
    246	sector_t array_sectors;
    247	sector_t dev_sectors;
    248
    249	/* Optional raid4/5/6 journal device */
    250	struct journal_dev {
    251		struct dm_dev *dev;
    252		struct md_rdev rdev;
    253		int mode;
    254	} journal_dev;
    255
    256	struct raid_dev dev[];
    257};
    258
    259static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
    260{
    261	struct mddev *mddev = &rs->md;
    262
    263	l->new_level = mddev->new_level;
    264	l->new_layout = mddev->new_layout;
    265	l->new_chunk_sectors = mddev->new_chunk_sectors;
    266}
    267
    268static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
    269{
    270	struct mddev *mddev = &rs->md;
    271
    272	mddev->new_level = l->new_level;
    273	mddev->new_layout = l->new_layout;
    274	mddev->new_chunk_sectors = l->new_chunk_sectors;
    275}
    276
    277/* raid10 algorithms (i.e. formats) */
    278#define	ALGORITHM_RAID10_DEFAULT	0
    279#define	ALGORITHM_RAID10_NEAR		1
    280#define	ALGORITHM_RAID10_OFFSET		2
    281#define	ALGORITHM_RAID10_FAR		3
    282
    283/* Supported raid types and properties. */
    284static struct raid_type {
    285	const char *name;		/* RAID algorithm. */
    286	const char *descr;		/* Descriptor text for logging. */
    287	const unsigned int parity_devs;	/* # of parity devices. */
    288	const unsigned int minimal_devs;/* minimal # of devices in set. */
    289	const unsigned int level;	/* RAID level. */
    290	const unsigned int algorithm;	/* RAID algorithm. */
    291} raid_types[] = {
    292	{"raid0",	  "raid0 (striping)",			    0, 2, 0,  0 /* NONE */},
    293	{"raid1",	  "raid1 (mirroring)",			    0, 2, 1,  0 /* NONE */},
    294	{"raid10_far",	  "raid10 far (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_FAR},
    295	{"raid10_offset", "raid10 offset (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_OFFSET},
    296	{"raid10_near",	  "raid10 near (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_NEAR},
    297	{"raid10",	  "raid10 (striped mirrors)",		    0, 2, 10, ALGORITHM_RAID10_DEFAULT},
    298	{"raid4",	  "raid4 (dedicated first parity disk)",    1, 2, 5,  ALGORITHM_PARITY_0}, /* raid4 layout = raid5_0 */
    299	{"raid5_n",	  "raid5 (dedicated last parity disk)",	    1, 2, 5,  ALGORITHM_PARITY_N},
    300	{"raid5_ls",	  "raid5 (left symmetric)",		    1, 2, 5,  ALGORITHM_LEFT_SYMMETRIC},
    301	{"raid5_rs",	  "raid5 (right symmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_SYMMETRIC},
    302	{"raid5_la",	  "raid5 (left asymmetric)",		    1, 2, 5,  ALGORITHM_LEFT_ASYMMETRIC},
    303	{"raid5_ra",	  "raid5 (right asymmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_ASYMMETRIC},
    304	{"raid6_zr",	  "raid6 (zero restart)",		    2, 4, 6,  ALGORITHM_ROTATING_ZERO_RESTART},
    305	{"raid6_nr",	  "raid6 (N restart)",			    2, 4, 6,  ALGORITHM_ROTATING_N_RESTART},
    306	{"raid6_nc",	  "raid6 (N continue)",			    2, 4, 6,  ALGORITHM_ROTATING_N_CONTINUE},
    307	{"raid6_n_6",	  "raid6 (dedicated parity/Q n/6)",	    2, 4, 6,  ALGORITHM_PARITY_N_6},
    308	{"raid6_ls_6",	  "raid6 (left symmetric dedicated Q 6)",   2, 4, 6,  ALGORITHM_LEFT_SYMMETRIC_6},
    309	{"raid6_rs_6",	  "raid6 (right symmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_RIGHT_SYMMETRIC_6},
    310	{"raid6_la_6",	  "raid6 (left asymmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_LEFT_ASYMMETRIC_6},
    311	{"raid6_ra_6",	  "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6,  ALGORITHM_RIGHT_ASYMMETRIC_6}
    312};
    313
    314/* True, if @v is in inclusive range [@min, @max] */
    315static bool __within_range(long v, long min, long max)
    316{
    317	return v >= min && v <= max;
    318}
    319
    320/* All table line arguments are defined here */
    321static struct arg_name_flag {
    322	const unsigned long flag;
    323	const char *name;
    324} __arg_name_flags[] = {
    325	{ CTR_FLAG_SYNC, "sync"},
    326	{ CTR_FLAG_NOSYNC, "nosync"},
    327	{ CTR_FLAG_REBUILD, "rebuild"},
    328	{ CTR_FLAG_DAEMON_SLEEP, "daemon_sleep"},
    329	{ CTR_FLAG_MIN_RECOVERY_RATE, "min_recovery_rate"},
    330	{ CTR_FLAG_MAX_RECOVERY_RATE, "max_recovery_rate"},
    331	{ CTR_FLAG_MAX_WRITE_BEHIND, "max_write_behind"},
    332	{ CTR_FLAG_WRITE_MOSTLY, "write_mostly"},
    333	{ CTR_FLAG_STRIPE_CACHE, "stripe_cache"},
    334	{ CTR_FLAG_REGION_SIZE, "region_size"},
    335	{ CTR_FLAG_RAID10_COPIES, "raid10_copies"},
    336	{ CTR_FLAG_RAID10_FORMAT, "raid10_format"},
    337	{ CTR_FLAG_DATA_OFFSET, "data_offset"},
    338	{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
    339	{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
    340	{ CTR_FLAG_JOURNAL_DEV, "journal_dev" },
    341	{ CTR_FLAG_JOURNAL_MODE, "journal_mode" },
    342};
    343
    344/* Return argument name string for given @flag */
    345static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
    346{
    347	if (hweight32(flag) == 1) {
    348		struct arg_name_flag *anf = __arg_name_flags + ARRAY_SIZE(__arg_name_flags);
    349
    350		while (anf-- > __arg_name_flags)
    351			if (flag & anf->flag)
    352				return anf->name;
    353
    354	} else
    355		DMERR("%s called with more than one flag!", __func__);
    356
    357	return NULL;
    358}
    359
    360/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
    361static struct {
    362	const int mode;
    363	const char *param;
    364} _raid456_journal_mode[] = {
    365	{ R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
    366	{ R5C_JOURNAL_MODE_WRITE_BACK    , "writeback" }
    367};
    368
    369/* Return MD raid4/5/6 journal mode for dm @journal_mode one */
    370static int dm_raid_journal_mode_to_md(const char *mode)
    371{
    372	int m = ARRAY_SIZE(_raid456_journal_mode);
    373
    374	while (m--)
    375		if (!strcasecmp(mode, _raid456_journal_mode[m].param))
    376			return _raid456_journal_mode[m].mode;
    377
    378	return -EINVAL;
    379}
    380
    381/* Return dm-raid raid4/5/6 journal mode string for @mode */
    382static const char *md_journal_mode_to_dm_raid(const int mode)
    383{
    384	int m = ARRAY_SIZE(_raid456_journal_mode);
    385
    386	while (m--)
    387		if (mode == _raid456_journal_mode[m].mode)
    388			return _raid456_journal_mode[m].param;
    389
    390	return "unknown";
    391}
    392
    393/*
    394 * Bool helpers to test for various raid levels of a raid set.
    395 * It's level as reported by the superblock rather than
    396 * the requested raid_type passed to the constructor.
    397 */
    398/* Return true, if raid set in @rs is raid0 */
    399static bool rs_is_raid0(struct raid_set *rs)
    400{
    401	return !rs->md.level;
    402}
    403
    404/* Return true, if raid set in @rs is raid1 */
    405static bool rs_is_raid1(struct raid_set *rs)
    406{
    407	return rs->md.level == 1;
    408}
    409
    410/* Return true, if raid set in @rs is raid10 */
    411static bool rs_is_raid10(struct raid_set *rs)
    412{
    413	return rs->md.level == 10;
    414}
    415
    416/* Return true, if raid set in @rs is level 6 */
    417static bool rs_is_raid6(struct raid_set *rs)
    418{
    419	return rs->md.level == 6;
    420}
    421
    422/* Return true, if raid set in @rs is level 4, 5 or 6 */
    423static bool rs_is_raid456(struct raid_set *rs)
    424{
    425	return __within_range(rs->md.level, 4, 6);
    426}
    427
    428/* Return true, if raid set in @rs is reshapable */
    429static bool __is_raid10_far(int layout);
    430static bool rs_is_reshapable(struct raid_set *rs)
    431{
    432	return rs_is_raid456(rs) ||
    433	       (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout));
    434}
    435
    436/* Return true, if raid set in @rs is recovering */
    437static bool rs_is_recovering(struct raid_set *rs)
    438{
    439	return rs->md.recovery_cp < rs->md.dev_sectors;
    440}
    441
    442/* Return true, if raid set in @rs is reshaping */
    443static bool rs_is_reshaping(struct raid_set *rs)
    444{
    445	return rs->md.reshape_position != MaxSector;
    446}
    447
    448/*
    449 * bool helpers to test for various raid levels of a raid type @rt
    450 */
    451
    452/* Return true, if raid type in @rt is raid0 */
    453static bool rt_is_raid0(struct raid_type *rt)
    454{
    455	return !rt->level;
    456}
    457
    458/* Return true, if raid type in @rt is raid1 */
    459static bool rt_is_raid1(struct raid_type *rt)
    460{
    461	return rt->level == 1;
    462}
    463
    464/* Return true, if raid type in @rt is raid10 */
    465static bool rt_is_raid10(struct raid_type *rt)
    466{
    467	return rt->level == 10;
    468}
    469
    470/* Return true, if raid type in @rt is raid4/5 */
    471static bool rt_is_raid45(struct raid_type *rt)
    472{
    473	return __within_range(rt->level, 4, 5);
    474}
    475
    476/* Return true, if raid type in @rt is raid6 */
    477static bool rt_is_raid6(struct raid_type *rt)
    478{
    479	return rt->level == 6;
    480}
    481
    482/* Return true, if raid type in @rt is raid4/5/6 */
    483static bool rt_is_raid456(struct raid_type *rt)
    484{
    485	return __within_range(rt->level, 4, 6);
    486}
    487/* END: raid level bools */
    488
    489/* Return valid ctr flags for the raid level of @rs */
    490static unsigned long __valid_flags(struct raid_set *rs)
    491{
    492	if (rt_is_raid0(rs->raid_type))
    493		return RAID0_VALID_FLAGS;
    494	else if (rt_is_raid1(rs->raid_type))
    495		return RAID1_VALID_FLAGS;
    496	else if (rt_is_raid10(rs->raid_type))
    497		return RAID10_VALID_FLAGS;
    498	else if (rt_is_raid45(rs->raid_type))
    499		return RAID45_VALID_FLAGS;
    500	else if (rt_is_raid6(rs->raid_type))
    501		return RAID6_VALID_FLAGS;
    502
    503	return 0;
    504}
    505
    506/*
    507 * Check for valid flags set on @rs
    508 *
    509 * Has to be called after parsing of the ctr flags!
    510 */
    511static int rs_check_for_valid_flags(struct raid_set *rs)
    512{
    513	if (rs->ctr_flags & ~__valid_flags(rs)) {
    514		rs->ti->error = "Invalid flags combination";
    515		return -EINVAL;
    516	}
    517
    518	return 0;
    519}
    520
    521/* MD raid10 bit definitions and helpers */
    522#define RAID10_OFFSET			(1 << 16) /* stripes with data copies area adjacent on devices */
    523#define RAID10_BROCKEN_USE_FAR_SETS	(1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */
    524#define RAID10_USE_FAR_SETS		(1 << 18) /* Use sets instead of whole stripe rotation */
    525#define RAID10_FAR_COPIES_SHIFT		8	  /* raid10 # far copies shift (2nd byte of layout) */
    526
    527/* Return md raid10 near copies for @layout */
    528static unsigned int __raid10_near_copies(int layout)
    529{
    530	return layout & 0xFF;
    531}
    532
    533/* Return md raid10 far copies for @layout */
    534static unsigned int __raid10_far_copies(int layout)
    535{
    536	return __raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT);
    537}
    538
    539/* Return true if md raid10 offset for @layout */
    540static bool __is_raid10_offset(int layout)
    541{
    542	return !!(layout & RAID10_OFFSET);
    543}
    544
    545/* Return true if md raid10 near for @layout */
    546static bool __is_raid10_near(int layout)
    547{
    548	return !__is_raid10_offset(layout) && __raid10_near_copies(layout) > 1;
    549}
    550
    551/* Return true if md raid10 far for @layout */
    552static bool __is_raid10_far(int layout)
    553{
    554	return !__is_raid10_offset(layout) && __raid10_far_copies(layout) > 1;
    555}
    556
    557/* Return md raid10 layout string for @layout */
    558static const char *raid10_md_layout_to_format(int layout)
    559{
    560	/*
    561	 * Bit 16 stands for "offset"
    562	 * (i.e. adjacent stripes hold copies)
    563	 *
    564	 * Refer to MD's raid10.c for details
    565	 */
    566	if (__is_raid10_offset(layout))
    567		return "offset";
    568
    569	if (__raid10_near_copies(layout) > 1)
    570		return "near";
    571
    572	if (__raid10_far_copies(layout) > 1)
    573		return "far";
    574
    575	return "unknown";
    576}
    577
    578/* Return md raid10 algorithm for @name */
    579static int raid10_name_to_format(const char *name)
    580{
    581	if (!strcasecmp(name, "near"))
    582		return ALGORITHM_RAID10_NEAR;
    583	else if (!strcasecmp(name, "offset"))
    584		return ALGORITHM_RAID10_OFFSET;
    585	else if (!strcasecmp(name, "far"))
    586		return ALGORITHM_RAID10_FAR;
    587
    588	return -EINVAL;
    589}
    590
    591/* Return md raid10 copies for @layout */
    592static unsigned int raid10_md_layout_to_copies(int layout)
    593{
    594	return max(__raid10_near_copies(layout), __raid10_far_copies(layout));
    595}
    596
    597/* Return md raid10 format id for @format string */
    598static int raid10_format_to_md_layout(struct raid_set *rs,
    599				      unsigned int algorithm,
    600				      unsigned int copies)
    601{
    602	unsigned int n = 1, f = 1, r = 0;
    603
    604	/*
    605	 * MD resilienece flaw:
    606	 *
    607	 * enabling use_far_sets for far/offset formats causes copies
    608	 * to be colocated on the same devs together with their origins!
    609	 *
    610	 * -> disable it for now in the definition above
    611	 */
    612	if (algorithm == ALGORITHM_RAID10_DEFAULT ||
    613	    algorithm == ALGORITHM_RAID10_NEAR)
    614		n = copies;
    615
    616	else if (algorithm == ALGORITHM_RAID10_OFFSET) {
    617		f = copies;
    618		r = RAID10_OFFSET;
    619		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
    620			r |= RAID10_USE_FAR_SETS;
    621
    622	} else if (algorithm == ALGORITHM_RAID10_FAR) {
    623		f = copies;
    624		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
    625			r |= RAID10_USE_FAR_SETS;
    626
    627	} else
    628		return -EINVAL;
    629
    630	return r | (f << RAID10_FAR_COPIES_SHIFT) | n;
    631}
    632/* END: MD raid10 bit definitions and helpers */
    633
    634/* Check for any of the raid10 algorithms */
    635static bool __got_raid10(struct raid_type *rtp, const int layout)
    636{
    637	if (rtp->level == 10) {
    638		switch (rtp->algorithm) {
    639		case ALGORITHM_RAID10_DEFAULT:
    640		case ALGORITHM_RAID10_NEAR:
    641			return __is_raid10_near(layout);
    642		case ALGORITHM_RAID10_OFFSET:
    643			return __is_raid10_offset(layout);
    644		case ALGORITHM_RAID10_FAR:
    645			return __is_raid10_far(layout);
    646		default:
    647			break;
    648		}
    649	}
    650
    651	return false;
    652}
    653
    654/* Return raid_type for @name */
    655static struct raid_type *get_raid_type(const char *name)
    656{
    657	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);
    658
    659	while (rtp-- > raid_types)
    660		if (!strcasecmp(rtp->name, name))
    661			return rtp;
    662
    663	return NULL;
    664}
    665
    666/* Return raid_type for @name based derived from @level and @layout */
    667static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
    668{
    669	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);
    670
    671	while (rtp-- > raid_types) {
    672		/* RAID10 special checks based on @layout flags/properties */
    673		if (rtp->level == level &&
    674		    (__got_raid10(rtp, layout) || rtp->algorithm == layout))
    675			return rtp;
    676	}
    677
    678	return NULL;
    679}
    680
    681/* Adjust rdev sectors */
    682static void rs_set_rdev_sectors(struct raid_set *rs)
    683{
    684	struct mddev *mddev = &rs->md;
    685	struct md_rdev *rdev;
    686
    687	/*
    688	 * raid10 sets rdev->sector to the device size, which
    689	 * is unintended in case of out-of-place reshaping
    690	 */
    691	rdev_for_each(rdev, mddev)
    692		if (!test_bit(Journal, &rdev->flags))
    693			rdev->sectors = mddev->dev_sectors;
    694}
    695
    696/*
    697 * Change bdev capacity of @rs in case of a disk add/remove reshape
    698 */
    699static void rs_set_capacity(struct raid_set *rs)
    700{
    701	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
    702
    703	set_capacity_and_notify(gendisk, rs->md.array_sectors);
    704}
    705
    706/*
    707 * Set the mddev properties in @rs to the current
    708 * ones retrieved from the freshest superblock
    709 */
    710static void rs_set_cur(struct raid_set *rs)
    711{
    712	struct mddev *mddev = &rs->md;
    713
    714	mddev->new_level = mddev->level;
    715	mddev->new_layout = mddev->layout;
    716	mddev->new_chunk_sectors = mddev->chunk_sectors;
    717}
    718
    719/*
    720 * Set the mddev properties in @rs to the new
    721 * ones requested by the ctr
    722 */
    723static void rs_set_new(struct raid_set *rs)
    724{
    725	struct mddev *mddev = &rs->md;
    726
    727	mddev->level = mddev->new_level;
    728	mddev->layout = mddev->new_layout;
    729	mddev->chunk_sectors = mddev->new_chunk_sectors;
    730	mddev->raid_disks = rs->raid_disks;
    731	mddev->delta_disks = 0;
    732}
    733
    734static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *raid_type,
    735				       unsigned int raid_devs)
    736{
    737	unsigned int i;
    738	struct raid_set *rs;
    739
    740	if (raid_devs <= raid_type->parity_devs) {
    741		ti->error = "Insufficient number of devices";
    742		return ERR_PTR(-EINVAL);
    743	}
    744
    745	rs = kzalloc(struct_size(rs, dev, raid_devs), GFP_KERNEL);
    746	if (!rs) {
    747		ti->error = "Cannot allocate raid context";
    748		return ERR_PTR(-ENOMEM);
    749	}
    750
    751	mddev_init(&rs->md);
    752
    753	rs->raid_disks = raid_devs;
    754	rs->delta_disks = 0;
    755
    756	rs->ti = ti;
    757	rs->raid_type = raid_type;
    758	rs->stripe_cache_entries = 256;
    759	rs->md.raid_disks = raid_devs;
    760	rs->md.level = raid_type->level;
    761	rs->md.new_level = rs->md.level;
    762	rs->md.layout = raid_type->algorithm;
    763	rs->md.new_layout = rs->md.layout;
    764	rs->md.delta_disks = 0;
    765	rs->md.recovery_cp = MaxSector;
    766
    767	for (i = 0; i < raid_devs; i++)
    768		md_rdev_init(&rs->dev[i].rdev);
    769
    770	/*
    771	 * Remaining items to be initialized by further RAID params:
    772	 *  rs->md.persistent
    773	 *  rs->md.external
    774	 *  rs->md.chunk_sectors
    775	 *  rs->md.new_chunk_sectors
    776	 *  rs->md.dev_sectors
    777	 */
    778
    779	return rs;
    780}
    781
    782/* Free all @rs allocations */
    783static void raid_set_free(struct raid_set *rs)
    784{
    785	int i;
    786
    787	if (rs->journal_dev.dev) {
    788		md_rdev_clear(&rs->journal_dev.rdev);
    789		dm_put_device(rs->ti, rs->journal_dev.dev);
    790	}
    791
    792	for (i = 0; i < rs->raid_disks; i++) {
    793		if (rs->dev[i].meta_dev)
    794			dm_put_device(rs->ti, rs->dev[i].meta_dev);
    795		md_rdev_clear(&rs->dev[i].rdev);
    796		if (rs->dev[i].data_dev)
    797			dm_put_device(rs->ti, rs->dev[i].data_dev);
    798	}
    799
    800	kfree(rs);
    801}
    802
    803/*
    804 * For every device we have two words
    805 *  <meta_dev>: meta device name or '-' if missing
    806 *  <data_dev>: data device name or '-' if missing
    807 *
    808 * The following are permitted:
    809 *    - -
    810 *    - <data_dev>
    811 *    <meta_dev> <data_dev>
    812 *
    813 * The following is not allowed:
    814 *    <meta_dev> -
    815 *
    816 * This code parses those words.  If there is a failure,
    817 * the caller must use raid_set_free() to unwind the operations.
    818 */
    819static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
    820{
    821	int i;
    822	int rebuild = 0;
    823	int metadata_available = 0;
    824	int r = 0;
    825	const char *arg;
    826
    827	/* Put off the number of raid devices argument to get to dev pairs */
    828	arg = dm_shift_arg(as);
    829	if (!arg)
    830		return -EINVAL;
    831
    832	for (i = 0; i < rs->raid_disks; i++) {
    833		rs->dev[i].rdev.raid_disk = i;
    834
    835		rs->dev[i].meta_dev = NULL;
    836		rs->dev[i].data_dev = NULL;
    837
    838		/*
    839		 * There are no offsets initially.
    840		 * Out of place reshape will set them accordingly.
    841		 */
    842		rs->dev[i].rdev.data_offset = 0;
    843		rs->dev[i].rdev.new_data_offset = 0;
    844		rs->dev[i].rdev.mddev = &rs->md;
    845
    846		arg = dm_shift_arg(as);
    847		if (!arg)
    848			return -EINVAL;
    849
    850		if (strcmp(arg, "-")) {
    851			r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
    852					  &rs->dev[i].meta_dev);
    853			if (r) {
    854				rs->ti->error = "RAID metadata device lookup failure";
    855				return r;
    856			}
    857
    858			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
    859			if (!rs->dev[i].rdev.sb_page) {
    860				rs->ti->error = "Failed to allocate superblock page";
    861				return -ENOMEM;
    862			}
    863		}
    864
    865		arg = dm_shift_arg(as);
    866		if (!arg)
    867			return -EINVAL;
    868
    869		if (!strcmp(arg, "-")) {
    870			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
    871			    (!rs->dev[i].rdev.recovery_offset)) {
    872				rs->ti->error = "Drive designated for rebuild not specified";
    873				return -EINVAL;
    874			}
    875
    876			if (rs->dev[i].meta_dev) {
    877				rs->ti->error = "No data device supplied with metadata device";
    878				return -EINVAL;
    879			}
    880
    881			continue;
    882		}
    883
    884		r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
    885				  &rs->dev[i].data_dev);
    886		if (r) {
    887			rs->ti->error = "RAID device lookup failure";
    888			return r;
    889		}
    890
    891		if (rs->dev[i].meta_dev) {
    892			metadata_available = 1;
    893			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
    894		}
    895		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
    896		list_add_tail(&rs->dev[i].rdev.same_set, &rs->md.disks);
    897		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
    898			rebuild++;
    899	}
    900
    901	if (rs->journal_dev.dev)
    902		list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks);
    903
    904	if (metadata_available) {
    905		rs->md.external = 0;
    906		rs->md.persistent = 1;
    907		rs->md.major_version = 2;
    908	} else if (rebuild && !rs->md.recovery_cp) {
    909		/*
    910		 * Without metadata, we will not be able to tell if the array
    911		 * is in-sync or not - we must assume it is not.  Therefore,
    912		 * it is impossible to rebuild a drive.
    913		 *
    914		 * Even if there is metadata, the on-disk information may
    915		 * indicate that the array is not in-sync and it will then
    916		 * fail at that time.
    917		 *
    918		 * User could specify 'nosync' option if desperate.
    919		 */
    920		rs->ti->error = "Unable to rebuild drive while array is not in-sync";
    921		return -EINVAL;
    922	}
    923
    924	return 0;
    925}
    926
    927/*
    928 * validate_region_size
    929 * @rs
    930 * @region_size:  region size in sectors.  If 0, pick a size (4MiB default).
    931 *
    932 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
    933 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
    934 *
    935 * Returns: 0 on success, -EINVAL on failure.
    936 */
    937static int validate_region_size(struct raid_set *rs, unsigned long region_size)
    938{
    939	unsigned long min_region_size = rs->ti->len / (1 << 21);
    940
    941	if (rs_is_raid0(rs))
    942		return 0;
    943
    944	if (!region_size) {
    945		/*
    946		 * Choose a reasonable default.	 All figures in sectors.
    947		 */
    948		if (min_region_size > (1 << 13)) {
    949			/* If not a power of 2, make it the next power of 2 */
    950			region_size = roundup_pow_of_two(min_region_size);
    951			DMINFO("Choosing default region size of %lu sectors",
    952			       region_size);
    953		} else {
    954			DMINFO("Choosing default region size of 4MiB");
    955			region_size = 1 << 13; /* sectors */
    956		}
    957	} else {
    958		/*
    959		 * Validate user-supplied value.
    960		 */
    961		if (region_size > rs->ti->len) {
    962			rs->ti->error = "Supplied region size is too large";
    963			return -EINVAL;
    964		}
    965
    966		if (region_size < min_region_size) {
    967			DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
    968			      region_size, min_region_size);
    969			rs->ti->error = "Supplied region size is too small";
    970			return -EINVAL;
    971		}
    972
    973		if (!is_power_of_2(region_size)) {
    974			rs->ti->error = "Region size is not a power of 2";
    975			return -EINVAL;
    976		}
    977
    978		if (region_size < rs->md.chunk_sectors) {
    979			rs->ti->error = "Region size is smaller than the chunk size";
    980			return -EINVAL;
    981		}
    982	}
    983
    984	/*
    985	 * Convert sectors to bytes.
    986	 */
    987	rs->md.bitmap_info.chunksize = to_bytes(region_size);
    988
    989	return 0;
    990}
    991
    992/*
    993 * validate_raid_redundancy
    994 * @rs
    995 *
    996 * Determine if there are enough devices in the array that haven't
    997 * failed (or are being rebuilt) to form a usable array.
    998 *
    999 * Returns: 0 on success, -EINVAL on failure.
   1000 */
   1001static int validate_raid_redundancy(struct raid_set *rs)
   1002{
   1003	unsigned int i, rebuild_cnt = 0;
   1004	unsigned int rebuilds_per_group = 0, copies, raid_disks;
   1005	unsigned int group_size, last_group_start;
   1006
   1007	for (i = 0; i < rs->raid_disks; i++)
   1008		if (!test_bit(FirstUse, &rs->dev[i].rdev.flags) &&
   1009		    ((!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
   1010		      !rs->dev[i].rdev.sb_page)))
   1011			rebuild_cnt++;
   1012
   1013	switch (rs->md.level) {
   1014	case 0:
   1015		break;
   1016	case 1:
   1017		if (rebuild_cnt >= rs->md.raid_disks)
   1018			goto too_many;
   1019		break;
   1020	case 4:
   1021	case 5:
   1022	case 6:
   1023		if (rebuild_cnt > rs->raid_type->parity_devs)
   1024			goto too_many;
   1025		break;
   1026	case 10:
   1027		copies = raid10_md_layout_to_copies(rs->md.new_layout);
   1028		if (copies < 2) {
   1029			DMERR("Bogus raid10 data copies < 2!");
   1030			return -EINVAL;
   1031		}
   1032
   1033		if (rebuild_cnt < copies)
   1034			break;
   1035
   1036		/*
   1037		 * It is possible to have a higher rebuild count for RAID10,
   1038		 * as long as the failed devices occur in different mirror
   1039		 * groups (i.e. different stripes).
   1040		 *
   1041		 * When checking "near" format, make sure no adjacent devices
   1042		 * have failed beyond what can be handled.  In addition to the
   1043		 * simple case where the number of devices is a multiple of the
   1044		 * number of copies, we must also handle cases where the number
   1045		 * of devices is not a multiple of the number of copies.
   1046		 * E.g.	   dev1 dev2 dev3 dev4 dev5
   1047		 *	    A	 A    B	   B	C
   1048		 *	    C	 D    D	   E	E
   1049		 */
   1050		raid_disks = min(rs->raid_disks, rs->md.raid_disks);
   1051		if (__is_raid10_near(rs->md.new_layout)) {
   1052			for (i = 0; i < raid_disks; i++) {
   1053				if (!(i % copies))
   1054					rebuilds_per_group = 0;
   1055				if ((!rs->dev[i].rdev.sb_page ||
   1056				    !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
   1057				    (++rebuilds_per_group >= copies))
   1058					goto too_many;
   1059			}
   1060			break;
   1061		}
   1062
   1063		/*
   1064		 * When checking "far" and "offset" formats, we need to ensure
   1065		 * that the device that holds its copy is not also dead or
   1066		 * being rebuilt.  (Note that "far" and "offset" formats only
   1067		 * support two copies right now.  These formats also only ever
   1068		 * use the 'use_far_sets' variant.)
   1069		 *
   1070		 * This check is somewhat complicated by the need to account
   1071		 * for arrays that are not a multiple of (far) copies.	This
   1072		 * results in the need to treat the last (potentially larger)
   1073		 * set differently.
   1074		 */
   1075		group_size = (raid_disks / copies);
   1076		last_group_start = (raid_disks / group_size) - 1;
   1077		last_group_start *= group_size;
   1078		for (i = 0; i < raid_disks; i++) {
   1079			if (!(i % copies) && !(i > last_group_start))
   1080				rebuilds_per_group = 0;
   1081			if ((!rs->dev[i].rdev.sb_page ||
   1082			     !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
   1083			    (++rebuilds_per_group >= copies))
   1084					goto too_many;
   1085		}
   1086		break;
   1087	default:
   1088		if (rebuild_cnt)
   1089			return -EINVAL;
   1090	}
   1091
   1092	return 0;
   1093
   1094too_many:
   1095	return -EINVAL;
   1096}
   1097
   1098/*
   1099 * Possible arguments are...
   1100 *	<chunk_size> [optional_args]
   1101 *
   1102 * Argument definitions
   1103 *    <chunk_size>			The number of sectors per disk that
   1104 *					will form the "stripe"
   1105 *    [[no]sync]			Force or prevent recovery of the
   1106 *					entire array
   1107 *    [rebuild <idx>]			Rebuild the drive indicated by the index
   1108 *    [daemon_sleep <ms>]		Time between bitmap daemon work to
   1109 *					clear bits
   1110 *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
   1111 *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
   1112 *    [write_mostly <idx>]		Indicate a write mostly drive via index
   1113 *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
   1114 *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
   1115 *    [region_size <sectors>]		Defines granularity of bitmap
   1116 *    [journal_dev <dev>]		raid4/5/6 journaling deviice
   1117 *    					(i.e. write hole closing log)
   1118 *
   1119 * RAID10-only options:
   1120 *    [raid10_copies <# copies>]	Number of copies.  (Default: 2)
   1121 *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
   1122 */
   1123static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
   1124			     unsigned int num_raid_params)
   1125{
   1126	int value, raid10_format = ALGORITHM_RAID10_DEFAULT;
   1127	unsigned int raid10_copies = 2;
   1128	unsigned int i, write_mostly = 0;
   1129	unsigned int region_size = 0;
   1130	sector_t max_io_len;
   1131	const char *arg, *key;
   1132	struct raid_dev *rd;
   1133	struct raid_type *rt = rs->raid_type;
   1134
   1135	arg = dm_shift_arg(as);
   1136	num_raid_params--; /* Account for chunk_size argument */
   1137
   1138	if (kstrtoint(arg, 10, &value) < 0) {
   1139		rs->ti->error = "Bad numerical argument given for chunk_size";
   1140		return -EINVAL;
   1141	}
   1142
   1143	/*
   1144	 * First, parse the in-order required arguments
   1145	 * "chunk_size" is the only argument of this type.
   1146	 */
   1147	if (rt_is_raid1(rt)) {
   1148		if (value)
   1149			DMERR("Ignoring chunk size parameter for RAID 1");
   1150		value = 0;
   1151	} else if (!is_power_of_2(value)) {
   1152		rs->ti->error = "Chunk size must be a power of 2";
   1153		return -EINVAL;
   1154	} else if (value < 8) {
   1155		rs->ti->error = "Chunk size value is too small";
   1156		return -EINVAL;
   1157	}
   1158
   1159	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
   1160
   1161	/*
   1162	 * We set each individual device as In_sync with a completed
   1163	 * 'recovery_offset'.  If there has been a device failure or
   1164	 * replacement then one of the following cases applies:
   1165	 *
   1166	 *   1) User specifies 'rebuild'.
   1167	 *	- Device is reset when param is read.
   1168	 *   2) A new device is supplied.
   1169	 *	- No matching superblock found, resets device.
   1170	 *   3) Device failure was transient and returns on reload.
   1171	 *	- Failure noticed, resets device for bitmap replay.
   1172	 *   4) Device hadn't completed recovery after previous failure.
   1173	 *	- Superblock is read and overrides recovery_offset.
   1174	 *
   1175	 * What is found in the superblocks of the devices is always
   1176	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
   1177	 */
   1178	for (i = 0; i < rs->raid_disks; i++) {
   1179		set_bit(In_sync, &rs->dev[i].rdev.flags);
   1180		rs->dev[i].rdev.recovery_offset = MaxSector;
   1181	}
   1182
   1183	/*
   1184	 * Second, parse the unordered optional arguments
   1185	 */
   1186	for (i = 0; i < num_raid_params; i++) {
   1187		key = dm_shift_arg(as);
   1188		if (!key) {
   1189			rs->ti->error = "Not enough raid parameters given";
   1190			return -EINVAL;
   1191		}
   1192
   1193		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC))) {
   1194			if (test_and_set_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
   1195				rs->ti->error = "Only one 'nosync' argument allowed";
   1196				return -EINVAL;
   1197			}
   1198			continue;
   1199		}
   1200		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) {
   1201			if (test_and_set_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) {
   1202				rs->ti->error = "Only one 'sync' argument allowed";
   1203				return -EINVAL;
   1204			}
   1205			continue;
   1206		}
   1207		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
   1208			if (test_and_set_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
   1209				rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed";
   1210				return -EINVAL;
   1211			}
   1212			continue;
   1213		}
   1214
   1215		arg = dm_shift_arg(as);
   1216		i++; /* Account for the argument pairs */
   1217		if (!arg) {
   1218			rs->ti->error = "Wrong number of raid parameters given";
   1219			return -EINVAL;
   1220		}
   1221
   1222		/*
   1223		 * Parameters that take a string value are checked here.
   1224		 */
   1225		/* "raid10_format {near|offset|far} */
   1226		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
   1227			if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
   1228				rs->ti->error = "Only one 'raid10_format' argument pair allowed";
   1229				return -EINVAL;
   1230			}
   1231			if (!rt_is_raid10(rt)) {
   1232				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
   1233				return -EINVAL;
   1234			}
   1235			raid10_format = raid10_name_to_format(arg);
   1236			if (raid10_format < 0) {
   1237				rs->ti->error = "Invalid 'raid10_format' value given";
   1238				return raid10_format;
   1239			}
   1240			continue;
   1241		}
   1242
   1243		/* "journal_dev <dev>" */
   1244		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
   1245			int r;
   1246			struct md_rdev *jdev;
   1247
   1248			if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
   1249				rs->ti->error = "Only one raid4/5/6 set journaling device allowed";
   1250				return -EINVAL;
   1251			}
   1252			if (!rt_is_raid456(rt)) {
   1253				rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type";
   1254				return -EINVAL;
   1255			}
   1256			r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
   1257					  &rs->journal_dev.dev);
   1258			if (r) {
   1259				rs->ti->error = "raid4/5/6 journal device lookup failure";
   1260				return r;
   1261			}
   1262			jdev = &rs->journal_dev.rdev;
   1263			md_rdev_init(jdev);
   1264			jdev->mddev = &rs->md;
   1265			jdev->bdev = rs->journal_dev.dev->bdev;
   1266			jdev->sectors = bdev_nr_sectors(jdev->bdev);
   1267			if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
   1268				rs->ti->error = "No space for raid4/5/6 journal";
   1269				return -ENOSPC;
   1270			}
   1271			rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
   1272			set_bit(Journal, &jdev->flags);
   1273			continue;
   1274		}
   1275
   1276		/* "journal_mode <mode>" ("journal_dev" mandatory!) */
   1277		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
   1278			int r;
   1279
   1280			if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
   1281				rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
   1282				return -EINVAL;
   1283			}
   1284			if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
   1285				rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
   1286				return -EINVAL;
   1287			}
   1288			r = dm_raid_journal_mode_to_md(arg);
   1289			if (r < 0) {
   1290				rs->ti->error = "Invalid 'journal_mode' argument";
   1291				return r;
   1292			}
   1293			rs->journal_dev.mode = r;
   1294			continue;
   1295		}
   1296
   1297		/*
   1298		 * Parameters with number values from here on.
   1299		 */
   1300		if (kstrtoint(arg, 10, &value) < 0) {
   1301			rs->ti->error = "Bad numerical argument given in raid params";
   1302			return -EINVAL;
   1303		}
   1304
   1305		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD))) {
   1306			/*
   1307			 * "rebuild" is being passed in by userspace to provide
   1308			 * indexes of replaced devices and to set up additional
   1309			 * devices on raid level takeover.
   1310			 */
   1311			if (!__within_range(value, 0, rs->raid_disks - 1)) {
   1312				rs->ti->error = "Invalid rebuild index given";
   1313				return -EINVAL;
   1314			}
   1315
   1316			if (test_and_set_bit(value, (void *) rs->rebuild_disks)) {
   1317				rs->ti->error = "rebuild for this index already given";
   1318				return -EINVAL;
   1319			}
   1320
   1321			rd = rs->dev + value;
   1322			clear_bit(In_sync, &rd->rdev.flags);
   1323			clear_bit(Faulty, &rd->rdev.flags);
   1324			rd->rdev.recovery_offset = 0;
   1325			set_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags);
   1326		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
   1327			if (!rt_is_raid1(rt)) {
   1328				rs->ti->error = "write_mostly option is only valid for RAID1";
   1329				return -EINVAL;
   1330			}
   1331
   1332			if (!__within_range(value, 0, rs->md.raid_disks - 1)) {
   1333				rs->ti->error = "Invalid write_mostly index given";
   1334				return -EINVAL;
   1335			}
   1336
   1337			write_mostly++;
   1338			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
   1339			set_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
   1340		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
   1341			if (!rt_is_raid1(rt)) {
   1342				rs->ti->error = "max_write_behind option is only valid for RAID1";
   1343				return -EINVAL;
   1344			}
   1345
   1346			if (test_and_set_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) {
   1347				rs->ti->error = "Only one max_write_behind argument pair allowed";
   1348				return -EINVAL;
   1349			}
   1350
   1351			/*
   1352			 * In device-mapper, we specify things in sectors, but
   1353			 * MD records this value in kB
   1354			 */
   1355			if (value < 0 || value / 2 > COUNTER_MAX) {
   1356				rs->ti->error = "Max write-behind limit out of range";
   1357				return -EINVAL;
   1358			}
   1359
   1360			rs->md.bitmap_info.max_write_behind = value / 2;
   1361		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP))) {
   1362			if (test_and_set_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) {
   1363				rs->ti->error = "Only one daemon_sleep argument pair allowed";
   1364				return -EINVAL;
   1365			}
   1366			if (value < 0) {
   1367				rs->ti->error = "daemon sleep period out of range";
   1368				return -EINVAL;
   1369			}
   1370			rs->md.bitmap_info.daemon_sleep = value;
   1371		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET))) {
   1372			/* Userspace passes new data_offset after having extended the the data image LV */
   1373			if (test_and_set_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
   1374				rs->ti->error = "Only one data_offset argument pair allowed";
   1375				return -EINVAL;
   1376			}
   1377			/* Ensure sensible data offset */
   1378			if (value < 0 ||
   1379			    (value && (value < MIN_FREE_RESHAPE_SPACE || value % to_sector(PAGE_SIZE)))) {
   1380				rs->ti->error = "Bogus data_offset value";
   1381				return -EINVAL;
   1382			}
   1383			rs->data_offset = value;
   1384		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS))) {
   1385			/* Define the +/-# of disks to add to/remove from the given raid set */
   1386			if (test_and_set_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
   1387				rs->ti->error = "Only one delta_disks argument pair allowed";
   1388				return -EINVAL;
   1389			}
   1390			/* Ensure MAX_RAID_DEVICES and raid type minimal_devs! */
   1391			if (!__within_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs)) {
   1392				rs->ti->error = "Too many delta_disk requested";
   1393				return -EINVAL;
   1394			}
   1395
   1396			rs->delta_disks = value;
   1397		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE))) {
   1398			if (test_and_set_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) {
   1399				rs->ti->error = "Only one stripe_cache argument pair allowed";
   1400				return -EINVAL;
   1401			}
   1402
   1403			if (!rt_is_raid456(rt)) {
   1404				rs->ti->error = "Inappropriate argument: stripe_cache";
   1405				return -EINVAL;
   1406			}
   1407
   1408			if (value < 0) {
   1409				rs->ti->error = "Bogus stripe cache entries value";
   1410				return -EINVAL;
   1411			}
   1412			rs->stripe_cache_entries = value;
   1413		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
   1414			if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
   1415				rs->ti->error = "Only one min_recovery_rate argument pair allowed";
   1416				return -EINVAL;
   1417			}
   1418
   1419			if (value < 0) {
   1420				rs->ti->error = "min_recovery_rate out of range";
   1421				return -EINVAL;
   1422			}
   1423			rs->md.sync_speed_min = value;
   1424		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) {
   1425			if (test_and_set_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) {
   1426				rs->ti->error = "Only one max_recovery_rate argument pair allowed";
   1427				return -EINVAL;
   1428			}
   1429
   1430			if (value < 0) {
   1431				rs->ti->error = "max_recovery_rate out of range";
   1432				return -EINVAL;
   1433			}
   1434			rs->md.sync_speed_max = value;
   1435		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE))) {
   1436			if (test_and_set_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) {
   1437				rs->ti->error = "Only one region_size argument pair allowed";
   1438				return -EINVAL;
   1439			}
   1440
   1441			region_size = value;
   1442			rs->requested_bitmap_chunk_sectors = value;
   1443		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES))) {
   1444			if (test_and_set_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) {
   1445				rs->ti->error = "Only one raid10_copies argument pair allowed";
   1446				return -EINVAL;
   1447			}
   1448
   1449			if (!__within_range(value, 2, rs->md.raid_disks)) {
   1450				rs->ti->error = "Bad value for 'raid10_copies'";
   1451				return -EINVAL;
   1452			}
   1453
   1454			raid10_copies = value;
   1455		} else {
   1456			DMERR("Unable to parse RAID parameter: %s", key);
   1457			rs->ti->error = "Unable to parse RAID parameter";
   1458			return -EINVAL;
   1459		}
   1460	}
   1461
   1462	if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) &&
   1463	    test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
   1464		rs->ti->error = "sync and nosync are mutually exclusive";
   1465		return -EINVAL;
   1466	}
   1467
   1468	if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) &&
   1469	    (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ||
   1470	     test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))) {
   1471		rs->ti->error = "sync/nosync and rebuild are mutually exclusive";
   1472		return -EINVAL;
   1473	}
   1474
   1475	if (write_mostly >= rs->md.raid_disks) {
   1476		rs->ti->error = "Can't set all raid1 devices to write_mostly";
   1477		return -EINVAL;
   1478	}
   1479
   1480	if (rs->md.sync_speed_max &&
   1481	    rs->md.sync_speed_min > rs->md.sync_speed_max) {
   1482		rs->ti->error = "Bogus recovery rates";
   1483		return -EINVAL;
   1484	}
   1485
   1486	if (validate_region_size(rs, region_size))
   1487		return -EINVAL;
   1488
   1489	if (rs->md.chunk_sectors)
   1490		max_io_len = rs->md.chunk_sectors;
   1491	else
   1492		max_io_len = region_size;
   1493
   1494	if (dm_set_target_max_io_len(rs->ti, max_io_len))
   1495		return -EINVAL;
   1496
   1497	if (rt_is_raid10(rt)) {
   1498		if (raid10_copies > rs->md.raid_disks) {
   1499			rs->ti->error = "Not enough devices to satisfy specification";
   1500			return -EINVAL;
   1501		}
   1502
   1503		rs->md.new_layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies);
   1504		if (rs->md.new_layout < 0) {
   1505			rs->ti->error = "Error getting raid10 format";
   1506			return rs->md.new_layout;
   1507		}
   1508
   1509		rt = get_raid_type_by_ll(10, rs->md.new_layout);
   1510		if (!rt) {
   1511			rs->ti->error = "Failed to recognize new raid10 layout";
   1512			return -EINVAL;
   1513		}
   1514
   1515		if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT ||
   1516		     rt->algorithm == ALGORITHM_RAID10_NEAR) &&
   1517		    test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
   1518			rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible";
   1519			return -EINVAL;
   1520		}
   1521	}
   1522
   1523	rs->raid10_copies = raid10_copies;
   1524
   1525	/* Assume there are no metadata devices until the drives are parsed */
   1526	rs->md.persistent = 0;
   1527	rs->md.external = 1;
   1528
   1529	/* Check, if any invalid ctr arguments have been passed in for the raid level */
   1530	return rs_check_for_valid_flags(rs);
   1531}
   1532
   1533/* Set raid4/5/6 cache size */
   1534static int rs_set_raid456_stripe_cache(struct raid_set *rs)
   1535{
   1536	int r;
   1537	struct r5conf *conf;
   1538	struct mddev *mddev = &rs->md;
   1539	uint32_t min_stripes = max(mddev->chunk_sectors, mddev->new_chunk_sectors) / 2;
   1540	uint32_t nr_stripes = rs->stripe_cache_entries;
   1541
   1542	if (!rt_is_raid456(rs->raid_type)) {
   1543		rs->ti->error = "Inappropriate raid level; cannot change stripe_cache size";
   1544		return -EINVAL;
   1545	}
   1546
   1547	if (nr_stripes < min_stripes) {
   1548		DMINFO("Adjusting requested %u stripe cache entries to %u to suit stripe size",
   1549		       nr_stripes, min_stripes);
   1550		nr_stripes = min_stripes;
   1551	}
   1552
   1553	conf = mddev->private;
   1554	if (!conf) {
   1555		rs->ti->error = "Cannot change stripe_cache size on inactive RAID set";
   1556		return -EINVAL;
   1557	}
   1558
   1559	/* Try setting number of stripes in raid456 stripe cache */
   1560	if (conf->min_nr_stripes != nr_stripes) {
   1561		r = raid5_set_cache_size(mddev, nr_stripes);
   1562		if (r) {
   1563			rs->ti->error = "Failed to set raid4/5/6 stripe cache size";
   1564			return r;
   1565		}
   1566
   1567		DMINFO("%u stripe cache entries", nr_stripes);
   1568	}
   1569
   1570	return 0;
   1571}
   1572
   1573/* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */
   1574static unsigned int mddev_data_stripes(struct raid_set *rs)
   1575{
   1576	return rs->md.raid_disks - rs->raid_type->parity_devs;
   1577}
   1578
   1579/* Return # of data stripes of @rs (i.e. as of ctr) */
   1580static unsigned int rs_data_stripes(struct raid_set *rs)
   1581{
   1582	return rs->raid_disks - rs->raid_type->parity_devs;
   1583}
   1584
   1585/*
   1586 * Retrieve rdev->sectors from any valid raid device of @rs
   1587 * to allow userpace to pass in arbitray "- -" device tupples.
   1588 */
   1589static sector_t __rdev_sectors(struct raid_set *rs)
   1590{
   1591	int i;
   1592
   1593	for (i = 0; i < rs->raid_disks; i++) {
   1594		struct md_rdev *rdev = &rs->dev[i].rdev;
   1595
   1596		if (!test_bit(Journal, &rdev->flags) &&
   1597		    rdev->bdev && rdev->sectors)
   1598			return rdev->sectors;
   1599	}
   1600
   1601	return 0;
   1602}
   1603
   1604/* Check that calculated dev_sectors fits all component devices. */
   1605static int _check_data_dev_sectors(struct raid_set *rs)
   1606{
   1607	sector_t ds = ~0;
   1608	struct md_rdev *rdev;
   1609
   1610	rdev_for_each(rdev, &rs->md)
   1611		if (!test_bit(Journal, &rdev->flags) && rdev->bdev) {
   1612			ds = min(ds, bdev_nr_sectors(rdev->bdev));
   1613			if (ds < rs->md.dev_sectors) {
   1614				rs->ti->error = "Component device(s) too small";
   1615				return -EINVAL;
   1616			}
   1617		}
   1618
   1619	return 0;
   1620}
   1621
   1622/* Calculate the sectors per device and per array used for @rs */
   1623static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev)
   1624{
   1625	int delta_disks;
   1626	unsigned int data_stripes;
   1627	sector_t array_sectors = sectors, dev_sectors = sectors;
   1628	struct mddev *mddev = &rs->md;
   1629
   1630	if (use_mddev) {
   1631		delta_disks = mddev->delta_disks;
   1632		data_stripes = mddev_data_stripes(rs);
   1633	} else {
   1634		delta_disks = rs->delta_disks;
   1635		data_stripes = rs_data_stripes(rs);
   1636	}
   1637
   1638	/* Special raid1 case w/o delta_disks support (yet) */
   1639	if (rt_is_raid1(rs->raid_type))
   1640		;
   1641	else if (rt_is_raid10(rs->raid_type)) {
   1642		if (rs->raid10_copies < 2 ||
   1643		    delta_disks < 0) {
   1644			rs->ti->error = "Bogus raid10 data copies or delta disks";
   1645			return -EINVAL;
   1646		}
   1647
   1648		dev_sectors *= rs->raid10_copies;
   1649		if (sector_div(dev_sectors, data_stripes))
   1650			goto bad;
   1651
   1652		array_sectors = (data_stripes + delta_disks) * dev_sectors;
   1653		if (sector_div(array_sectors, rs->raid10_copies))
   1654			goto bad;
   1655
   1656	} else if (sector_div(dev_sectors, data_stripes))
   1657		goto bad;
   1658
   1659	else
   1660		/* Striped layouts */
   1661		array_sectors = (data_stripes + delta_disks) * dev_sectors;
   1662
   1663	mddev->array_sectors = array_sectors;
   1664	mddev->dev_sectors = dev_sectors;
   1665	rs_set_rdev_sectors(rs);
   1666
   1667	return _check_data_dev_sectors(rs);
   1668bad:
   1669	rs->ti->error = "Target length not divisible by number of data devices";
   1670	return -EINVAL;
   1671}
   1672
   1673/* Setup recovery on @rs */
   1674static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
   1675{
   1676	/* raid0 does not recover */
   1677	if (rs_is_raid0(rs))
   1678		rs->md.recovery_cp = MaxSector;
   1679	/*
   1680	 * A raid6 set has to be recovered either
   1681	 * completely or for the grown part to
   1682	 * ensure proper parity and Q-Syndrome
   1683	 */
   1684	else if (rs_is_raid6(rs))
   1685		rs->md.recovery_cp = dev_sectors;
   1686	/*
   1687	 * Other raid set types may skip recovery
   1688	 * depending on the 'nosync' flag.
   1689	 */
   1690	else
   1691		rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
   1692				     ? MaxSector : dev_sectors;
   1693}
   1694
   1695static void do_table_event(struct work_struct *ws)
   1696{
   1697	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
   1698
   1699	smp_rmb(); /* Make sure we access most actual mddev properties */
   1700	if (!rs_is_reshaping(rs)) {
   1701		if (rs_is_raid10(rs))
   1702			rs_set_rdev_sectors(rs);
   1703		rs_set_capacity(rs);
   1704	}
   1705	dm_table_event(rs->ti->table);
   1706}
   1707
   1708/*
   1709 * Make sure a valid takover (level switch) is being requested on @rs
   1710 *
   1711 * Conversions of raid sets from one MD personality to another
   1712 * have to conform to restrictions which are enforced here.
   1713 */
   1714static int rs_check_takeover(struct raid_set *rs)
   1715{
   1716	struct mddev *mddev = &rs->md;
   1717	unsigned int near_copies;
   1718
   1719	if (rs->md.degraded) {
   1720		rs->ti->error = "Can't takeover degraded raid set";
   1721		return -EPERM;
   1722	}
   1723
   1724	if (rs_is_reshaping(rs)) {
   1725		rs->ti->error = "Can't takeover reshaping raid set";
   1726		return -EPERM;
   1727	}
   1728
   1729	switch (mddev->level) {
   1730	case 0:
   1731		/* raid0 -> raid1/5 with one disk */
   1732		if ((mddev->new_level == 1 || mddev->new_level == 5) &&
   1733		    mddev->raid_disks == 1)
   1734			return 0;
   1735
   1736		/* raid0 -> raid10 */
   1737		if (mddev->new_level == 10 &&
   1738		    !(rs->raid_disks % mddev->raid_disks))
   1739			return 0;
   1740
   1741		/* raid0 with multiple disks -> raid4/5/6 */
   1742		if (__within_range(mddev->new_level, 4, 6) &&
   1743		    mddev->new_layout == ALGORITHM_PARITY_N &&
   1744		    mddev->raid_disks > 1)
   1745			return 0;
   1746
   1747		break;
   1748
   1749	case 10:
   1750		/* Can't takeover raid10_offset! */
   1751		if (__is_raid10_offset(mddev->layout))
   1752			break;
   1753
   1754		near_copies = __raid10_near_copies(mddev->layout);
   1755
   1756		/* raid10* -> raid0 */
   1757		if (mddev->new_level == 0) {
   1758			/* Can takeover raid10_near with raid disks divisable by data copies! */
   1759			if (near_copies > 1 &&
   1760			    !(mddev->raid_disks % near_copies)) {
   1761				mddev->raid_disks /= near_copies;
   1762				mddev->delta_disks = mddev->raid_disks;
   1763				return 0;
   1764			}
   1765
   1766			/* Can takeover raid10_far */
   1767			if (near_copies == 1 &&
   1768			    __raid10_far_copies(mddev->layout) > 1)
   1769				return 0;
   1770
   1771			break;
   1772		}
   1773
   1774		/* raid10_{near,far} -> raid1 */
   1775		if (mddev->new_level == 1 &&
   1776		    max(near_copies, __raid10_far_copies(mddev->layout)) == mddev->raid_disks)
   1777			return 0;
   1778
   1779		/* raid10_{near,far} with 2 disks -> raid4/5 */
   1780		if (__within_range(mddev->new_level, 4, 5) &&
   1781		    mddev->raid_disks == 2)
   1782			return 0;
   1783		break;
   1784
   1785	case 1:
   1786		/* raid1 with 2 disks -> raid4/5 */
   1787		if (__within_range(mddev->new_level, 4, 5) &&
   1788		    mddev->raid_disks == 2) {
   1789			mddev->degraded = 1;
   1790			return 0;
   1791		}
   1792
   1793		/* raid1 -> raid0 */
   1794		if (mddev->new_level == 0 &&
   1795		    mddev->raid_disks == 1)
   1796			return 0;
   1797
   1798		/* raid1 -> raid10 */
   1799		if (mddev->new_level == 10)
   1800			return 0;
   1801		break;
   1802
   1803	case 4:
   1804		/* raid4 -> raid0 */
   1805		if (mddev->new_level == 0)
   1806			return 0;
   1807
   1808		/* raid4 -> raid1/5 with 2 disks */
   1809		if ((mddev->new_level == 1 || mddev->new_level == 5) &&
   1810		    mddev->raid_disks == 2)
   1811			return 0;
   1812
   1813		/* raid4 -> raid5/6 with parity N */
   1814		if (__within_range(mddev->new_level, 5, 6) &&
   1815		    mddev->layout == ALGORITHM_PARITY_N)
   1816			return 0;
   1817		break;
   1818
   1819	case 5:
   1820		/* raid5 with parity N -> raid0 */
   1821		if (mddev->new_level == 0 &&
   1822		    mddev->layout == ALGORITHM_PARITY_N)
   1823			return 0;
   1824
   1825		/* raid5 with parity N -> raid4 */
   1826		if (mddev->new_level == 4 &&
   1827		    mddev->layout == ALGORITHM_PARITY_N)
   1828			return 0;
   1829
   1830		/* raid5 with 2 disks -> raid1/4/10 */
   1831		if ((mddev->new_level == 1 || mddev->new_level == 4 || mddev->new_level == 10) &&
   1832		    mddev->raid_disks == 2)
   1833			return 0;
   1834
   1835		/* raid5_* ->  raid6_*_6 with Q-Syndrome N (e.g. raid5_ra -> raid6_ra_6 */
   1836		if (mddev->new_level == 6 &&
   1837		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
   1838		      __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6)))
   1839			return 0;
   1840		break;
   1841
   1842	case 6:
   1843		/* raid6 with parity N -> raid0 */
   1844		if (mddev->new_level == 0 &&
   1845		    mddev->layout == ALGORITHM_PARITY_N)
   1846			return 0;
   1847
   1848		/* raid6 with parity N -> raid4 */
   1849		if (mddev->new_level == 4 &&
   1850		    mddev->layout == ALGORITHM_PARITY_N)
   1851			return 0;
   1852
   1853		/* raid6_*_n with Q-Syndrome N -> raid5_* */
   1854		if (mddev->new_level == 5 &&
   1855		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
   1856		     __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC)))
   1857			return 0;
   1858		break;
   1859
   1860	default:
   1861		break;
   1862	}
   1863
   1864	rs->ti->error = "takeover not possible";
   1865	return -EINVAL;
   1866}
   1867
   1868/* True if @rs requested to be taken over */
   1869static bool rs_takeover_requested(struct raid_set *rs)
   1870{
   1871	return rs->md.new_level != rs->md.level;
   1872}
   1873
   1874/* True if layout is set to reshape. */
   1875static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev)
   1876{
   1877	return (use_mddev ? rs->md.delta_disks : rs->delta_disks) ||
   1878	       rs->md.new_layout != rs->md.layout ||
   1879	       rs->md.new_chunk_sectors != rs->md.chunk_sectors;
   1880}
   1881
   1882/* True if @rs is requested to reshape by ctr */
   1883static bool rs_reshape_requested(struct raid_set *rs)
   1884{
   1885	bool change;
   1886	struct mddev *mddev = &rs->md;
   1887
   1888	if (rs_takeover_requested(rs))
   1889		return false;
   1890
   1891	if (rs_is_raid0(rs))
   1892		return false;
   1893
   1894	change = rs_is_layout_change(rs, false);
   1895
   1896	/* Historical case to support raid1 reshape without delta disks */
   1897	if (rs_is_raid1(rs)) {
   1898		if (rs->delta_disks)
   1899			return !!rs->delta_disks;
   1900
   1901		return !change &&
   1902		       mddev->raid_disks != rs->raid_disks;
   1903	}
   1904
   1905	if (rs_is_raid10(rs))
   1906		return change &&
   1907		       !__is_raid10_far(mddev->new_layout) &&
   1908		       rs->delta_disks >= 0;
   1909
   1910	return change;
   1911}
   1912
   1913/*  Features */
   1914#define	FEATURE_FLAG_SUPPORTS_V190	0x1 /* Supports extended superblock */
   1915
   1916/* State flags for sb->flags */
   1917#define	SB_FLAG_RESHAPE_ACTIVE		0x1
   1918#define	SB_FLAG_RESHAPE_BACKWARDS	0x2
   1919
   1920/*
   1921 * This structure is never routinely used by userspace, unlike md superblocks.
   1922 * Devices with this superblock should only ever be accessed via device-mapper.
   1923 */
   1924#define DM_RAID_MAGIC 0x64526D44
   1925struct dm_raid_superblock {
   1926	__le32 magic;		/* "DmRd" */
   1927	__le32 compat_features;	/* Used to indicate compatible features (like 1.9.0 ondisk metadata extension) */
   1928
   1929	__le32 num_devices;	/* Number of devices in this raid set. (Max 64) */
   1930	__le32 array_position;	/* The position of this drive in the raid set */
   1931
   1932	__le64 events;		/* Incremented by md when superblock updated */
   1933	__le64 failed_devices;	/* Pre 1.9.0 part of bit field of devices to */
   1934				/* indicate failures (see extension below) */
   1935
   1936	/*
   1937	 * This offset tracks the progress of the repair or replacement of
   1938	 * an individual drive.
   1939	 */
   1940	__le64 disk_recovery_offset;
   1941
   1942	/*
   1943	 * This offset tracks the progress of the initial raid set
   1944	 * synchronisation/parity calculation.
   1945	 */
   1946	__le64 array_resync_offset;
   1947
   1948	/*
   1949	 * raid characteristics
   1950	 */
   1951	__le32 level;
   1952	__le32 layout;
   1953	__le32 stripe_sectors;
   1954
   1955	/********************************************************************
   1956	 * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
   1957	 *
   1958	 * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
   1959	 */
   1960
   1961	__le32 flags; /* Flags defining array states for reshaping */
   1962
   1963	/*
   1964	 * This offset tracks the progress of a raid
   1965	 * set reshape in order to be able to restart it
   1966	 */
   1967	__le64 reshape_position;
   1968
   1969	/*
   1970	 * These define the properties of the array in case of an interrupted reshape
   1971	 */
   1972	__le32 new_level;
   1973	__le32 new_layout;
   1974	__le32 new_stripe_sectors;
   1975	__le32 delta_disks;
   1976
   1977	__le64 array_sectors; /* Array size in sectors */
   1978
   1979	/*
   1980	 * Sector offsets to data on devices (reshaping).
   1981	 * Needed to support out of place reshaping, thus
   1982	 * not writing over any stripes whilst converting
   1983	 * them from old to new layout
   1984	 */
   1985	__le64 data_offset;
   1986	__le64 new_data_offset;
   1987
   1988	__le64 sectors; /* Used device size in sectors */
   1989
   1990	/*
   1991	 * Additonal Bit field of devices indicating failures to support
   1992	 * up to 256 devices with the 1.9.0 on-disk metadata format
   1993	 */
   1994	__le64 extended_failed_devices[DISKS_ARRAY_ELEMS - 1];
   1995
   1996	__le32 incompat_features;	/* Used to indicate any incompatible features */
   1997
   1998	/* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */
   1999} __packed;
   2000
   2001/*
   2002 * Check for reshape constraints on raid set @rs:
   2003 *
   2004 * - reshape function non-existent
   2005 * - degraded set
   2006 * - ongoing recovery
   2007 * - ongoing reshape
   2008 *
   2009 * Returns 0 if none or -EPERM if given constraint
   2010 * and error message reference in @errmsg
   2011 */
   2012static int rs_check_reshape(struct raid_set *rs)
   2013{
   2014	struct mddev *mddev = &rs->md;
   2015
   2016	if (!mddev->pers || !mddev->pers->check_reshape)
   2017		rs->ti->error = "Reshape not supported";
   2018	else if (mddev->degraded)
   2019		rs->ti->error = "Can't reshape degraded raid set";
   2020	else if (rs_is_recovering(rs))
   2021		rs->ti->error = "Convert request on recovering raid set prohibited";
   2022	else if (rs_is_reshaping(rs))
   2023		rs->ti->error = "raid set already reshaping!";
   2024	else if (!(rs_is_raid1(rs) || rs_is_raid10(rs) || rs_is_raid456(rs)))
   2025		rs->ti->error = "Reshaping only supported for raid1/4/5/6/10";
   2026	else
   2027		return 0;
   2028
   2029	return -EPERM;
   2030}
   2031
   2032static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
   2033{
   2034	BUG_ON(!rdev->sb_page);
   2035
   2036	if (rdev->sb_loaded && !force_reload)
   2037		return 0;
   2038
   2039	rdev->sb_loaded = 0;
   2040
   2041	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
   2042		DMERR("Failed to read superblock of device at position %d",
   2043		      rdev->raid_disk);
   2044		md_error(rdev->mddev, rdev);
   2045		set_bit(Faulty, &rdev->flags);
   2046		return -EIO;
   2047	}
   2048
   2049	rdev->sb_loaded = 1;
   2050
   2051	return 0;
   2052}
   2053
   2054static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices)
   2055{
   2056	failed_devices[0] = le64_to_cpu(sb->failed_devices);
   2057	memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices));
   2058
   2059	if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
   2060		int i = ARRAY_SIZE(sb->extended_failed_devices);
   2061
   2062		while (i--)
   2063			failed_devices[i+1] = le64_to_cpu(sb->extended_failed_devices[i]);
   2064	}
   2065}
   2066
   2067static void sb_update_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices)
   2068{
   2069	int i = ARRAY_SIZE(sb->extended_failed_devices);
   2070
   2071	sb->failed_devices = cpu_to_le64(failed_devices[0]);
   2072	while (i--)
   2073		sb->extended_failed_devices[i] = cpu_to_le64(failed_devices[i+1]);
   2074}
   2075
   2076/*
   2077 * Synchronize the superblock members with the raid set properties
   2078 *
   2079 * All superblock data is little endian.
   2080 */
   2081static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
   2082{
   2083	bool update_failed_devices = false;
   2084	unsigned int i;
   2085	uint64_t failed_devices[DISKS_ARRAY_ELEMS];
   2086	struct dm_raid_superblock *sb;
   2087	struct raid_set *rs = container_of(mddev, struct raid_set, md);
   2088
   2089	/* No metadata device, no superblock */
   2090	if (!rdev->meta_bdev)
   2091		return;
   2092
   2093	BUG_ON(!rdev->sb_page);
   2094
   2095	sb = page_address(rdev->sb_page);
   2096
   2097	sb_retrieve_failed_devices(sb, failed_devices);
   2098
   2099	for (i = 0; i < rs->raid_disks; i++)
   2100		if (!rs->dev[i].data_dev || test_bit(Faulty, &rs->dev[i].rdev.flags)) {
   2101			update_failed_devices = true;
   2102			set_bit(i, (void *) failed_devices);
   2103		}
   2104
   2105	if (update_failed_devices)
   2106		sb_update_failed_devices(sb, failed_devices);
   2107
   2108	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
   2109	sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
   2110
   2111	sb->num_devices = cpu_to_le32(mddev->raid_disks);
   2112	sb->array_position = cpu_to_le32(rdev->raid_disk);
   2113
   2114	sb->events = cpu_to_le64(mddev->events);
   2115
   2116	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
   2117	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
   2118
   2119	sb->level = cpu_to_le32(mddev->level);
   2120	sb->layout = cpu_to_le32(mddev->layout);
   2121	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
   2122
   2123	/********************************************************************
   2124	 * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
   2125	 *
   2126	 * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
   2127	 */
   2128	sb->new_level = cpu_to_le32(mddev->new_level);
   2129	sb->new_layout = cpu_to_le32(mddev->new_layout);
   2130	sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
   2131
   2132	sb->delta_disks = cpu_to_le32(mddev->delta_disks);
   2133
   2134	smp_rmb(); /* Make sure we access most recent reshape position */
   2135	sb->reshape_position = cpu_to_le64(mddev->reshape_position);
   2136	if (le64_to_cpu(sb->reshape_position) != MaxSector) {
   2137		/* Flag ongoing reshape */
   2138		sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE);
   2139
   2140		if (mddev->delta_disks < 0 || mddev->reshape_backwards)
   2141			sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_BACKWARDS);
   2142	} else {
   2143		/* Clear reshape flags */
   2144		sb->flags &= ~(cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE|SB_FLAG_RESHAPE_BACKWARDS));
   2145	}
   2146
   2147	sb->array_sectors = cpu_to_le64(mddev->array_sectors);
   2148	sb->data_offset = cpu_to_le64(rdev->data_offset);
   2149	sb->new_data_offset = cpu_to_le64(rdev->new_data_offset);
   2150	sb->sectors = cpu_to_le64(rdev->sectors);
   2151	sb->incompat_features = cpu_to_le32(0);
   2152
   2153	/* Zero out the rest of the payload after the size of the superblock */
   2154	memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
   2155}
   2156
   2157/*
   2158 * super_load
   2159 *
   2160 * This function creates a superblock if one is not found on the device
   2161 * and will decide which superblock to use if there's a choice.
   2162 *
   2163 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
   2164 */
   2165static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
   2166{
   2167	int r;
   2168	struct dm_raid_superblock *sb;
   2169	struct dm_raid_superblock *refsb;
   2170	uint64_t events_sb, events_refsb;
   2171
   2172	r = read_disk_sb(rdev, rdev->sb_size, false);
   2173	if (r)
   2174		return r;
   2175
   2176	sb = page_address(rdev->sb_page);
   2177
   2178	/*
   2179	 * Two cases that we want to write new superblocks and rebuild:
   2180	 * 1) New device (no matching magic number)
   2181	 * 2) Device specified for rebuild (!In_sync w/ offset == 0)
   2182	 */
   2183	if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
   2184	    (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
   2185		super_sync(rdev->mddev, rdev);
   2186
   2187		set_bit(FirstUse, &rdev->flags);
   2188		sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
   2189
   2190		/* Force writing of superblocks to disk */
   2191		set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags);
   2192
   2193		/* Any superblock is better than none, choose that if given */
   2194		return refdev ? 0 : 1;
   2195	}
   2196
   2197	if (!refdev)
   2198		return 1;
   2199
   2200	events_sb = le64_to_cpu(sb->events);
   2201
   2202	refsb = page_address(refdev->sb_page);
   2203	events_refsb = le64_to_cpu(refsb->events);
   2204
   2205	return (events_sb > events_refsb) ? 1 : 0;
   2206}
   2207
   2208static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
   2209{
   2210	int role;
   2211	unsigned int d;
   2212	struct mddev *mddev = &rs->md;
   2213	uint64_t events_sb;
   2214	uint64_t failed_devices[DISKS_ARRAY_ELEMS];
   2215	struct dm_raid_superblock *sb;
   2216	uint32_t new_devs = 0, rebuild_and_new = 0, rebuilds = 0;
   2217	struct md_rdev *r;
   2218	struct dm_raid_superblock *sb2;
   2219
   2220	sb = page_address(rdev->sb_page);
   2221	events_sb = le64_to_cpu(sb->events);
   2222
   2223	/*
   2224	 * Initialise to 1 if this is a new superblock.
   2225	 */
   2226	mddev->events = events_sb ? : 1;
   2227
   2228	mddev->reshape_position = MaxSector;
   2229
   2230	mddev->raid_disks = le32_to_cpu(sb->num_devices);
   2231	mddev->level = le32_to_cpu(sb->level);
   2232	mddev->layout = le32_to_cpu(sb->layout);
   2233	mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
   2234
   2235	/*
   2236	 * Reshaping is supported, e.g. reshape_position is valid
   2237	 * in superblock and superblock content is authoritative.
   2238	 */
   2239	if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
   2240		/* Superblock is authoritative wrt given raid set layout! */
   2241		mddev->new_level = le32_to_cpu(sb->new_level);
   2242		mddev->new_layout = le32_to_cpu(sb->new_layout);
   2243		mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors);
   2244		mddev->delta_disks = le32_to_cpu(sb->delta_disks);
   2245		mddev->array_sectors = le64_to_cpu(sb->array_sectors);
   2246
   2247		/* raid was reshaping and got interrupted */
   2248		if (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_ACTIVE) {
   2249			if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
   2250				DMERR("Reshape requested but raid set is still reshaping");
   2251				return -EINVAL;
   2252			}
   2253
   2254			if (mddev->delta_disks < 0 ||
   2255			    (!mddev->delta_disks && (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_BACKWARDS)))
   2256				mddev->reshape_backwards = 1;
   2257			else
   2258				mddev->reshape_backwards = 0;
   2259
   2260			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
   2261			rs->raid_type = get_raid_type_by_ll(mddev->level, mddev->layout);
   2262		}
   2263
   2264	} else {
   2265		/*
   2266		 * No takeover/reshaping, because we don't have the extended v1.9.0 metadata
   2267		 */
   2268		struct raid_type *rt_cur = get_raid_type_by_ll(mddev->level, mddev->layout);
   2269		struct raid_type *rt_new = get_raid_type_by_ll(mddev->new_level, mddev->new_layout);
   2270
   2271		if (rs_takeover_requested(rs)) {
   2272			if (rt_cur && rt_new)
   2273				DMERR("Takeover raid sets from %s to %s not yet supported by metadata. (raid level change)",
   2274				      rt_cur->name, rt_new->name);
   2275			else
   2276				DMERR("Takeover raid sets not yet supported by metadata. (raid level change)");
   2277			return -EINVAL;
   2278		} else if (rs_reshape_requested(rs)) {
   2279			DMERR("Reshaping raid sets not yet supported by metadata. (raid layout change keeping level)");
   2280			if (mddev->layout != mddev->new_layout) {
   2281				if (rt_cur && rt_new)
   2282					DMERR("	 current layout %s vs new layout %s",
   2283					      rt_cur->name, rt_new->name);
   2284				else
   2285					DMERR("	 current layout 0x%X vs new layout 0x%X",
   2286					      le32_to_cpu(sb->layout), mddev->new_layout);
   2287			}
   2288			if (mddev->chunk_sectors != mddev->new_chunk_sectors)
   2289				DMERR("	 current stripe sectors %u vs new stripe sectors %u",
   2290				      mddev->chunk_sectors, mddev->new_chunk_sectors);
   2291			if (rs->delta_disks)
   2292				DMERR("	 current %u disks vs new %u disks",
   2293				      mddev->raid_disks, mddev->raid_disks + rs->delta_disks);
   2294			if (rs_is_raid10(rs)) {
   2295				DMERR("	 Old layout: %s w/ %u copies",
   2296				      raid10_md_layout_to_format(mddev->layout),
   2297				      raid10_md_layout_to_copies(mddev->layout));
   2298				DMERR("	 New layout: %s w/ %u copies",
   2299				      raid10_md_layout_to_format(mddev->new_layout),
   2300				      raid10_md_layout_to_copies(mddev->new_layout));
   2301			}
   2302			return -EINVAL;
   2303		}
   2304
   2305		DMINFO("Discovered old metadata format; upgrading to extended metadata format");
   2306	}
   2307
   2308	if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
   2309		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
   2310
   2311	/*
   2312	 * During load, we set FirstUse if a new superblock was written.
   2313	 * There are two reasons we might not have a superblock:
   2314	 * 1) The raid set is brand new - in which case, all of the
   2315	 *    devices must have their In_sync bit set.	Also,
   2316	 *    recovery_cp must be 0, unless forced.
   2317	 * 2) This is a new device being added to an old raid set
   2318	 *    and the new device needs to be rebuilt - in which
   2319	 *    case the In_sync bit will /not/ be set and
   2320	 *    recovery_cp must be MaxSector.
   2321	 * 3) This is/are a new device(s) being added to an old
   2322	 *    raid set during takeover to a higher raid level
   2323	 *    to provide capacity for redundancy or during reshape
   2324	 *    to add capacity to grow the raid set.
   2325	 */
   2326	d = 0;
   2327	rdev_for_each(r, mddev) {
   2328		if (test_bit(Journal, &rdev->flags))
   2329			continue;
   2330
   2331		if (test_bit(FirstUse, &r->flags))
   2332			new_devs++;
   2333
   2334		if (!test_bit(In_sync, &r->flags)) {
   2335			DMINFO("Device %d specified for rebuild; clearing superblock",
   2336				r->raid_disk);
   2337			rebuilds++;
   2338
   2339			if (test_bit(FirstUse, &r->flags))
   2340				rebuild_and_new++;
   2341		}
   2342
   2343		d++;
   2344	}
   2345
   2346	if (new_devs == rs->raid_disks || !rebuilds) {
   2347		/* Replace a broken device */
   2348		if (new_devs == rs->raid_disks) {
   2349			DMINFO("Superblocks created for new raid set");
   2350			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
   2351		} else if (new_devs != rebuilds &&
   2352			   new_devs != rs->delta_disks) {
   2353			DMERR("New device injected into existing raid set without "
   2354			      "'delta_disks' or 'rebuild' parameter specified");
   2355			return -EINVAL;
   2356		}
   2357	} else if (new_devs && new_devs != rebuilds) {
   2358		DMERR("%u 'rebuild' devices cannot be injected into"
   2359		      " a raid set with %u other first-time devices",
   2360		      rebuilds, new_devs);
   2361		return -EINVAL;
   2362	} else if (rebuilds) {
   2363		if (rebuild_and_new && rebuilds != rebuild_and_new) {
   2364			DMERR("new device%s provided without 'rebuild'",
   2365			      new_devs > 1 ? "s" : "");
   2366			return -EINVAL;
   2367		} else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
   2368			DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
   2369			      (unsigned long long) mddev->recovery_cp);
   2370			return -EINVAL;
   2371		} else if (rs_is_reshaping(rs)) {
   2372			DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)",
   2373			      (unsigned long long) mddev->reshape_position);
   2374			return -EINVAL;
   2375		}
   2376	}
   2377
   2378	/*
   2379	 * Now we set the Faulty bit for those devices that are
   2380	 * recorded in the superblock as failed.
   2381	 */
   2382	sb_retrieve_failed_devices(sb, failed_devices);
   2383	rdev_for_each(r, mddev) {
   2384		if (test_bit(Journal, &rdev->flags) ||
   2385		    !r->sb_page)
   2386			continue;
   2387		sb2 = page_address(r->sb_page);
   2388		sb2->failed_devices = 0;
   2389		memset(sb2->extended_failed_devices, 0, sizeof(sb2->extended_failed_devices));
   2390
   2391		/*
   2392		 * Check for any device re-ordering.
   2393		 */
   2394		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
   2395			role = le32_to_cpu(sb2->array_position);
   2396			if (role < 0)
   2397				continue;
   2398
   2399			if (role != r->raid_disk) {
   2400				if (rs_is_raid10(rs) && __is_raid10_near(mddev->layout)) {
   2401					if (mddev->raid_disks % __raid10_near_copies(mddev->layout) ||
   2402					    rs->raid_disks % rs->raid10_copies) {
   2403						rs->ti->error =
   2404							"Cannot change raid10 near set to odd # of devices!";
   2405						return -EINVAL;
   2406					}
   2407
   2408					sb2->array_position = cpu_to_le32(r->raid_disk);
   2409
   2410				} else if (!(rs_is_raid10(rs) && rt_is_raid0(rs->raid_type)) &&
   2411					   !(rs_is_raid0(rs) && rt_is_raid10(rs->raid_type)) &&
   2412					   !rt_is_raid1(rs->raid_type)) {
   2413					rs->ti->error = "Cannot change device positions in raid set";
   2414					return -EINVAL;
   2415				}
   2416
   2417				DMINFO("raid device #%d now at position #%d", role, r->raid_disk);
   2418			}
   2419
   2420			/*
   2421			 * Partial recovery is performed on
   2422			 * returning failed devices.
   2423			 */
   2424			if (test_bit(role, (void *) failed_devices))
   2425				set_bit(Faulty, &r->flags);
   2426		}
   2427	}
   2428
   2429	return 0;
   2430}
   2431
   2432static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
   2433{
   2434	struct mddev *mddev = &rs->md;
   2435	struct dm_raid_superblock *sb;
   2436
   2437	if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
   2438		return 0;
   2439
   2440	sb = page_address(rdev->sb_page);
   2441
   2442	/*
   2443	 * If mddev->events is not set, we know we have not yet initialized
   2444	 * the array.
   2445	 */
   2446	if (!mddev->events && super_init_validation(rs, rdev))
   2447		return -EINVAL;
   2448
   2449	if (le32_to_cpu(sb->compat_features) &&
   2450	    le32_to_cpu(sb->compat_features) != FEATURE_FLAG_SUPPORTS_V190) {
   2451		rs->ti->error = "Unable to assemble array: Unknown flag(s) in compatible feature flags";
   2452		return -EINVAL;
   2453	}
   2454
   2455	if (sb->incompat_features) {
   2456		rs->ti->error = "Unable to assemble array: No incompatible feature flags supported yet";
   2457		return -EINVAL;
   2458	}
   2459
   2460	/* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */
   2461	mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096);
   2462	mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
   2463
   2464	if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
   2465		/*
   2466		 * Retrieve rdev size stored in superblock to be prepared for shrink.
   2467		 * Check extended superblock members are present otherwise the size
   2468		 * will not be set!
   2469		 */
   2470		if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190)
   2471			rdev->sectors = le64_to_cpu(sb->sectors);
   2472
   2473		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
   2474		if (rdev->recovery_offset == MaxSector)
   2475			set_bit(In_sync, &rdev->flags);
   2476		/*
   2477		 * If no reshape in progress -> we're recovering single
   2478		 * disk(s) and have to set the device(s) to out-of-sync
   2479		 */
   2480		else if (!rs_is_reshaping(rs))
   2481			clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */
   2482	}
   2483
   2484	/*
   2485	 * If a device comes back, set it as not In_sync and no longer faulty.
   2486	 */
   2487	if (test_and_clear_bit(Faulty, &rdev->flags)) {
   2488		rdev->recovery_offset = 0;
   2489		clear_bit(In_sync, &rdev->flags);
   2490		rdev->saved_raid_disk = rdev->raid_disk;
   2491	}
   2492
   2493	/* Reshape support -> restore repective data offsets */
   2494	rdev->data_offset = le64_to_cpu(sb->data_offset);
   2495	rdev->new_data_offset = le64_to_cpu(sb->new_data_offset);
   2496
   2497	return 0;
   2498}
   2499
   2500/*
   2501 * Analyse superblocks and select the freshest.
   2502 */
   2503static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
   2504{
   2505	int r;
   2506	struct md_rdev *rdev, *freshest;
   2507	struct mddev *mddev = &rs->md;
   2508
   2509	freshest = NULL;
   2510	rdev_for_each(rdev, mddev) {
   2511		if (test_bit(Journal, &rdev->flags))
   2512			continue;
   2513
   2514		if (!rdev->meta_bdev)
   2515			continue;
   2516
   2517		/* Set superblock offset/size for metadata device. */
   2518		rdev->sb_start = 0;
   2519		rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
   2520		if (rdev->sb_size < sizeof(struct dm_raid_superblock) || rdev->sb_size > PAGE_SIZE) {
   2521			DMERR("superblock size of a logical block is no longer valid");
   2522			return -EINVAL;
   2523		}
   2524
   2525		/*
   2526		 * Skipping super_load due to CTR_FLAG_SYNC will cause
   2527		 * the array to undergo initialization again as
   2528		 * though it were new.	This is the intended effect
   2529		 * of the "sync" directive.
   2530		 *
   2531		 * With reshaping capability added, we must ensure that
   2532		 * that the "sync" directive is disallowed during the reshape.
   2533		 */
   2534		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
   2535			continue;
   2536
   2537		r = super_load(rdev, freshest);
   2538
   2539		switch (r) {
   2540		case 1:
   2541			freshest = rdev;
   2542			break;
   2543		case 0:
   2544			break;
   2545		default:
   2546			/* This is a failure to read the superblock from the metadata device. */
   2547			/*
   2548			 * We have to keep any raid0 data/metadata device pairs or
   2549			 * the MD raid0 personality will fail to start the array.
   2550			 */
   2551			if (rs_is_raid0(rs))
   2552				continue;
   2553
   2554			/*
   2555			 * We keep the dm_devs to be able to emit the device tuple
   2556			 * properly on the table line in raid_status() (rather than
   2557			 * mistakenly acting as if '- -' got passed into the constructor).
   2558			 *
   2559			 * The rdev has to stay on the same_set list to allow for
   2560			 * the attempt to restore faulty devices on second resume.
   2561			 */
   2562			rdev->raid_disk = rdev->saved_raid_disk = -1;
   2563			break;
   2564		}
   2565	}
   2566
   2567	if (!freshest)
   2568		return 0;
   2569
   2570	/*
   2571	 * Validation of the freshest device provides the source of
   2572	 * validation for the remaining devices.
   2573	 */
   2574	rs->ti->error = "Unable to assemble array: Invalid superblocks";
   2575	if (super_validate(rs, freshest))
   2576		return -EINVAL;
   2577
   2578	if (validate_raid_redundancy(rs)) {
   2579		rs->ti->error = "Insufficient redundancy to activate array";
   2580		return -EINVAL;
   2581	}
   2582
   2583	rdev_for_each(rdev, mddev)
   2584		if (!test_bit(Journal, &rdev->flags) &&
   2585		    rdev != freshest &&
   2586		    super_validate(rs, rdev))
   2587			return -EINVAL;
   2588	return 0;
   2589}
   2590
   2591/*
   2592 * Adjust data_offset and new_data_offset on all disk members of @rs
   2593 * for out of place reshaping if requested by contructor
   2594 *
   2595 * We need free space at the beginning of each raid disk for forward
   2596 * and at the end for backward reshapes which userspace has to provide
   2597 * via remapping/reordering of space.
   2598 */
   2599static int rs_adjust_data_offsets(struct raid_set *rs)
   2600{
   2601	sector_t data_offset = 0, new_data_offset = 0;
   2602	struct md_rdev *rdev;
   2603
   2604	/* Constructor did not request data offset change */
   2605	if (!test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
   2606		if (!rs_is_reshapable(rs))
   2607			goto out;
   2608
   2609		return 0;
   2610	}
   2611
   2612	/* HM FIXME: get In_Sync raid_dev? */
   2613	rdev = &rs->dev[0].rdev;
   2614
   2615	if (rs->delta_disks < 0) {
   2616		/*
   2617		 * Removing disks (reshaping backwards):
   2618		 *
   2619		 * - before reshape: data is at offset 0 and free space
   2620		 *		     is at end of each component LV
   2621		 *
   2622		 * - after reshape: data is at offset rs->data_offset != 0 on each component LV
   2623		 */
   2624		data_offset = 0;
   2625		new_data_offset = rs->data_offset;
   2626
   2627	} else if (rs->delta_disks > 0) {
   2628		/*
   2629		 * Adding disks (reshaping forwards):
   2630		 *
   2631		 * - before reshape: data is at offset rs->data_offset != 0 and
   2632		 *		     free space is at begin of each component LV
   2633		 *
   2634		 * - after reshape: data is at offset 0 on each component LV
   2635		 */
   2636		data_offset = rs->data_offset;
   2637		new_data_offset = 0;
   2638
   2639	} else {
   2640		/*
   2641		 * User space passes in 0 for data offset after having removed reshape space
   2642		 *
   2643		 * - or - (data offset != 0)
   2644		 *
   2645		 * Changing RAID layout or chunk size -> toggle offsets
   2646		 *
   2647		 * - before reshape: data is at offset rs->data_offset 0 and
   2648		 *		     free space is at end of each component LV
   2649		 *		     -or-
   2650		 *                   data is at offset rs->data_offset != 0 and
   2651		 *		     free space is at begin of each component LV
   2652		 *
   2653		 * - after reshape: data is at offset 0 if it was at offset != 0
   2654		 *                  or at offset != 0 if it was at offset 0
   2655		 *                  on each component LV
   2656		 *
   2657		 */
   2658		data_offset = rs->data_offset ? rdev->data_offset : 0;
   2659		new_data_offset = data_offset ? 0 : rs->data_offset;
   2660		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
   2661	}
   2662
   2663	/*
   2664	 * Make sure we got a minimum amount of free sectors per device
   2665	 */
   2666	if (rs->data_offset &&
   2667	    bdev_nr_sectors(rdev->bdev) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
   2668		rs->ti->error = data_offset ? "No space for forward reshape" :
   2669					      "No space for backward reshape";
   2670		return -ENOSPC;
   2671	}
   2672out:
   2673	/*
   2674	 * Raise recovery_cp in case data_offset != 0 to
   2675	 * avoid false recovery positives in the constructor.
   2676	 */
   2677	if (rs->md.recovery_cp < rs->md.dev_sectors)
   2678		rs->md.recovery_cp += rs->dev[0].rdev.data_offset;
   2679
   2680	/* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
   2681	rdev_for_each(rdev, &rs->md) {
   2682		if (!test_bit(Journal, &rdev->flags)) {
   2683			rdev->data_offset = data_offset;
   2684			rdev->new_data_offset = new_data_offset;
   2685		}
   2686	}
   2687
   2688	return 0;
   2689}
   2690
   2691/* Userpace reordered disks -> adjust raid_disk indexes in @rs */
   2692static void __reorder_raid_disk_indexes(struct raid_set *rs)
   2693{
   2694	int i = 0;
   2695	struct md_rdev *rdev;
   2696
   2697	rdev_for_each(rdev, &rs->md) {
   2698		if (!test_bit(Journal, &rdev->flags)) {
   2699			rdev->raid_disk = i++;
   2700			rdev->saved_raid_disk = rdev->new_raid_disk = -1;
   2701		}
   2702	}
   2703}
   2704
   2705/*
   2706 * Setup @rs for takeover by a different raid level
   2707 */
   2708static int rs_setup_takeover(struct raid_set *rs)
   2709{
   2710	struct mddev *mddev = &rs->md;
   2711	struct md_rdev *rdev;
   2712	unsigned int d = mddev->raid_disks = rs->raid_disks;
   2713	sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset;
   2714
   2715	if (rt_is_raid10(rs->raid_type)) {
   2716		if (rs_is_raid0(rs)) {
   2717			/* Userpace reordered disks -> adjust raid_disk indexes */
   2718			__reorder_raid_disk_indexes(rs);
   2719
   2720			/* raid0 -> raid10_far layout */
   2721			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
   2722								   rs->raid10_copies);
   2723		} else if (rs_is_raid1(rs))
   2724			/* raid1 -> raid10_near layout */
   2725			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
   2726								   rs->raid_disks);
   2727		else
   2728			return -EINVAL;
   2729
   2730	}
   2731
   2732	clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
   2733	mddev->recovery_cp = MaxSector;
   2734
   2735	while (d--) {
   2736		rdev = &rs->dev[d].rdev;
   2737
   2738		if (test_bit(d, (void *) rs->rebuild_disks)) {
   2739			clear_bit(In_sync, &rdev->flags);
   2740			clear_bit(Faulty, &rdev->flags);
   2741			mddev->recovery_cp = rdev->recovery_offset = 0;
   2742			/* Bitmap has to be created when we do an "up" takeover */
   2743			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
   2744		}
   2745
   2746		rdev->new_data_offset = new_data_offset;
   2747	}
   2748
   2749	return 0;
   2750}
   2751
   2752/* Prepare @rs for reshape */
   2753static int rs_prepare_reshape(struct raid_set *rs)
   2754{
   2755	bool reshape;
   2756	struct mddev *mddev = &rs->md;
   2757
   2758	if (rs_is_raid10(rs)) {
   2759		if (rs->raid_disks != mddev->raid_disks &&
   2760		    __is_raid10_near(mddev->layout) &&
   2761		    rs->raid10_copies &&
   2762		    rs->raid10_copies != __raid10_near_copies(mddev->layout)) {
   2763			/*
   2764			 * raid disk have to be multiple of data copies to allow this conversion,
   2765			 *
   2766			 * This is actually not a reshape it is a
   2767			 * rebuild of any additional mirrors per group
   2768			 */
   2769			if (rs->raid_disks % rs->raid10_copies) {
   2770				rs->ti->error = "Can't reshape raid10 mirror groups";
   2771				return -EINVAL;
   2772			}
   2773
   2774			/* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */
   2775			__reorder_raid_disk_indexes(rs);
   2776			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
   2777								   rs->raid10_copies);
   2778			mddev->new_layout = mddev->layout;
   2779			reshape = false;
   2780		} else
   2781			reshape = true;
   2782
   2783	} else if (rs_is_raid456(rs))
   2784		reshape = true;
   2785
   2786	else if (rs_is_raid1(rs)) {
   2787		if (rs->delta_disks) {
   2788			/* Process raid1 via delta_disks */
   2789			mddev->degraded = rs->delta_disks < 0 ? -rs->delta_disks : rs->delta_disks;
   2790			reshape = true;
   2791		} else {
   2792			/* Process raid1 without delta_disks */
   2793			mddev->raid_disks = rs->raid_disks;
   2794			reshape = false;
   2795		}
   2796	} else {
   2797		rs->ti->error = "Called with bogus raid type";
   2798		return -EINVAL;
   2799	}
   2800
   2801	if (reshape) {
   2802		set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
   2803		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
   2804	} else if (mddev->raid_disks < rs->raid_disks)
   2805		/* Create new superblocks and bitmaps, if any new disks */
   2806		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
   2807
   2808	return 0;
   2809}
   2810
   2811/* Get reshape sectors from data_offsets or raid set */
   2812static sector_t _get_reshape_sectors(struct raid_set *rs)
   2813{
   2814	struct md_rdev *rdev;
   2815	sector_t reshape_sectors = 0;
   2816
   2817	rdev_for_each(rdev, &rs->md)
   2818		if (!test_bit(Journal, &rdev->flags)) {
   2819			reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ?
   2820					rdev->data_offset - rdev->new_data_offset :
   2821					rdev->new_data_offset - rdev->data_offset;
   2822			break;
   2823		}
   2824
   2825	return max(reshape_sectors, (sector_t) rs->data_offset);
   2826}
   2827
   2828/*
   2829 * Reshape:
   2830 * - change raid layout
   2831 * - change chunk size
   2832 * - add disks
   2833 * - remove disks
   2834 */
   2835static int rs_setup_reshape(struct raid_set *rs)
   2836{
   2837	int r = 0;
   2838	unsigned int cur_raid_devs, d;
   2839	sector_t reshape_sectors = _get_reshape_sectors(rs);
   2840	struct mddev *mddev = &rs->md;
   2841	struct md_rdev *rdev;
   2842
   2843	mddev->delta_disks = rs->delta_disks;
   2844	cur_raid_devs = mddev->raid_disks;
   2845
   2846	/* Ignore impossible layout change whilst adding/removing disks */
   2847	if (mddev->delta_disks &&
   2848	    mddev->layout != mddev->new_layout) {
   2849		DMINFO("Ignoring invalid layout change with delta_disks=%d", rs->delta_disks);
   2850		mddev->new_layout = mddev->layout;
   2851	}
   2852
   2853	/*
   2854	 * Adjust array size:
   2855	 *
   2856	 * - in case of adding disk(s), array size has
   2857	 *   to grow after the disk adding reshape,
   2858	 *   which'll hapen in the event handler;
   2859	 *   reshape will happen forward, so space has to
   2860	 *   be available at the beginning of each disk
   2861	 *
   2862	 * - in case of removing disk(s), array size
   2863	 *   has to shrink before starting the reshape,
   2864	 *   which'll happen here;
   2865	 *   reshape will happen backward, so space has to
   2866	 *   be available at the end of each disk
   2867	 *
   2868	 * - data_offset and new_data_offset are
   2869	 *   adjusted for aforementioned out of place
   2870	 *   reshaping based on userspace passing in
   2871	 *   the "data_offset <sectors>" key/value
   2872	 *   pair via the constructor
   2873	 */
   2874
   2875	/* Add disk(s) */
   2876	if (rs->delta_disks > 0) {
   2877		/* Prepare disks for check in raid4/5/6/10 {check|start}_reshape */
   2878		for (d = cur_raid_devs; d < rs->raid_disks; d++) {
   2879			rdev = &rs->dev[d].rdev;
   2880			clear_bit(In_sync, &rdev->flags);
   2881
   2882			/*
   2883			 * save_raid_disk needs to be -1, or recovery_offset will be set to 0
   2884			 * by md, which'll store that erroneously in the superblock on reshape
   2885			 */
   2886			rdev->saved_raid_disk = -1;
   2887			rdev->raid_disk = d;
   2888
   2889			rdev->sectors = mddev->dev_sectors;
   2890			rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector;
   2891		}
   2892
   2893		mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */
   2894
   2895	/* Remove disk(s) */
   2896	} else if (rs->delta_disks < 0) {
   2897		r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true);
   2898		mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */
   2899
   2900	/* Change layout and/or chunk size */
   2901	} else {
   2902		/*
   2903		 * Reshape layout (e.g. raid5_ls -> raid5_n) and/or chunk size:
   2904		 *
   2905		 * keeping number of disks and do layout change ->
   2906		 *
   2907		 * toggle reshape_backward depending on data_offset:
   2908		 *
   2909		 * - free space upfront -> reshape forward
   2910		 *
   2911		 * - free space at the end -> reshape backward
   2912		 *
   2913		 *
   2914		 * This utilizes free reshape space avoiding the need
   2915		 * for userspace to move (parts of) LV segments in
   2916		 * case of layout/chunksize change  (for disk
   2917		 * adding/removing reshape space has to be at
   2918		 * the proper address (see above with delta_disks):
   2919		 *
   2920		 * add disk(s)   -> begin
   2921		 * remove disk(s)-> end
   2922		 */
   2923		mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1;
   2924	}
   2925
   2926	/*
   2927	 * Adjust device size for forward reshape
   2928	 * because md_finish_reshape() reduces it.
   2929	 */
   2930	if (!mddev->reshape_backwards)
   2931		rdev_for_each(rdev, &rs->md)
   2932			if (!test_bit(Journal, &rdev->flags))
   2933				rdev->sectors += reshape_sectors;
   2934
   2935	return r;
   2936}
   2937
   2938/*
   2939 * If the md resync thread has updated superblock with max reshape position
   2940 * at the end of a reshape but not (yet) reset the layout configuration
   2941 * changes -> reset the latter.
   2942 */
   2943static void rs_reset_inconclusive_reshape(struct raid_set *rs)
   2944{
   2945	if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) {
   2946		rs_set_cur(rs);
   2947		rs->md.delta_disks = 0;
   2948		rs->md.reshape_backwards = 0;
   2949	}
   2950}
   2951
   2952/*
   2953 * Enable/disable discard support on RAID set depending on
   2954 * RAID level and discard properties of underlying RAID members.
   2955 */
   2956static void configure_discard_support(struct raid_set *rs)
   2957{
   2958	int i;
   2959	bool raid456;
   2960	struct dm_target *ti = rs->ti;
   2961
   2962	/*
   2963	 * XXX: RAID level 4,5,6 require zeroing for safety.
   2964	 */
   2965	raid456 = rs_is_raid456(rs);
   2966
   2967	for (i = 0; i < rs->raid_disks; i++) {
   2968		if (!rs->dev[i].rdev.bdev ||
   2969		    !bdev_max_discard_sectors(rs->dev[i].rdev.bdev))
   2970			return;
   2971
   2972		if (raid456) {
   2973			if (!devices_handle_discard_safely) {
   2974				DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
   2975				DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
   2976				return;
   2977			}
   2978		}
   2979	}
   2980
   2981	ti->num_discard_bios = 1;
   2982}
   2983
   2984/*
   2985 * Construct a RAID0/1/10/4/5/6 mapping:
   2986 * Args:
   2987 *	<raid_type> <#raid_params> <raid_params>{0,}	\
   2988 *	<#raid_devs> [<meta_dev1> <dev1>]{1,}
   2989 *
   2990 * <raid_params> varies by <raid_type>.	 See 'parse_raid_params' for
   2991 * details on possible <raid_params>.
   2992 *
   2993 * Userspace is free to initialize the metadata devices, hence the superblocks to
   2994 * enforce recreation based on the passed in table parameters.
   2995 *
   2996 */
   2997static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
   2998{
   2999	int r;
   3000	bool resize = false;
   3001	struct raid_type *rt;
   3002	unsigned int num_raid_params, num_raid_devs;
   3003	sector_t sb_array_sectors, rdev_sectors, reshape_sectors;
   3004	struct raid_set *rs = NULL;
   3005	const char *arg;
   3006	struct rs_layout rs_layout;
   3007	struct dm_arg_set as = { argc, argv }, as_nrd;
   3008	struct dm_arg _args[] = {
   3009		{ 0, as.argc, "Cannot understand number of raid parameters" },
   3010		{ 1, 254, "Cannot understand number of raid devices parameters" }
   3011	};
   3012
   3013	arg = dm_shift_arg(&as);
   3014	if (!arg) {
   3015		ti->error = "No arguments";
   3016		return -EINVAL;
   3017	}
   3018
   3019	rt = get_raid_type(arg);
   3020	if (!rt) {
   3021		ti->error = "Unrecognised raid_type";
   3022		return -EINVAL;
   3023	}
   3024
   3025	/* Must have <#raid_params> */
   3026	if (dm_read_arg_group(_args, &as, &num_raid_params, &ti->error))
   3027		return -EINVAL;
   3028
   3029	/* number of raid device tupples <meta_dev data_dev> */
   3030	as_nrd = as;
   3031	dm_consume_args(&as_nrd, num_raid_params);
   3032	_args[1].max = (as_nrd.argc - 1) / 2;
   3033	if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error))
   3034		return -EINVAL;
   3035
   3036	if (!__within_range(num_raid_devs, 1, MAX_RAID_DEVICES)) {
   3037		ti->error = "Invalid number of supplied raid devices";
   3038		return -EINVAL;
   3039	}
   3040
   3041	rs = raid_set_alloc(ti, rt, num_raid_devs);
   3042	if (IS_ERR(rs))
   3043		return PTR_ERR(rs);
   3044
   3045	r = parse_raid_params(rs, &as, num_raid_params);
   3046	if (r)
   3047		goto bad;
   3048
   3049	r = parse_dev_params(rs, &as);
   3050	if (r)
   3051		goto bad;
   3052
   3053	rs->md.sync_super = super_sync;
   3054
   3055	/*
   3056	 * Calculate ctr requested array and device sizes to allow
   3057	 * for superblock analysis needing device sizes defined.
   3058	 *
   3059	 * Any existing superblock will overwrite the array and device sizes
   3060	 */
   3061	r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
   3062	if (r)
   3063		goto bad;
   3064
   3065	/* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */
   3066	rs->array_sectors = rs->md.array_sectors;
   3067	rs->dev_sectors = rs->md.dev_sectors;
   3068
   3069	/*
   3070	 * Backup any new raid set level, layout, ...
   3071	 * requested to be able to compare to superblock
   3072	 * members for conversion decisions.
   3073	 */
   3074	rs_config_backup(rs, &rs_layout);
   3075
   3076	r = analyse_superblocks(ti, rs);
   3077	if (r)
   3078		goto bad;
   3079
   3080	/* All in-core metadata now as of current superblocks after calling analyse_superblocks() */
   3081	sb_array_sectors = rs->md.array_sectors;
   3082	rdev_sectors = __rdev_sectors(rs);
   3083	if (!rdev_sectors) {
   3084		ti->error = "Invalid rdev size";
   3085		r = -EINVAL;
   3086		goto bad;
   3087	}
   3088
   3089
   3090	reshape_sectors = _get_reshape_sectors(rs);
   3091	if (rs->dev_sectors != rdev_sectors) {
   3092		resize = (rs->dev_sectors != rdev_sectors - reshape_sectors);
   3093		if (rs->dev_sectors > rdev_sectors - reshape_sectors)
   3094			set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
   3095	}
   3096
   3097	INIT_WORK(&rs->md.event_work, do_table_event);
   3098	ti->private = rs;
   3099	ti->num_flush_bios = 1;
   3100
   3101	/* Restore any requested new layout for conversion decision */
   3102	rs_config_restore(rs, &rs_layout);
   3103
   3104	/*
   3105	 * Now that we have any superblock metadata available,
   3106	 * check for new, recovering, reshaping, to be taken over,
   3107	 * to be reshaped or an existing, unchanged raid set to
   3108	 * run in sequence.
   3109	 */
   3110	if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) {
   3111		/* A new raid6 set has to be recovered to ensure proper parity and Q-Syndrome */
   3112		if (rs_is_raid6(rs) &&
   3113		    test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
   3114			ti->error = "'nosync' not allowed for new raid6 set";
   3115			r = -EINVAL;
   3116			goto bad;
   3117		}
   3118		rs_setup_recovery(rs, 0);
   3119		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
   3120		rs_set_new(rs);
   3121	} else if (rs_is_recovering(rs)) {
   3122		/* A recovering raid set may be resized */
   3123		goto size_check;
   3124	} else if (rs_is_reshaping(rs)) {
   3125		/* Have to reject size change request during reshape */
   3126		if (resize) {
   3127			ti->error = "Can't resize a reshaping raid set";
   3128			r = -EPERM;
   3129			goto bad;
   3130		}
   3131		/* skip setup rs */
   3132	} else if (rs_takeover_requested(rs)) {
   3133		if (rs_is_reshaping(rs)) {
   3134			ti->error = "Can't takeover a reshaping raid set";
   3135			r = -EPERM;
   3136			goto bad;
   3137		}
   3138
   3139		/* We can't takeover a journaled raid4/5/6 */
   3140		if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
   3141			ti->error = "Can't takeover a journaled raid4/5/6 set";
   3142			r = -EPERM;
   3143			goto bad;
   3144		}
   3145
   3146		/*
   3147		 * If a takeover is needed, userspace sets any additional
   3148		 * devices to rebuild and we can check for a valid request here.
   3149		 *
   3150		 * If acceptible, set the level to the new requested
   3151		 * one, prohibit requesting recovery, allow the raid
   3152		 * set to run and store superblocks during resume.
   3153		 */
   3154		r = rs_check_takeover(rs);
   3155		if (r)
   3156			goto bad;
   3157
   3158		r = rs_setup_takeover(rs);
   3159		if (r)
   3160			goto bad;
   3161
   3162		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
   3163		/* Takeover ain't recovery, so disable recovery */
   3164		rs_setup_recovery(rs, MaxSector);
   3165		rs_set_new(rs);
   3166	} else if (rs_reshape_requested(rs)) {
   3167		/* Only request grow on raid set size extensions, not on reshapes. */
   3168		clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
   3169
   3170		/*
   3171		 * No need to check for 'ongoing' takeover here, because takeover
   3172		 * is an instant operation as oposed to an ongoing reshape.
   3173		 */
   3174
   3175		/* We can't reshape a journaled raid4/5/6 */
   3176		if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
   3177			ti->error = "Can't reshape a journaled raid4/5/6 set";
   3178			r = -EPERM;
   3179			goto bad;
   3180		}
   3181
   3182		/* Out-of-place space has to be available to allow for a reshape unless raid1! */
   3183		if (reshape_sectors || rs_is_raid1(rs)) {
   3184			/*
   3185			  * We can only prepare for a reshape here, because the
   3186			  * raid set needs to run to provide the repective reshape
   3187			  * check functions via its MD personality instance.
   3188			  *
   3189			  * So do the reshape check after md_run() succeeded.
   3190			  */
   3191			r = rs_prepare_reshape(rs);
   3192			if (r)
   3193				goto bad;
   3194
   3195			/* Reshaping ain't recovery, so disable recovery */
   3196			rs_setup_recovery(rs, MaxSector);
   3197		}
   3198		rs_set_cur(rs);
   3199	} else {
   3200size_check:
   3201		/* May not set recovery when a device rebuild is requested */
   3202		if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
   3203			clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
   3204			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
   3205			rs_setup_recovery(rs, MaxSector);
   3206		} else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) {
   3207			/*
   3208			 * Set raid set to current size, i.e. size as of
   3209			 * superblocks to grow to larger size in preresume.
   3210			 */
   3211			r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false);
   3212			if (r)
   3213				goto bad;
   3214
   3215			rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors);
   3216		} else {
   3217			/* This is no size change or it is shrinking, update size and record in superblocks */
   3218			r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
   3219			if (r)
   3220				goto bad;
   3221
   3222			if (sb_array_sectors > rs->array_sectors)
   3223				set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
   3224		}
   3225		rs_set_cur(rs);
   3226	}
   3227
   3228	/* If constructor requested it, change data and new_data offsets */
   3229	r = rs_adjust_data_offsets(rs);
   3230	if (r)
   3231		goto bad;
   3232
   3233	/* Catch any inconclusive reshape superblock content. */
   3234	rs_reset_inconclusive_reshape(rs);
   3235
   3236	/* Start raid set read-only and assumed clean to change in raid_resume() */
   3237	rs->md.ro = 1;
   3238	rs->md.in_sync = 1;
   3239
   3240	/* Keep array frozen until resume. */
   3241	set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
   3242
   3243	/* Has to be held on running the array */
   3244	mddev_lock_nointr(&rs->md);
   3245	r = md_run(&rs->md);
   3246	rs->md.in_sync = 0; /* Assume already marked dirty */
   3247	if (r) {
   3248		ti->error = "Failed to run raid array";
   3249		mddev_unlock(&rs->md);
   3250		goto bad;
   3251	}
   3252
   3253	r = md_start(&rs->md);
   3254	if (r) {
   3255		ti->error = "Failed to start raid array";
   3256		mddev_unlock(&rs->md);
   3257		goto bad_md_start;
   3258	}
   3259
   3260	/* If raid4/5/6 journal mode explicitly requested (only possible with journal dev) -> set it */
   3261	if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
   3262		r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
   3263		if (r) {
   3264			ti->error = "Failed to set raid4/5/6 journal mode";
   3265			mddev_unlock(&rs->md);
   3266			goto bad_journal_mode_set;
   3267		}
   3268	}
   3269
   3270	mddev_suspend(&rs->md);
   3271	set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
   3272
   3273	/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
   3274	if (rs_is_raid456(rs)) {
   3275		r = rs_set_raid456_stripe_cache(rs);
   3276		if (r)
   3277			goto bad_stripe_cache;
   3278	}
   3279
   3280	/* Now do an early reshape check */
   3281	if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
   3282		r = rs_check_reshape(rs);
   3283		if (r)
   3284			goto bad_check_reshape;
   3285
   3286		/* Restore new, ctr requested layout to perform check */
   3287		rs_config_restore(rs, &rs_layout);
   3288
   3289		if (rs->md.pers->start_reshape) {
   3290			r = rs->md.pers->check_reshape(&rs->md);
   3291			if (r) {
   3292				ti->error = "Reshape check failed";
   3293				goto bad_check_reshape;
   3294			}
   3295		}
   3296	}
   3297
   3298	/* Disable/enable discard support on raid set. */
   3299	configure_discard_support(rs);
   3300
   3301	mddev_unlock(&rs->md);
   3302	return 0;
   3303
   3304bad_md_start:
   3305bad_journal_mode_set:
   3306bad_stripe_cache:
   3307bad_check_reshape:
   3308	md_stop(&rs->md);
   3309bad:
   3310	raid_set_free(rs);
   3311
   3312	return r;
   3313}
   3314
   3315static void raid_dtr(struct dm_target *ti)
   3316{
   3317	struct raid_set *rs = ti->private;
   3318
   3319	md_stop(&rs->md);
   3320	raid_set_free(rs);
   3321}
   3322
   3323static int raid_map(struct dm_target *ti, struct bio *bio)
   3324{
   3325	struct raid_set *rs = ti->private;
   3326	struct mddev *mddev = &rs->md;
   3327
   3328	/*
   3329	 * If we're reshaping to add disk(s)), ti->len and
   3330	 * mddev->array_sectors will differ during the process
   3331	 * (ti->len > mddev->array_sectors), so we have to requeue
   3332	 * bios with addresses > mddev->array_sectors here or
   3333	 * there will occur accesses past EOD of the component
   3334	 * data images thus erroring the raid set.
   3335	 */
   3336	if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
   3337		return DM_MAPIO_REQUEUE;
   3338
   3339	md_handle_request(mddev, bio);
   3340
   3341	return DM_MAPIO_SUBMITTED;
   3342}
   3343
   3344/* Return sync state string for @state */
   3345enum sync_state { st_frozen, st_reshape, st_resync, st_check, st_repair, st_recover, st_idle };
   3346static const char *sync_str(enum sync_state state)
   3347{
   3348	/* Has to be in above sync_state order! */
   3349	static const char *sync_strs[] = {
   3350		"frozen",
   3351		"reshape",
   3352		"resync",
   3353		"check",
   3354		"repair",
   3355		"recover",
   3356		"idle"
   3357	};
   3358
   3359	return __within_range(state, 0, ARRAY_SIZE(sync_strs) - 1) ? sync_strs[state] : "undef";
   3360};
   3361
   3362/* Return enum sync_state for @mddev derived from @recovery flags */
   3363static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long recovery)
   3364{
   3365	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
   3366		return st_frozen;
   3367
   3368	/* The MD sync thread can be done with io or be interrupted but still be running */
   3369	if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
   3370	    (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
   3371	     (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
   3372		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
   3373			return st_reshape;
   3374
   3375		if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
   3376			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
   3377				return st_resync;
   3378			if (test_bit(MD_RECOVERY_CHECK, &recovery))
   3379				return st_check;
   3380			return st_repair;
   3381		}
   3382
   3383		if (test_bit(MD_RECOVERY_RECOVER, &recovery))
   3384			return st_recover;
   3385
   3386		if (mddev->reshape_position != MaxSector)
   3387			return st_reshape;
   3388	}
   3389
   3390	return st_idle;
   3391}
   3392
   3393/*
   3394 * Return status string for @rdev
   3395 *
   3396 * Status characters:
   3397 *
   3398 *  'D' = Dead/Failed raid set component or raid4/5/6 journal device
   3399 *  'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
   3400 *  'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
   3401 *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
   3402 */
   3403static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
   3404{
   3405	if (!rdev->bdev)
   3406		return "-";
   3407	else if (test_bit(Faulty, &rdev->flags))
   3408		return "D";
   3409	else if (test_bit(Journal, &rdev->flags))
   3410		return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
   3411	else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) ||
   3412		 (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) &&
   3413		  !test_bit(In_sync, &rdev->flags)))
   3414		return "a";
   3415	else
   3416		return "A";
   3417}
   3418
   3419/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */
   3420static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
   3421				enum sync_state state, sector_t resync_max_sectors)
   3422{
   3423	sector_t r;
   3424	struct mddev *mddev = &rs->md;
   3425
   3426	clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
   3427	clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
   3428
   3429	if (rs_is_raid0(rs)) {
   3430		r = resync_max_sectors;
   3431		set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
   3432
   3433	} else {
   3434		if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
   3435			r = mddev->recovery_cp;
   3436		else
   3437			r = mddev->curr_resync_completed;
   3438
   3439		if (state == st_idle && r >= resync_max_sectors) {
   3440			/*
   3441			 * Sync complete.
   3442			 */
   3443			/* In case we have finished recovering, the array is in sync. */
   3444			if (test_bit(MD_RECOVERY_RECOVER, &recovery))
   3445				set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
   3446
   3447		} else if (state == st_recover)
   3448			/*
   3449			 * In case we are recovering, the array is not in sync
   3450			 * and health chars should show the recovering legs.
   3451			 *
   3452			 * Already retrieved recovery offset from curr_resync_completed above.
   3453			 */
   3454			;
   3455
   3456		else if (state == st_resync || state == st_reshape)
   3457			/*
   3458			 * If "resync/reshape" is occurring, the raid set
   3459			 * is or may be out of sync hence the health
   3460			 * characters shall be 'a'.
   3461			 */
   3462			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
   3463
   3464		else if (state == st_check || state == st_repair)
   3465			/*
   3466			 * If "check" or "repair" is occurring, the raid set has
   3467			 * undergone an initial sync and the health characters
   3468			 * should not be 'a' anymore.
   3469			 */
   3470			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
   3471
   3472		else if (test_bit(MD_RECOVERY_NEEDED, &recovery))
   3473			/*
   3474			 * We are idle and recovery is needed, prevent 'A' chars race
   3475			 * caused by components still set to in-sync by constructor.
   3476			 */
   3477			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
   3478
   3479		else {
   3480			/*
   3481			 * We are idle and the raid set may be doing an initial
   3482			 * sync, or it may be rebuilding individual components.
   3483			 * If all the devices are In_sync, then it is the raid set
   3484			 * that is being initialized.
   3485			 */
   3486			struct md_rdev *rdev;
   3487
   3488			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
   3489			rdev_for_each(rdev, mddev)
   3490				if (!test_bit(Journal, &rdev->flags) &&
   3491				    !test_bit(In_sync, &rdev->flags)) {
   3492					clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
   3493					break;
   3494				}
   3495		}
   3496	}
   3497
   3498	return min(r, resync_max_sectors);
   3499}
   3500
   3501/* Helper to return @dev name or "-" if !@dev */
   3502static const char *__get_dev_name(struct dm_dev *dev)
   3503{
   3504	return dev ? dev->name : "-";
   3505}
   3506
   3507static void raid_status(struct dm_target *ti, status_type_t type,
   3508			unsigned int status_flags, char *result, unsigned int maxlen)
   3509{
   3510	struct raid_set *rs = ti->private;
   3511	struct mddev *mddev = &rs->md;
   3512	struct r5conf *conf = mddev->private;
   3513	int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
   3514	unsigned long recovery;
   3515	unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
   3516	unsigned int sz = 0;
   3517	unsigned int rebuild_writemostly_count = 0;
   3518	sector_t progress, resync_max_sectors, resync_mismatches;
   3519	enum sync_state state;
   3520	struct raid_type *rt;
   3521
   3522	switch (type) {
   3523	case STATUSTYPE_INFO:
   3524		/* *Should* always succeed */
   3525		rt = get_raid_type_by_ll(mddev->new_level, mddev->new_layout);
   3526		if (!rt)
   3527			return;
   3528
   3529		DMEMIT("%s %d ", rt->name, mddev->raid_disks);
   3530
   3531		/* Access most recent mddev properties for status output */
   3532		smp_rmb();
   3533		/* Get sensible max sectors even if raid set not yet started */
   3534		resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
   3535				      mddev->resync_max_sectors : mddev->dev_sectors;
   3536		recovery = rs->md.recovery;
   3537		state = decipher_sync_action(mddev, recovery);
   3538		progress = rs_get_progress(rs, recovery, state, resync_max_sectors);
   3539		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
   3540				    atomic64_read(&mddev->resync_mismatches) : 0;
   3541
   3542		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
   3543		for (i = 0; i < rs->raid_disks; i++)
   3544			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev));
   3545
   3546		/*
   3547		 * In-sync/Reshape ratio:
   3548		 *  The in-sync ratio shows the progress of:
   3549		 *   - Initializing the raid set
   3550		 *   - Rebuilding a subset of devices of the raid set
   3551		 *  The user can distinguish between the two by referring
   3552		 *  to the status characters.
   3553		 *
   3554		 *  The reshape ratio shows the progress of
   3555		 *  changing the raid layout or the number of
   3556		 *  disks of a raid set
   3557		 */
   3558		DMEMIT(" %llu/%llu", (unsigned long long) progress,
   3559				     (unsigned long long) resync_max_sectors);
   3560
   3561		/*
   3562		 * v1.5.0+:
   3563		 *
   3564		 * Sync action:
   3565		 *   See Documentation/admin-guide/device-mapper/dm-raid.rst for
   3566		 *   information on each of these states.
   3567		 */
   3568		DMEMIT(" %s", sync_str(state));
   3569
   3570		/*
   3571		 * v1.5.0+:
   3572		 *
   3573		 * resync_mismatches/mismatch_cnt
   3574		 *   This field shows the number of discrepancies found when
   3575		 *   performing a "check" of the raid set.
   3576		 */
   3577		DMEMIT(" %llu", (unsigned long long) resync_mismatches);
   3578
   3579		/*
   3580		 * v1.9.0+:
   3581		 *
   3582		 * data_offset (needed for out of space reshaping)
   3583		 *   This field shows the data offset into the data
   3584		 *   image LV where the first stripes data starts.
   3585		 *
   3586		 * We keep data_offset equal on all raid disks of the set,
   3587		 * so retrieving it from the first raid disk is sufficient.
   3588		 */
   3589		DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
   3590
   3591		/*
   3592		 * v1.10.0+:
   3593		 */
   3594		DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
   3595			      __raid_dev_status(rs, &rs->journal_dev.rdev) : "-");
   3596		break;
   3597
   3598	case STATUSTYPE_TABLE:
   3599		/* Report the table line string you would use to construct this raid set */
   3600
   3601		/*
   3602		 * Count any rebuild or writemostly argument pairs and subtract the
   3603		 * hweight count being added below of any rebuild and writemostly ctr flags.
   3604		 */
   3605		for (i = 0; i < rs->raid_disks; i++) {
   3606			rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) +
   3607						     (test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0);
   3608		}
   3609		rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) +
   3610					     (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0);
   3611		/* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */
   3612		raid_param_cnt += rebuild_writemostly_count +
   3613				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
   3614				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
   3615		/* Emit table line */
   3616		/* This has to be in the documented order for userspace! */
   3617		DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
   3618		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
   3619			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
   3620		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
   3621			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
   3622		if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags))
   3623			for (i = 0; i < rs->raid_disks; i++)
   3624				if (test_bit(i, (void *) rs->rebuild_disks))
   3625					DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i);
   3626		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
   3627			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
   3628					  mddev->bitmap_info.daemon_sleep);
   3629		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
   3630			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
   3631					 mddev->sync_speed_min);
   3632		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
   3633			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
   3634					 mddev->sync_speed_max);
   3635		if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags))
   3636			for (i = 0; i < rs->raid_disks; i++)
   3637				if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
   3638					DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
   3639					       rs->dev[i].rdev.raid_disk);
   3640		if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
   3641			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
   3642					  mddev->bitmap_info.max_write_behind);
   3643		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
   3644			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
   3645					 max_nr_stripes);
   3646		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
   3647			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
   3648					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
   3649		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
   3650			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
   3651					 raid10_md_layout_to_copies(mddev->layout));
   3652		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
   3653			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
   3654					 raid10_md_layout_to_format(mddev->layout));
   3655		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
   3656			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
   3657					 max(rs->delta_disks, mddev->delta_disks));
   3658		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
   3659			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
   3660					   (unsigned long long) rs->data_offset);
   3661		if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
   3662			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
   3663					__get_dev_name(rs->journal_dev.dev));
   3664		if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
   3665			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
   3666					 md_journal_mode_to_dm_raid(rs->journal_dev.mode));
   3667		DMEMIT(" %d", rs->raid_disks);
   3668		for (i = 0; i < rs->raid_disks; i++)
   3669			DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
   3670					 __get_dev_name(rs->dev[i].data_dev));
   3671		break;
   3672
   3673	case STATUSTYPE_IMA:
   3674		rt = get_raid_type_by_ll(mddev->new_level, mddev->new_layout);
   3675		if (!rt)
   3676			return;
   3677
   3678		DMEMIT_TARGET_NAME_VERSION(ti->type);
   3679		DMEMIT(",raid_type=%s,raid_disks=%d", rt->name, mddev->raid_disks);
   3680
   3681		/* Access most recent mddev properties for status output */
   3682		smp_rmb();
   3683		recovery = rs->md.recovery;
   3684		state = decipher_sync_action(mddev, recovery);
   3685		DMEMIT(",raid_state=%s", sync_str(state));
   3686
   3687		for (i = 0; i < rs->raid_disks; i++) {
   3688			DMEMIT(",raid_device_%d_status=", i);
   3689			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev));
   3690		}
   3691
   3692		if (rt_is_raid456(rt)) {
   3693			DMEMIT(",journal_dev_mode=");
   3694			switch (rs->journal_dev.mode) {
   3695			case R5C_JOURNAL_MODE_WRITE_THROUGH:
   3696				DMEMIT("%s",
   3697				       _raid456_journal_mode[R5C_JOURNAL_MODE_WRITE_THROUGH].param);
   3698				break;
   3699			case R5C_JOURNAL_MODE_WRITE_BACK:
   3700				DMEMIT("%s",
   3701				       _raid456_journal_mode[R5C_JOURNAL_MODE_WRITE_BACK].param);
   3702				break;
   3703			default:
   3704				DMEMIT("invalid");
   3705				break;
   3706			}
   3707		}
   3708		DMEMIT(";");
   3709		break;
   3710	}
   3711}
   3712
   3713static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
   3714			char *result, unsigned maxlen)
   3715{
   3716	struct raid_set *rs = ti->private;
   3717	struct mddev *mddev = &rs->md;
   3718
   3719	if (!mddev->pers || !mddev->pers->sync_request)
   3720		return -EINVAL;
   3721
   3722	if (!strcasecmp(argv[0], "frozen"))
   3723		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
   3724	else
   3725		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
   3726
   3727	if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
   3728		if (mddev->sync_thread) {
   3729			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   3730			md_reap_sync_thread(mddev);
   3731		}
   3732	} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
   3733		return -EBUSY;
   3734	else if (!strcasecmp(argv[0], "resync"))
   3735		; /* MD_RECOVERY_NEEDED set below */
   3736	else if (!strcasecmp(argv[0], "recover"))
   3737		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
   3738	else {
   3739		if (!strcasecmp(argv[0], "check")) {
   3740			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
   3741			set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
   3742			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
   3743		} else if (!strcasecmp(argv[0], "repair")) {
   3744			set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
   3745			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
   3746		} else
   3747			return -EINVAL;
   3748	}
   3749	if (mddev->ro == 2) {
   3750		/* A write to sync_action is enough to justify
   3751		 * canceling read-auto mode
   3752		 */
   3753		mddev->ro = 0;
   3754		if (!mddev->suspended && mddev->sync_thread)
   3755			md_wakeup_thread(mddev->sync_thread);
   3756	}
   3757	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
   3758	if (!mddev->suspended && mddev->thread)
   3759		md_wakeup_thread(mddev->thread);
   3760
   3761	return 0;
   3762}
   3763
   3764static int raid_iterate_devices(struct dm_target *ti,
   3765				iterate_devices_callout_fn fn, void *data)
   3766{
   3767	struct raid_set *rs = ti->private;
   3768	unsigned int i;
   3769	int r = 0;
   3770
   3771	for (i = 0; !r && i < rs->raid_disks; i++) {
   3772		if (rs->dev[i].data_dev) {
   3773			r = fn(ti, rs->dev[i].data_dev,
   3774			       0, /* No offset on data devs */
   3775			       rs->md.dev_sectors, data);
   3776		}
   3777	}
   3778
   3779	return r;
   3780}
   3781
   3782static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
   3783{
   3784	struct raid_set *rs = ti->private;
   3785	unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors);
   3786
   3787	blk_limits_io_min(limits, chunk_size_bytes);
   3788	blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
   3789}
   3790
   3791static void raid_postsuspend(struct dm_target *ti)
   3792{
   3793	struct raid_set *rs = ti->private;
   3794
   3795	if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
   3796		/* Writes have to be stopped before suspending to avoid deadlocks. */
   3797		if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
   3798			md_stop_writes(&rs->md);
   3799
   3800		mddev_lock_nointr(&rs->md);
   3801		mddev_suspend(&rs->md);
   3802		mddev_unlock(&rs->md);
   3803	}
   3804}
   3805
   3806static void attempt_restore_of_faulty_devices(struct raid_set *rs)
   3807{
   3808	int i;
   3809	uint64_t cleared_failed_devices[DISKS_ARRAY_ELEMS];
   3810	unsigned long flags;
   3811	bool cleared = false;
   3812	struct dm_raid_superblock *sb;
   3813	struct mddev *mddev = &rs->md;
   3814	struct md_rdev *r;
   3815
   3816	/* RAID personalities have to provide hot add/remove methods or we need to bail out. */
   3817	if (!mddev->pers || !mddev->pers->hot_add_disk || !mddev->pers->hot_remove_disk)
   3818		return;
   3819
   3820	memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
   3821
   3822	for (i = 0; i < mddev->raid_disks; i++) {
   3823		r = &rs->dev[i].rdev;
   3824		/* HM FIXME: enhance journal device recovery processing */
   3825		if (test_bit(Journal, &r->flags))
   3826			continue;
   3827
   3828		if (test_bit(Faulty, &r->flags) &&
   3829		    r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) {
   3830			DMINFO("Faulty %s device #%d has readable super block."
   3831			       "  Attempting to revive it.",
   3832			       rs->raid_type->name, i);
   3833
   3834			/*
   3835			 * Faulty bit may be set, but sometimes the array can
   3836			 * be suspended before the personalities can respond
   3837			 * by removing the device from the array (i.e. calling
   3838			 * 'hot_remove_disk').	If they haven't yet removed
   3839			 * the failed device, its 'raid_disk' number will be
   3840			 * '>= 0' - meaning we must call this function
   3841			 * ourselves.
   3842			 */
   3843			flags = r->flags;
   3844			clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
   3845			if (r->raid_disk >= 0) {
   3846				if (mddev->pers->hot_remove_disk(mddev, r)) {
   3847					/* Failed to revive this device, try next */
   3848					r->flags = flags;
   3849					continue;
   3850				}
   3851			} else
   3852				r->raid_disk = r->saved_raid_disk = i;
   3853
   3854			clear_bit(Faulty, &r->flags);
   3855			clear_bit(WriteErrorSeen, &r->flags);
   3856
   3857			if (mddev->pers->hot_add_disk(mddev, r)) {
   3858				/* Failed to revive this device, try next */
   3859				r->raid_disk = r->saved_raid_disk = -1;
   3860				r->flags = flags;
   3861			} else {
   3862				clear_bit(In_sync, &r->flags);
   3863				r->recovery_offset = 0;
   3864				set_bit(i, (void *) cleared_failed_devices);
   3865				cleared = true;
   3866			}
   3867		}
   3868	}
   3869
   3870	/* If any failed devices could be cleared, update all sbs failed_devices bits */
   3871	if (cleared) {
   3872		uint64_t failed_devices[DISKS_ARRAY_ELEMS];
   3873
   3874		rdev_for_each(r, &rs->md) {
   3875			if (test_bit(Journal, &r->flags))
   3876				continue;
   3877
   3878			sb = page_address(r->sb_page);
   3879			sb_retrieve_failed_devices(sb, failed_devices);
   3880
   3881			for (i = 0; i < DISKS_ARRAY_ELEMS; i++)
   3882				failed_devices[i] &= ~cleared_failed_devices[i];
   3883
   3884			sb_update_failed_devices(sb, failed_devices);
   3885		}
   3886	}
   3887}
   3888
   3889static int __load_dirty_region_bitmap(struct raid_set *rs)
   3890{
   3891	int r = 0;
   3892
   3893	/* Try loading the bitmap unless "raid0", which does not have one */
   3894	if (!rs_is_raid0(rs) &&
   3895	    !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
   3896		r = md_bitmap_load(&rs->md);
   3897		if (r)
   3898			DMERR("Failed to load bitmap");
   3899	}
   3900
   3901	return r;
   3902}
   3903
   3904/* Enforce updating all superblocks */
   3905static void rs_update_sbs(struct raid_set *rs)
   3906{
   3907	struct mddev *mddev = &rs->md;
   3908	int ro = mddev->ro;
   3909
   3910	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
   3911	mddev->ro = 0;
   3912	md_update_sb(mddev, 1);
   3913	mddev->ro = ro;
   3914}
   3915
   3916/*
   3917 * Reshape changes raid algorithm of @rs to new one within personality
   3918 * (e.g. raid6_zr -> raid6_nc), changes stripe size, adds/removes
   3919 * disks from a raid set thus growing/shrinking it or resizes the set
   3920 *
   3921 * Call mddev_lock_nointr() before!
   3922 */
   3923static int rs_start_reshape(struct raid_set *rs)
   3924{
   3925	int r;
   3926	struct mddev *mddev = &rs->md;
   3927	struct md_personality *pers = mddev->pers;
   3928
   3929	/* Don't allow the sync thread to work until the table gets reloaded. */
   3930	set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
   3931
   3932	r = rs_setup_reshape(rs);
   3933	if (r)
   3934		return r;
   3935
   3936	/*
   3937	 * Check any reshape constraints enforced by the personalility
   3938	 *
   3939	 * May as well already kick the reshape off so that * pers->start_reshape() becomes optional.
   3940	 */
   3941	r = pers->check_reshape(mddev);
   3942	if (r) {
   3943		rs->ti->error = "pers->check_reshape() failed";
   3944		return r;
   3945	}
   3946
   3947	/*
   3948	 * Personality may not provide start reshape method in which
   3949	 * case check_reshape above has already covered everything
   3950	 */
   3951	if (pers->start_reshape) {
   3952		r = pers->start_reshape(mddev);
   3953		if (r) {
   3954			rs->ti->error = "pers->start_reshape() failed";
   3955			return r;
   3956		}
   3957	}
   3958
   3959	/*
   3960	 * Now reshape got set up, update superblocks to
   3961	 * reflect the fact so that a table reload will
   3962	 * access proper superblock content in the ctr.
   3963	 */
   3964	rs_update_sbs(rs);
   3965
   3966	return 0;
   3967}
   3968
   3969static int raid_preresume(struct dm_target *ti)
   3970{
   3971	int r;
   3972	struct raid_set *rs = ti->private;
   3973	struct mddev *mddev = &rs->md;
   3974
   3975	/* This is a resume after a suspend of the set -> it's already started. */
   3976	if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
   3977		return 0;
   3978
   3979	/*
   3980	 * The superblocks need to be updated on disk if the
   3981	 * array is new or new devices got added (thus zeroed
   3982	 * out by userspace) or __load_dirty_region_bitmap
   3983	 * will overwrite them in core with old data or fail.
   3984	 */
   3985	if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags))
   3986		rs_update_sbs(rs);
   3987
   3988	/* Load the bitmap from disk unless raid0 */
   3989	r = __load_dirty_region_bitmap(rs);
   3990	if (r)
   3991		return r;
   3992
   3993	/* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */
   3994	if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) {
   3995		mddev->array_sectors = rs->array_sectors;
   3996		mddev->dev_sectors = rs->dev_sectors;
   3997		rs_set_rdev_sectors(rs);
   3998		rs_set_capacity(rs);
   3999	}
   4000
   4001	/* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */
   4002        if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
   4003	    (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) ||
   4004	     (rs->requested_bitmap_chunk_sectors &&
   4005	       mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
   4006		int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
   4007
   4008		r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
   4009		if (r)
   4010			DMERR("Failed to resize bitmap");
   4011	}
   4012
   4013	/* Check for any resize/reshape on @rs and adjust/initiate */
   4014	/* Be prepared for mddev_resume() in raid_resume() */
   4015	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
   4016	if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
   4017		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
   4018		mddev->resync_min = mddev->recovery_cp;
   4019		if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags))
   4020			mddev->resync_max_sectors = mddev->dev_sectors;
   4021	}
   4022
   4023	/* Check for any reshape request unless new raid set */
   4024	if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
   4025		/* Initiate a reshape. */
   4026		rs_set_rdev_sectors(rs);
   4027		mddev_lock_nointr(mddev);
   4028		r = rs_start_reshape(rs);
   4029		mddev_unlock(mddev);
   4030		if (r)
   4031			DMWARN("Failed to check/start reshape, continuing without change");
   4032		r = 0;
   4033	}
   4034
   4035	return r;
   4036}
   4037
   4038static void raid_resume(struct dm_target *ti)
   4039{
   4040	struct raid_set *rs = ti->private;
   4041	struct mddev *mddev = &rs->md;
   4042
   4043	if (test_and_set_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
   4044		/*
   4045		 * A secondary resume while the device is active.
   4046		 * Take this opportunity to check whether any failed
   4047		 * devices are reachable again.
   4048		 */
   4049		attempt_restore_of_faulty_devices(rs);
   4050	}
   4051
   4052	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
   4053		/* Only reduce raid set size before running a disk removing reshape. */
   4054		if (mddev->delta_disks < 0)
   4055			rs_set_capacity(rs);
   4056
   4057		mddev_lock_nointr(mddev);
   4058		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
   4059		mddev->ro = 0;
   4060		mddev->in_sync = 0;
   4061		mddev_resume(mddev);
   4062		mddev_unlock(mddev);
   4063	}
   4064}
   4065
   4066static struct target_type raid_target = {
   4067	.name = "raid",
   4068	.version = {1, 15, 1},
   4069	.module = THIS_MODULE,
   4070	.ctr = raid_ctr,
   4071	.dtr = raid_dtr,
   4072	.map = raid_map,
   4073	.status = raid_status,
   4074	.message = raid_message,
   4075	.iterate_devices = raid_iterate_devices,
   4076	.io_hints = raid_io_hints,
   4077	.postsuspend = raid_postsuspend,
   4078	.preresume = raid_preresume,
   4079	.resume = raid_resume,
   4080};
   4081
   4082static int __init dm_raid_init(void)
   4083{
   4084	DMINFO("Loading target version %u.%u.%u",
   4085	       raid_target.version[0],
   4086	       raid_target.version[1],
   4087	       raid_target.version[2]);
   4088	return dm_register_target(&raid_target);
   4089}
   4090
   4091static void __exit dm_raid_exit(void)
   4092{
   4093	dm_unregister_target(&raid_target);
   4094}
   4095
   4096module_init(dm_raid_init);
   4097module_exit(dm_raid_exit);
   4098
   4099module_param(devices_handle_discard_safely, bool, 0644);
   4100MODULE_PARM_DESC(devices_handle_discard_safely,
   4101		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
   4102
   4103MODULE_DESCRIPTION(DM_NAME " raid0/1/10/4/5/6 target");
   4104MODULE_ALIAS("dm-raid0");
   4105MODULE_ALIAS("dm-raid1");
   4106MODULE_ALIAS("dm-raid10");
   4107MODULE_ALIAS("dm-raid4");
   4108MODULE_ALIAS("dm-raid5");
   4109MODULE_ALIAS("dm-raid6");
   4110MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
   4111MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
   4112MODULE_LICENSE("GPL");