From 770b1d216d7371c94c88713824da4be4bc39a4e0 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 15 Nov 2021 17:23:17 -0800
Subject: md/raid5: play nice with PREEMPT_RT

raid_run_ops() relies on the implicitly disabled preemption for
its percpu ops, although this is really about CPU locality. This
breaks RT semantics as it can take regular (and thus sleeping)
spinlocks, such as stripe_lock.

Add a local_lock such that non-RT does not change and continues
to be just map to preempt_disable/enable, but makes RT happy as
the region will use a per-CPU spinlock and thus be preemptible
and still guarantee CPU locality.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9c1a5877cf9f..1240a5c16af8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2215,10 +2215,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	struct r5conf *conf = sh->raid_conf;
 	int level = conf->level;
 	struct raid5_percpu *percpu;
-	unsigned long cpu;
 
-	cpu = get_cpu();
-	percpu = per_cpu_ptr(conf->percpu, cpu);
+	local_lock(&conf->percpu->lock);
+	percpu = this_cpu_ptr(conf->percpu);
 	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
 		ops_run_biofill(sh);
 		overlap_clear++;
@@ -2271,13 +2270,14 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			BUG();
 	}
 
-	if (overlap_clear && !sh->batch_head)
+	if (overlap_clear && !sh->batch_head) {
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
 				wake_up(&sh->raid_conf->wait_for_overlap);
 		}
-	put_cpu();
+	}
+	local_unlock(&conf->percpu->lock);
 }
 
 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
@@ -7052,6 +7052,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
 		return -ENOMEM;
 	}
 
+	local_lock_init(&percpu->lock);
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From bf2c411bb1cfc45f73eb6c55b5755bcb990063ae Mon Sep 17 00:00:00 2001
From: Vishal Verma <vverma@digitalocean.com>
Date: Tue, 21 Dec 2021 20:06:22 +0000
Subject: md: raid456 add nowait support

Returns EAGAIN in case the raid456 driver would block waiting for reshape.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Vishal Verma <vverma@digitalocean.com>
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid5.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1240a5c16af8..beb544be9058 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5686,6 +5686,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	struct stripe_head *sh;
 	int stripe_sectors;
 
+	/* We need to handle this when io_uring supports discard/trim */
+	if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT))
+		return;
+
 	if (mddev->reshape_position != MaxSector)
 		/* Skip discard while reshape is happening */
 		return;
@@ -5819,6 +5823,17 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 
+	/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
+	if ((bi->bi_opf & REQ_NOWAIT) &&
+	    (conf->reshape_progress != MaxSector) &&
+	    (mddev->reshape_backwards
+	    ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe)
+	    : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) {
+		bio_wouldblock_error(bi);
+		if (rw == WRITE)
+			md_write_end(mddev);
+		return true;
+	}
 	md_account_bio(mddev, &bi);
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
-- 
cgit v1.2.3-71-gd317


From 0c031fd37f69deb0cd8c43bbfcfccd62ebd7e952 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Fri, 10 Dec 2021 17:31:15 +0800
Subject: md: Move alloc/free acct bioset in to personality

bioset acct is only needed for raid0 and raid5. Therefore, md_run only
allocates it for raid0 and raid5. However, this does not cover
personality takeover, which may cause uninitialized bioset. For example,
the following repro steps:

  mdadm -CR /dev/md0 -l1 -n2 /dev/loop0 /dev/loop1
  mdadm --wait /dev/md0
  mkfs.xfs /dev/md0
  mdadm /dev/md0 --grow -l5
  mount /dev/md0 /mnt

causes panic like:

[  225.933939] BUG: kernel NULL pointer dereference, address: 0000000000000000
[  225.934903] #PF: supervisor instruction fetch in kernel mode
[  225.935639] #PF: error_code(0x0010) - not-present page
[  225.936361] PGD 0 P4D 0
[  225.936677] Oops: 0010 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN PTI
[  225.937525] CPU: 27 PID: 1133 Comm: mount Not tainted 5.16.0-rc3+ #706
[  225.938416] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.module_el8.4.0+547+a85d02ba 04/01/2014
[  225.939922] RIP: 0010:0x0
[  225.940289] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
[  225.941196] RSP: 0018:ffff88815897eff0 EFLAGS: 00010246
[  225.941897] RAX: 0000000000000000 RBX: 0000000000092800 RCX: ffffffff81370a39
[  225.942813] RDX: dffffc0000000000 RSI: 0000000000000000 RDI: 0000000000092800
[  225.943772] RBP: 1ffff1102b12fe04 R08: fffffbfff0b43c01 R09: fffffbfff0b43c01
[  225.944807] R10: ffffffff85a1e007 R11: fffffbfff0b43c00 R12: ffff88810eaaaf58
[  225.945757] R13: 0000000000000000 R14: ffff88810eaaafb8 R15: ffff88815897f040
[  225.946709] FS:  00007ff3f2505080(0000) GS:ffff888fb5e00000(0000) knlGS:0000000000000000
[  225.947814] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  225.948556] CR2: ffffffffffffffd6 CR3: 000000015aa5a006 CR4: 0000000000370ee0
[  225.949537] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  225.950455] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  225.951414] Call Trace:
[  225.951787]  <TASK>
[  225.952120]  mempool_alloc+0xe5/0x250
[  225.952625]  ? mempool_resize+0x370/0x370
[  225.953187]  ? rcu_read_lock_sched_held+0xa1/0xd0
[  225.953862]  ? rcu_read_lock_bh_held+0xb0/0xb0
[  225.954464]  ? sched_clock_cpu+0x15/0x120
[  225.955019]  ? find_held_lock+0xac/0xd0
[  225.955564]  bio_alloc_bioset+0x1ed/0x2a0
[  225.956080]  ? lock_downgrade+0x3a0/0x3a0
[  225.956644]  ? bvec_alloc+0xc0/0xc0
[  225.957135]  bio_clone_fast+0x19/0x80
[  225.957651]  raid5_make_request+0x1370/0x1b70
[  225.958286]  ? sched_clock_cpu+0x15/0x120
[  225.958797]  ? __lock_acquire+0x8b2/0x3510
[  225.959339]  ? raid5_get_active_stripe+0xce0/0xce0
[  225.959986]  ? lock_is_held_type+0xd8/0x130
[  225.960528]  ? rcu_read_lock_sched_held+0xa1/0xd0
[  225.961135]  ? rcu_read_lock_bh_held+0xb0/0xb0
[  225.961703]  ? sched_clock_cpu+0x15/0x120
[  225.962232]  ? lock_release+0x27a/0x6c0
[  225.962746]  ? do_wait_intr_irq+0x130/0x130
[  225.963302]  ? lock_downgrade+0x3a0/0x3a0
[  225.963815]  ? lock_release+0x6c0/0x6c0
[  225.964348]  md_handle_request+0x342/0x530
[  225.964888]  ? set_in_sync+0x170/0x170
[  225.965397]  ? blk_queue_split+0x133/0x150
[  225.965988]  ? __blk_queue_split+0x8b0/0x8b0
[  225.966524]  ? submit_bio_checks+0x3b2/0x9d0
[  225.967069]  md_submit_bio+0x127/0x1c0
[...]

Fix this by moving alloc/free of acct bioset to pers->run and pers->free.

While we are on this, properly handle md_integrity_register() error in
raid0_run().

Fixes: daee2024715d (md: check level before create and exit io_acct_set)
Cc: stable@vger.kernel.org
Acked-by: Guoqing Jiang <guoqing.jiang@linux.dev>
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md.c    | 27 +++++++++++++++++----------
 drivers/md/md.h    |  2 ++
 drivers/md/raid0.c | 38 ++++++++++++++++++++++++++++----------
 drivers/md/raid5.c | 41 ++++++++++++++++++++++++++++++-----------
 4 files changed, 77 insertions(+), 31 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3f75c5896b92..3f57843918ec 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5881,13 +5881,6 @@ int md_run(struct mddev *mddev)
 		if (err)
 			goto exit_bio_set;
 	}
-	if (mddev->level != 1 && mddev->level != 10 &&
-	    !bioset_initialized(&mddev->io_acct_set)) {
-		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
-				  offsetof(struct md_io_acct, bio_clone), 0);
-		if (err)
-			goto exit_sync_set;
-	}
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
@@ -6064,9 +6057,6 @@ bitmap_abort:
 	module_put(pers->owner);
 	md_bitmap_destroy(mddev);
 abort:
-	if (mddev->level != 1 && mddev->level != 10)
-		bioset_exit(&mddev->io_acct_set);
-exit_sync_set:
 	bioset_exit(&mddev->sync_set);
 exit_bio_set:
 	bioset_exit(&mddev->bio_set);
@@ -8608,6 +8598,23 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 }
 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
 
+int acct_bioset_init(struct mddev *mddev)
+{
+	int err = 0;
+
+	if (!bioset_initialized(&mddev->io_acct_set))
+		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
+			offsetof(struct md_io_acct, bio_clone), 0);
+	return err;
+}
+EXPORT_SYMBOL_GPL(acct_bioset_init);
+
+void acct_bioset_exit(struct mddev *mddev)
+{
+	bioset_exit(&mddev->io_acct_set);
+}
+EXPORT_SYMBOL_GPL(acct_bioset_exit);
+
 static void md_end_io_acct(struct bio *bio)
 {
 	struct md_io_acct *md_io_acct = bio->bi_private;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 53ea7a6961de..f1bf3625ef4c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -721,6 +721,8 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
 extern void md_finish_reshape(struct mddev *mddev);
 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 			struct bio *bio, sector_t start, sector_t size);
+int acct_bioset_init(struct mddev *mddev);
+void acct_bioset_exit(struct mddev *mddev);
 void md_account_bio(struct mddev *mddev, struct bio **bio);
 
 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 62c8b6adac70..b59a77b31b90 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -356,7 +356,21 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
 	return array_sectors;
 }
 
-static void raid0_free(struct mddev *mddev, void *priv);
+static void free_conf(struct mddev *mddev, struct r0conf *conf)
+{
+	kfree(conf->strip_zone);
+	kfree(conf->devlist);
+	kfree(conf);
+	mddev->private = NULL;
+}
+
+static void raid0_free(struct mddev *mddev, void *priv)
+{
+	struct r0conf *conf = priv;
+
+	free_conf(mddev, conf);
+	acct_bioset_exit(mddev);
+}
 
 static int raid0_run(struct mddev *mddev)
 {
@@ -370,11 +384,16 @@ static int raid0_run(struct mddev *mddev)
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
 
+	if (acct_bioset_init(mddev)) {
+		pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
+		return -ENOMEM;
+	}
+
 	/* if private is not null, we are here after takeover */
 	if (mddev->private == NULL) {
 		ret = create_strip_zones(mddev, &conf);
 		if (ret < 0)
-			return ret;
+			goto exit_acct_set;
 		mddev->private = conf;
 	}
 	conf = mddev->private;
@@ -413,17 +432,16 @@ static int raid0_run(struct mddev *mddev)
 	dump_zones(mddev);
 
 	ret = md_integrity_register(mddev);
+	if (ret)
+		goto free;
 
 	return ret;
-}
 
-static void raid0_free(struct mddev *mddev, void *priv)
-{
-	struct r0conf *conf = priv;
-
-	kfree(conf->strip_zone);
-	kfree(conf->devlist);
-	kfree(conf);
+free:
+	free_conf(mddev, conf);
+exit_acct_set:
+	acct_bioset_exit(mddev);
+	return ret;
 }
 
 static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index beb544be9058..ffe720c73b0a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7462,12 +7462,19 @@ static int raid5_run(struct mddev *mddev)
 	struct md_rdev *rdev;
 	struct md_rdev *journal_dev = NULL;
 	sector_t reshape_offset = 0;
-	int i;
+	int i, ret = 0;
 	long long min_offset_diff = 0;
 	int first = 1;
 
-	if (mddev_init_writes_pending(mddev) < 0)
+	if (acct_bioset_init(mddev)) {
+		pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
 		return -ENOMEM;
+	}
+
+	if (mddev_init_writes_pending(mddev) < 0) {
+		ret = -ENOMEM;
+		goto exit_acct_set;
+	}
 
 	if (mddev->recovery_cp != MaxSector)
 		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7498,7 +7505,8 @@ static int raid5_run(struct mddev *mddev)
 	    (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
 		pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
 			  mdname(mddev));
-		return -EINVAL;
+		ret = -EINVAL;
+		goto exit_acct_set;
 	}
 
 	if (mddev->reshape_position != MaxSector) {
@@ -7523,13 +7531,15 @@ static int raid5_run(struct mddev *mddev)
 		if (journal_dev) {
 			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
 				mdname(mddev));
-			return -EINVAL;
+			ret = -EINVAL;
+			goto exit_acct_set;
 		}
 
 		if (mddev->new_level != mddev->level) {
 			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
 				mdname(mddev));
-			return -EINVAL;
+			ret = -EINVAL;
+			goto exit_acct_set;
 		}
 		old_disks = mddev->raid_disks - mddev->delta_disks;
 		/* reshape_position must be on a new-stripe boundary, and one
@@ -7545,7 +7555,8 @@ static int raid5_run(struct mddev *mddev)
 		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
 			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
 				mdname(mddev));
-			return -EINVAL;
+			ret = -EINVAL;
+			goto exit_acct_set;
 		}
 		reshape_offset = here_new * chunk_sectors;
 		/* here_new is the stripe we will write to */
@@ -7567,7 +7578,8 @@ static int raid5_run(struct mddev *mddev)
 			else if (mddev->ro == 0) {
 				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
 					mdname(mddev));
-				return -EINVAL;
+				ret = -EINVAL;
+				goto exit_acct_set;
 			}
 		} else if (mddev->reshape_backwards
 		    ? (here_new * chunk_sectors + min_offset_diff <=
@@ -7577,7 +7589,8 @@ static int raid5_run(struct mddev *mddev)
 			/* Reading from the same stripe as writing to - bad */
 			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
 				mdname(mddev));
-			return -EINVAL;
+			ret = -EINVAL;
+			goto exit_acct_set;
 		}
 		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
 		/* OK, we should be able to continue; */
@@ -7601,8 +7614,10 @@ static int raid5_run(struct mddev *mddev)
 	else
 		conf = mddev->private;
 
-	if (IS_ERR(conf))
-		return PTR_ERR(conf);
+	if (IS_ERR(conf)) {
+		ret = PTR_ERR(conf);
+		goto exit_acct_set;
+	}
 
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
 		if (!journal_dev) {
@@ -7799,7 +7814,10 @@ abort:
 	free_conf(conf);
 	mddev->private = NULL;
 	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
-	return -EIO;
+	ret = -EIO;
+exit_acct_set:
+	acct_bioset_exit(mddev);
+	return ret;
 }
 
 static void raid5_free(struct mddev *mddev, void *priv)
@@ -7807,6 +7825,7 @@ static void raid5_free(struct mddev *mddev, void *priv)
 	struct r5conf *conf = priv;
 
 	free_conf(conf);
+	acct_bioset_exit(mddev);
 	mddev->to_remove = &raid5_attrs_group;
 }
 
-- 
cgit v1.2.3-71-gd317