From 775c5033a0d164622d9d10dd0f0a5531639ed3ed Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 4 Mar 2021 11:09:12 +0200
Subject: fuse: fix live lock in fuse_iget()

Commit 5d069dbe8aaf ("fuse: fix bad inode") replaced make_bad_inode()
in fuse_iget() with a private implementation fuse_make_bad().

The private implementation fails to remove the bad inode from inode
cache, so the retry loop with iget5_locked() finds the same bad inode
and marks it bad forever.

kmsg snip:

[ ] rcu: INFO: rcu_sched self-detected stall on CPU
...
[ ]  ? bit_wait_io+0x50/0x50
[ ]  ? fuse_init_file_inode+0x70/0x70
[ ]  ? find_inode.isra.32+0x60/0xb0
[ ]  ? fuse_init_file_inode+0x70/0x70
[ ]  ilookup5_nowait+0x65/0x90
[ ]  ? fuse_init_file_inode+0x70/0x70
[ ]  ilookup5.part.36+0x2e/0x80
[ ]  ? fuse_init_file_inode+0x70/0x70
[ ]  ? fuse_inode_eq+0x20/0x20
[ ]  iget5_locked+0x21/0x80
[ ]  ? fuse_inode_eq+0x20/0x20
[ ]  fuse_iget+0x96/0x1b0

Fixes: 5d069dbe8aaf ("fuse: fix bad inode")
Cc: stable@vger.kernel.org # 5.10+
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/fuse_i.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 68cca8d4db6e..63d97a15ffde 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -863,6 +863,7 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
 
 static inline void fuse_make_bad(struct inode *inode)
 {
+	remove_inode_hash(inode);
 	set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
 }
 
-- 
cgit v1.2.3-71-gd317


From 3f9b9efd82a84f27e95d0414f852caf1fa839e83 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 9 Feb 2021 17:47:54 -0500
Subject: virtiofs: Fail dax mount if device does not support it

Right now "mount -t virtiofs -o dax myfs /mnt/virtiofs" succeeds even
if filesystem deivce does not have a cache window and hence DAX can't
be supported.

This gives a false sense to user that they are using DAX with virtiofs
but fact of the matter is that they are not.

Fix this by returning error if dax can't be supported and user has asked
for it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/virtio_fs.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 8868ac31a3c0..4ee6f734ba83 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1324,8 +1324,15 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
 
 	/* virtiofs allocates and installs its own fuse devices */
 	ctx->fudptr = NULL;
-	if (ctx->dax)
+	if (ctx->dax) {
+		if (!fs->dax_dev) {
+			err = -EINVAL;
+			pr_err("virtio-fs: dax can't be enabled as filesystem"
+			       " device does not support it.\n");
+			goto err_free_fuse_devs;
+		}
 		ctx->dax_dev = fs->dax_dev;
+	}
 	err = fuse_fill_super_common(sb, ctx);
 	if (err < 0)
 		goto err_free_fuse_devs;
-- 
cgit v1.2.3-71-gd317


From efc61345274d6c7a46a0570efbc916fcbe3e927b Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Thu, 18 Feb 2021 10:11:32 -0500
Subject: ext4: shrink race window in ext4_should_retry_alloc()

When generic/371 is run on kvm-xfstests using 5.10 and 5.11 kernels, it
fails at significant rates on the two test scenarios that disable
delayed allocation (ext3conv and data_journal) and force actual block
allocation for the fallocate and pwrite functions in the test.  The
failure rate on 5.10 for both ext3conv and data_journal on one test
system typically runs about 85%.  On 5.11, the failure rate on ext3conv
sometimes drops to as low as 1% while the rate on data_journal
increases to nearly 100%.

The observed failures are largely due to ext4_should_retry_alloc()
cutting off block allocation retries when s_mb_free_pending (used to
indicate that a transaction in progress will free blocks) is 0.
However, free space is usually available when this occurs during runs
of generic/371.  It appears that a thread attempting to allocate
blocks is just missing transaction commits in other threads that
increase the free cluster count and reset s_mb_free_pending while
the allocating thread isn't running.  Explicitly testing for free space
availability avoids this race.

The current code uses a post-increment operator in the conditional
expression that determines whether the retry limit has been exceeded.
This means that the conditional expression uses the value of the
retry counter before it's increased, resulting in an extra retry cycle.
The current code actually retries twice before hitting its retry limit
rather than once.

Increasing the retry limit to 3 from the current actual maximum retry
count of 2 in combination with the change described above reduces the
observed failure rate to less that 0.1% on both ext3conv and
data_journal with what should be limited impact on users sensitive to
the overhead caused by retries.

A per filesystem percpu counter exported via sysfs is added to allow
users or developers to track the number of times the retry limit is
exceeded without resorting to debugging methods.  This should provide
some insight into worst case retry behavior.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Link: https://lore.kernel.org/r/20210218151132.19678-1-enwlinux@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/balloc.c | 38 ++++++++++++++++++++++++++------------
 fs/ext4/ext4.h   |  1 +
 fs/ext4/super.c  |  5 +++++
 fs/ext4/sysfs.c  |  7 +++++++
 4 files changed, 39 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f45f9feebe59..74a5172c2d83 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -626,27 +626,41 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
 
 /**
  * ext4_should_retry_alloc() - check if a block allocation should be retried
- * @sb:			super block
- * @retries:		number of attemps has been made
+ * @sb:			superblock
+ * @retries:		number of retry attempts made so far
  *
- * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
- * it is profitable to retry the operation, this function will wait
- * for the current or committing transaction to complete, and then
- * return TRUE.  We will only retry once.
+ * ext4_should_retry_alloc() is called when ENOSPC is returned while
+ * attempting to allocate blocks.  If there's an indication that a pending
+ * journal transaction might free some space and allow another attempt to
+ * succeed, this function will wait for the current or committing transaction
+ * to complete and then return TRUE.
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
-	    (*retries)++ > 1 ||
-	    !EXT4_SB(sb)->s_journal)
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!sbi->s_journal)
 		return 0;
 
-	smp_mb();
-	if (EXT4_SB(sb)->s_mb_free_pending == 0)
+	if (++(*retries) > 3) {
+		percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
 		return 0;
+	}
 
+	/*
+	 * if there's no indication that blocks are about to be freed it's
+	 * possible we just missed a transaction commit that did so
+	 */
+	smp_mb();
+	if (sbi->s_mb_free_pending == 0)
+		return ext4_has_free_clusters(sbi, 1, 0);
+
+	/*
+	 * it's possible we've just missed a transaction commit here,
+	 * so ignore the returned status
+	 */
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
-	jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+	(void) jbd2_journal_force_commit_nested(sbi->s_journal);
 	return 1;
 }
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 644fd69185d3..ea3b41579f38 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1484,6 +1484,7 @@ struct ext4_sb_info {
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
 	struct percpu_counter s_dirtyclusters_counter;
+	struct percpu_counter s_sra_exceeded_retry_limit;
 	struct blockgroup_lock *s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
 	struct kobject s_kobj;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad34a37278cd..a0a256859662 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1210,6 +1210,7 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
 	percpu_free_rwsem(&sbi->s_writepages_rwsem);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -5011,6 +5012,9 @@ no_journal:
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
 					  GFP_KERNEL);
+	if (!err)
+		err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
+					  GFP_KERNEL);
 	if (!err)
 		err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
 
@@ -5124,6 +5128,7 @@ failed_mount6:
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
 	percpu_free_rwsem(&sbi->s_writepages_rwsem);
 failed_mount5:
 	ext4_ext_release(sb);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 075aa3a19ff5..a3d08276d441 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -24,6 +24,7 @@ typedef enum {
 	attr_session_write_kbytes,
 	attr_lifetime_write_kbytes,
 	attr_reserved_clusters,
+	attr_sra_exceeded_retry_limit,
 	attr_inode_readahead,
 	attr_trigger_test_error,
 	attr_first_error_time,
@@ -202,6 +203,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
 EXT4_ATTR_FUNC(session_write_kbytes, 0444);
 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
 EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
 		 ext4_sb_info, s_inode_readahead_blks);
@@ -251,6 +253,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(reserved_clusters),
+	ATTR_LIST(sra_exceeded_retry_limit),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
@@ -374,6 +377,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
 		return snprintf(buf, PAGE_SIZE, "%llu\n",
 				(unsigned long long)
 				atomic64_read(&sbi->s_resv_clusters));
+	case attr_sra_exceeded_retry_limit:
+		return snprintf(buf, PAGE_SIZE, "%llu\n",
+				(unsigned long long)
+			percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
 	case attr_inode_readahead:
 	case attr_pointer_ui:
 		if (!ptr)
-- 
cgit v1.2.3-71-gd317


From 163f0ec1df33cf468509ff38cbcbb5eb0d7fac60 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 22 Feb 2021 18:16:26 +0100
Subject: ext4: add reclaim checks to xattr code

Syzbot is reporting that ext4 can enter fs reclaim from kvmalloc() while
the transaction is started like:

  fs_reclaim_acquire+0x117/0x150 mm/page_alloc.c:4340
  might_alloc include/linux/sched/mm.h:193 [inline]
  slab_pre_alloc_hook mm/slab.h:493 [inline]
  slab_alloc_node mm/slub.c:2817 [inline]
  __kmalloc_node+0x5f/0x430 mm/slub.c:4015
  kmalloc_node include/linux/slab.h:575 [inline]
  kvmalloc_node+0x61/0xf0 mm/util.c:587
  kvmalloc include/linux/mm.h:781 [inline]
  ext4_xattr_inode_cache_find fs/ext4/xattr.c:1465 [inline]
  ext4_xattr_inode_lookup_create fs/ext4/xattr.c:1508 [inline]
  ext4_xattr_set_entry+0x1ce6/0x3780 fs/ext4/xattr.c:1649
  ext4_xattr_ibody_set+0x78/0x2b0 fs/ext4/xattr.c:2224
  ext4_xattr_set_handle+0x8f4/0x13e0 fs/ext4/xattr.c:2380
  ext4_xattr_set+0x13a/0x340 fs/ext4/xattr.c:2493

This should be impossible since transaction start sets PF_MEMALLOC_NOFS.
Add some assertions to the code to catch if something isn't working as
expected early.

Link: https://lore.kernel.org/linux-ext4/000000000000563a0205bafb7970@google.com/
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210222171626.21884-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 372208500f4e..083c95126781 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1462,6 +1462,9 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 	if (!ce)
 		return NULL;
 
+	WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
+		     !(current->flags & PF_MEMALLOC_NOFS));
+
 	ea_data = kvmalloc(value_len, GFP_KERNEL);
 	if (!ea_data) {
 		mb_cache_entry_put(ea_inode_cache, ce);
@@ -2327,6 +2330,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 			error = -ENOSPC;
 			goto cleanup;
 		}
+		WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
 	}
 
 	error = ext4_reserve_inode_write(handle, inode, &is.iloc);
-- 
cgit v1.2.3-71-gd317


From f91436d55a279f045987e8b8c1385585dca54be9 Mon Sep 17 00:00:00 2001
From: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Date: Wed, 24 Feb 2021 15:58:00 +0600
Subject: fs/ext4: fix integer overflow in s_log_groups_per_flex

syzbot found UBSAN: shift-out-of-bounds in ext4_mb_init [1], when
1 << sbi->s_es->s_log_groups_per_flex is bigger than UINT_MAX,
where sbi->s_mb_prefetch is unsigned integer type.

32 is the maximum allowed power of s_log_groups_per_flex. Following if
check will also trigger UBSAN shift-out-of-bound:

if (1 << sbi->s_es->s_log_groups_per_flex >= UINT_MAX) {

So I'm checking it against the raw number, perhaps there is another way
to calculate UINT_MAX max power. Also use min_t as to make sure it's
uint type.

[1] UBSAN: shift-out-of-bounds in fs/ext4/mballoc.c:2713:24
shift exponent 60 is too large for 32-bit type 'int'
Call Trace:
 __dump_stack lib/dump_stack.c:79 [inline]
 dump_stack+0x137/0x1be lib/dump_stack.c:120
 ubsan_epilogue lib/ubsan.c:148 [inline]
 __ubsan_handle_shift_out_of_bounds+0x432/0x4d0 lib/ubsan.c:395
 ext4_mb_init_backend fs/ext4/mballoc.c:2713 [inline]
 ext4_mb_init+0x19bc/0x19f0 fs/ext4/mballoc.c:2898
 ext4_fill_super+0xc2ec/0xfbe0 fs/ext4/super.c:4983

Reported-by: syzbot+a8b4b0c60155e87e9484@syzkaller.appspotmail.com
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210224095800.3350002-1-snovitoll@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 99bf091fee10..a02fadf4fc84 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2709,8 +2709,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	}
 
 	if (ext4_has_feature_flex_bg(sb)) {
-		/* a single flex group is supposed to be read by a single IO */
-		sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex,
+		/* a single flex group is supposed to be read by a single IO.
+		 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
+		 * unsigned integer, so the maximum shift is 32.
+		 */
+		if (sbi->s_es->s_log_groups_per_flex >= 32) {
+			ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
+			goto err_freesgi;
+		}
+		sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
 			BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
 		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
 	} else {
-- 
cgit v1.2.3-71-gd317


From c915fb80eaa6194fa9bd0a4487705cd5b0dda2f1 Mon Sep 17 00:00:00 2001
From: Zhaolong Zhang <zhangzl2013@126.com>
Date: Tue, 2 Mar 2021 17:42:31 +0800
Subject: ext4: fix bh ref count on error paths

__ext4_journalled_writepage should drop bhs' ref count on error paths

Signed-off-by: Zhaolong Zhang <zhangzl2013@126.com>
Link: https://lore.kernel.org/r/1614678151-70481-1-git-send-email-zhangzl2013@126.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 650c5acd2f2d..a79a9ea58c56 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1938,13 +1938,13 @@ static int __ext4_journalled_writepage(struct page *page,
 	if (!ret)
 		ret = err;
 
-	if (!ext4_has_inline_data(inode))
-		ext4_walk_page_buffers(NULL, page_bufs, 0, len,
-				       NULL, bput_one);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
 	unlock_page(page);
 out_no_pagelock:
+	if (!inline_data && page_bufs)
+		ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+				       NULL, bput_one);
 	brelse(inode_bh);
 	return ret;
 }
-- 
cgit v1.2.3-71-gd317


From d30881f573e565ebb5dbb50b31ed6106b5c81328 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 18 Feb 2021 21:02:07 -0500
Subject: nfsd: Don't keep looking up unhashed files in the nfsd file cache

If a file is unhashed, then we're going to reject it anyway and retry,
so make sure we skip it when we're doing the RCU lockless lookup.
This avoids a number of unnecessary nfserr_jukebox returns from
nfsd_file_acquire()

Fixes: 65294c1f2c5e ("nfsd: add a new struct file caching facility to nfsd")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 53fcbf79bdca..7629248fdd53 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -898,6 +898,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
 			continue;
 		if (!nfsd_match_cred(nf->nf_cred, current_cred()))
 			continue;
+		if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+			continue;
 		if (nfsd_file_get(nf) != NULL)
 			return nf;
 	}
-- 
cgit v1.2.3-71-gd317


From 7005227369079963d25fb2d5d736d0feb2c44cf6 Mon Sep 17 00:00:00 2001
From: Julian Braha <julianbraha@gmail.com>
Date: Fri, 19 Feb 2021 16:56:10 -0500
Subject: fs: nfsd: fix kconfig dependency warning for NFSD_V4

When NFSD_V4 is enabled and CRYPTO is disabled,
Kbuild gives the following warning:

WARNING: unmet direct dependencies detected for CRYPTO_SHA256
  Depends on [n]: CRYPTO [=n]
  Selected by [y]:
  - NFSD_V4 [=y] && NETWORK_FILESYSTEMS [=y] && NFSD [=y] && PROC_FS [=y]

WARNING: unmet direct dependencies detected for CRYPTO_MD5
  Depends on [n]: CRYPTO [=n]
  Selected by [y]:
  - NFSD_V4 [=y] && NETWORK_FILESYSTEMS [=y] && NFSD [=y] && PROC_FS [=y]

This is because NFSD_V4 selects CRYPTO_MD5 and CRYPTO_SHA256,
without depending on or selecting CRYPTO, despite those config options
being subordinate to CRYPTO.

Signed-off-by: Julian Braha <julianbraha@gmail.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 821e5913faee..d6cff5fbe705 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -73,6 +73,7 @@ config NFSD_V4
 	select NFSD_V3
 	select FS_POSIX_ACL
 	select SUNRPC_GSS
+	select CRYPTO
 	select CRYPTO_MD5
 	select CRYPTO_SHA256
 	select GRACE_PERIOD
-- 
cgit v1.2.3-71-gd317


From bfdd89f232aa2de5a4b3fc985cba894148b830a8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 24 Feb 2021 13:39:50 -0500
Subject: nfsd: don't abort copies early

The typical result of the backwards comparison here is that the source
server in a server-to-server copy will return BAD_STATEID within a few
seconds of the copy starting, instead of giving the copy a full lease
period, so the copy_file_range() call will end up unnecessarily
returning a short read.

Fixes: 624322f1adc5 "NFSD add COPY_NOTIFY operation"
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 423fd6683f3a..61552e89bd89 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5389,7 +5389,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 	idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
 		cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid);
 		if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID &&
-				cps->cpntf_time > cutoff)
+				cps->cpntf_time < cutoff)
 			_free_cpntf_state_locked(nn, cps);
 	}
 	spin_unlock(&nn->s2s_cp_lock);
-- 
cgit v1.2.3-71-gd317


From 4aa5e002034f0701c3335379fd6c22d7f3338cce Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 8 Mar 2021 10:51:51 -0500
Subject: Revert "nfsd4: remove check_conflicting_opens warning"

This reverts commit 50747dd5e47b "nfsd4: remove check_conflicting_opens
warning", as a prerequisite for reverting 94415b06eb8a, which has a
serious bug.

Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 61552e89bd89..482e241c9639 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4952,6 +4952,7 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
 		writes--;
 	if (fp->fi_fds[O_RDWR])
 		writes--;
+	WARN_ON_ONCE(writes < 0);
 	if (writes > 0)
 		return -EAGAIN;
 	spin_lock(&fp->fi_lock);
-- 
cgit v1.2.3-71-gd317


From 6ee65a773096ab3f39d9b00311ac983be5bdeb7c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 8 Mar 2021 10:52:29 -0500
Subject: Revert "nfsd4: a client's own opens needn't prevent delegations"

This reverts commit 94415b06eb8aed13481646026dc995f04a3a534a.

That commit claimed to allow a client to get a read delegation when it
was the only writer.  Actually it allowed a client to get a read
delegation when *any* client has a write open!

The main problem is that it's depending on nfs4_clnt_odstate structures
that are actually only maintained for pnfs exports.

This causes clients to miss writes performed by other clients, even when
there have been intervening closes and opens, violating close-to-open
cache consistency.

We can do this a different way, but first we should just revert this.

I've added pynfs 4.1 test DELEG19 to test for this, as I should have
done originally!

Cc: stable@vger.kernel.org
Reported-by: Timo Rothenpieler <timo@rothenpieler.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/locks.c          |  3 ---
 fs/nfsd/nfs4state.c | 54 ++++++++++++++---------------------------------------
 2 files changed, 14 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 99ca97e81b7a..6125d2de39b8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1808,9 +1808,6 @@ check_conflicting_open(struct file *filp, const long arg, int flags)
 
 	if (flags & FL_LAYOUT)
 		return 0;
-	if (flags & FL_DELEG)
-		/* We leave these checks to the caller. */
-		return 0;
 
 	if (arg == F_RDLCK)
 		return inode_is_open_for_write(inode) ? -EAGAIN : 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 482e241c9639..97447a64bad0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4940,32 +4940,6 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
 	return fl;
 }
 
-static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
-						struct nfs4_file *fp)
-{
-	struct nfs4_clnt_odstate *co;
-	struct file *f = fp->fi_deleg_file->nf_file;
-	struct inode *ino = locks_inode(f);
-	int writes = atomic_read(&ino->i_writecount);
-
-	if (fp->fi_fds[O_WRONLY])
-		writes--;
-	if (fp->fi_fds[O_RDWR])
-		writes--;
-	WARN_ON_ONCE(writes < 0);
-	if (writes > 0)
-		return -EAGAIN;
-	spin_lock(&fp->fi_lock);
-	list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) {
-		if (co->co_client != clp) {
-			spin_unlock(&fp->fi_lock);
-			return -EAGAIN;
-		}
-	}
-	spin_unlock(&fp->fi_lock);
-	return 0;
-}
-
 static struct nfs4_delegation *
 nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 		    struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
@@ -4985,12 +4959,9 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 
 	nf = find_readable_file(fp);
 	if (!nf) {
-		/*
-		 * We probably could attempt another open and get a read
-		 * delegation, but for now, don't bother until the
-		 * client actually sends us one.
-		 */
-		return ERR_PTR(-EAGAIN);
+		/* We should always have a readable file here */
+		WARN_ON_ONCE(1);
+		return ERR_PTR(-EBADF);
 	}
 	spin_lock(&state_lock);
 	spin_lock(&fp->fi_lock);
@@ -5020,19 +4991,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 	if (!fl)
 		goto out_clnt_odstate;
 
-	status = nfsd4_check_conflicting_opens(clp, fp);
-	if (status) {
-		locks_free_lock(fl);
-		goto out_clnt_odstate;
-	}
 	status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
 	if (fl)
 		locks_free_lock(fl);
 	if (status)
 		goto out_clnt_odstate;
-	status = nfsd4_check_conflicting_opens(clp, fp);
-	if (status)
-		goto out_clnt_odstate;
 
 	spin_lock(&state_lock);
 	spin_lock(&fp->fi_lock);
@@ -5114,6 +5077,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
 				goto out_no_deleg;
 			if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
 				goto out_no_deleg;
+			/*
+			 * Also, if the file was opened for write or
+			 * create, there's a good chance the client's
+			 * about to write to it, resulting in an
+			 * immediate recall (since we don't support
+			 * write delegations):
+			 */
+			if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+				goto out_no_deleg;
+			if (open->op_create == NFS4_OPEN_CREATE)
+				goto out_no_deleg;
 			break;
 		default:
 			goto out_no_deleg;
-- 
cgit v1.2.3-71-gd317


From 5808fecc572391867fcd929662b29c12e6d08d81 Mon Sep 17 00:00:00 2001
From: Ritesh Harjani <riteshh@linux.ibm.com>
Date: Tue, 9 Mar 2021 09:29:11 -0800
Subject: iomap: Fix negative assignment to unsigned sis->pages in
 iomap_swapfile_activate

In case if isi.nr_pages is 0, we are making sis->pages (which is
unsigned int) a huge value in iomap_swapfile_activate() by assigning -1.
This could cause a kernel crash in kernel v4.18 (with below signature).
Or could lead to unknown issues on latest kernel if the fake big swap gets
used.

Fix this issue by returning -EINVAL in case of nr_pages is 0, since it
is anyway a invalid swapfile. Looks like this issue will be hit when
we have pagesize < blocksize type of configuration.

I was able to hit the issue in case of a tiny swap file with below
test script.
https://raw.githubusercontent.com/riteshharjani/LinuxStudy/master/scripts/swap-issue.sh

kernel crash analysis on v4.18
==============================
On v4.18 kernel, it causes a kernel panic, since sis->pages becomes
a huge value and isi.nr_extents is 0. When 0 is returned it is
considered as a swapfile over NFS and SWP_FILE is set (sis->flags |= SWP_FILE).
Then when swapoff was getting called it was calling a_ops->swap_deactivate()
if (sis->flags & SWP_FILE) is true. Since a_ops->swap_deactivate() is
NULL in case of XFS, it causes below panic.

Panic signature on v4.18 kernel:
=======================================
root@qemu:/home/qemu# [ 8291.723351] XFS (loop2): Unmounting Filesystem
[ 8292.123104] XFS (loop2): Mounting V5 Filesystem
[ 8292.132451] XFS (loop2): Ending clean mount
[ 8292.263362] Adding 4294967232k swap on /mnt1/test/swapfile.  Priority:-2 extents:1 across:274877906880k
[ 8292.277834] Unable to handle kernel paging request for instruction fetch
[ 8292.278677] Faulting instruction address: 0x00000000
cpu 0x19: Vector: 400 (Instruction Access) at [c0000009dd5b7ad0]
    pc: 0000000000000000
    lr: c0000000003eb9dc: destroy_swap_extents+0xfc/0x120
    sp: c0000009dd5b7d50
   msr: 8000000040009033
  current = 0xc0000009b6710080
  paca    = 0xc00000003ffcb280   irqmask: 0x03   irq_happened: 0x01
    pid   = 5604, comm = swapoff
Linux version 4.18.0 (riteshh@xxxxxxx) (gcc version 8.4.0 (Ubuntu 8.4.0-1ubuntu1~18.04)) #57 SMP Wed Mar 3 01:33:04 CST 2021
enter ? for help
[link register   ] c0000000003eb9dc destroy_swap_extents+0xfc/0x120
[c0000009dd5b7d50] c0000000025a7058 proc_poll_event+0x0/0x4 (unreliable)
[c0000009dd5b7da0] c0000000003f0498 sys_swapoff+0x3f8/0x910
[c0000009dd5b7e30] c00000000000bbe4 system_call+0x5c/0x70
Exception: c01 (System Call) at 00007ffff7d208d8

Signed-off-by: Ritesh Harjani <riteshh@linux.ibm.com>
[djwong: rework the comment to provide more details]
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/swapfile.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index a648dbf6991e..a5e478de1417 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -170,6 +170,16 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
 			return ret;
 	}
 
+	/*
+	 * If this swapfile doesn't contain even a single page-aligned
+	 * contiguous range of blocks, reject this useless swapfile to
+	 * prevent confusion later on.
+	 */
+	if (isi.nr_pages == 0) {
+		pr_warn("swapon: Cannot find a single usable page in file.\n");
+		return -EINVAL;
+	}
+
 	*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
 	sis->max = isi.nr_pages;
 	sis->pages = isi.nr_pages - 1;
-- 
cgit v1.2.3-71-gd317


From b5a08423da9da59c7f38ed8dbb6dd6cbbe9024a4 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 2 Mar 2021 09:32:52 -0800
Subject: xfs: fix quota accounting when a mount is idmapped

Nowadays, we indirectly use the idmap-aware helper functions in the VFS
to set the initial uid and gid of a file being created.  Unfortunately,
we didn't convert the quota code, which means we attach the wrong dquots
to files created on an idmapped mount.

Fixes: f736d93d76d3 ("xfs: support idmapped mounts")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_inode.c   | 14 ++++++++------
 fs/xfs/xfs_symlink.c |  3 ++-
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 46a861d55e48..f93370bd7b1e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1007,9 +1007,10 @@ xfs_create(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-					XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-					&udqp, &gdqp, &pdqp);
+	error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
+			fsgid_into_mnt(mnt_userns), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+			&udqp, &gdqp, &pdqp);
 	if (error)
 		return error;
 
@@ -1157,9 +1158,10 @@ xfs_create_tmpfile(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-				XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-				&udqp, &gdqp, &pdqp);
+	error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
+			fsgid_into_mnt(mnt_userns), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+			&udqp, &gdqp, &pdqp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 1379013d74b8..7f368b10ded1 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -182,7 +182,8 @@ xfs_symlink(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
+			fsgid_into_mnt(mnt_userns), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
-- 
cgit v1.2.3-71-gd317


From 614c9750173e412663728215152cc6d12bcb3425 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 9 Mar 2021 09:41:14 -0500
Subject: NFSD: fix dest to src mount in inter-server COPY

A cleanup of the inter SSC copy needs to call fput() of the source
file handle to make sure that file structure is freed as well as
drop the reference on the superblock to unmount the source server.

Fixes: 36e1e5ba90fb ("NFSD: Fix use-after-free warning when doing inter-server copy")
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfsd/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index acdb3cd806a1..dd9f38d072dd 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1302,7 +1302,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
 			struct nfsd_file *dst)
 {
 	nfs42_ssc_close(src->nf_file);
-	/* 'src' is freed by nfsd4_do_async_copy */
+	fput(src->nf_file);
 	nfsd_file_put(dst);
 	mntput(ss_mnt);
 }
-- 
cgit v1.2.3-71-gd317


From f053cf7aa66cd9d592b0fc967f4d887c2abff1b7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 2 Mar 2021 12:04:19 -0800
Subject: ext4: fix error handling in ext4_end_enable_verity()

ext4 didn't properly clean up if verity failed to be enabled on a file:

- It left verity metadata (pages past EOF) in the page cache, which
  would be exposed to userspace if the file was later extended.

- It didn't truncate the verity metadata at all (either from cache or
  from disk) if an error occurred while setting the verity bit.

Fix these bugs by adding a call to truncate_inode_pages() and ensuring
that we truncate the verity metadata (both from cache and from disk) in
all error paths.  Also rework the code to cleanly separate the success
path from the error paths, which makes it much easier to understand.

Reported-by: Yunlei He <heyunlei@hihonor.com>
Fixes: c93d8f885809 ("ext4: add basic fs-verity support")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20210302200420.137977-2-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/verity.c | 89 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 5b7ba8f71153..00e3cbde472e 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -201,55 +201,76 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
 	struct inode *inode = file_inode(filp);
 	const int credits = 2; /* superblock and inode for ext4_orphan_del() */
 	handle_t *handle;
+	struct ext4_iloc iloc;
 	int err = 0;
-	int err2;
 
-	if (desc != NULL) {
-		/* Succeeded; write the verity descriptor. */
-		err = ext4_write_verity_descriptor(inode, desc, desc_size,
-						   merkle_tree_size);
-
-		/* Write all pages before clearing VERITY_IN_PROGRESS. */
-		if (!err)
-			err = filemap_write_and_wait(inode->i_mapping);
-	}
+	/*
+	 * If an error already occurred (which fs/verity/ signals by passing
+	 * desc == NULL), then only clean-up is needed.
+	 */
+	if (desc == NULL)
+		goto cleanup;
 
-	/* If we failed, truncate anything we wrote past i_size. */
-	if (desc == NULL || err)
-		ext4_truncate(inode);
+	/* Append the verity descriptor. */
+	err = ext4_write_verity_descriptor(inode, desc, desc_size,
+					   merkle_tree_size);
+	if (err)
+		goto cleanup;
 
 	/*
-	 * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
-	 * deleting the inode from the orphan list, even if something failed.
-	 * If everything succeeded, we'll also set the verity bit in the same
-	 * transaction.
+	 * Write all pages (both data and verity metadata).  Note that this must
+	 * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages
+	 * beyond i_size won't be written properly.  For crash consistency, this
+	 * also must happen before the verity inode flag gets persisted.
 	 */
+	err = filemap_write_and_wait(inode->i_mapping);
+	if (err)
+		goto cleanup;
 
-	ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+	/*
+	 * Finally, set the verity inode flag and remove the inode from the
+	 * orphan list (in a single transaction).
+	 */
 
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
 	if (IS_ERR(handle)) {
-		ext4_orphan_del(NULL, inode);
-		return PTR_ERR(handle);
+		err = PTR_ERR(handle);
+		goto cleanup;
 	}
 
-	err2 = ext4_orphan_del(handle, inode);
-	if (err2)
-		goto out_stop;
+	err = ext4_orphan_del(handle, inode);
+	if (err)
+		goto stop_and_cleanup;
 
-	if (desc != NULL && !err) {
-		struct ext4_iloc iloc;
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto stop_and_cleanup;
 
-		err = ext4_reserve_inode_write(handle, inode, &iloc);
-		if (err)
-			goto out_stop;
-		ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
-		ext4_set_inode_flags(inode, false);
-		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-	}
-out_stop:
+	ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+	ext4_set_inode_flags(inode, false);
+	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	if (err)
+		goto stop_and_cleanup;
+
+	ext4_journal_stop(handle);
+
+	ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+	return 0;
+
+stop_and_cleanup:
 	ext4_journal_stop(handle);
-	return err ?: err2;
+cleanup:
+	/*
+	 * Verity failed to be enabled, so clean up by truncating any verity
+	 * metadata that was written beyond i_size (both from cache and from
+	 * disk), removing the inode from the orphan list (if it wasn't done
+	 * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS.
+	 */
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	ext4_truncate(inode);
+	ext4_orphan_del(NULL, inode);
+	ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+	return err;
 }
 
 static int ext4_get_verity_descriptor_location(struct inode *inode,
-- 
cgit v1.2.3-71-gd317


From b4250dd868d1b42c0a65de11ef3afbee67ba5d2f Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Thu, 11 Mar 2021 10:55:00 -0500
Subject: NFSD: fix error handling in NFSv4.0 callbacks

When the server tries to do a callback and a client fails it due to
authentication problems, we need the server to set callback down
flag in RENEW so that client can recover.

Suggested-by: Bruce Fields <bfields@redhat.com>
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Link: https://lore.kernel.org/linux-nfs/FB84E90A-1A03-48B3-8BF7-D9D10AC2C9FE@oracle.com/T/#t
---
 fs/nfsd/nfs4callback.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 052be5bf9ef5..7325592b456e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1189,6 +1189,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 		switch (task->tk_status) {
 		case -EIO:
 		case -ETIMEDOUT:
+		case -EACCES:
 			nfsd4_mark_cb_down(clp, task->tk_status);
 		}
 		break;
-- 
cgit v1.2.3-71-gd317


From 16efa4fce3b7af17bb45d635c3e89992d721e0f3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 12 Mar 2021 20:26:13 -0700
Subject: io_uring: allow IO worker threads to be frozen

With the freezer using the proper signaling to notify us of when it's
time to freeze a thread, we can re-enable normal freezer usage for the
IO threads. Ensure that SQPOLL, io-wq, and the io-wq manager call
try_to_freeze() appropriately, and remove the default setting of
PF_NOFREEZE from create_io_thread().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 6 +++++-
 fs/io_uring.c | 1 +
 kernel/fork.c | 1 -
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 0ae9ecadf295..e05f996d088f 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -488,6 +488,8 @@ static int io_wqe_worker(void *data)
 	set_task_comm(current, buf);
 
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+		long ret;
+
 		set_current_state(TASK_INTERRUPTIBLE);
 loop:
 		raw_spin_lock_irq(&wqe->lock);
@@ -498,7 +500,8 @@ loop:
 		__io_worker_idle(wqe, worker);
 		raw_spin_unlock_irq(&wqe->lock);
 		io_flush_signals();
-		if (schedule_timeout(WORKER_IDLE_TIMEOUT))
+		ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
+		if (try_to_freeze() || ret)
 			continue;
 		if (fatal_signal_pending(current))
 			break;
@@ -709,6 +712,7 @@ static int io_wq_manager(void *data)
 		set_current_state(TASK_INTERRUPTIBLE);
 		io_wq_check_workers(wq);
 		schedule_timeout(HZ);
+		try_to_freeze();
 		if (fatal_signal_pending(current))
 			set_bit(IO_WQ_BIT_EXIT, &wq->state);
 	} while (!test_bit(IO_WQ_BIT_EXIT, &wq->state));
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a4bce17af506..05adc4887ef3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6752,6 +6752,7 @@ static int io_sq_thread(void *data)
 
 			up_read(&sqd->rw_lock);
 			schedule();
+			try_to_freeze();
 			down_read(&sqd->rw_lock);
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 				io_ring_clear_wakeup_flag(ctx);
diff --git a/kernel/fork.c b/kernel/fork.c
index 72e444cd0ffe..d3171e8e88e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2436,7 +2436,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
 	if (!IS_ERR(tsk)) {
 		sigfillset(&tsk->blocked);
 		sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
-		tsk->flags |= PF_NOFREEZE;
 	}
 	return tsk;
 }
-- 
cgit v1.2.3-71-gd317


From 9e15c3a0ced5a61f320b989072c24983cb1620c1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Mar 2021 12:29:43 -0700
Subject: io_uring: convert io_buffer_idr to XArray

Like we did for the personality idr, convert the IO buffer idr to use
XArray. This avoids a use-after-free on removal of entries, since idr
doesn't like doing so from inside an iterator, and it nicely reduces
the amount of code we need to support this feature.

Fixes: 5a2e745d4d43 ("io_uring: buffer registration infrastructure")
Cc: stable@vger.kernel.org
Cc: Matthew Wilcox <willy@infradead.org>
Cc: yangerkun <yangerkun@huawei.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 43 +++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 05adc4887ef3..58d62dd9f8e4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -402,7 +402,7 @@ struct io_ring_ctx {
 	struct socket		*ring_sock;
 #endif
 
-	struct idr		io_buffer_idr;
+	struct xarray		io_buffers;
 
 	struct xarray		personalities;
 	u32			pers_next;
@@ -1135,7 +1135,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_waitqueue_head(&ctx->cq_wait);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	init_completion(&ctx->ref_comp);
-	idr_init(&ctx->io_buffer_idr);
+	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
@@ -2843,7 +2843,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 
 	lockdep_assert_held(&req->ctx->uring_lock);
 
-	head = idr_find(&req->ctx->io_buffer_idr, bgid);
+	head = xa_load(&req->ctx->io_buffers, bgid);
 	if (head) {
 		if (!list_empty(&head->list)) {
 			kbuf = list_last_entry(&head->list, struct io_buffer,
@@ -2851,7 +2851,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 			list_del(&kbuf->list);
 		} else {
 			kbuf = head;
-			idr_remove(&req->ctx->io_buffer_idr, bgid);
+			xa_erase(&req->ctx->io_buffers, bgid);
 		}
 		if (*len > kbuf->len)
 			*len = kbuf->len;
@@ -3892,7 +3892,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
 	}
 	i++;
 	kfree(buf);
-	idr_remove(&ctx->io_buffer_idr, bgid);
+	xa_erase(&ctx->io_buffers, bgid);
 
 	return i;
 }
@@ -3910,7 +3910,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	lockdep_assert_held(&ctx->uring_lock);
 
 	ret = -ENOENT;
-	head = idr_find(&ctx->io_buffer_idr, p->bgid);
+	head = xa_load(&ctx->io_buffers, p->bgid);
 	if (head)
 		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
 	if (ret < 0)
@@ -3993,21 +3993,14 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	lockdep_assert_held(&ctx->uring_lock);
 
-	list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
+	list = head = xa_load(&ctx->io_buffers, p->bgid);
 
 	ret = io_add_buffers(p, &head);
-	if (ret < 0)
-		goto out;
-
-	if (!list) {
-		ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
-					GFP_KERNEL);
-		if (ret < 0) {
+	if (ret >= 0 && !list) {
+		ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
+		if (ret < 0)
 			__io_remove_buffers(ctx, head, p->bgid, -1U);
-			goto out;
-		}
 	}
-out:
 	if (ret < 0)
 		req_set_fail_links(req);
 
@@ -8333,19 +8326,13 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 	return -ENXIO;
 }
 
-static int __io_destroy_buffers(int id, void *p, void *data)
-{
-	struct io_ring_ctx *ctx = data;
-	struct io_buffer *buf = p;
-
-	__io_remove_buffers(ctx, buf, id, -1U);
-	return 0;
-}
-
 static void io_destroy_buffers(struct io_ring_ctx *ctx)
 {
-	idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
-	idr_destroy(&ctx->io_buffer_idr);
+	struct io_buffer *buf;
+	unsigned long index;
+
+	xa_for_each(&ctx->io_buffers, index, buf)
+		__io_remove_buffers(ctx, buf, index, -1U);
 }
 
 static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
-- 
cgit v1.2.3-71-gd317


From 5171317dfd9afcf729799d31fffdbb9e71e45402 Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Wed, 10 Mar 2021 10:22:27 +0000
Subject: cifs: update new ACE pointer after populate_new_aces.

After the fix for retaining externally set ACEs with cifsacl and
modefromsid,idsfromsid, there was an issue in populating the
inherited ACEs after setting the ACEs introduced by these two modes.
Fixed this by updating the ACE pointer again after the call to
populate_new_aces.

Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Reviewed-by: Rohith Surabattula <rohiths@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifsacl.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 9d29eb9660c2..2be22a5c690f 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1118,7 +1118,6 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
 	/* Retain old ACEs which we can retain */
 	for (i = 0; i < src_num_aces; ++i) {
 		pntace = (struct cifs_ace *) (acl_base + size);
-		pnntace = (struct cifs_ace *) (nacl_base + nsize);
 
 		if (!new_aces_set && (pntace->flags & INHERITED_ACE)) {
 			/* Place the new ACEs in between existing explicit and inherited */
@@ -1131,14 +1130,18 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
 		}
 
 		/* If it's any one of the ACE we're replacing, skip! */
-		if ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) ||
+		if (!mode_from_sid &&
+				((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) ||
 				(compare_sids(&pntace->sid, pownersid) == 0) ||
 				(compare_sids(&pntace->sid, pgrpsid) == 0) ||
 				(compare_sids(&pntace->sid, &sid_everyone) == 0) ||
-				(compare_sids(&pntace->sid, &sid_authusers) == 0)) {
+				(compare_sids(&pntace->sid, &sid_authusers) == 0))) {
 			goto next_ace;
 		}
 
+		/* update the pointer to the next ACE to populate*/
+		pnntace = (struct cifs_ace *) (nacl_base + nsize);
+
 		nsize += cifs_copy_ace(pnntace, pntace, NULL);
 		num_aces++;
 
-- 
cgit v1.2.3-71-gd317


From 05946d4b7a7349ae58bfa2d51ae832e64a394c2d Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Wed, 10 Mar 2021 13:20:40 +0100
Subject: cifs: Fix preauth hash corruption

smb311_update_preauth_hash() uses the shash in server->secmech without
appropriate locking, and this can lead to sessions corrupting each
other's preauth hashes.

The following script can easily trigger the problem:

	#!/bin/sh -e

	NMOUNTS=10
	for i in $(seq $NMOUNTS);
		mkdir -p /tmp/mnt$i
		umount /tmp/mnt$i 2>/dev/null || :
	done
	while :; do
		for i in $(seq $NMOUNTS); do
			mount -t cifs //192.168.0.1/test /tmp/mnt$i -o ... &
		done
		wait
		for i in $(seq $NMOUNTS); do
			umount /tmp/mnt$i
		done
	done

Usually within seconds this leads to one or more of the mounts failing
with the following errors, and a "Bad SMB2 signature for message" is
seen in the server logs:

 CIFS: VFS: \\192.168.0.1 failed to connect to IPC (rc=-13)
 CIFS: VFS: cifs_mount failed w/return code = -13

Fix it by holding the server mutex just like in the other places where
the shashes are used.

Fixes: 8bd68c6e47abff34e4 ("CIFS: implement v3.11 preauth integrity")
Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
CC: <stable@vger.kernel.org>
Reviewed-by: Aurelien Aptel <aaptel@suse.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/transport.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 007d99437c77..c1725b55f364 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -1196,9 +1196,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 	/*
 	 * Compounding is never used during session establish.
 	 */
-	if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP))
+	if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
+		mutex_lock(&server->srv_mutex);
 		smb311_update_preauth_hash(ses, rqst[0].rq_iov,
 					   rqst[0].rq_nvec);
+		mutex_unlock(&server->srv_mutex);
+	}
 
 	for (i = 0; i < num_rqst; i++) {
 		rc = wait_for_response(server, midQ[i]);
@@ -1266,7 +1269,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 			.iov_base = resp_iov[0].iov_base,
 			.iov_len = resp_iov[0].iov_len
 		};
+		mutex_lock(&server->srv_mutex);
 		smb311_update_preauth_hash(ses, &iov, 1);
+		mutex_unlock(&server->srv_mutex);
 	}
 
 out:
-- 
cgit v1.2.3-71-gd317


From efe814a471e0e58f28f1efaf430c8784a4f36626 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 14 Mar 2021 20:57:08 +0000
Subject: io_uring: fix ->flags races by linked timeouts

It's racy to modify req->flags from a not owning context, e.g. linked
timeout calling req_set_fail_links() for the master request might race
with that request setting/clearing flags while being executed
concurrently. Just remove req_set_fail_links(prev) from
io_link_timeout_fn(), io_async_find_and_cancel() and functions down the
line take care of setting the fail bit.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 58d62dd9f8e4..217f72d50ff5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6197,7 +6197,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	if (prev) {
-		req_set_fail_links(prev);
 		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
 		io_put_req_deferred(prev, 1);
 	} else {
-- 
cgit v1.2.3-71-gd317


From 180f829fe4026bd192447d261e712b6cb84f6202 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 14 Mar 2021 20:57:09 +0000
Subject: io_uring: fix complete_post use ctx after free

If io_req_complete_post() put not a final ref, we can't rely on the
request's ctx ref, and so ctx may potentially be freed while
complete_post() is in io_cqring_ev_posted()/etc.

In that case get an additional ctx reference, and put it in the end, so
protecting following io_cqring_ev_posted(). And also prolong ctx
lifetime until spin_unlock happens, as we do with mutexes, so added
percpu_ref_get() doesn't race with ctx free.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 217f72d50ff5..f8a683607b25 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1550,14 +1550,17 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 		io_put_task(req->task, 1);
 		list_add(&req->compl.list, &cs->locked_free_list);
 		cs->locked_free_nr++;
-	} else
-		req = NULL;
+	} else {
+		if (!percpu_ref_tryget(&ctx->refs))
+			req = NULL;
+	}
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
-	io_cqring_ev_posted(ctx);
 
-	if (req)
+	if (req) {
+		io_cqring_ev_posted(ctx);
 		percpu_ref_put(&ctx->refs);
+	}
 }
 
 static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -8373,11 +8376,13 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	/*
 	 * Some may use context even when all refs and requests have been put,
-	 * and they are free to do so while still holding uring_lock, see
-	 * __io_req_task_submit(). Wait for them to finish.
+	 * and they are free to do so while still holding uring_lock or
+	 * completion_lock, see __io_req_task_submit(). Wait for them to finish.
 	 */
 	mutex_lock(&ctx->uring_lock);
 	mutex_unlock(&ctx->uring_lock);
+	spin_lock_irq(&ctx->completion_lock);
+	spin_unlock_irq(&ctx->completion_lock);
 
 	io_sq_thread_finish(ctx);
 	io_sqe_buffers_unregister(ctx);
-- 
cgit v1.2.3-71-gd317


From 09a6f4efaa6536e760385f949e24078fd78305ad Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 14 Mar 2021 20:57:10 +0000
Subject: io_uring: replace sqd rw_semaphore with mutex

The only user of read-locking of sqd->rw_lock is sq_thread itself, which
is by definition alone, so we don't really need rw_semaphore, but mutex
will do. Replace it with a mutex, and kill read-to-write upgrading and
extra task_work handling in io_sq_thread().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8a683607b25..6de779aafd33 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -258,7 +258,7 @@ enum {
 
 struct io_sq_data {
 	refcount_t		refs;
-	struct rw_semaphore	rw_lock;
+	struct mutex		lock;
 
 	/* ctx's that are using this sqd */
 	struct list_head	ctx_list;
@@ -6689,16 +6689,15 @@ static int io_sq_thread(void *data)
 		set_cpus_allowed_ptr(current, cpu_online_mask);
 	current->flags |= PF_NO_SETAFFINITY;
 
-	down_read(&sqd->rw_lock);
-
+	mutex_lock(&sqd->lock);
 	while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
 		int ret;
 		bool cap_entries, sqt_spin, needs_sched;
 
 		if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
-			up_read(&sqd->rw_lock);
+			mutex_unlock(&sqd->lock);
 			cond_resched();
-			down_read(&sqd->rw_lock);
+			mutex_lock(&sqd->lock);
 			io_run_task_work();
 			timeout = jiffies + sqd->sq_thread_idle;
 			continue;
@@ -6745,10 +6744,10 @@ static int io_sq_thread(void *data)
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 				io_ring_set_wakeup_flag(ctx);
 
-			up_read(&sqd->rw_lock);
+			mutex_unlock(&sqd->lock);
 			schedule();
 			try_to_freeze();
-			down_read(&sqd->rw_lock);
+			mutex_lock(&sqd->lock);
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 				io_ring_clear_wakeup_flag(ctx);
 		}
@@ -6756,20 +6755,13 @@ static int io_sq_thread(void *data)
 		finish_wait(&sqd->wait, &wait);
 		timeout = jiffies + sqd->sq_thread_idle;
 	}
-	up_read(&sqd->rw_lock);
-	down_write(&sqd->rw_lock);
-	/*
-	 * someone may have parked and added a cancellation task_work, run
-	 * it first because we don't want it in io_uring_cancel_sqpoll()
-	 */
-	io_run_task_work();
 
 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 		io_uring_cancel_sqpoll(ctx);
 	sqd->thread = NULL;
 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 		io_ring_set_wakeup_flag(ctx);
-	up_write(&sqd->rw_lock);
+	mutex_unlock(&sqd->lock);
 
 	io_run_task_work();
 	complete(&sqd->exited);
@@ -7071,21 +7063,21 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 }
 
 static void io_sq_thread_unpark(struct io_sq_data *sqd)
-	__releases(&sqd->rw_lock)
+	__releases(&sqd->lock)
 {
 	WARN_ON_ONCE(sqd->thread == current);
 
 	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-	up_write(&sqd->rw_lock);
+	mutex_unlock(&sqd->lock);
 }
 
 static void io_sq_thread_park(struct io_sq_data *sqd)
-	__acquires(&sqd->rw_lock)
+	__acquires(&sqd->lock)
 {
 	WARN_ON_ONCE(sqd->thread == current);
 
 	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-	down_write(&sqd->rw_lock);
+	mutex_lock(&sqd->lock);
 	/* set again for consistency, in case concurrent parks are happening */
 	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 	if (sqd->thread)
@@ -7096,11 +7088,11 @@ static void io_sq_thread_stop(struct io_sq_data *sqd)
 {
 	WARN_ON_ONCE(sqd->thread == current);
 
-	down_write(&sqd->rw_lock);
+	mutex_lock(&sqd->lock);
 	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 	if (sqd->thread)
 		wake_up_process(sqd->thread);
-	up_write(&sqd->rw_lock);
+	mutex_unlock(&sqd->lock);
 	wait_for_completion(&sqd->exited);
 }
 
@@ -7182,7 +7174,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
 
 	refcount_set(&sqd->refs, 1);
 	INIT_LIST_HEAD(&sqd->ctx_list);
-	init_rwsem(&sqd->rw_lock);
+	mutex_init(&sqd->lock);
 	init_waitqueue_head(&sqd->wait);
 	init_completion(&sqd->exited);
 	return sqd;
-- 
cgit v1.2.3-71-gd317


From f6d54255f4235448d4bbe442362d4caa62da97d5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 14 Mar 2021 20:57:11 +0000
Subject: io_uring: halt SQO submission on ctx exit

io_sq_thread_finish() is called in io_ring_ctx_free(), so SQPOLL task is
potentially running submitting new requests. It's not a disaster because
of using a "try" variant of percpu_ref_get, but is far from nice.

Remove ctx from the sqd ctx list earlier, before cancellation loop, so
SQPOLL can't find it and so won't submit new requests.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6de779aafd33..b87012a21775 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8564,6 +8564,14 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 		io_unregister_personality(ctx, index);
 	mutex_unlock(&ctx->uring_lock);
 
+	/* prevent SQPOLL from submitting new requests */
+	if (ctx->sq_data) {
+		io_sq_thread_park(ctx->sq_data);
+		list_del_init(&ctx->sqd_list);
+		io_sqd_update_thread_idle(ctx->sq_data);
+		io_sq_thread_unpark(ctx->sq_data);
+	}
+
 	io_kill_timeouts(ctx, NULL, NULL);
 	io_poll_remove_all(ctx, NULL, NULL);
 
-- 
cgit v1.2.3-71-gd317


From 9e138a48345427fa42f6076396ea069cebf3c08f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 14 Mar 2021 20:57:12 +0000
Subject: io_uring: fix concurrent parking

If io_sq_thread_park() of one task got rescheduled right after
set_bit(), before it gets back to mutex_lock() there can happen
park()/unpark() by another task with SQPOLL locking again and
continuing running never seeing that first set_bit(SHOULD_PARK),
so won't even try to put the mutex down for parking.

It will get parked eventually when SQPOLL drops the lock for reschedule,
but may be problematic and will get in the way of further fixes.

Account number of tasks waiting for parking with a new atomic variable
park_pending and adjust SHOULD_PARK accordingly. It doesn't entirely
replaces SHOULD_PARK bit with this atomic var because it's convenient
to have it as a bit in the state and will help to do optimisations
later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b87012a21775..70ceb8ed5950 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -258,6 +258,7 @@ enum {
 
 struct io_sq_data {
 	refcount_t		refs;
+	atomic_t		park_pending;
 	struct mutex		lock;
 
 	/* ctx's that are using this sqd */
@@ -7067,7 +7068,13 @@ static void io_sq_thread_unpark(struct io_sq_data *sqd)
 {
 	WARN_ON_ONCE(sqd->thread == current);
 
+	/*
+	 * Do the dance but not conditional clear_bit() because it'd race with
+	 * other threads incrementing park_pending and setting the bit.
+	 */
 	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+	if (atomic_dec_return(&sqd->park_pending))
+		set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 	mutex_unlock(&sqd->lock);
 }
 
@@ -7076,10 +7083,9 @@ static void io_sq_thread_park(struct io_sq_data *sqd)
 {
 	WARN_ON_ONCE(sqd->thread == current);
 
+	atomic_inc(&sqd->park_pending);
 	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 	mutex_lock(&sqd->lock);
-	/* set again for consistency, in case concurrent parks are happening */
-	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 	if (sqd->thread)
 		wake_up_process(sqd->thread);
 }
@@ -7099,6 +7105,8 @@ static void io_sq_thread_stop(struct io_sq_data *sqd)
 static void io_put_sq_data(struct io_sq_data *sqd)
 {
 	if (refcount_dec_and_test(&sqd->refs)) {
+		WARN_ON_ONCE(atomic_read(&sqd->park_pending));
+
 		io_sq_thread_stop(sqd);
 		kfree(sqd);
 	}
@@ -7172,6 +7180,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
 	if (!sqd)
 		return ERR_PTR(-ENOMEM);
 
+	atomic_set(&sqd->park_pending, 0);
 	refcount_set(&sqd->refs, 1);
 	INIT_LIST_HEAD(&sqd->ctx_list);
 	mutex_init(&sqd->lock);
-- 
cgit v1.2.3-71-gd317


From 9b46571142e47503ed4f3ae3be5ed3968d8cb9cc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Mar 2021 14:23:07 +0000
Subject: io_uring: add generic callback_head helpers

We already have helpers to run/add callback_head but taking ctx and
working with ctx->exit_task_work. Extract generic versions of them
implemented in terms of struct callback_head, it will be used later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 62 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 70ceb8ed5950..5c2de43b99f5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1929,17 +1929,44 @@ static int io_req_task_work_add(struct io_kiocb *req)
 	return ret;
 }
 
-static void io_req_task_work_add_fallback(struct io_kiocb *req,
-					  task_work_func_t cb)
+static bool io_run_task_work_head(struct callback_head **work_head)
+{
+	struct callback_head *work, *next;
+	bool executed = false;
+
+	do {
+		work = xchg(work_head, NULL);
+		if (!work)
+			break;
+
+		do {
+			next = work->next;
+			work->func(work);
+			work = next;
+			cond_resched();
+		} while (work);
+		executed = true;
+	} while (1);
+
+	return executed;
+}
+
+static void io_task_work_add_head(struct callback_head **work_head,
+				  struct callback_head *task_work)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	struct callback_head *head;
 
-	init_task_work(&req->task_work, cb);
 	do {
-		head = READ_ONCE(ctx->exit_task_work);
-		req->task_work.next = head;
-	} while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
+		head = READ_ONCE(*work_head);
+		task_work->next = head;
+	} while (cmpxchg(work_head, head, task_work) != head);
+}
+
+static void io_req_task_work_add_fallback(struct io_kiocb *req,
+					  task_work_func_t cb)
+{
+	init_task_work(&req->task_work, cb);
+	io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work);
 }
 
 static void __io_req_task_cancel(struct io_kiocb *req, int error)
@@ -8471,26 +8498,9 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 	return -EINVAL;
 }
 
-static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
+static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
 {
-	struct callback_head *work, *next;
-	bool executed = false;
-
-	do {
-		work = xchg(&ctx->exit_task_work, NULL);
-		if (!work)
-			break;
-
-		do {
-			next = work->next;
-			work->func(work);
-			work = next;
-			cond_resched();
-		} while (work);
-		executed = true;
-	} while (1);
-
-	return executed;
+	return io_run_task_work_head(&ctx->exit_task_work);
 }
 
 struct io_tctx_exit {
-- 
cgit v1.2.3-71-gd317


From b7f5a0bfe2061b2c7b2164de06fa4072d7373a45 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Mar 2021 14:23:08 +0000
Subject: io_uring: fix sqpoll cancellation via task_work

Running sqpoll cancellations via task_work_run() is a bad idea because
it depends on other task works to be run, but those may be locked in
currently running task_work_run() because of how it's (splicing the list
in batches).

Enqueue and run them through a separate callback head, namely
struct io_sq_data::park_task_work. As a nice bonus we now precisely
control where it's run, that's much safer than guessing where it can
happen as it was before.

Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5c2de43b99f5..5f4e312111ea 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -274,6 +274,7 @@ struct io_sq_data {
 
 	unsigned long		state;
 	struct completion	exited;
+	struct callback_head	*park_task_work;
 };
 
 #define IO_IOPOLL_BATCH			8
@@ -6727,6 +6728,7 @@ static int io_sq_thread(void *data)
 			cond_resched();
 			mutex_lock(&sqd->lock);
 			io_run_task_work();
+			io_run_task_work_head(&sqd->park_task_work);
 			timeout = jiffies + sqd->sq_thread_idle;
 			continue;
 		}
@@ -6781,6 +6783,7 @@ static int io_sq_thread(void *data)
 		}
 
 		finish_wait(&sqd->wait, &wait);
+		io_run_task_work_head(&sqd->park_task_work);
 		timeout = jiffies + sqd->sq_thread_idle;
 	}
 
@@ -6792,6 +6795,7 @@ static int io_sq_thread(void *data)
 	mutex_unlock(&sqd->lock);
 
 	io_run_task_work();
+	io_run_task_work_head(&sqd->park_task_work);
 	complete(&sqd->exited);
 	do_exit(0);
 }
@@ -8890,7 +8894,7 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
 	if (task) {
 		init_completion(&work.completion);
 		init_task_work(&work.task_work, io_sqpoll_cancel_cb);
-		WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL));
+		io_task_work_add_head(&sqd->park_task_work, &work.task_work);
 		wake_up_process(task);
 	}
 	io_sq_thread_unpark(sqd);
-- 
cgit v1.2.3-71-gd317


From d336f7ebc65007f5831e2297e6f3383ae8dbf8ed Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 2 Mar 2021 09:32:53 -0800
Subject: xfs: force log and push AIL to clear pinned inodes when aborting
 mount

If we allocate quota inodes in the process of mounting a filesystem but
then decide to abort the mount, it's possible that the quota inodes are
sitting around pinned by the log.  Now that inode reclaim relies on the
AIL to flush inodes, we have to force the log and push the AIL in
between releasing the quota inodes and kicking off reclaim to tear down
all the incore inodes.  Do this by extracting the bits we need from the
unmount path and reusing them.  As an added bonus, failed writes during
a failed mount will not retry forever now.

This was originally found during a fuzz test of metadata directories
(xfs/1546), but the actual symptom was that reclaim hung up on the quota
inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_mount.c | 90 ++++++++++++++++++++++++++----------------------------
 1 file changed, 44 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 52370d0a3f43..1c97b155a8ee 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -634,6 +634,47 @@ xfs_check_summary_counts(
 	return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
 }
 
+/*
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
+ * internal inode structures can be sitting in the CIL and AIL at this point,
+ * so we need to unpin them, write them back and/or reclaim them before unmount
+ * can proceed.
+ *
+ * An inode cluster that has been freed can have its buffer still pinned in
+ * memory because the transaction is still sitting in a iclog. The stale inodes
+ * on that buffer will be pinned to the buffer until the transaction hits the
+ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
+ * may never see the pinned buffer, so nothing will push out the iclog and
+ * unpin the buffer.
+ *
+ * Hence we need to force the log to unpin everything first. However, log
+ * forces don't wait for the discards they issue to complete, so we have to
+ * explicitly wait for them to complete here as well.
+ *
+ * Then we can tell the world we are unmounting so that error handling knows
+ * that the filesystem is going away and we should error out anything that we
+ * have been retrying in the background.  This will prevent never-ending
+ * retries in AIL pushing from hanging the unmount.
+ *
+ * Finally, we can push the AIL to clean all the remaining dirty objects, then
+ * reclaim the remaining inodes that are still in memory at this point in time.
+ */
+static void
+xfs_unmount_flush_inodes(
+	struct xfs_mount	*mp)
+{
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	xfs_extent_busy_wait_all(mp);
+	flush_workqueue(xfs_discard_wq);
+
+	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
+	xfs_ail_push_all_sync(mp->m_ail);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	xfs_reclaim_inodes(mp);
+	xfs_health_unmount(mp);
+}
+
 /*
  * This function does the following on an initial mount of a file system:
  *	- reads the superblock from disk and init the mount struct
@@ -1008,7 +1049,7 @@ xfs_mountfs(
 	/* Clean out dquots that might be in memory after quotacheck. */
 	xfs_qm_unmount(mp);
 	/*
-	 * Cancel all delayed reclaim work and reclaim the inodes directly.
+	 * Flush all inode reclamation work and flush the log.
 	 * We have to do this /after/ rtunmount and qm_unmount because those
 	 * two will have scheduled delayed reclaim for the rt/quota inodes.
 	 *
@@ -1018,11 +1059,8 @@ xfs_mountfs(
 	 * qm_unmount_quotas and therefore rely on qm_unmount to release the
 	 * quota inodes.
 	 */
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	xfs_reclaim_inodes(mp);
-	xfs_health_unmount(mp);
+	xfs_unmount_flush_inodes(mp);
  out_log_dealloc:
-	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
 	xfs_log_mount_cancel(mp);
  out_fail_wait:
 	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -1063,47 +1101,7 @@ xfs_unmountfs(
 	xfs_rtunmount_inodes(mp);
 	xfs_irele(mp->m_rootip);
 
-	/*
-	 * We can potentially deadlock here if we have an inode cluster
-	 * that has been freed has its buffer still pinned in memory because
-	 * the transaction is still sitting in a iclog. The stale inodes
-	 * on that buffer will be pinned to the buffer until the
-	 * transaction hits the disk and the callbacks run. Pushing the AIL will
-	 * skip the stale inodes and may never see the pinned buffer, so
-	 * nothing will push out the iclog and unpin the buffer. Hence we
-	 * need to force the log here to ensure all items are flushed into the
-	 * AIL before we go any further.
-	 */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/*
-	 * Wait for all busy extents to be freed, including completion of
-	 * any discard operation.
-	 */
-	xfs_extent_busy_wait_all(mp);
-	flush_workqueue(xfs_discard_wq);
-
-	/*
-	 * We now need to tell the world we are unmounting. This will allow
-	 * us to detect that the filesystem is going away and we should error
-	 * out anything that we have been retrying in the background. This will
-	 * prevent neverending retries in AIL pushing from hanging the unmount.
-	 */
-	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
-
-	/*
-	 * Flush all pending changes from the AIL.
-	 */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/*
-	 * Reclaim all inodes. At this point there should be no dirty inodes and
-	 * none should be pinned or locked. Stop background inode reclaim here
-	 * if it is still running.
-	 */
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	xfs_reclaim_inodes(mp);
-	xfs_health_unmount(mp);
+	xfs_unmount_flush_inodes(mp);
 
 	xfs_qm_unmount(mp);
 
-- 
cgit v1.2.3-71-gd317


From 8723d5ba8bdae1c41be7a6fc8469dc9aa551e7d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 14 Mar 2021 10:59:39 -0700
Subject: xfs: also reject BULKSTAT_SINGLE in a mount user namespace

BULKSTAT_SINGLE exposed the ondisk uids/gids just like bulkstat, and can
be called on any inode, including ones not visible in the current mount.

Fixes: f736d93d76d3 ("xfs: support idmapped mounts")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_itable.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index ca310a125d1e..3498b97fb06d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -168,6 +168,12 @@ xfs_bulkstat_one(
 	};
 	int			error;
 
+	if (breq->mnt_userns != &init_user_ns) {
+		xfs_warn_ratelimited(breq->mp,
+			"bulkstat not supported inside of idmapped mounts.");
+		return -EINVAL;
+	}
+
 	ASSERT(breq->icount == 1);
 
 	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-- 
cgit v1.2.3-71-gd317


From d2dcc8ed8ec650a793e81d8b2222146eb6ddd84f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 8 Mar 2021 17:20:17 +0800
Subject: btrfs: fix wrong offset to zero out range beyond i_size

[BUG]
The test generic/091 fails , with the following output:

  fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -W
  mapped writes DISABLED
  Seed set to 1
  main: filesystem does not support fallocate mode FALLOC_FL_COLLAPSE_RANGE, disabling!
  main: filesystem does not support fallocate mode FALLOC_FL_INSERT_RANGE, disabling!
  skipping zero size read
  truncating to largest ever: 0xe400
  copying to largest ever: 0x1f400
  cloning to largest ever: 0x70000
  cloning to largest ever: 0x77000
  fallocating to largest ever: 0x7a120
  Mapped Read: non-zero data past EOF (0x3a7ff) page offset 0x800 is 0xf2e1 <<<
  ...

[CAUSE]
In commit c28ea613fafa ("btrfs: subpage: fix the false data csum mismatch error")
end_bio_extent_readpage() changes to only zero the range inside the bvec
for incoming subpage support.

But that commit is using incorrect offset to calculate the start.

For subpage, we can have a case that the whole bvec is beyond isize,
thus we need to calculate the correct offset.

But the offending commit is using @end (bvec end), other than @start
(bvec start) to calculate the start offset.

This means, we only zero the last byte of the bvec, not from the isize.
This stupid bug makes the range beyond isize is not properly zeroed, and
failed above test.

[FIX]
Use correct @start to calculate the range start.

Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: c28ea613fafa ("btrfs: subpage: fix the false data csum mismatch error")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4671c99d468d..f3d7be975c3a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3020,7 +3020,7 @@ readpage_ok:
 			 */
 			if (page->index == end_index && i_size <= end) {
 				u32 zero_start = max(offset_in_page(i_size),
-						     offset_in_page(end));
+						     offset_in_page(start));
 
 				zero_user_segment(page, zero_start,
 						  offset_in_page(end) + 1);
-- 
cgit v1.2.3-71-gd317


From fbf48bb0b197e6894a04c714728c952af7153bf3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 3 Mar 2021 18:41:51 +0800
Subject: btrfs: track qgroup released data in own variable in
 insert_prealloc_file_extent

There is a piece of weird code in insert_prealloc_file_extent(), which
looks like:

	ret = btrfs_qgroup_release_data(inode, file_offset, len);
	if (ret < 0)
		return ERR_PTR(ret);
	if (trans) {
		ret = insert_reserved_file_extent(trans, inode,
						  file_offset, &stack_fi,
						  true, ret);
	...
	}
	extent_info.is_new_extent = true;
	extent_info.qgroup_reserved = ret;
	...

Note how the variable @ret is abused here, and if anyone is adding code
just after btrfs_qgroup_release_data() call, it's super easy to
overwrite the @ret and cause tons of qgroup related bugs.

Fix such abuse by introducing new variable @qgroup_released, so that we
won't reuse the existing variable @ret.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c35b724a5611..77182be403c5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9873,6 +9873,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
 	struct btrfs_path *path;
 	u64 start = ins->objectid;
 	u64 len = ins->offset;
+	int qgroup_released;
 	int ret;
 
 	memset(&stack_fi, 0, sizeof(stack_fi));
@@ -9885,14 +9886,14 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
 	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
 	/* Encryption and other encoding is reserved and all 0 */
 
-	ret = btrfs_qgroup_release_data(inode, file_offset, len);
-	if (ret < 0)
-		return ERR_PTR(ret);
+	qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
+	if (qgroup_released < 0)
+		return ERR_PTR(qgroup_released);
 
 	if (trans) {
 		ret = insert_reserved_file_extent(trans, inode,
 						  file_offset, &stack_fi,
-						  true, ret);
+						  true, qgroup_released);
 		if (ret)
 			return ERR_PTR(ret);
 		return trans;
@@ -9905,7 +9906,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
 	extent_info.file_offset = file_offset;
 	extent_info.extent_buf = (char *)&stack_fi;
 	extent_info.is_new_extent = true;
-	extent_info.qgroup_reserved = ret;
+	extent_info.qgroup_reserved = qgroup_released;
 	extent_info.insertions = 0;
 
 	path = btrfs_alloc_path();
-- 
cgit v1.2.3-71-gd317


From a3ee79bd8fe17812d2305ccc4bf81bfeab395576 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 3 Mar 2021 18:41:52 +0800
Subject: btrfs: fix qgroup data rsv leak caused by falloc failure

[BUG]
When running fsstress with only falloc workload, and a very low qgroup
limit set, we can get qgroup data rsv leak at unmount time.

 BTRFS warning (device dm-0): qgroup 0/5 has unreleased space, type 0 rsv 20480
 BTRFS error (device dm-0): qgroup reserved space leaked

The minimal reproducer looks like:

  #!/bin/bash
  dev=/dev/test/test
  mnt="/mnt/btrfs"
  fsstress=~/xfstests-dev/ltp/fsstress
  runtime=8

  workload()
  {
          umount $dev &> /dev/null
          umount $mnt &> /dev/null
          mkfs.btrfs -f $dev > /dev/null
          mount $dev $mnt

          btrfs quota en $mnt
          btrfs quota rescan -w $mnt
          btrfs qgroup limit 16m 0/5 $mnt

          $fsstress -w -z -f creat=10 -f fallocate=10 -p 2 -n 100 \
  		-d $mnt -v > /tmp/fsstress

          umount $mnt
          if dmesg | grep leak ; then
		echo "!!! FAILED !!!"
  		exit 1
          fi
  }

  for (( i=0; i < $runtime; i++)); do
          echo "=== $i/$runtime==="
          workload
  done

Normally it would fail before round 4.

[CAUSE]
In function insert_prealloc_file_extent(), we first call
btrfs_qgroup_release_data() to know how many bytes are reserved for
qgroup data rsv.

Then use that @qgroup_released number to continue our work.

But after we call btrfs_qgroup_release_data(), we should either queue
@qgroup_released to delayed ref or free them manually in error path.

Unfortunately, we lack the error handling to free the released bytes,
leaking qgroup data rsv.

All the error handling function outside won't help at all, as we have
released the range, meaning in inode io tree, the EXTENT_QGROUP_RESERVED
bit is already cleared, thus all btrfs_qgroup_free_data() call won't
free any data rsv.

[FIX]
Add free_qgroup tag to manually free the released qgroup data rsv.

Reported-by: Nikolay Borisov <nborisov@suse.com>
Reported-by: David Sterba <dsterba@suse.cz>
Fixes: 9729f10a608f ("btrfs: inode: move qgroup reserved space release to the callers of insert_reserved_file_extent()")
CC: stable@vger.kernel.org # 5.10+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 77182be403c5..ea5ede619220 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9895,7 +9895,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
 						  file_offset, &stack_fi,
 						  true, qgroup_released);
 		if (ret)
-			return ERR_PTR(ret);
+			goto free_qgroup;
 		return trans;
 	}
 
@@ -9910,17 +9910,31 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
 	extent_info.insertions = 0;
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return ERR_PTR(-ENOMEM);
+	if (!path) {
+		ret = -ENOMEM;
+		goto free_qgroup;
+	}
 
 	ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
 				     file_offset + len - 1, &extent_info,
 				     &trans);
 	btrfs_free_path(path);
 	if (ret)
-		return ERR_PTR(ret);
-
+		goto free_qgroup;
 	return trans;
+
+free_qgroup:
+	/*
+	 * We have released qgroup data range at the beginning of the function,
+	 * and normally qgroup_released bytes will be freed when committing
+	 * transaction.
+	 * But if we error out early, we have to free what we have released
+	 * or we leak qgroup data reservation.
+	 */
+	btrfs_qgroup_free_refroot(inode->root->fs_info,
+			inode->root->root_key.objectid, qgroup_released,
+			BTRFS_QGROUP_RSV_DATA);
+	return ERR_PTR(ret);
 }
 
 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
-- 
cgit v1.2.3-71-gd317


From e3d3b4157610164b0ec43d968b0dfedfe7c68992 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 11 Mar 2021 15:13:30 +0000
Subject: btrfs: zoned: fix linked list corruption after log root tree
 allocation failure

When using a zoned filesystem, while syncing the log, if we fail to
allocate the root node for the log root tree, we are not removing the
log context we allocated on stack from the list of log contexts of the
log root tree. This means after the return from btrfs_sync_log() we get
a corrupted linked list.

Fix this by allocating the node before adding our stack allocated context
to the list of log contexts of the log root tree.

Fixes: 3ddebf27fcd3a9 ("btrfs: zoned: reorder log node allocation on zoned filesystem")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2f1acc9aea9e..92a368627791 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3169,10 +3169,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&log_root_tree->log_mutex);
 
-	index2 = log_root_tree->log_transid % 2;
-	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
-	root_log_ctx.log_transid = log_root_tree->log_transid;
-
 	if (btrfs_is_zoned(fs_info)) {
 		if (!log_root_tree->node) {
 			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
@@ -3183,6 +3179,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	index2 = log_root_tree->log_transid % 2;
+	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+	root_log_ctx.log_transid = log_root_tree->log_transid;
+
 	/*
 	 * Now we are safe to update the log_root_tree because we're under the
 	 * log_mutex, and we're a current writer so we're holding the commit
-- 
cgit v1.2.3-71-gd317


From 64fcbb6158ecc684d84c64424830a9c37c77c5b9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Mar 2021 10:26:45 +0000
Subject: afs: Fix accessing YFS xattrs on a non-YFS server

If someone attempts to access YFS-related xattrs (e.g. afs.yfs.acl) on a
file on a non-YFS AFS server (such as OpenAFS), then the kernel will jump
to a NULL function pointer because the afs_fetch_acl_operation descriptor
doesn't point to a function for issuing an operation on a non-YFS
server[1].

Fix this by making afs_wait_for_operation() check that the issue_afs_rpc
method is set before jumping to it and setting -ENOTSUPP if not.  This fix
also covers other potential operations that also only exist on YFS servers.

afs_xattr_get/set_yfs() then need to translate -ENOTSUPP to -ENODATA as the
former error is internal to the kernel.

The bug shows up as an oops like the following:

	BUG: kernel NULL pointer dereference, address: 0000000000000000
	[...]
	Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
	[...]
	Call Trace:
	 afs_wait_for_operation+0x83/0x1b0 [kafs]
	 afs_xattr_get_yfs+0xe6/0x270 [kafs]
	 __vfs_getxattr+0x59/0x80
	 vfs_getxattr+0x11c/0x140
	 getxattr+0x181/0x250
	 ? __check_object_size+0x13f/0x150
	 ? __fput+0x16d/0x250
	 __x64_sys_fgetxattr+0x64/0xb0
	 do_syscall_64+0x49/0xc0
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9
	RIP: 0033:0x7fb120a9defe

This was triggered with "cp -a" which attempts to copy xattrs, including
afs ones, but is easier to reproduce with getfattr, e.g.:

	getfattr -d -m ".*" /afs/openafs.org/

Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept")
Reported-by: Gaja Sophie Peters <gaja.peters@math.uni-hamburg.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Gaja Sophie Peters <gaja.peters@math.uni-hamburg.de>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Reviewed-by: Jeffrey Altman <jaltman@auristor.com>
cc: linux-afs@lists.infradead.org
Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003498.html [1]
Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003566.html # v1
Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003572.html # v2
---
 fs/afs/fs_operation.c | 7 +++++--
 fs/afs/xattr.c        | 8 +++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 97cab12b0a6c..71c58723763d 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -181,10 +181,13 @@ void afs_wait_for_operation(struct afs_operation *op)
 		if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
 		    op->ops->issue_yfs_rpc)
 			op->ops->issue_yfs_rpc(op);
-		else
+		else if (op->ops->issue_afs_rpc)
 			op->ops->issue_afs_rpc(op);
+		else
+			op->ac.error = -ENOTSUPP;
 
-		op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
+		if (op->call)
+			op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
 	}
 
 	switch (op->error) {
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index c629caae5002..4934e325a14a 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -231,6 +231,8 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler,
 			else
 				ret = -ERANGE;
 		}
+	} else if (ret == -ENOTSUPP) {
+		ret = -ENODATA;
 	}
 
 error_yacl:
@@ -256,6 +258,7 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler,
 {
 	struct afs_operation *op;
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	int ret;
 
 	if (flags == XATTR_CREATE ||
 	    strcmp(name, "acl") != 0)
@@ -270,7 +273,10 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler,
 		return afs_put_operation(op);
 
 	op->ops = &yfs_store_opaque_acl2_operation;
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	if (ret == -ENOTSUPP)
+		ret = -ENODATA;
+	return ret;
 }
 
 static const struct xattr_handler afs_xattr_yfs_handler = {
-- 
cgit v1.2.3-71-gd317


From a7889c6320b9200e3fe415238f546db677310fa9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 9 Mar 2021 08:27:39 +0000
Subject: afs: Stop listxattr() from listing "afs.*" attributes

afs_listxattr() lists all the available special afs xattrs (i.e. those in
the "afs.*" space), no matter what type of server we're dealing with.  But
OpenAFS servers, for example, cannot deal with some of the extra-capable
attributes that AuriStor (YFS) servers provide.  Unfortunately, the
presence of the afs.yfs.* attributes causes errors[1] for anything that
tries to read them if the server is of the wrong type.

Fix the problem by removing afs_listxattr() so that none of the special
xattrs are listed (AFS doesn't support xattrs).  It does mean, however,
that getfattr won't list them, though they can still be accessed with
getxattr() and setxattr().

This can be tested with something like:

	getfattr -d -m ".*" /afs/example.com/path/to/file

With this change, none of the afs.* attributes should be visible.

Changes:
ver #2:
 - Hide all of the afs.* xattrs, not just the ACL ones.

Fixes: ae46578b963f ("afs: Get YFS ACLs and information through xattrs")
Reported-by: Gaja Sophie Peters <gaja.peters@math.uni-hamburg.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Gaja Sophie Peters <gaja.peters@math.uni-hamburg.de>
Reviewed-by: Jeffrey Altman <jaltman@auristor.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003502.html [1]
Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003567.html # v1
Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003573.html # v2
---
 fs/afs/dir.c      |  1 -
 fs/afs/file.c     |  1 -
 fs/afs/inode.c    |  1 -
 fs/afs/internal.h |  1 -
 fs/afs/mntpt.c    |  1 -
 fs/afs/xattr.c    | 23 -----------------------
 6 files changed, 28 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 714fcca9af99..17548c1faf02 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -70,7 +70,6 @@ const struct inode_operations afs_dir_inode_operations = {
 	.permission	= afs_permission,
 	.getattr	= afs_getattr,
 	.setattr	= afs_setattr,
-	.listxattr	= afs_listxattr,
 };
 
 const struct address_space_operations afs_dir_aops = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 85f5adf21aa0..960b64268623 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -43,7 +43,6 @@ const struct inode_operations afs_file_inode_operations = {
 	.getattr	= afs_getattr,
 	.setattr	= afs_setattr,
 	.permission	= afs_permission,
-	.listxattr	= afs_listxattr,
 };
 
 const struct address_space_operations afs_fs_aops = {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1156b2df28d3..12be88716e4c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -27,7 +27,6 @@
 
 static const struct inode_operations afs_symlink_inode_operations = {
 	.get_link	= page_get_link,
-	.listxattr	= afs_listxattr,
 };
 
 static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index b626e38e9ab5..1627b1872812 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1509,7 +1509,6 @@ extern int afs_launder_page(struct page *);
  * xattr.c
  */
 extern const struct xattr_handler *afs_xattr_handlers[];
-extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
 
 /*
  * yfsclient.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 052dab2f5c03..bbb2c210d139 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -32,7 +32,6 @@ const struct inode_operations afs_mntpt_inode_operations = {
 	.lookup		= afs_mntpt_lookup,
 	.readlink	= page_readlink,
 	.getattr	= afs_getattr,
-	.listxattr	= afs_listxattr,
 };
 
 const struct inode_operations afs_autocell_inode_operations = {
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 4934e325a14a..7751b0b3f81d 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -11,29 +11,6 @@
 #include <linux/xattr.h>
 #include "internal.h"
 
-static const char afs_xattr_list[] =
-	"afs.acl\0"
-	"afs.cell\0"
-	"afs.fid\0"
-	"afs.volume\0"
-	"afs.yfs.acl\0"
-	"afs.yfs.acl_inherited\0"
-	"afs.yfs.acl_num_cleaned\0"
-	"afs.yfs.vol_acl";
-
-/*
- * Retrieve a list of the supported xattrs.
- */
-ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	if (size == 0)
-		return sizeof(afs_xattr_list);
-	if (size < sizeof(afs_xattr_list))
-		return -ERANGE;
-	memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list));
-	return sizeof(afs_xattr_list);
-}
-
 /*
  * Deal with the result of a successful fetch ACL operation.
  */
-- 
cgit v1.2.3-71-gd317


From 1601ea068b886da1f8f8d4e18b9403e9e24adef6 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 15 Mar 2021 12:43:55 +0900
Subject: zonefs: prevent use of seq files as swap file

The sequential write constraint of sequential zone file prevent their
use as swap files. Only allow conventional zone files to be used as swap
files.

Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
Cc: <stable@vger.kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 fs/zonefs/super.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 0fe76f376dee..a3d074f98660 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -165,6 +165,21 @@ static int zonefs_writepages(struct address_space *mapping,
 	return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
 }
 
+static int zonefs_swap_activate(struct swap_info_struct *sis,
+				struct file *swap_file, sector_t *span)
+{
+	struct inode *inode = file_inode(swap_file);
+	struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+	if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
+		zonefs_err(inode->i_sb,
+			   "swap file: not a conventional zone file\n");
+		return -EINVAL;
+	}
+
+	return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
+}
+
 static const struct address_space_operations zonefs_file_aops = {
 	.readpage		= zonefs_readpage,
 	.readahead		= zonefs_readahead,
@@ -177,6 +192,7 @@ static const struct address_space_operations zonefs_file_aops = {
 	.is_partially_uptodate	= iomap_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 	.direct_IO		= noop_direct_IO,
+	.swap_activate		= zonefs_swap_activate,
 };
 
 static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
-- 
cgit v1.2.3-71-gd317


From ebfd68cd0c1e81267c757332385cb96df30dacce Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 10 Mar 2021 15:20:28 +0900
Subject: zonefs: Fix O_APPEND async write handling

zonefs updates the size of a sequential zone file inode only on
completion of direct writes. When executing asynchronous append writes
(with a file open with O_APPEND or using RWF_APPEND), the use of the
current inode size in generic_write_checks() to set an iocb offset thus
leads to unaligned write if an application issues an append write
operation with another write already being executed.

Fix this problem by introducing zonefs_write_checks() as a modified
version of generic_write_checks() using the file inode wp_offset for an
append write iocb offset. Also introduce zonefs_write_check_limits() to
replace generic_write_check_limits() call. This zonefs special helper
makes sure that the maximum file limit used is the maximum size of the
file being accessed.

Since zonefs_write_checks() already truncates the iov_iter, the calls
to iov_iter_truncate() in zonefs_file_dio_write() and
zonefs_file_buffered_write() are removed.

Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
Cc: <stable@vger.kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 fs/zonefs/super.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 68 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index a3d074f98660..3427c99abb4d 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -743,6 +743,68 @@ out_release:
 	return ret;
 }
 
+/*
+ * Do not exceed the LFS limits nor the file zone size. If pos is under the
+ * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
+ */
+static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
+					loff_t count)
+{
+	struct inode *inode = file_inode(file);
+	struct zonefs_inode_info *zi = ZONEFS_I(inode);
+	loff_t limit = rlimit(RLIMIT_FSIZE);
+	loff_t max_size = zi->i_max_size;
+
+	if (limit != RLIM_INFINITY) {
+		if (pos >= limit) {
+			send_sig(SIGXFSZ, current, 0);
+			return -EFBIG;
+		}
+		count = min(count, limit - pos);
+	}
+
+	if (!(file->f_flags & O_LARGEFILE))
+		max_size = min_t(loff_t, MAX_NON_LFS, max_size);
+
+	if (unlikely(pos >= max_size))
+		return -EFBIG;
+
+	return min(count, max_size - pos);
+}
+
+static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct zonefs_inode_info *zi = ZONEFS_I(inode);
+	loff_t count;
+
+	if (IS_SWAPFILE(inode))
+		return -ETXTBSY;
+
+	if (!iov_iter_count(from))
+		return 0;
+
+	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+		return -EINVAL;
+
+	if (iocb->ki_flags & IOCB_APPEND) {
+		if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+			return -EINVAL;
+		mutex_lock(&zi->i_truncate_mutex);
+		iocb->ki_pos = zi->i_wpoffset;
+		mutex_unlock(&zi->i_truncate_mutex);
+	}
+
+	count = zonefs_write_check_limits(file, iocb->ki_pos,
+					  iov_iter_count(from));
+	if (count < 0)
+		return count;
+
+	iov_iter_truncate(from, count);
+	return iov_iter_count(from);
+}
+
 /*
  * Handle direct writes. For sequential zone files, this is the only possible
  * write path. For these files, check that the user is issuing writes
@@ -760,8 +822,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 	struct super_block *sb = inode->i_sb;
 	bool sync = is_sync_kiocb(iocb);
 	bool append = false;
-	size_t count;
-	ssize_t ret;
+	ssize_t ret, count;
 
 	/*
 	 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
@@ -779,12 +840,11 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 		inode_lock(inode);
 	}
 
-	ret = generic_write_checks(iocb, from);
-	if (ret <= 0)
+	count = zonefs_write_checks(iocb, from);
+	if (count <= 0) {
+		ret = count;
 		goto inode_unlock;
-
-	iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos);
-	count = iov_iter_count(from);
+	}
 
 	if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 		ret = -EINVAL;
@@ -844,12 +904,10 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 		inode_lock(inode);
 	}
 
-	ret = generic_write_checks(iocb, from);
+	ret = zonefs_write_checks(iocb, from);
 	if (ret <= 0)
 		goto inode_unlock;
 
-	iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos);
-
 	ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
 	if (ret > 0)
 		iocb->ki_pos += ret;
-- 
cgit v1.2.3-71-gd317


From d9bb77d51e668a1a6d4530c1ea471574d0ce465f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 15 Mar 2021 13:39:14 +0800
Subject: btrfs: subpage: fix wild pointer access during metadata read failure

[BUG]
When running fstests for btrfs subpage read-write test, it has a very
high chance to crash at generic/475 with the following stack:

 BTRFS warning (device dm-8): direct IO failed ino 510 rw 1,34817 sector 0xcdf0 len 94208 err no 10
 Unable to handle kernel paging request at virtual address ffff80001157e7c0
 CPU: 2 PID: 687125 Comm: kworker/u12:4 Tainted: G        WC        5.12.0-rc2-custom+ #5
 Hardware name: Khadas VIM3 (DT)
 Workqueue: btrfs-endio-meta btrfs_work_helper [btrfs]
 pc : queued_spin_lock_slowpath+0x1a0/0x390
 lr : do_raw_spin_lock+0xc4/0x11c
 Call trace:
  queued_spin_lock_slowpath+0x1a0/0x390
  _raw_spin_lock+0x68/0x84
  btree_readahead_hook+0x38/0xc0 [btrfs]
  end_bio_extent_readpage+0x504/0x5f4 [btrfs]
  bio_endio+0x170/0x1a4
  end_workqueue_fn+0x3c/0x60 [btrfs]
  btrfs_work_helper+0x1b0/0x1b4 [btrfs]
  process_one_work+0x22c/0x430
  worker_thread+0x70/0x3a0
  kthread+0x13c/0x140
  ret_from_fork+0x10/0x30
 Code: 910020e0 8b0200c2 f861d884 aa0203e1 (f8246827)

[CAUSE]
In end_bio_extent_readpage(), if we hit an error during read, we will
handle the error differently for data and metadata.
For data we queue a repair, while for metadata, we record the error and
let the caller choose what to do.

But the code is still using page->private to grab extent buffer, which
no longer points to extent buffer for subpage metadata pages.

Thus this wild pointer access leads to above crash.

[FIX]
Introduce a helper, find_extent_buffer_readpage(), to grab extent
buffer.

The difference against find_extent_buffer_nospinlock() is:

- Also handles regular sectorsize == PAGE_SIZE case
- No extent buffer refs increase/decrease
  As extent buffer under IO must have non-zero refs, so this is safe

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f3d7be975c3a..0d00a9cd123a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2885,6 +2885,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
 		btrfs_subpage_end_reader(fs_info, page, start, len);
 }
 
+/*
+ * Find extent buffer for a givne bytenr.
+ *
+ * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
+ * in endio context.
+ */
+static struct extent_buffer *find_extent_buffer_readpage(
+		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+	struct extent_buffer *eb;
+
+	/*
+	 * For regular sectorsize, we can use page->private to grab extent
+	 * buffer
+	 */
+	if (fs_info->sectorsize == PAGE_SIZE) {
+		ASSERT(PagePrivate(page) && page->private);
+		return (struct extent_buffer *)page->private;
+	}
+
+	/* For subpage case, we need to lookup buffer radix tree */
+	rcu_read_lock();
+	eb = radix_tree_lookup(&fs_info->buffer_radix,
+			       bytenr >> fs_info->sectorsize_bits);
+	rcu_read_unlock();
+	ASSERT(eb);
+	return eb;
+}
+
 /*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
@@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio)
 		} else {
 			struct extent_buffer *eb;
 
-			eb = (struct extent_buffer *)page->private;
+			eb = find_extent_buffer_readpage(fs_info, page, start);
 			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
 			eb->read_mirror = mirror;
 			atomic_dec(&eb->io_pages);
-- 
cgit v1.2.3-71-gd317


From 60484cd9d50117017cf53d5310c6cd629600dc69 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 15 Mar 2021 13:39:15 +0800
Subject: btrfs: subpage: make readahead work properly

In readahead infrastructure, we are using a lot of hard coded PAGE_SHIFT
while we're not doing anything specific to PAGE_SIZE.

One of the most affected part is the radix tree operation of
btrfs_fs_info::reada_tree.

If using PAGE_SHIFT, subpage metadata readahead is broken and does no
help reading metadata ahead.

Fix the problem by using btrfs_fs_info::sectorsize_bits so that
readahead could work for subpage.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/reada.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 20fd4aa48a8c..06713a8fe26b 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err)
 	/* find extent */
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree,
-			       eb->start >> PAGE_SHIFT);
+			       eb->start >> fs_info->sectorsize_bits);
 	if (re)
 		re->refcnt++;
 	spin_unlock(&fs_info->reada_lock);
@@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
 	zone = NULL;
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
-				     logical >> PAGE_SHIFT, 1);
+				     logical >> fs_info->sectorsize_bits, 1);
 	if (ret == 1 && logical >= zone->start && logical <= zone->end) {
 		kref_get(&zone->refcnt);
 		spin_unlock(&fs_info->reada_lock);
@@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
 
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&dev->reada_zones,
-				(unsigned long)(zone->end >> PAGE_SHIFT),
-				zone);
+			(unsigned long)(zone->end >> fs_info->sectorsize_bits),
+			zone);
 
 	if (ret == -EEXIST) {
 		kfree(zone);
 		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
-					     logical >> PAGE_SHIFT, 1);
+					logical >> fs_info->sectorsize_bits, 1);
 		if (ret == 1 && logical >= zone->start && logical <= zone->end)
 			kref_get(&zone->refcnt);
 		else
@@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 	u64 length;
 	int real_stripes;
 	int nzones = 0;
-	unsigned long index = logical >> PAGE_SHIFT;
+	unsigned long index = logical >> fs_info->sectorsize_bits;
 	int dev_replace_is_ongoing;
 	int have_zone = 0;
 
@@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
 			     struct reada_extent *re)
 {
 	int i;
-	unsigned long index = re->logical >> PAGE_SHIFT;
+	unsigned long index = re->logical >> fs_info->sectorsize_bits;
 
 	spin_lock(&fs_info->reada_lock);
 	if (--re->refcnt) {
@@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
 static void reada_zone_release(struct kref *kref)
 {
 	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+	struct btrfs_fs_info *fs_info = zone->device->fs_info;
 
-	lockdep_assert_held(&zone->device->fs_info->reada_lock);
+	lockdep_assert_held(&fs_info->reada_lock);
 
 	radix_tree_delete(&zone->device->reada_zones,
-			  zone->end >> PAGE_SHIFT);
+			  zone->end >> fs_info->sectorsize_bits);
 
 	kfree(zone);
 }
@@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
 static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
 {
 	int i;
-	unsigned long index = zone->end >> PAGE_SHIFT;
+	unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
 
 	for (i = 0; i < zone->ndevs; ++i) {
 		struct reada_zone *peer;
@@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
 					     (void **)&zone, index, 1);
 		if (ret == 0)
 			break;
-		index = (zone->end >> PAGE_SHIFT) + 1;
+		index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
 		if (zone->locked) {
 			if (zone->elems > top_locked_elems) {
 				top_locked_elems = zone->elems;
@@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
 	 * plugging to speed things up
 	 */
 	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
-				     dev->reada_next >> PAGE_SHIFT, 1);
+				dev->reada_next >> fs_info->sectorsize_bits, 1);
 	if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
 		ret = reada_pick_zone(dev);
 		if (!ret) {
@@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
 		}
 		re = NULL;
 		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
-					dev->reada_next >> PAGE_SHIFT, 1);
+				dev->reada_next >> fs_info->sectorsize_bits, 1);
 	}
 	if (ret == 0) {
 		spin_unlock(&fs_info->reada_lock);
@@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 				pr_cont(" curr off %llu",
 					device->reada_next - zone->start);
 			pr_cont("\n");
-			index = (zone->end >> PAGE_SHIFT) + 1;
+			index = (zone->end >> fs_info->sectorsize_bits) + 1;
 		}
 		cnt = 0;
 		index = 0;
@@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 				}
 			}
 			pr_cont("\n");
-			index = (re->logical >> PAGE_SHIFT) + 1;
+			index = (re->logical >> fs_info->sectorsize_bits) + 1;
 			if (++cnt > 15)
 				break;
 		}
@@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 		if (ret == 0)
 			break;
 		if (!re->scheduled) {
-			index = (re->logical >> PAGE_SHIFT) + 1;
+			index = (re->logical >> fs_info->sectorsize_bits) + 1;
 			continue;
 		}
 		pr_debug("re: logical %llu size %u list empty %d scheduled %d",
@@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 			}
 		}
 		pr_cont("\n");
-		index = (re->logical >> PAGE_SHIFT) + 1;
+		index = (re->logical >> fs_info->sectorsize_bits) + 1;
 	}
 	spin_unlock(&fs_info->reada_lock);
 }
-- 
cgit v1.2.3-71-gd317


From f8425c9396639cc462bcce44b1051f8b4e62fddb Mon Sep 17 00:00:00 2001
From: Alessio Balsini <balsini@android.com>
Date: Mon, 25 Jan 2021 15:30:51 +0000
Subject: fuse: 32-bit user space ioctl compat for fuse device

With a 64-bit kernel build the FUSE device cannot handle ioctl requests
coming from 32-bit user space.  This is due to the ioctl command
translation that generates different command identifiers that thus cannot
be used for direct comparisons without proper manipulation.

Explicitly extract type and number from the ioctl command to enable 32-bit
user space compatibility on 64-bit kernel builds.

Signed-off-by: Alessio Balsini <balsini@android.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c             | 26 ++++++++++++++++----------
 include/uapi/linux/fuse.h |  3 ++-
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c6636b4c4ccf..c0fee830a34e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2229,19 +2229,21 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
 static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 			   unsigned long arg)
 {
-	int err = -ENOTTY;
+	int res;
+	int oldfd;
+	struct fuse_dev *fud = NULL;
 
-	if (cmd == FUSE_DEV_IOC_CLONE) {
-		int oldfd;
+	if (_IOC_TYPE(cmd) != FUSE_DEV_IOC_MAGIC)
+		return -ENOTTY;
 
-		err = -EFAULT;
-		if (!get_user(oldfd, (__u32 __user *) arg)) {
+	switch (_IOC_NR(cmd)) {
+	case _IOC_NR(FUSE_DEV_IOC_CLONE):
+		res = -EFAULT;
+		if (!get_user(oldfd, (__u32 __user *)arg)) {
 			struct file *old = fget(oldfd);
 
-			err = -EINVAL;
+			res = -EINVAL;
 			if (old) {
-				struct fuse_dev *fud = NULL;
-
 				/*
 				 * Check against file->f_op because CUSE
 				 * uses the same ioctl handler.
@@ -2252,14 +2254,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 
 				if (fud) {
 					mutex_lock(&fuse_mutex);
-					err = fuse_device_clone(fud->fc, file);
+					res = fuse_device_clone(fud->fc, file);
 					mutex_unlock(&fuse_mutex);
 				}
 				fput(old);
 			}
 		}
+		break;
+	default:
+		res = -ENOTTY;
+		break;
 	}
-	return err;
+	return res;
 }
 
 const struct file_operations fuse_dev_operations = {
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 98ca64d1beb6..54442612c48b 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -903,7 +903,8 @@ struct fuse_notify_retrieve_in {
 };
 
 /* Device ioctls: */
-#define FUSE_DEV_IOC_CLONE	_IOR(229, 0, uint32_t)
+#define FUSE_DEV_IOC_MAGIC		229
+#define FUSE_DEV_IOC_CLONE		_IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t)
 
 struct fuse_lseek_in {
 	uint64_t	fh;
-- 
cgit v1.2.3-71-gd317


From 34e49994d0dcdb2d31d4d2908d04f4e9ce57e4d7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 15 Mar 2021 15:18:24 +0100
Subject: btrfs: fix slab cache flags for free space tree bitmap

The free space tree bitmap slab cache is created with SLAB_RED_ZONE but
that's a debugging flag and not always enabled. Also the other slabs are
created with at least SLAB_MEM_SPREAD that we want as well to average
the memory placement cost.

Reported-by: Vlastimil Babka <vbabka@suse.cz>
Fixes: 3acd48507dc4 ("btrfs: fix allocation of free space cache v1 bitmap pages")
CC: stable@vger.kernel.org # 5.4+
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ea5ede619220..5092dea21448 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9005,7 +9005,7 @@ int __init btrfs_init_cachep(void)
 
 	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
 							PAGE_SIZE, PAGE_SIZE,
-							SLAB_RED_ZONE, NULL);
+							SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_free_space_bitmap_cachep)
 		goto fail;
 
-- 
cgit v1.2.3-71-gd317


From dbcc7d57bffc0c8cac9dac11bec548597d59a6a5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 11 Mar 2021 14:31:05 +0000
Subject: btrfs: fix race when cloning extent buffer during rewind of an old
 root

While resolving backreferences, as part of a logical ino ioctl call or
fiemap, we can end up hitting a BUG_ON() when replaying tree mod log
operations of a root, triggering a stack trace like the following:

  ------------[ cut here ]------------
  kernel BUG at fs/btrfs/ctree.c:1210!
  invalid opcode: 0000 [#1] SMP KASAN PTI
  CPU: 1 PID: 19054 Comm: crawl_335 Tainted: G        W         5.11.0-2d11c0084b02-misc-next+ #89
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
  RIP: 0010:__tree_mod_log_rewind+0x3b1/0x3c0
  Code: 05 48 8d 74 10 (...)
  RSP: 0018:ffffc90001eb70b8 EFLAGS: 00010297
  RAX: 0000000000000000 RBX: ffff88812344e400 RCX: ffffffffb28933b6
  RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff88812344e42c
  RBP: ffffc90001eb7108 R08: 1ffff11020b60a20 R09: ffffed1020b60a20
  R10: ffff888105b050f9 R11: ffffed1020b60a1f R12: 00000000000000ee
  R13: ffff8880195520c0 R14: ffff8881bc958500 R15: ffff88812344e42c
  FS:  00007fd1955e8700(0000) GS:ffff8881f5600000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007efdb7928718 CR3: 000000010103a006 CR4: 0000000000170ee0
  Call Trace:
   btrfs_search_old_slot+0x265/0x10d0
   ? lock_acquired+0xbb/0x600
   ? btrfs_search_slot+0x1090/0x1090
   ? free_extent_buffer.part.61+0xd7/0x140
   ? free_extent_buffer+0x13/0x20
   resolve_indirect_refs+0x3e9/0xfc0
   ? lock_downgrade+0x3d0/0x3d0
   ? __kasan_check_read+0x11/0x20
   ? add_prelim_ref.part.11+0x150/0x150
   ? lock_downgrade+0x3d0/0x3d0
   ? __kasan_check_read+0x11/0x20
   ? lock_acquired+0xbb/0x600
   ? __kasan_check_write+0x14/0x20
   ? do_raw_spin_unlock+0xa8/0x140
   ? rb_insert_color+0x30/0x360
   ? prelim_ref_insert+0x12d/0x430
   find_parent_nodes+0x5c3/0x1830
   ? resolve_indirect_refs+0xfc0/0xfc0
   ? lock_release+0xc8/0x620
   ? fs_reclaim_acquire+0x67/0xf0
   ? lock_acquire+0xc7/0x510
   ? lock_downgrade+0x3d0/0x3d0
   ? lockdep_hardirqs_on_prepare+0x160/0x210
   ? lock_release+0xc8/0x620
   ? fs_reclaim_acquire+0x67/0xf0
   ? lock_acquire+0xc7/0x510
   ? poison_range+0x38/0x40
   ? unpoison_range+0x14/0x40
   ? trace_hardirqs_on+0x55/0x120
   btrfs_find_all_roots_safe+0x142/0x1e0
   ? find_parent_nodes+0x1830/0x1830
   ? btrfs_inode_flags_to_xflags+0x50/0x50
   iterate_extent_inodes+0x20e/0x580
   ? tree_backref_for_extent+0x230/0x230
   ? lock_downgrade+0x3d0/0x3d0
   ? read_extent_buffer+0xdd/0x110
   ? lock_downgrade+0x3d0/0x3d0
   ? __kasan_check_read+0x11/0x20
   ? lock_acquired+0xbb/0x600
   ? __kasan_check_write+0x14/0x20
   ? _raw_spin_unlock+0x22/0x30
   ? __kasan_check_write+0x14/0x20
   iterate_inodes_from_logical+0x129/0x170
   ? iterate_inodes_from_logical+0x129/0x170
   ? btrfs_inode_flags_to_xflags+0x50/0x50
   ? iterate_extent_inodes+0x580/0x580
   ? __vmalloc_node+0x92/0xb0
   ? init_data_container+0x34/0xb0
   ? init_data_container+0x34/0xb0
   ? kvmalloc_node+0x60/0x80
   btrfs_ioctl_logical_to_ino+0x158/0x230
   btrfs_ioctl+0x205e/0x4040
   ? __might_sleep+0x71/0xe0
   ? btrfs_ioctl_get_supported_features+0x30/0x30
   ? getrusage+0x4b6/0x9c0
   ? __kasan_check_read+0x11/0x20
   ? lock_release+0xc8/0x620
   ? __might_fault+0x64/0xd0
   ? lock_acquire+0xc7/0x510
   ? lock_downgrade+0x3d0/0x3d0
   ? lockdep_hardirqs_on_prepare+0x210/0x210
   ? lockdep_hardirqs_on_prepare+0x210/0x210
   ? __kasan_check_read+0x11/0x20
   ? do_vfs_ioctl+0xfc/0x9d0
   ? ioctl_file_clone+0xe0/0xe0
   ? lock_downgrade+0x3d0/0x3d0
   ? lockdep_hardirqs_on_prepare+0x210/0x210
   ? __kasan_check_read+0x11/0x20
   ? lock_release+0xc8/0x620
   ? __task_pid_nr_ns+0xd3/0x250
   ? lock_acquire+0xc7/0x510
   ? __fget_files+0x160/0x230
   ? __fget_light+0xf2/0x110
   __x64_sys_ioctl+0xc3/0x100
   do_syscall_64+0x37/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7fd1976e2427
  Code: 00 00 90 48 8b 05 (...)
  RSP: 002b:00007fd1955e5cf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
  RAX: ffffffffffffffda RBX: 00007fd1955e5f40 RCX: 00007fd1976e2427
  RDX: 00007fd1955e5f48 RSI: 00000000c038943b RDI: 0000000000000004
  RBP: 0000000001000000 R08: 0000000000000000 R09: 00007fd1955e6120
  R10: 0000557835366b00 R11: 0000000000000246 R12: 0000000000000004
  R13: 00007fd1955e5f48 R14: 00007fd1955e5f40 R15: 00007fd1955e5ef8
  Modules linked in:
  ---[ end trace ec8931a1c36e57be ]---

  (gdb) l *(__tree_mod_log_rewind+0x3b1)
  0xffffffff81893521 is in __tree_mod_log_rewind (fs/btrfs/ctree.c:1210).
  1205                     * the modification. as we're going backwards, we do the
  1206                     * opposite of each operation here.
  1207                     */
  1208                    switch (tm->op) {
  1209                    case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
  1210                            BUG_ON(tm->slot < n);
  1211                            fallthrough;
  1212                    case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
  1213                    case MOD_LOG_KEY_REMOVE:
  1214                            btrfs_set_node_key(eb, &tm->key, tm->slot);

Here's what happens to hit that BUG_ON():

1) We have one tree mod log user (through fiemap or the logical ino ioctl),
   with a sequence number of 1, so we have fs_info->tree_mod_seq == 1;

2) Another task is at ctree.c:balance_level() and we have eb X currently as
   the root of the tree, and we promote its single child, eb Y, as the new
   root.

   Then, at ctree.c:balance_level(), we call:

      tree_mod_log_insert_root(eb X, eb Y, 1);

3) At tree_mod_log_insert_root() we create tree mod log elements for each
   slot of eb X, of operation type MOD_LOG_KEY_REMOVE_WHILE_FREEING each
   with a ->logical pointing to ebX->start. These are placed in an array
   named tm_list.
   Lets assume there are N elements (N pointers in eb X);

4) Then, still at tree_mod_log_insert_root(), we create a tree mod log
   element of operation type MOD_LOG_ROOT_REPLACE, ->logical set to
   ebY->start, ->old_root.logical set to ebX->start, ->old_root.level set
   to the level of eb X and ->generation set to the generation of eb X;

5) Then tree_mod_log_insert_root() calls tree_mod_log_free_eb() with
   tm_list as argument. After that, tree_mod_log_free_eb() calls
   __tree_mod_log_insert() for each member of tm_list in reverse order,
   from highest slot in eb X, slot N - 1, to slot 0 of eb X;

6) __tree_mod_log_insert() sets the sequence number of each given tree mod
   log operation - it increments fs_info->tree_mod_seq and sets
   fs_info->tree_mod_seq as the sequence number of the given tree mod log
   operation.

   This means that for the tm_list created at tree_mod_log_insert_root(),
   the element corresponding to slot 0 of eb X has the highest sequence
   number (1 + N), and the element corresponding to the last slot has the
   lowest sequence number (2);

7) Then, after inserting tm_list's elements into the tree mod log rbtree,
   the MOD_LOG_ROOT_REPLACE element is inserted, which gets the highest
   sequence number, which is N + 2;

8) Back to ctree.c:balance_level(), we free eb X by calling
   btrfs_free_tree_block() on it. Because eb X was created in the current
   transaction, has no other references and writeback did not happen for
   it, we add it back to the free space cache/tree;

9) Later some other task T allocates the metadata extent from eb X, since
   it is marked as free space in the space cache/tree, and uses it as a
   node for some other btree;

10) The tree mod log user task calls btrfs_search_old_slot(), which calls
    get_old_root(), and finally that calls __tree_mod_log_oldest_root()
    with time_seq == 1 and eb_root == eb Y;

11) First iteration of the while loop finds the tree mod log element with
    sequence number N + 2, for the logical address of eb Y and of type
    MOD_LOG_ROOT_REPLACE;

12) Because the operation type is MOD_LOG_ROOT_REPLACE, we don't break out
    of the loop, and set root_logical to point to tm->old_root.logical
    which corresponds to the logical address of eb X;

13) On the next iteration of the while loop, the call to
    tree_mod_log_search_oldest() returns the smallest tree mod log element
    for the logical address of eb X, which has a sequence number of 2, an
    operation type of MOD_LOG_KEY_REMOVE_WHILE_FREEING and corresponds to
    the old slot N - 1 of eb X (eb X had N items in it before being freed);

14) We then break out of the while loop and return the tree mod log operation
    of type MOD_LOG_ROOT_REPLACE (eb Y), and not the one for slot N - 1 of
    eb X, to get_old_root();

15) At get_old_root(), we process the MOD_LOG_ROOT_REPLACE operation
    and set "logical" to the logical address of eb X, which was the old
    root. We then call tree_mod_log_search() passing it the logical
    address of eb X and time_seq == 1;

16) Then before calling tree_mod_log_search(), task T adds a key to eb X,
    which results in adding a tree mod log operation of type
    MOD_LOG_KEY_ADD to the tree mod log - this is done at
    ctree.c:insert_ptr() - but after adding the tree mod log operation
    and before updating the number of items in eb X from 0 to 1...

17) The task at get_old_root() calls tree_mod_log_search() and gets the
    tree mod log operation of type MOD_LOG_KEY_ADD just added by task T.
    Then it enters the following if branch:

    if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
       (...)
    } (...)

    Calls read_tree_block() for eb X, which gets a reference on eb X but
    does not lock it - task T has it locked.
    Then it clones eb X while it has nritems set to 0 in its header, before
    task T sets nritems to 1 in eb X's header. From hereupon we use the
    clone of eb X which no other task has access to;

18) Then we call __tree_mod_log_rewind(), passing it the MOD_LOG_KEY_ADD
    mod log operation we just got from tree_mod_log_search() in the
    previous step and the cloned version of eb X;

19) At __tree_mod_log_rewind(), we set the local variable "n" to the number
    of items set in eb X's clone, which is 0. Then we enter the while loop,
    and in its first iteration we process the MOD_LOG_KEY_ADD operation,
    which just decrements "n" from 0 to (u32)-1, since "n" is declared with
    a type of u32. At the end of this iteration we call rb_next() to find the
    next tree mod log operation for eb X, that gives us the mod log operation
    of type MOD_LOG_KEY_REMOVE_WHILE_FREEING, for slot 0, with a sequence
    number of N + 1 (steps 3 to 6);

20) Then we go back to the top of the while loop and trigger the following
    BUG_ON():

        (...)
        switch (tm->op) {
        case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
                 BUG_ON(tm->slot < n);
                 fallthrough;
        (...)

    Because "n" has a value of (u32)-1 (4294967295) and tm->slot is 0.

Fix this by taking a read lock on the extent buffer before cloning it at
ctree.c:get_old_root(). This should be done regardless of the extent
buffer having been freed and reused, as a concurrent task might be
modifying it (while holding a write lock on it).

Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Link: https://lore.kernel.org/linux-btrfs/20210227155037.GN28049@hungrycats.org/
Fixes: 834328a8493079 ("Btrfs: tree mod log's old roots could still be part of the tree")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d56730a67885..34b929bd5c1a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1365,7 +1365,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 				   "failed to read tree block %llu from get_old_root",
 				   logical);
 		} else {
+			btrfs_tree_read_lock(old);
 			eb = btrfs_clone_extent_buffer(old);
+			btrfs_tree_read_unlock(old);
 			free_extent_buffer(old);
 		}
 	} else if (old_root) {
-- 
cgit v1.2.3-71-gd317


From 485df75554257e883d0ce39bb886e8212349748e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 11 Mar 2021 14:31:06 +0000
Subject: btrfs: always pin deleted leaves when there are active tree mod log
 users

When freeing a tree block we may end up adding its extent back to the
free space cache/tree, as long as there are no more references for it,
it was created in the current transaction and writeback for it never
happened. This is generally fine, however when we have tree mod log
operations it can result in inconsistent versions of a btree after
unwinding extent buffers with the recorded tree mod log operations.

This is because:

* We only log operations for nodes (adding and removing key/pointers),
  for leaves we don't do anything;

* This means that we can log a MOD_LOG_KEY_REMOVE_WHILE_FREEING operation
  for a node that points to a leaf that was deleted;

* Before we apply the logged operation to unwind a node, we can have
  that leaf's extent allocated again, either as a node or as a leaf, and
  possibly for another btree. This is possible if the leaf was created in
  the current transaction and writeback for it never started, in which
  case btrfs_free_tree_block() returns its extent back to the free space
  cache/tree;

* Then, before applying the tree mod log operation, some task allocates
  the metadata extent just freed before, and uses it either as a leaf or
  as a node for some btree (can be the same or another one, it does not
  matter);

* After applying the MOD_LOG_KEY_REMOVE_WHILE_FREEING operation we now
  get the target node with an item pointing to the metadata extent that
  now has content different from what it had before the leaf was deleted.
  It might now belong to a different btree and be a node and not a leaf
  anymore.

  As a consequence, the results of searches after the unwinding can be
  unpredictable and produce unexpected results.

So make sure we pin extent buffers corresponding to leaves when there
are tree mod log users.

CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 78ad31a59e59..36a3c973fda1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3323,6 +3323,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
 	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
 		struct btrfs_block_group *cache;
+		bool must_pin = false;
 
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 			ret = check_ref_cleanup(trans, buf->start);
@@ -3340,7 +3341,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			goto out;
 		}
 
-		if (btrfs_is_zoned(fs_info)) {
+		/*
+		 * If this is a leaf and there are tree mod log users, we may
+		 * have recorded mod log operations that point to this leaf.
+		 * So we must make sure no one reuses this leaf's extent before
+		 * mod log operations are applied to a node, otherwise after
+		 * rewinding a node using the mod log operations we get an
+		 * inconsistent btree, as the leaf's extent may now be used as
+		 * a node or leaf for another different btree.
+		 * We are safe from races here because at this point no other
+		 * node or root points to this extent buffer, so if after this
+		 * check a new tree mod log user joins, it will not be able to
+		 * find a node pointing to this leaf and record operations that
+		 * point to this leaf.
+		 */
+		if (btrfs_header_level(buf) == 0) {
+			read_lock(&fs_info->tree_mod_log_lock);
+			must_pin = !list_empty(&fs_info->tree_mod_seq_list);
+			read_unlock(&fs_info->tree_mod_log_lock);
+		}
+
+		if (must_pin || btrfs_is_zoned(fs_info)) {
 			btrfs_redirty_list_add(trans->transaction, buf);
 			pin_down_extent(trans, cache, buf->start, buf->len, 1);
 			btrfs_put_block_group(cache);
-- 
cgit v1.2.3-71-gd317


From 5abbe51a526253b9f003e9a0a195638dc882d660 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 1 Feb 2021 18:46:41 +0100
Subject: kernel, fs: Introduce and use set_restart_fn() and
 arch_set_restart_data()

Preparation for fixing get_nr_restart_syscall() on X86 for COMPAT.

Add a new helper which sets restart_block->fn and calls a dummy
arch_set_restart_data() helper.

Fixes: 609c19a385c8 ("x86/ptrace: Stop setting TS_COMPAT in ptrace code")
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210201174641.GA17871@redhat.com
---
 fs/select.c                    | 10 ++++------
 include/linux/thread_info.h    | 13 +++++++++++++
 kernel/futex.c                 |  3 +--
 kernel/time/alarmtimer.c       |  2 +-
 kernel/time/hrtimer.c          |  2 +-
 kernel/time/posix-cpu-timers.c |  2 +-
 6 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 37aaa8317f3a..945896d0ac9e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1055,10 +1055,9 @@ static long do_restart_poll(struct restart_block *restart_block)
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	if (ret == -ERESTARTNOHAND) {
-		restart_block->fn = do_restart_poll;
-		ret = -ERESTART_RESTARTBLOCK;
-	}
+	if (ret == -ERESTARTNOHAND)
+		ret = set_restart_fn(restart_block, do_restart_poll);
+
 	return ret;
 }
 
@@ -1080,7 +1079,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct restart_block *restart_block;
 
 		restart_block = &current->restart_block;
-		restart_block->fn = do_restart_poll;
 		restart_block->poll.ufds = ufds;
 		restart_block->poll.nfds = nfds;
 
@@ -1091,7 +1089,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 		} else
 			restart_block->poll.has_timeout = 0;
 
-		ret = -ERESTART_RESTARTBLOCK;
+		ret = set_restart_fn(restart_block, do_restart_poll);
 	}
 	return ret;
 }
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9b2158c69275..157762db9d4b 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/restart_block.h>
+#include <linux/errno.h>
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 /*
@@ -59,6 +60,18 @@ enum syscall_work_bit {
 
 #ifdef __KERNEL__
 
+#ifndef arch_set_restart_data
+#define arch_set_restart_data(restart) do { } while (0)
+#endif
+
+static inline long set_restart_fn(struct restart_block *restart,
+					long (*fn)(struct restart_block *))
+{
+	restart->fn = fn;
+	arch_set_restart_data(restart);
+	return -ERESTART_RESTARTBLOCK;
+}
+
 #ifndef THREAD_ALIGN
 #define THREAD_ALIGN	THREAD_SIZE
 #endif
diff --git a/kernel/futex.c b/kernel/futex.c
index e68db7745039..00febd6dea9c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2728,14 +2728,13 @@ retry:
 		goto out;
 
 	restart = &current->restart_block;
-	restart->fn = futex_wait_restart;
 	restart->futex.uaddr = uaddr;
 	restart->futex.val = val;
 	restart->futex.time = *abs_time;
 	restart->futex.bitset = bitset;
 	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
-	ret = -ERESTART_RESTARTBLOCK;
+	ret = set_restart_fn(restart, futex_wait_restart);
 
 out:
 	if (to) {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 98d7a15e8cf6..4d94e2b5499d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 	if (flags == TIMER_ABSTIME)
 		return -ERESTARTNOHAND;
 
-	restart->fn = alarm_timer_nsleep_restart;
 	restart->nanosleep.clockid = type;
 	restart->nanosleep.expires = exp;
+	set_restart_fn(restart, alarm_timer_nsleep_restart);
 	return ret;
 }
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 788b9d137de4..5c9d968187ae 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1957,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
 	}
 
 	restart = &current->restart_block;
-	restart->fn = hrtimer_nanosleep_restart;
 	restart->nanosleep.clockid = t.timer.base->clockid;
 	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+	set_restart_fn(restart, hrtimer_nanosleep_restart);
 out:
 	destroy_hrtimer_on_stack(&t.timer);
 	return ret;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..9abe15255bc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		if (flags & TIMER_ABSTIME)
 			return -ERESTARTNOHAND;
 
-		restart_block->fn = posix_cpu_nsleep_restart;
 		restart_block->nanosleep.clockid = which_clock;
+		set_restart_fn(restart_block, posix_cpu_nsleep_restart);
 	}
 	return error;
 }
-- 
cgit v1.2.3-71-gd317


From 6980d29ce4da223ad7f0751c7f1d61d3c6b54ab3 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 16 Mar 2021 20:30:26 +0800
Subject: zonefs: fix to update .i_wr_refcnt correctly in zonefs_open_zone()

In zonefs_open_zone(), if opened zone count is larger than
.s_max_open_zones threshold, we missed to recover .i_wr_refcnt,
fix this.

Fixes: b5c00e975779 ("zonefs: open/close zone on file open/close")
Cc: <stable@vger.kernel.org>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 fs/zonefs/super.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 3427c99abb4d..049e36c69ed7 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1040,9 +1040,7 @@ static int zonefs_open_zone(struct inode *inode)
 
 	mutex_lock(&zi->i_truncate_mutex);
 
-	zi->i_wr_refcnt++;
-	if (zi->i_wr_refcnt == 1) {
-
+	if (!zi->i_wr_refcnt) {
 		if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
 			atomic_dec(&sbi->s_open_zones);
 			ret = -EBUSY;
@@ -1052,7 +1050,6 @@ static int zonefs_open_zone(struct inode *inode)
 		if (i_size_read(inode) < zi->i_max_size) {
 			ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 			if (ret) {
-				zi->i_wr_refcnt--;
 				atomic_dec(&sbi->s_open_zones);
 				goto unlock;
 			}
@@ -1060,6 +1057,8 @@ static int zonefs_open_zone(struct inode *inode)
 		}
 	}
 
+	zi->i_wr_refcnt++;
+
 unlock:
 	mutex_unlock(&zi->i_truncate_mutex);
 
-- 
cgit v1.2.3-71-gd317


From 76cd979f4f38a27df22efb5773a0d567181a9392 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Tue, 16 Mar 2021 16:33:27 +0100
Subject: io_uring: imply MSG_NOSIGNAL for send[msg]()/recv[msg]() calls

We never want to generate any SIGPIPE, -EPIPE only is much better.

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Link: https://lore.kernel.org/r/38961085c3ec49fd21550c7788f214d1ff02d2d4.1615908477.git.metze@samba.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5f4e312111ea..a81f7a30ea70 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4384,7 +4384,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 		kmsg = &iomsg;
 	}
 
-	flags = req->sr_msg.msg_flags;
+	flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
 	if (flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 	else if (issue_flags & IO_URING_F_NONBLOCK)
@@ -4428,7 +4428,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_controllen = 0;
 	msg.msg_namelen = 0;
 
-	flags = req->sr_msg.msg_flags;
+	flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
 	if (flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 	else if (issue_flags & IO_URING_F_NONBLOCK)
@@ -4618,7 +4618,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 				1, req->sr_msg.len);
 	}
 
-	flags = req->sr_msg.msg_flags;
+	flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
 	if (flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 	else if (force_nonblock)
@@ -4677,7 +4677,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_iocb = NULL;
 	msg.msg_flags = 0;
 
-	flags = req->sr_msg.msg_flags;
+	flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
 	if (flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 	else if (force_nonblock)
-- 
cgit v1.2.3-71-gd317


From 53e043b2b432ef2294efec04dd8a88d96c024624 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Mon, 15 Mar 2021 12:56:56 +0100
Subject: io_uring: remove structures from include/linux/io_uring.h

Link: https://lore.kernel.org/r/8c1d14f3748105f4caeda01716d47af2fa41d11c.1615809009.git.metze@samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.h               | 10 +++++++++-
 fs/io_uring.c            | 16 ++++++++++++++++
 include/linux/io_uring.h | 25 -------------------------
 3 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 1ac2f3248088..80d590564ff9 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -2,7 +2,6 @@
 #define INTERNAL_IO_WQ_H
 
 #include <linux/refcount.h>
-#include <linux/io_uring.h>
 
 struct io_wq;
 
@@ -21,6 +20,15 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
+struct io_wq_work_node {
+	struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+	struct io_wq_work_node *first;
+	struct io_wq_work_node *last;
+};
+
 static inline void wq_list_add_after(struct io_wq_work_node *node,
 				     struct io_wq_work_node *pos,
 				     struct io_wq_work_list *list)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a81f7a30ea70..52ba8d7f3eb8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -456,6 +456,22 @@ struct io_ring_ctx {
 	struct list_head		tctx_list;
 };
 
+struct io_uring_task {
+	/* submission side */
+	struct xarray		xa;
+	struct wait_queue_head	wait;
+	void			*last;
+	void			*io_wq;
+	struct percpu_counter	inflight;
+	atomic_t		in_idle;
+	bool			sqpoll;
+
+	spinlock_t		task_lock;
+	struct io_wq_work_list	task_list;
+	unsigned long		task_state;
+	struct callback_head	task_work;
+};
+
 /*
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 9761a0ec9f95..79cde9906be0 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,31 +5,6 @@
 #include <linux/sched.h>
 #include <linux/xarray.h>
 
-struct io_wq_work_node {
-	struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
-	struct io_wq_work_node *first;
-	struct io_wq_work_node *last;
-};
-
-struct io_uring_task {
-	/* submission side */
-	struct xarray		xa;
-	struct wait_queue_head	wait;
-	void			*last;
-	void			*io_wq;
-	struct percpu_counter	inflight;
-	atomic_t		in_idle;
-	bool			sqpoll;
-
-	spinlock_t		task_lock;
-	struct io_wq_work_list	task_list;
-	unsigned long		task_state;
-	struct callback_head	task_work;
-};
-
 #if defined(CONFIG_IO_URING)
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_task_cancel(void);
-- 
cgit v1.2.3-71-gd317


From ee53fb2b197b72b126ca0387ae636da75d969428 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Mon, 15 Mar 2021 12:56:57 +0100
Subject: io_uring: use typesafe pointers in io_uring_task

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Link: https://lore.kernel.org/r/ce2a598e66e48347bb04afbaf2acc67c0cc7971a.1615809009.git.metze@samba.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 52ba8d7f3eb8..675073966760 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -460,8 +460,8 @@ struct io_uring_task {
 	/* submission side */
 	struct xarray		xa;
 	struct wait_queue_head	wait;
-	void			*last;
-	void			*io_wq;
+	const struct io_ring_ctx *last;
+	struct io_wq		*io_wq;
 	struct percpu_counter	inflight;
 	atomic_t		in_idle;
 	bool			sqpoll;
-- 
cgit v1.2.3-71-gd317


From de75a3d3f5a14c9ab3c4883de3471d3c92a8ee78 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 18 Mar 2021 11:54:35 +0000
Subject: io_uring: don't leak creds on SQO attach error

Attaching to already dead/dying SQPOLL task is disallowed in
io_sq_offload_create(), but cleanup is hand coded by calling
io_put_sq_data()/etc., that miss to put ctx->sq_creds.

Defer everything to error-path io_sq_thread_finish(), adding
ctx->sqd_list in the error case as well as finish will handle it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 675073966760..ea2d3e120555 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7910,22 +7910,17 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 
 		ret = 0;
 		io_sq_thread_park(sqd);
+		list_add(&ctx->sqd_list, &sqd->ctx_list);
+		io_sqd_update_thread_idle(sqd);
 		/* don't attach to a dying SQPOLL thread, would be racy */
-		if (attached && !sqd->thread) {
+		if (attached && !sqd->thread)
 			ret = -ENXIO;
-		} else {
-			list_add(&ctx->sqd_list, &sqd->ctx_list);
-			io_sqd_update_thread_idle(sqd);
-		}
 		io_sq_thread_unpark(sqd);
 
-		if (ret < 0) {
-			io_put_sq_data(sqd);
-			ctx->sq_data = NULL;
-			return ret;
-		} else if (attached) {
+		if (ret < 0)
+			goto err;
+		if (attached)
 			return 0;
-		}
 
 		if (p->flags & IORING_SETUP_SQ_AFF) {
 			int cpu = p->sq_thread_cpu;
-- 
cgit v1.2.3-71-gd317


From 403dba003d17b3f0c1627b355cec2d74041cf648 Mon Sep 17 00:00:00 2001
From: Liu xuzhi <liu.xuzhi@zte.com.cn>
Date: Thu, 18 Mar 2021 17:46:57 -0700
Subject: fs/cifs/: fix misspellings using codespell tool

A typo is found out by codespell tool in 251th lines of cifs_swn.c:

$ codespell ./fs/cifs/
./cifs_swn.c:251: funciton  ==> function

Fix a typo found by codespell.

Signed-off-by: Liu xuzhi <liu.xuzhi@zte.com.cn>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifs_swn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index f2d730fffccb..d829b8bf833e 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -248,7 +248,7 @@ nlmsg_fail:
 
 /*
  * Try to find a matching registration for the tcon's server name and share name.
- * Calls to this funciton must be protected by cifs_swnreg_idr_mutex.
+ * Calls to this function must be protected by cifs_swnreg_idr_mutex.
  * TODO Try to avoid memory allocations
  */
 static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon)
-- 
cgit v1.2.3-71-gd317


From af3ef3b1031634724a3763606695ebcd113d782b Mon Sep 17 00:00:00 2001
From: Aurelien Aptel <aaptel@suse.com>
Date: Thu, 18 Mar 2021 19:17:10 +0100
Subject: cifs: warn and fail if trying to use rootfs without the config option

If CONFIG_CIFS_ROOT is not set, rootfs mount option is invalid

Signed-off-by: Aurelien Aptel <aaptel@suse.com>
CC: <stable@vger.kernel.org> # v5.11
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/fs_context.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 892f51a21278..78889024a7ed 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -1196,9 +1196,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 		pr_warn_once("Witness protocol support is experimental\n");
 		break;
 	case Opt_rootfs:
-#ifdef CONFIG_CIFS_ROOT
-		ctx->rootfs = true;
+#ifndef CONFIG_CIFS_ROOT
+		cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n");
+		goto cifs_parse_mount_err;
 #endif
+		ctx->rootfs = true;
 		break;
 	case Opt_posixpaths:
 		if (result.negated)
-- 
cgit v1.2.3-71-gd317


From 65af8f0166f4d15e61c63db498ec7981acdd897f Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 19 Mar 2021 00:05:48 -0500
Subject: cifs: fix allocation size on newly created files

Applications that create and extend and write to a file do not
expect to see 0 allocation size.  When file is extended,
set its allocation size to a plausible value until we have a
chance to query the server for it.  When the file is cached
this will prevent showing an impossible number of allocated
blocks (like 0).  This fixes e.g. xfstests 614 which does

    1) create a file and set its size to 64K
    2) mmap write 64K to the file
    3) stat -c %b for the file (to query the number of allocated blocks)

It was failing because we returned 0 blocks.  Even though we would
return the correct cached file size, we returned an impossible
allocation size.

Signed-off-by: Steve French <stfrench@microsoft.com>
CC: <stable@vger.kernel.org>
Reviewed-by: Aurelien Aptel <aaptel@suse.com>
---
 fs/cifs/inode.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7c61bc9573c0..f2df4422e54a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2395,7 +2395,7 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 	 * We need to be sure that all dirty pages are written and the server
 	 * has actual ctime, mtime and file length.
 	 */
-	if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE)) &&
+	if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE | STATX_BLOCKS)) &&
 	    !CIFS_CACHE_READ(CIFS_I(inode)) &&
 	    inode->i_mapping && inode->i_mapping->nrpages != 0) {
 		rc = filemap_fdatawait(inode->i_mapping);
@@ -2585,6 +2585,14 @@ set_size_out:
 	if (rc == 0) {
 		cifsInode->server_eof = attrs->ia_size;
 		cifs_setsize(inode, attrs->ia_size);
+		/*
+		 * i_blocks is not related to (i_size / i_blksize), but instead
+		 * 512 byte (2**9) size is required for calculating num blocks.
+		 * Until we can query the server for actual allocation size,
+		 * this is best estimate we have for blocks allocated for a file
+		 * Number of blocks must be rounded up so size 1 is not 0 blocks
+		 */
+		inode->i_blocks = (512 - 1 + attrs->ia_size) >> 9;
 
 		/*
 		 * The man page of truncate says if the size changed,
-- 
cgit v1.2.3-71-gd317


From b7ff91fd030dc9d72ed91b1aab36e445a003af4f Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Wed, 3 Mar 2021 21:17:02 +0800
Subject: ext4: find old entry again if failed to rename whiteout

If we failed to add new entry on rename whiteout, we cannot reset the
old->de entry directly, because the old->de could have moved from under
us during make indexed dir. So find the old entry again before reset is
needed, otherwise it may corrupt the filesystem as below.

  /dev/sda: Entry '00000001' in ??? (12) has deleted/unused inode 15. CLEARED.
  /dev/sda: Unattached inode 75
  /dev/sda: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.

Fixes: 6b4b8e6b4ad ("ext4: fix bug for rename with RENAME_WHITEOUT")
Cc: stable@vger.kernel.org
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20210303131703.330415-1-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 686bf982c84e..4aae59d55288 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3613,6 +3613,31 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
 	return retval;
 }
 
+static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
+			  unsigned ino, unsigned file_type)
+{
+	struct ext4_renament old = *ent;
+	int retval = 0;
+
+	/*
+	 * old->de could have moved from under us during make indexed dir,
+	 * so the old->de may no longer valid and need to find it again
+	 * before reset old inode info.
+	 */
+	old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+	if (IS_ERR(old.bh))
+		retval = PTR_ERR(old.bh);
+	if (!old.bh)
+		retval = -ENOENT;
+	if (retval) {
+		ext4_std_error(old.dir->i_sb, retval);
+		return;
+	}
+
+	ext4_setent(handle, &old, ino, file_type);
+	brelse(old.bh);
+}
+
 static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
 				  const struct qstr *d_name)
 {
@@ -3937,8 +3962,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 end_rename:
 	if (whiteout) {
 		if (retval) {
-			ext4_setent(handle, &old,
-				old.inode->i_ino, old_file_type);
+			ext4_resetent(handle, &old,
+				      old.inode->i_ino, old_file_type);
 			drop_nlink(whiteout);
 		}
 		unlock_new_inode(whiteout);
-- 
cgit v1.2.3-71-gd317


From 5dccdc5a1916d4266edd251f20bbbb113a5c495f Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Wed, 3 Mar 2021 21:17:03 +0800
Subject: ext4: do not iput inode under running transaction in ext4_rename()

In ext4_rename(), when RENAME_WHITEOUT failed to add new entry into
directory, it ends up dropping new created whiteout inode under the
running transaction. After commit <9b88f9fb0d2> ("ext4: Do not iput inode
under running transaction"), we follow the assumptions that evict() does
not get called from a transaction context but in ext4_rename() it breaks
this suggestion. Although it's not a real problem, better to obey it, so
this patch add inode to orphan list and stop transaction before final
iput().

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20210303131703.330415-2-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4aae59d55288..1fe8370dbf0b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3799,14 +3799,14 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	 */
 	retval = -ENOENT;
 	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
-		goto end_rename;
+		goto release_bh;
 
 	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
 				 &new.de, &new.inlined);
 	if (IS_ERR(new.bh)) {
 		retval = PTR_ERR(new.bh);
 		new.bh = NULL;
-		goto end_rename;
+		goto release_bh;
 	}
 	if (new.bh) {
 		if (!new.inode) {
@@ -3823,15 +3823,13 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 		handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
 		if (IS_ERR(handle)) {
 			retval = PTR_ERR(handle);
-			handle = NULL;
-			goto end_rename;
+			goto release_bh;
 		}
 	} else {
 		whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle);
 		if (IS_ERR(whiteout)) {
 			retval = PTR_ERR(whiteout);
-			whiteout = NULL;
-			goto end_rename;
+			goto release_bh;
 		}
 	}
 
@@ -3965,16 +3963,18 @@ end_rename:
 			ext4_resetent(handle, &old,
 				      old.inode->i_ino, old_file_type);
 			drop_nlink(whiteout);
+			ext4_orphan_add(handle, whiteout);
 		}
 		unlock_new_inode(whiteout);
+		ext4_journal_stop(handle);
 		iput(whiteout);
-
+	} else {
+		ext4_journal_stop(handle);
 	}
+release_bh:
 	brelse(old.dir_bh);
 	brelse(old.bh);
 	brelse(new.bh);
-	if (handle)
-		ext4_journal_stop(handle);
 	return retval;
 }
 
-- 
cgit v1.2.3-71-gd317


From 6b22489911b726eebbf169caee52fea52013fbdd Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Fri, 5 Mar 2021 20:05:08 +0800
Subject: ext4: do not try to set xattr into ea_inode if value is empty

Syzbot report a warning that ext4 may create an empty ea_inode if set
an empty extent attribute to a file on the file system which is no free
blocks left.

  WARNING: CPU: 6 PID: 10667 at fs/ext4/xattr.c:1640 ext4_xattr_set_entry+0x10f8/0x1114 fs/ext4/xattr.c:1640
  ...
  Call trace:
   ext4_xattr_set_entry+0x10f8/0x1114 fs/ext4/xattr.c:1640
   ext4_xattr_block_set+0x1d0/0x1b1c fs/ext4/xattr.c:1942
   ext4_xattr_set_handle+0x8a0/0xf1c fs/ext4/xattr.c:2390
   ext4_xattr_set+0x120/0x1f0 fs/ext4/xattr.c:2491
   ext4_xattr_trusted_set+0x48/0x5c fs/ext4/xattr_trusted.c:37
   __vfs_setxattr+0x208/0x23c fs/xattr.c:177
  ...

Now, ext4 try to store extent attribute into an external inode if
ext4_xattr_block_set() return -ENOSPC, but for the case of store an
empty extent attribute, store the extent entry into the extent
attribute block is enough. A simple reproduce below.

  fallocate test.img -l 1M
  mkfs.ext4 -F -b 2048 -O ea_inode test.img
  mount test.img /mnt
  dd if=/dev/zero of=/mnt/foo bs=2048 count=500
  setfattr -n "user.test" /mnt/foo

Reported-by: syzbot+98b881fdd8ebf45ab4ae@syzkaller.appspotmail.com
Fixes: 9c6e7853c531 ("ext4: reserve space for xattr entries/names")
Cc: stable@kernel.org
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20210305120508.298465-1-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 083c95126781..6c1018223c54 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2404,7 +2404,7 @@ retry_inode:
 				 * external inode if possible.
 				 */
 				if (ext4_has_feature_ea_inode(inode->i_sb) &&
-				    !i.in_inode) {
+				    i.value_len && !i.in_inode) {
 					i.in_inode = 1;
 					goto retry_inode;
 				}
-- 
cgit v1.2.3-71-gd317


From 7d8bd3c76da1d94b85e6c9b7007e20e980bfcfe6 Mon Sep 17 00:00:00 2001
From: Shijie Luo <luoshijie1@huawei.com>
Date: Fri, 12 Mar 2021 01:50:51 -0500
Subject: ext4: fix potential error in ext4_do_update_inode

If set_large_file = 1 and errors occur in ext4_handle_dirty_metadata(),
the error code will be overridden, go to out_brelse to avoid this
situation.

Signed-off-by: Shijie Luo <luoshijie1@huawei.com>
Link: https://lore.kernel.org/r/20210312065051.36314-1-luoshijie1@huawei.com
Cc: stable@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a79a9ea58c56..927e47db7f00 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5026,7 +5026,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	struct super_block *sb = inode->i_sb;
-	int err = 0, rc, block;
+	int err = 0, block;
 	int need_datasync = 0, set_large_file = 0;
 	uid_t i_uid;
 	gid_t i_gid;
@@ -5138,9 +5138,9 @@ static int ext4_do_update_inode(handle_t *handle,
 					      bh->b_data);
 
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
-	if (!err)
-		err = rc;
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	if (err)
+		goto out_brelse;
 	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 	if (set_large_file) {
 		BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
-- 
cgit v1.2.3-71-gd317


From 2a4ae3bcdf05b8639406eaa09a2939f3c6dd8e75 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 15 Mar 2021 17:59:06 +0100
Subject: ext4: fix timer use-after-free on failed mount

When filesystem mount fails because of corrupted filesystem we first
cancel the s_err_report timer reminding fs errors every day and only
then we flush s_error_work. However s_error_work may report another fs
error and re-arm timer thus resulting in timer use-after-free. Fix the
problem by first flushing the work and only after that canceling the
s_err_report timer.

Reported-by: syzbot+628472a2aac693ab0fcd@syzkaller.appspotmail.com
Fixes: 2d01ddc86606 ("ext4: save error info to sb through journal if available")
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210315165906.2175-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a0a256859662..b9693680463a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5154,8 +5154,8 @@ failed_mount_wq:
 failed_mount3a:
 	ext4_es_unregister_shrinker(sbi);
 failed_mount3:
-	del_timer_sync(&sbi->s_err_report);
 	flush_work(&sbi->s_error_work);
+	del_timer_sync(&sbi->s_err_report);
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
-- 
cgit v1.2.3-71-gd317


From 8210bb29c1b66200cff7b25febcf6e39baf49fbf Mon Sep 17 00:00:00 2001
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Date: Tue, 16 Mar 2021 15:19:21 -0700
Subject: ext4: fix rename whiteout with fast commit

This patch adds rename whiteout support in fast commits. Note that the
whiteout object that gets created is actually char device. Which
imples, the function ext4_inode_journal_mode(struct inode *inode)
would return "JOURNAL_DATA" for this inode. This has a consequence in
fast commit code that it will make creation of the whiteout object a
fast-commit ineligible behavior and thus will fall back to full
commits. With this patch, this can be observed by running fast commits
with rename whiteout and seeing the stats generated by ext4_fc_stats
tracepoint as follows:

ext4_fc_stats: dev 254:32 fc ineligible reasons:
XATTR:0, CROSS_RENAME:0, JOURNAL_FLAG_CHANGE:0, NO_MEM:0, SWAP_BOOT:0,
RESIZE:0, RENAME_DIR:0, FALLOC_RANGE:0, INODE_JOURNAL_DATA:16;
num_commits:6, ineligible: 6, numblks: 3

So in short, this patch guarantees that in case of rename whiteout, we
fall back to full commits.

Amir mentioned that instead of creating a new whiteout object for
every rename, we can create a static whiteout object with irrelevant
nlink. That will make fast commits to not fall back to full
commit. But until this happens, this patch will ensure correctness by
falling back to full commits.

Fixes: 8016e29f4362 ("ext4: fast commit recovery path")
Cc: stable@kernel.org
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20210316221921.1124955-1-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h        | 2 ++
 fs/ext4/fast_commit.c | 9 +++++++--
 fs/ext4/namei.c       | 3 +++
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ea3b41579f38..826a56e3bbd2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2794,6 +2794,8 @@ void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
 	struct dentry *dentry);
 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+			    struct dentry *dentry);
 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
 void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
 void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 6c4f19b0a556..7541d0b5d706 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -513,10 +513,10 @@ void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
 }
 
-void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+			  struct dentry *dentry)
 {
 	struct __track_dentry_update_args args;
-	struct inode *inode = d_inode(dentry);
 	int ret;
 
 	args.dentry = dentry;
@@ -527,6 +527,11 @@ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 	trace_ext4_fc_track_create(inode, dentry, ret);
 }
 
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
+{
+	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
+}
+
 /* __track_fn for inode tracking */
 static int __track_inode(struct inode *inode, void *arg, bool update)
 {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1fe8370dbf0b..883e2a7cd4ab 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3873,6 +3873,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 		retval = ext4_mark_inode_dirty(handle, whiteout);
 		if (unlikely(retval))
 			goto end_rename;
+
 	}
 	if (!new.bh) {
 		retval = ext4_add_entry(handle, new.dentry, old.inode);
@@ -3946,6 +3947,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 			ext4_fc_track_unlink(handle, new.dentry);
 		__ext4_fc_track_link(handle, old.inode, new.dentry);
 		__ext4_fc_track_unlink(handle, old.inode, old.dentry);
+		if (whiteout)
+			__ext4_fc_track_create(handle, whiteout, old.dentry);
 	}
 
 	if (new.inode) {
-- 
cgit v1.2.3-71-gd317


From 512c15ef05d73a04f1aef18a3bc61a8bb516f323 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Sun, 17 Jan 2021 00:57:32 -0800
Subject: ext4: stop inode update before return

The inode update should be stopped before returing the error code.

Signed-off-by: Pan Bian <bianpan2016@163.com>
Link: https://lore.kernel.org/r/20210117085732.93788-1-bianpan2016@163.com
Fixes: 8016e29f4362 ("ext4: fast commit recovery path")
Cc: stable@kernel.org
Reviewed-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 927e47db7f00..0948a43f1b3d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5387,8 +5387,10 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 			inode->i_gid = attr->ia_gid;
 		error = ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_stop(handle);
-		if (unlikely(error))
+		if (unlikely(error)) {
+			ext4_fc_stop_update(inode);
 			return error;
+		}
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-- 
cgit v1.2.3-71-gd317


From 64395d950bc476106b39341e42ebfd4d2eb71d2c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 21 Mar 2021 00:45:37 -0400
Subject: ext4: initialize ret to suppress smatch warning

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 77c7c8a54da7..77c84d6f1af6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4382,7 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 {
 	struct inode *inode = file_inode(file);
 	handle_t *handle;
-	int ret, ret2 = 0, ret3 = 0;
+	int ret = 0, ret2 = 0, ret3 = 0;
 	int retries = 0;
 	int depth = 0;
 	struct ext4_map_blocks map;
-- 
cgit v1.2.3-71-gd317


From 00ddff431a458bbf143ea7c4c42d022676da1b17 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 21 Mar 2021 07:06:56 -0600
Subject: io-wq: ensure task is running before processing task_work

Mark the current task as running if we need to run task_work from the
io-wq threads as part of work handling. If that is the case, then return
as such so that the caller can appropriately loop back and reset if it
was part of a going-to-sleep flush.

Fixes: 3bfe6106693b ("io-wq: fork worker threads from original task")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index e05f996d088f..3dc10bfd8c3b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -386,13 +386,16 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 	return NULL;
 }
 
-static void io_flush_signals(void)
+static bool io_flush_signals(void)
 {
 	if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) {
+		__set_current_state(TASK_RUNNING);
 		if (current->task_works)
 			task_work_run();
 		clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL);
+		return true;
 	}
+	return false;
 }
 
 static void io_assign_current_work(struct io_worker *worker,
@@ -499,7 +502,8 @@ loop:
 		}
 		__io_worker_idle(wqe, worker);
 		raw_spin_unlock_irq(&wqe->lock);
-		io_flush_signals();
+		if (io_flush_signals())
+			continue;
 		ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
 		if (try_to_freeze() || ret)
 			continue;
-- 
cgit v1.2.3-71-gd317


From 0031275d119efe16711cd93519b595e6f9b4b330 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Sat, 20 Mar 2021 20:33:36 +0100
Subject: io_uring: call req_set_fail_links() on short send[msg]()/recv[msg]()
 with MSG_WAITALL

Without that it's not safe to use them in a linked combination with
others.

Now combinations like IORING_OP_SENDMSG followed by IORING_OP_SPLICE
should be possible.

We already handle short reads and writes for the following opcodes:

- IORING_OP_READV
- IORING_OP_READ_FIXED
- IORING_OP_READ
- IORING_OP_WRITEV
- IORING_OP_WRITE_FIXED
- IORING_OP_WRITE
- IORING_OP_SPLICE
- IORING_OP_TEE

Now we have it for these as well:

- IORING_OP_SENDMSG
- IORING_OP_SEND
- IORING_OP_RECVMSG
- IORING_OP_RECV

For IORING_OP_RECVMSG we also check for the MSG_TRUNC and MSG_CTRUNC
flags in order to call req_set_fail_links().

There might be applications arround depending on the behavior
that even short send[msg]()/recv[msg]() retuns continue an
IOSQE_IO_LINK chain.

It's very unlikely that such applications pass in MSG_WAITALL,
which is only defined in 'man 2 recvmsg', but not in 'man 2 sendmsg'.

It's expected that the low level sock_sendmsg() call just ignores
MSG_WAITALL, as MSG_ZEROCOPY is also ignored without explicitly set
SO_ZEROCOPY.

We also expect the caller to know about the implicit truncation to
MAX_RW_COUNT, which we don't detect.

cc: netdev@vger.kernel.org
Link: https://lore.kernel.org/r/c4e1a4cc0d905314f4d5dc567e65a7b09621aab3.1615908477.git.metze@samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ea2d3e120555..543551d70327 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4386,6 +4386,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
 	unsigned flags;
+	int min_ret = 0;
 	int ret;
 
 	sock = sock_from_file(req->file);
@@ -4406,6 +4407,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	else if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
 
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
 	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
 		return io_setup_async_msg(req, kmsg);
@@ -4416,7 +4420,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	if (ret < 0)
+	if (ret < min_ret)
 		req_set_fail_links(req);
 	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
@@ -4429,6 +4433,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	struct iovec iov;
 	struct socket *sock;
 	unsigned flags;
+	int min_ret = 0;
 	int ret;
 
 	sock = sock_from_file(req->file);
@@ -4450,6 +4455,9 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	else if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
 
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&msg.msg_iter);
+
 	msg.msg_flags = flags;
 	ret = sock_sendmsg(sock, &msg);
 	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
@@ -4457,7 +4465,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 
-	if (ret < 0)
+	if (ret < min_ret)
 		req_set_fail_links(req);
 	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
@@ -4609,6 +4617,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	struct socket *sock;
 	struct io_buffer *kbuf;
 	unsigned flags;
+	int min_ret = 0;
 	int ret, cflags = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
@@ -4640,6 +4649,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	else if (force_nonblock)
 		flags |= MSG_DONTWAIT;
 
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
 					kmsg->uaddr, flags);
 	if (force_nonblock && ret == -EAGAIN)
@@ -4653,7 +4665,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	if (ret < 0)
+	if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
 		req_set_fail_links(req);
 	__io_req_complete(req, issue_flags, ret, cflags);
 	return 0;
@@ -4668,6 +4680,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	struct socket *sock;
 	struct iovec iov;
 	unsigned flags;
+	int min_ret = 0;
 	int ret, cflags = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
@@ -4699,6 +4712,9 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	else if (force_nonblock)
 		flags |= MSG_DONTWAIT;
 
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&msg.msg_iter);
+
 	ret = sock_recvmsg(sock, &msg, flags);
 	if (force_nonblock && ret == -EAGAIN)
 		return -EAGAIN;
@@ -4707,7 +4723,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 out_free:
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_recv_kbuf(req);
-	if (ret < 0)
+	if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
 		req_set_fail_links(req);
 	__io_req_complete(req, issue_flags, ret, cflags);
 	return 0;
-- 
cgit v1.2.3-71-gd317