From 5298d4bfe80f6ae6ae2777bcd1357b0022d98573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 07:56:14 +0100 Subject: unicode: clean up the Kconfig symbol confusion Turn the CONFIG_UNICODE symbol into a tristate that generates some always built in code and remove the confusing CONFIG_UNICODE_UTF8_DATA symbol. Note that a lot of the IS_ENABLED() checks could be turned from cpp statements into normal ifs, but this change is intended to be fairly mechanic, so that should be cleaned up later. Fixes: 2b3d04787012 ("unicode: Add utf8-data module") Reported-by: Linus Torvalds Reviewed-by: Eric Biggers Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..fdac22d16c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,7 @@ struct super_block { #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif -- cgit v1.2.3-71-gd317 From 41d36a9f3e5336f5b48c3adba0777b8e217020d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Mar 2022 07:05:28 +0100 Subject: fs: remove kiocb.ki_hint This field is entirely unused now except for a tracepoint in f2fs, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220308060529.736277-2-hch@lst.de Signed-off-by: Jens Axboe --- fs/aio.c | 1 - fs/cachefiles/io.c | 2 -- fs/f2fs/file.c | 6 ------ fs/io_uring.c | 1 - include/linux/fs.h | 12 ------------ include/trace/events/f2fs.h | 3 +-- 6 files changed, 1 insertion(+), 24 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/aio.c b/fs/aio.c index 4ceba13a7db0..eb0948bb74f1 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1478,7 +1478,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; - req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 753986ea1583..bc7c7a7d9260 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -138,7 +138,6 @@ static int cachefiles_read(struct netfs_cache_resources *cres, ki->iocb.ki_filp = file; ki->iocb.ki_pos = start_pos + skipped; ki->iocb.ki_flags = IOCB_DIRECT; - ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); ki->skipped = skipped; ki->object = object; @@ -313,7 +312,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres, ki->iocb.ki_filp = file; ki->iocb.ki_pos = start_pos; ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; - ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); ki->object = object; ki->inval_counter = cres->inval_counter; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3c98ef6af97d..45076c01a2ba 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4479,10 +4479,8 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); const bool do_opu = f2fs_lfs_mode(sbi); - const int whint_mode = F2FS_OPTION(sbi).whint_mode; const loff_t pos = iocb->ki_pos; const ssize_t count = iov_iter_count(from); - const enum rw_hint hint = iocb->ki_hint; unsigned int dio_flags; struct iomap_dio *dio; ssize_t ret; @@ -4515,8 +4513,6 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, if (do_opu) down_read(&fi->i_gc_rwsem[READ]); } - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; /* * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of @@ -4539,8 +4535,6 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, ret = iomap_dio_complete(dio); } - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; if (do_opu) up_read(&fi->i_gc_rwsem[READ]); up_read(&fi->i_gc_rwsem[WRITE]); diff --git a/fs/io_uring.c b/fs/io_uring.c index 4715980e9015..36e09169ff93 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3747,7 +3747,6 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file)); return io_prep_rw(req, sqe); } diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..d5658ac5d8c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -327,7 +327,6 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; - u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ struct wait_page_queue *ki_waitq; /* for async buffered IO */ randomized_struct_fields_end @@ -2225,21 +2224,11 @@ static inline enum rw_hint file_write_hint(struct file *file) static inline int iocb_flags(struct file *file); -static inline u16 ki_hint_validate(enum rw_hint hint) -{ - typeof(((struct kiocb *)0)->ki_hint) max_hint = -1; - - if (hint <= max_hint) - return hint; - return 0; -} - static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = iocb_flags(filp), - .ki_hint = ki_hint_validate(file_write_hint(filp)), .ki_ioprio = get_current_ioprio(), }; } @@ -2250,7 +2239,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = kiocb_src->ki_flags, - .ki_hint = kiocb_src->ki_hint, .ki_ioprio = kiocb_src->ki_ioprio, .ki_pos = kiocb_src->ki_pos, }; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index f701bb23f83c..1779e133cea0 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -956,12 +956,11 @@ TRACE_EVENT(f2fs_direct_IO_enter, __entry->rw = rw; ), - TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_hint = %x ki_ioprio = %x rw = %d", + TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_ioprio = %x rw = %d", show_dev_ino(__entry), __entry->iocb->ki_pos, __entry->len, __entry->iocb->ki_flags, - __entry->iocb->ki_hint, __entry->iocb->ki_ioprio, __entry->rw) ); -- cgit v1.2.3-71-gd317 From 7b12e49669c99f63bc12351c57e581f1f14d4adf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Mar 2022 07:05:29 +0100 Subject: fs: remove fs.f_write_hint The value is now completely unused except for reporting it back through the F_GET_FILE_RW_HINT ioctl, so remove the value and the two ioctls for it. Trying to use the F_SET_FILE_RW_HINT and F_GET_FILE_RW_HINT fcntls will now return EINVAL, just like it would on a kernel that never supported this functionality in the first place. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220308060529.736277-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/fcntl.c | 18 ------------------ fs/open.c | 1 - include/linux/fs.h | 9 --------- 3 files changed, 28 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/fcntl.c b/fs/fcntl.c index 9c6c6a3e2de5..f15d885b9796 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -291,22 +291,6 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd, u64 h; switch (cmd) { - case F_GET_FILE_RW_HINT: - h = file_write_hint(file); - if (copy_to_user(argp, &h, sizeof(*argp))) - return -EFAULT; - return 0; - case F_SET_FILE_RW_HINT: - if (copy_from_user(&h, argp, sizeof(h))) - return -EFAULT; - hint = (enum rw_hint) h; - if (!rw_hint_valid(hint)) - return -EINVAL; - - spin_lock(&file->f_lock); - file->f_write_hint = hint; - spin_unlock(&file->f_lock); - return 0; case F_GET_RW_HINT: h = inode->i_write_hint; if (copy_to_user(argp, &h, sizeof(*argp))) @@ -431,8 +415,6 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_GET_RW_HINT: case F_SET_RW_HINT: - case F_GET_FILE_RW_HINT: - case F_SET_FILE_RW_HINT: err = fcntl_rw_hint(filp, cmd, arg); break; default: diff --git a/fs/open.c b/fs/open.c index 9ff2f621b760..1315253e0247 100644 --- a/fs/open.c +++ b/fs/open.c @@ -835,7 +835,6 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; - f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/include/linux/fs.h b/include/linux/fs.h index d5658ac5d8c6..a1fc3b41cd82 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -966,7 +966,6 @@ struct file { * Must not be taken from IRQ context. */ spinlock_t f_lock; - enum rw_hint f_write_hint; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -2214,14 +2213,6 @@ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns, !gid_valid(i_gid_into_mnt(mnt_userns, inode)); } -static inline enum rw_hint file_write_hint(struct file *file) -{ - if (file->f_write_hint != WRITE_LIFE_NOT_SET) - return file->f_write_hint; - - return file_inode(file)->i_write_hint; -} - static inline int iocb_flags(struct file *file); static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) -- cgit v1.2.3-71-gd317 From 871129332d74c9e94bd110932ac4445833995639 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 4 Sep 2019 12:13:25 -0700 Subject: fs: export rw_verify_area() I'm adding btrfs ioctls to read and write compressed data, and rather than duplicating the checks in rw_verify_area(), let's just export it. Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/internal.h | 5 ----- fs/read_write.c | 1 + include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/internal.h b/fs/internal.h index 8590c973c2f4..711bdc00ec7c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -157,11 +157,6 @@ extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); -/* - * read_write.c - */ -extern int rw_verify_area(int, struct file *, const loff_t *, size_t); - /* * pipe.c */ diff --git a/fs/read_write.c b/fs/read_write.c index 0074afa7ecb3..4d60146243df 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -385,6 +385,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t return security_file_permission(file, read_write == READ ? MAY_READ : MAY_WRITE); } +EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..0ebfc2519212 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3173,6 +3173,7 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); +int rw_verify_area(int, struct file *, const loff_t *, size_t); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); extern int stream_open(struct inode * inode, struct file * filp); -- cgit v1.2.3-71-gd317 From f6f7a25a650818c61defa97d60a79b216618315f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 12 Aug 2021 15:34:57 -0700 Subject: fs: export variant of generic_write_checks without iov_iter Encoded I/O in Btrfs needs to check a write with a given logical size without an iov_iter that matches that size (because the iov_iter we have is for the compressed data). So, factor out the parts of generic_write_check() that don't need an iov_iter into a new generic_write_checks_count() function and export that. Reviewed-by: Nikolay Borisov Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/read_write.c | 33 ++++++++++++++++++++------------- include/linux/fs.h | 1 + 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/read_write.c b/fs/read_write.c index 4d60146243df..dc5000173b80 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1618,24 +1618,16 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) return 0; } -/* - * Performs necessary checks before doing a write - * - * Can adjust writing position or amount of bytes to write. - * Returns appropriate error code that caller should return or - * zero in case that write should be allowed. - */ -ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) +/* Like generic_write_checks(), but takes size of write instead of iter. */ +int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - loff_t count; - int ret; if (IS_SWAPFILE(inode)) return -ETXTBSY; - if (!iov_iter_count(from)) + if (!*count) return 0; /* FIXME: this is for backwards compatibility with 2.4 */ @@ -1645,8 +1637,23 @@ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; - count = iov_iter_count(from); - ret = generic_write_check_limits(file, iocb->ki_pos, &count); + return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); +} +EXPORT_SYMBOL(generic_write_checks_count); + +/* + * Performs necessary checks before doing a write + * + * Can adjust writing position or amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. + */ +ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + loff_t count = iov_iter_count(from); + int ret; + + ret = generic_write_checks_count(iocb, &count); if (ret) return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index 0ebfc2519212..27746a3da8fd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3130,6 +3130,7 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); +int generic_write_checks_count(struct kiocb *iocb, loff_t *count); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); -- cgit v1.2.3-71-gd317 From 2e7e80f7e7e9dbbb3c2a85ee923ca32826052816 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:27 +0000 Subject: fs: Convert is_partially_uptodate to folios Since the uptodate property is maintained on a per-folio basis, the is_partially_uptodate method should also take a folio. Fix the types at the same time so it's clear that it returns true/false and takes the count in bytes, not blocks. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 2 +- Documentation/filesystems/vfs.rst | 10 +++++----- fs/buffer.c | 26 ++++++++++++------------- fs/iomap/buffered-io.c | 36 ++++++++++++++++------------------- include/linux/buffer_head.h | 3 +-- include/linux/fs.h | 4 ++-- include/linux/iomap.h | 3 +-- mm/filemap.c | 4 ++-- 8 files changed, 40 insertions(+), 48 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 3f9b1497ebb8..88b33524687f 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -258,7 +258,7 @@ prototypes:: int (*migratepage)(struct address_space *, struct page *, struct page *); void (*putback_page) (struct page *); int (*launder_page)(struct page *); - int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); + bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); int (*swap_activate)(struct file *); int (*swap_deactivate)(struct file *); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index bf5c48066fac..da3e7b470f0a 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -747,8 +747,8 @@ cache in your filesystem. The following members are defined: void (*putback_page) (struct page *); int (*launder_page) (struct page *); - int (*is_partially_uptodate) (struct page *, unsigned long, - unsigned long); + bool (*is_partially_uptodate) (struct folio *, size_t from, + size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page) (struct mapping *mapping, struct page *page); int (*swap_activate)(struct file *); @@ -937,9 +937,9 @@ cache in your filesystem. The following members are defined: ``is_partially_uptodate`` Called by the VM when reading a file through the pagecache when - the underlying blocksize != pagesize. If the required block is - up to date then the read can complete without needing the IO to - bring the whole page up to date. + the underlying blocksize is smaller than the size of the folio. + If the required block is up to date then the read can complete + without needing I/O to bring the whole page up to date. ``is_dirty_writeback`` Called by the VM when attempting to reclaim a page. The VM uses diff --git a/fs/buffer.c b/fs/buffer.c index 8e112b6bd371..929061995cf8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2206,29 +2206,27 @@ int generic_write_end(struct file *file, struct address_space *mapping, EXPORT_SYMBOL(generic_write_end); /* - * block_is_partially_uptodate checks whether buffers within a page are + * block_is_partially_uptodate checks whether buffers within a folio are * uptodate or not. * - * Returns true if all buffers which correspond to a file portion - * we want to read are uptodate. + * Returns true if all buffers which correspond to the specified part + * of the folio are uptodate. */ -int block_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count) +bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; - int ret = 1; - - if (!page_has_buffers(page)) - return 0; + bool ret = true; - head = page_buffers(page); + head = folio_buffers(folio); + if (!head) + return false; blocksize = head->b_size; - to = min_t(unsigned, PAGE_SIZE - from, count); + to = min_t(unsigned, folio_size(folio) - from, count); to = from + to; - if (from < blocksize && to > PAGE_SIZE - blocksize) - return 0; + if (from < blocksize && to > folio_size(folio) - blocksize) + return false; bh = head; block_start = 0; @@ -2236,7 +2234,7 @@ int block_is_partially_uptodate(struct page *page, unsigned long from, block_end = block_start + blocksize; if (block_end > from && block_start < to) { if (!buffer_uptodate(bh)) { - ret = 0; + ret = false; break; } if (block_end >= to) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d020a2e81a24..da0a7b15a64e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -424,37 +424,33 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_readahead); /* - * iomap_is_partially_uptodate checks whether blocks within a page are + * iomap_is_partially_uptodate checks whether blocks within a folio are * uptodate or not. * - * Returns true if all blocks which correspond to a file portion - * we want to read within the page are uptodate. + * Returns true if all blocks which correspond to the specified part + * of the folio are uptodate. */ -int -iomap_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count) +bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { - struct folio *folio = page_folio(page); struct iomap_page *iop = to_iomap_page(folio); - struct inode *inode = page->mapping->host; - unsigned len, first, last; - unsigned i; + struct inode *inode = folio->mapping->host; + size_t len; + unsigned first, last, i; - /* Limit range to one page */ - len = min_t(unsigned, PAGE_SIZE - from, count); + if (!iop) + return false; + + /* Limit range to this folio */ + len = min(folio_size(folio) - from, count); /* First and last blocks in range within page */ first = from >> inode->i_blkbits; last = (from + len - 1) >> inode->i_blkbits; - if (iop) { - for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) - return 0; - return 1; - } - - return 0; + for (i = first; i <= last; i++) + if (!test_bit(i, iop->uptodate)) + return false; + return true; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3451f1fcda12..79d465057889 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -225,8 +225,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); int block_read_full_page(struct page*, get_block_t*); -int block_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count); +bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, get_block_t *get_block); int __block_write_begin(struct page *page, loff_t pos, unsigned len, diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..5939e6694ada 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -400,8 +400,8 @@ struct address_space_operations { bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); int (*launder_page) (struct page *); - int (*is_partially_uptodate) (struct page *, unsigned long, - unsigned long); + bool (*is_partially_uptodate) (struct folio *, size_t from, + size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page)(struct address_space *, struct page *); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 97a3a2edb585..3bcbb264f83f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -227,8 +227,7 @@ ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); -int iomap_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count); +bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); int iomap_releasepage(struct page *page, gfp_t gfp_mask); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); void iomap_invalidatepage(struct page *page, unsigned int offset, diff --git a/mm/filemap.c b/mm/filemap.c index ad8c39d90bf9..9639b844dd31 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2452,7 +2452,7 @@ static bool filemap_range_uptodate(struct address_space *mapping, pos -= folio_pos(folio); } - return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count); + return mapping->a_ops->is_partially_uptodate(folio, pos, count); } static int filemap_update_page(struct kiocb *iocb, @@ -2844,7 +2844,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas, offset = offset_in_folio(folio, start) & ~(bsz - 1); do { - if (ops->is_partially_uptodate(&folio->page, offset, bsz) == + if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; start = (start + bsz) & ~(bsz - 1); -- cgit v1.2.3-71-gd317 From 128d1f8241d62ab014eef6dd4ef9bb977dbeadb2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:32 +0000 Subject: fs: Add invalidate_folio() aops method This is used in preference to invalidatepage, if defined. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 13 +++++++------ Documentation/filesystems/vfs.rst | 11 ++++++----- include/linux/fs.h | 1 + mm/truncate.c | 8 +++++++- 4 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 88b33524687f..29a045fd3860 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -250,6 +250,7 @@ prototypes:: loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidate_folio) (struct folio *, size_t start, size_t len); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); @@ -278,6 +279,7 @@ readpages: no shared write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: +invalidate_folio: yes exclusive invalidatepage: yes exclusive releasepage: yes freepage: yes @@ -370,13 +372,12 @@ not locked. filesystems and by the swapper. The latter will eventually go away. Please, keep it that way and don't breed new callers. -->invalidatepage() is called when the filesystem must attempt to drop +->invalidate_folio() is called when the filesystem must attempt to drop some or all of the buffers from the page when it is being truncated. It -returns zero on success. If ->invalidatepage is zero, the kernel uses -block_invalidatepage() instead. The filesystem must exclusively acquire -invalidate_lock before invalidating page cache in truncate / hole punch path -(and thus calling into ->invalidatepage) to block races between page cache -invalidation and page cache filling functions (fault, read, ...). +returns zero on success. The filesystem must exclusively acquire +invalidate_lock before invalidating page cache in truncate / hole punch +path (and thus calling into ->invalidate_folio) to block races between page +cache invalidation and page cache filling functions (fault, read, ...). ->releasepage() is called when the kernel is about to try to drop the buffers from the page in preparation for freeing it. It returns zero to diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index da3e7b470f0a..26c090cd8cf5 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -735,6 +735,7 @@ cache in your filesystem. The following members are defined: loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidate_folio) (struct folio *, size_t start, size_t len); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); @@ -868,15 +869,15 @@ cache in your filesystem. The following members are defined: to find out where the blocks in the file are and uses those addresses directly. -``invalidatepage`` - If a page has PagePrivate set, then invalidatepage will be - called when part or all of the page is to be removed from the +``invalidate_folio`` + If a folio has private data, then invalidate_folio will be + called when part or all of the folio is to be removed from the address space. This generally corresponds to either a truncation, punch hole or a complete invalidation of the address space (in the latter case 'offset' will always be 0 and 'length' - will be PAGE_SIZE). Any private data associated with the page + will be folio_size()). Any private data associated with the page should be updated to reflect this truncation. If offset is 0 - and length is PAGE_SIZE, then the private data should be + and length is folio_size(), then the private data should be released, because the page must be able to be completely discarded. This may be done by calling the ->releasepage function, but in this case the release MUST succeed. diff --git a/include/linux/fs.h b/include/linux/fs.h index 5939e6694ada..bcdb613cd652 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -387,6 +387,7 @@ struct address_space_operations { /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidate_folio) (struct folio *, size_t offset, size_t len); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); diff --git a/mm/truncate.c b/mm/truncate.c index aa0ed373789d..b9ad298e6ce7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -154,9 +154,15 @@ static int invalidate_exceptional_entry2(struct address_space *mapping, */ void folio_invalidate(struct folio *folio, size_t offset, size_t length) { + const struct address_space_operations *aops = folio->mapping->a_ops; void (*invalidatepage)(struct page *, unsigned int, unsigned int); - invalidatepage = folio->mapping->a_ops->invalidatepage; + if (aops->invalidate_folio) { + aops->invalidate_folio(folio, offset, length); + return; + } + + invalidatepage = aops->invalidatepage; #ifdef CONFIG_BLOCK if (!invalidatepage) invalidatepage = block_invalidatepage; -- cgit v1.2.3-71-gd317 From 5660a8630dab61a28e07ec00c42bf605b182d725 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:35 +0000 Subject: fs: Remove noop_invalidatepage() We used to have to use noop_invalidatepage() to prevent block_invalidatepage() from being called, but that behaviour is now gone. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- drivers/dax/device.c | 1 - fs/ext2/inode.c | 1 - fs/ext4/inode.c | 1 - fs/fuse/dax.c | 1 - fs/libfs.c | 11 ----------- fs/xfs/xfs_aops.c | 1 - include/linux/fs.h | 2 -- 7 files changed, 18 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index d33a0613ed0c..7a59ca51217e 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -347,7 +347,6 @@ static unsigned long dax_get_unmapped_area(struct file *filp, static const struct address_space_operations dev_dax_aops = { .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, }; static int dax_open(struct inode *inode, struct file *filp) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 1e14777c3ca6..9b579ee56eaf 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1001,7 +1001,6 @@ static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, }; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 07ef3f84db9e..d7086209572a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3632,7 +3632,6 @@ static const struct address_space_operations ext4_dax_aops = { .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, .bmap = ext4_bmap, - .invalidatepage = noop_invalidatepage, .swap_activate = ext4_iomap_swap_activate, }; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 182b24a14804..b11fa10b88d8 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1327,7 +1327,6 @@ static const struct address_space_operations fuse_dax_file_aops = { .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, }; static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) diff --git a/fs/libfs.c b/fs/libfs.c index 974125270a42..4e047841e61d 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1198,17 +1198,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -void noop_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - /* - * There is no page cache to invalidate in the dax case, however - * we need this callback defined to prevent falling back to - * block_invalidatepage() in do_invalidatepage(). - */ -} -EXPORT_SYMBOL_GPL(noop_invalidatepage); - ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 51a040b658cb..7dd314f2288f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -582,6 +582,5 @@ const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index bcdb613cd652..a40ea82248da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3323,8 +3323,6 @@ extern int simple_rename(struct user_namespace *, struct inode *, extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); -extern void noop_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, -- cgit v1.2.3-71-gd317 From f50015a596fa106bf642bd85fbf6e6b52cc913d0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:51 +0000 Subject: fs: Remove aops->invalidatepage With all users migrated to ->invalidate_folio, remove the old operation. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 2 -- Documentation/filesystems/vfs.rst | 1 - include/linux/fs.h | 1 - mm/truncate.c | 14 +++----------- 4 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 29a045fd3860..8e9cbc0fb70f 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -251,7 +251,6 @@ prototypes:: struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); @@ -280,7 +279,6 @@ write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: invalidate_folio: yes exclusive -invalidatepage: yes exclusive releasepage: yes freepage: yes direct_IO: diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 26c090cd8cf5..28704831652c 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -736,7 +736,6 @@ cache in your filesystem. The following members are defined: struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); diff --git a/include/linux/fs.h b/include/linux/fs.h index a40ea82248da..af9ae091bd82 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -388,7 +388,6 @@ struct address_space_operations { /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); diff --git a/mm/truncate.c b/mm/truncate.c index 28650151091a..8010461a59bd 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,8 +19,7 @@ #include #include #include -#include /* grr. try_to_release_page, - do_invalidatepage */ +#include /* grr. try_to_release_page */ #include #include #include "internal.h" @@ -155,16 +154,9 @@ static int invalidate_exceptional_entry2(struct address_space *mapping, void folio_invalidate(struct folio *folio, size_t offset, size_t length) { const struct address_space_operations *aops = folio->mapping->a_ops; - void (*invalidatepage)(struct page *, unsigned int, unsigned int); - if (aops->invalidate_folio) { + if (aops->invalidate_folio) aops->invalidate_folio(folio, offset, length); - return; - } - - invalidatepage = aops->invalidatepage; - if (invalidatepage) - (*invalidatepage)(&folio->page, offset, length); } EXPORT_SYMBOL_GPL(folio_invalidate); @@ -334,7 +326,7 @@ int invalidate_inode_page(struct page *page) * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * - * Note that since ->invalidatepage() accepts range to invalidate + * Note that since ->invalidate_folio() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ -- cgit v1.2.3-71-gd317 From affa80e8c6a1df473694c2087259901872309cc4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:52 +0000 Subject: fs: Add aops->launder_folio Since the only difference between ->launder_page and ->launder_folio is the type of the pointer, these can safely use a union without affecting bisectability. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 10 +++++----- Documentation/filesystems/vfs.rst | 8 ++++---- include/linux/fs.h | 5 ++++- mm/truncate.c | 8 ++++---- 4 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 8e9cbc0fb70f..dee512efb458 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -257,7 +257,7 @@ prototypes:: bool (*isolate_page) (struct page *, isolate_mode_t); int (*migratepage)(struct address_space *, struct page *, struct page *); void (*putback_page) (struct page *); - int (*launder_page)(struct page *); + int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); int (*swap_activate)(struct file *); @@ -285,7 +285,7 @@ direct_IO: isolate_page: yes migratepage: yes (both) putback_page: yes -launder_page: yes +launder_folio: yes is_partially_uptodate: yes error_remove_page: yes swap_activate: no @@ -385,9 +385,9 @@ the kernel assumes that the fs has no private interest in the buffers. ->freepage() is called when the kernel is done dropping the page from the page cache. -->launder_page() may be called prior to releasing a page if -it is still found to be dirty. It returns zero if the page was successfully -cleaned, or an error value if not. Note that in order to prevent the page +->launder_folio() may be called prior to releasing a folio if +it is still found to be dirty. It returns zero if the folio was successfully +cleaned, or an error value if not. Note that in order to prevent the folio getting mapped back in and redirtied, it needs to be kept locked across the entire operation. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 28704831652c..c54ca4d88ed6 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -745,7 +745,7 @@ cache in your filesystem. The following members are defined: int (*migratepage) (struct page *, struct page *); /* put migration-failed page back to right list */ void (*putback_page) (struct page *); - int (*launder_page) (struct page *); + int (*launder_folio) (struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); @@ -930,9 +930,9 @@ cache in your filesystem. The following members are defined: ``putback_page`` Called by the VM when isolated page's migration fails. -``launder_page`` - Called before freeing a page - it writes back the dirty page. - To prevent redirtying the page, it is kept locked during the +``launder_folio`` + Called before freeing a folio - it writes back the dirty folio. + To prevent redirtying the folio, it is kept locked during the whole operation. ``is_partially_uptodate`` diff --git a/include/linux/fs.h b/include/linux/fs.h index af9ae091bd82..0af3075cdff2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -399,7 +399,10 @@ struct address_space_operations { struct page *, struct page *, enum migrate_mode); bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); - int (*launder_page) (struct page *); + union { + int (*launder_page) (struct page *); + int (*launder_folio) (struct folio *); + }; bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); diff --git a/mm/truncate.c b/mm/truncate.c index 8010461a59bd..6ad44b546dff 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -614,13 +614,13 @@ failed: return 0; } -static int do_launder_folio(struct address_space *mapping, struct folio *folio) +static int folio_launder(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return 0; - if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL) + if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) return 0; - return mapping->a_ops->launder_page(&folio->page); + return mapping->a_ops->launder_folio(folio); } /** @@ -686,7 +686,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); - ret2 = do_launder_folio(mapping, folio); + ret2 = folio_launder(mapping, folio); if (ret2 == 0) { if (!invalidate_complete_folio2(mapping, folio)) ret2 = -EBUSY; -- cgit v1.2.3-71-gd317 From 072acba6d08730beba5bad293c7ce6d0c4b0624c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:59 +0000 Subject: fs: Remove aops->launder_page With all users converted to ->launder_folio, remove ->launder_page. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- include/linux/fs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0af3075cdff2..055be40084f1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -399,10 +399,7 @@ struct address_space_operations { struct page *, struct page *, enum migrate_mode); bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); - union { - int (*launder_page) (struct page *); - int (*launder_folio) (struct folio *); - }; + int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); -- cgit v1.2.3-71-gd317 From 6f31a5a261dbbe7bf7f585dfe81f8acd4b25ec3b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:22:00 +0000 Subject: fs: Add aops->dirty_folio This replaces ->set_page_dirty(). It returns a bool instead of an int and takes the address_space as a parameter instead of expecting the implementations to retrieve the address_space from the page. This is particularly important for filesystems which use FS_OPS for swap. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 15 ++++++++------- Documentation/filesystems/vfs.rst | 16 ++++++++-------- include/linux/fs.h | 1 + mm/page-writeback.c | 17 ++++++++++------- mm/page_io.c | 5 ++++- 5 files changed, 31 insertions(+), 23 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index dee512efb458..72fa12dabd39 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -239,7 +239,7 @@ prototypes:: int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); - int (*set_page_dirty)(struct page *page); + bool (*dirty_folio)(struct address_space *, struct folio *folio); void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); @@ -264,7 +264,7 @@ prototypes:: int (*swap_deactivate)(struct file *); locking rules: - All except set_page_dirty and freepage may block + All except dirty_folio and freepage may block ====================== ======================== ========= =============== ops PageLocked(page) i_rwsem invalidate_lock @@ -272,7 +272,7 @@ ops PageLocked(page) i_rwsem invalidate_lock writepage: yes, unlocks (see below) readpage: yes, unlocks shared writepages: -set_page_dirty no +dirty_folio maybe readahead: yes, unlocks shared readpages: no shared write_begin: locks the page exclusive @@ -361,10 +361,11 @@ If nr_to_write is NULL, all dirty pages must be written. writepages should _only_ write pages which are present on mapping->io_pages. -->set_page_dirty() is called from various places in the kernel -when the target page is marked as needing writeback. It may be called -under spinlock (it cannot block) and is sometimes called with the page -not locked. +->dirty_folio() is called from various places in the kernel when +the target folio is marked as needing writeback. The folio cannot be +truncated because either the caller holds the folio lock, or the caller +has found the folio while holding the page table lock which will block +truncation. ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some filesystems and by the swapper. The latter will eventually go away. Please, diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index c54ca4d88ed6..d16bee420326 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -658,7 +658,7 @@ pages, however the address_space has finer control of write sizes. The read process essentially only requires 'readpage'. The write process is more complicated and uses write_begin/write_end or -set_page_dirty to write data into the address_space, and writepage and +dirty_folio to write data into the address_space, and writepage and writepages to writeback data to storage. Adding and removing pages to/from an address_space is protected by the @@ -724,7 +724,7 @@ cache in your filesystem. The following members are defined: int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); - int (*set_page_dirty)(struct page *page); + bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); @@ -793,13 +793,13 @@ cache in your filesystem. The following members are defined: This will choose pages from the address space that are tagged as DIRTY and will pass them to ->writepage. -``set_page_dirty`` - called by the VM to set a page dirty. This is particularly - needed if an address space attaches private data to a page, and - that data needs to be updated when a page is dirtied. This is +``dirty_folio`` + called by the VM to mark a folio as dirty. This is particularly + needed if an address space attaches private data to a folio, and + that data needs to be updated when a folio is dirtied. This is called, for example, when a memory mapped page gets modified. - If defined, it should set the PageDirty flag, and the - PAGECACHE_TAG_DIRTY tag in the radix tree. + If defined, it should set the folio dirty flag, and the + PAGECACHE_TAG_DIRTY search mark in i_pages. ``readahead`` Called by the VM to read pages associated with the address_space diff --git a/include/linux/fs.h b/include/linux/fs.h index 055be40084f1..c3d5db8851ae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -369,6 +369,7 @@ struct address_space_operations { /* Set a page dirty. Return true if this dirtied it */ int (*set_page_dirty)(struct page *page); + bool (*dirty_folio)(struct address_space *, struct folio *); /* * Reads in the requested pages. Unlike ->readpage(), this is diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d163f8d36b..27a87ae4502c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2616,7 +2616,7 @@ EXPORT_SYMBOL(folio_redirty_for_writepage); * folio_mark_dirty - Mark a folio as being modified. * @folio: The folio. * - * For folios with a mapping this should be done under the page lock + * For folios with a mapping this should be done with the folio lock held * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. @@ -2630,16 +2630,19 @@ bool folio_mark_dirty(struct folio *folio) if (likely(mapping)) { /* * readahead/lru_deactivate_page could remain - * PG_readahead/PG_reclaim due to race with end_page_writeback - * About readahead, if the page is written, the flags would be + * PG_readahead/PG_reclaim due to race with folio_end_writeback + * About readahead, if the folio is written, the flags would be * reset. So no problem. - * About lru_deactivate_page, if the page is redirty, the flag - * will be reset. So no problem. but if the page is used by readahead - * it will confuse readahead and make it restart the size rampup - * process. But it's a trivial problem. + * About lru_deactivate_page, if the folio is redirtied, + * the flag will be reset. So no problem. but if the + * folio is used by readahead it will confuse readahead + * and make it restart the size rampup process. But it's + * a trivial problem. */ if (folio_test_reclaim(folio)) folio_clear_reclaim(folio); + if (mapping->a_ops->dirty_folio) + return mapping->a_ops->dirty_folio(mapping, folio); return mapping->a_ops->set_page_dirty(&folio->page); } if (!folio_test_dirty(folio)) { diff --git a/mm/page_io.c b/mm/page_io.c index 0bf8e40f4e57..24c975fb4e21 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -444,9 +444,12 @@ int swap_set_page_dirty(struct page *page) if (data_race(sis->flags & SWP_FS_OPS)) { struct address_space *mapping = sis->swap_file->f_mapping; + const struct address_space_operations *aops = mapping->a_ops; VM_BUG_ON_PAGE(!PageSwapCache(page), page); - return mapping->a_ops->set_page_dirty(page); + if (aops->dirty_folio) + return aops->dirty_folio(mapping, page_folio(page)); + return aops->set_page_dirty(page); } else { return __set_page_dirty_no_writeback(page); } -- cgit v1.2.3-71-gd317 From 3a3bae50af5d73fab5da20484029de77ca67bb2e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:22:15 +0000 Subject: fs: Remove aops ->set_page_dirty With all implementations converted to ->dirty_folio, we can stop calling this fallback method and remove it entirely. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- fs/ecryptfs/mmap.c | 2 +- include/linux/fs.h | 3 +-- mm/page-writeback.c | 11 +++-------- mm/page_io.c | 4 +--- 4 files changed, 6 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 9aabcb2f52e9..9ad61b582f07 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -540,7 +540,7 @@ const struct address_space_operations ecryptfs_aops = { * XXX: This is pretty broken for multiple reasons: ecryptfs does not * actually use buffer_heads, and ecryptfs will crash without * CONFIG_BLOCK. But it matches the behavior before the default for - * address_space_operations without the ->set_page_dirty method was + * address_space_operations without the ->dirty_folio method was * cleaned up, so this is the best we can do without maintainer * feedback. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index c3d5db8851ae..b472d78f00b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -367,8 +367,7 @@ struct address_space_operations { /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); - /* Set a page dirty. Return true if this dirtied it */ - int (*set_page_dirty)(struct page *page); + /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4557a8d3dfea..0997738545dd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2641,15 +2641,10 @@ bool folio_mark_dirty(struct folio *folio) */ if (folio_test_reclaim(folio)) folio_clear_reclaim(folio); - if (mapping->a_ops->dirty_folio) - return mapping->a_ops->dirty_folio(mapping, folio); - return mapping->a_ops->set_page_dirty(&folio->page); + return mapping->a_ops->dirty_folio(mapping, folio); } - if (!folio_test_dirty(folio)) { - if (!folio_test_set_dirty(folio)) - return true; - } - return false; + + return noop_dirty_folio(mapping, folio); } EXPORT_SYMBOL(folio_mark_dirty); diff --git a/mm/page_io.c b/mm/page_io.c index e3333973335e..6bde053b9d9d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -449,9 +449,7 @@ bool swap_dirty_folio(struct address_space *mapping, struct folio *folio) aops = mapping->a_ops; VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - if (aops->dirty_folio) - return aops->dirty_folio(mapping, folio); - return aops->set_page_dirty(&folio->page); + return aops->dirty_folio(mapping, folio); } else { return noop_dirty_folio(mapping, folio); } -- cgit v1.2.3-71-gd317 From c56109dd35c9204cd6c49d2116ef36e5044ef867 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 17:22:10 -0500 Subject: mm/truncate: Combine invalidate_mapping_pagevec() and __invalidate_mapping_pages() We can save a function call by combining these two functions, which are identical except for the return value. Also move the prototype to mm/internal.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- include/linux/fs.h | 4 ---- mm/internal.h | 2 ++ mm/truncate.c | 32 +++++++++++++------------------- 3 files changed, 15 insertions(+), 23 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..85c584c5c623 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2749,10 +2749,6 @@ extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -void invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, - unsigned long *nr_pagevec); - static inline void invalidate_remote_inode(struct inode *inode) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || diff --git a/mm/internal.h b/mm/internal.h index 7c441f43ba31..6047268076e7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -97,6 +97,8 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long invalidate_inode_page(struct page *page); +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); /** * folio_evictable - Test whether a folio is evictable. diff --git a/mm/truncate.c b/mm/truncate.c index 9ed62cb3c503..cace6e3e4e8c 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -494,7 +494,18 @@ void truncate_inode_pages_final(struct address_space *mapping) } EXPORT_SYMBOL(truncate_inode_pages_final); -static unsigned long __invalidate_mapping_pages(struct address_space *mapping, +/** + * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * @nr_pagevec: invalidate failed page number for caller + * + * This helper is similar to invalidate_mapping_pages(), except that it accounts + * for pages that are likely on a pagevec and counts them in @nr_pagevec, which + * will be used by the caller. + */ +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; @@ -559,27 +570,10 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { - return __invalidate_mapping_pages(mapping, start, end, NULL); + return invalidate_mapping_pagevec(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); -/** - * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * @nr_pagevec: invalidate failed page number for caller - * - * This helper is similar to invalidate_mapping_pages(), except that it accounts - * for pages that are likely on a pagevec and counts them in @nr_pagevec, which - * will be used by the caller. - */ -void invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) -{ - __invalidate_mapping_pages(mapping, start, end, nr_pagevec); -} - /* * This is like invalidate_inode_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger -- cgit v1.2.3-71-gd317 From cbcc268bb1ce5b539e7652d398e08e9b83dc4cef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 17:23:58 -0500 Subject: fs: Move many prototypes to pagemap.h These functions are page cache functionality and don't need to be declared in fs.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- drivers/block/xen-blkback/xenbus.c | 1 + drivers/usb/gadget/function/f_mass_storage.c | 1 + fs/coda/file.c | 1 + fs/iomap/fiemap.c | 1 + fs/nfsd/filecache.c | 1 + fs/nfsd/vfs.c | 1 + fs/vboxsf/utils.c | 1 + include/linux/fs.h | 116 --------------------------- include/linux/pagemap.h | 114 ++++++++++++++++++++++++++ 9 files changed, 121 insertions(+), 116 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 62125fd4af4a..f09040435e2e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include "common.h" diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index 46dd11dcb3a8..7371c6e65b10 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -179,6 +179,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/coda/file.c b/fs/coda/file.c index 29dd87be2fb8..3f3c81e6b1ab 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 66cf267c68ae..610ca6f1ec9b 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -7,6 +7,7 @@ #include #include #include +#include static int iomap_to_fiemap(struct fiemap_extent_info *fi, const struct iomap *iomap, u32 flags) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 8bc807c5fea4..47f804e0ec93 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 91600e71be19..fe0d7abbc1b1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index aec2ebf7d25a..e1db0f3f7e5e 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "vfsmod.h" diff --git a/include/linux/fs.h b/include/linux/fs.h index 85c584c5c623..0961c979e949 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2746,50 +2746,6 @@ extern void init_special_inode(struct inode *, umode_t, dev_t); extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end); - -static inline void invalidate_remote_inode(struct inode *inode) -{ - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) - invalidate_mapping_pages(inode->i_mapping, 0, -1); -} -extern int invalidate_inode_pages2(struct address_space *mapping); -extern int invalidate_inode_pages2_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); -extern int write_inode_now(struct inode *, int); -extern int filemap_fdatawrite(struct address_space *); -extern int filemap_flush(struct address_space *); -extern int filemap_fdatawait_keep_errors(struct address_space *mapping); -extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, - loff_t lend); -extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping, - loff_t start_byte, loff_t end_byte); - -static inline int filemap_fdatawait(struct address_space *mapping) -{ - return filemap_fdatawait_range(mapping, 0, LLONG_MAX); -} - -extern bool filemap_range_has_page(struct address_space *, loff_t lstart, - loff_t lend); -extern int filemap_write_and_wait_range(struct address_space *mapping, - loff_t lstart, loff_t lend); -extern int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); -extern int filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end); -extern int filemap_check_errors(struct address_space *mapping); -extern void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); - -static inline int filemap_write_and_wait(struct address_space *mapping) -{ - return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); -} - extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); @@ -2801,67 +2757,6 @@ static inline int file_write_and_wait(struct file *file) return file_write_and_wait_range(file, 0, LLONG_MAX); } -/** - * filemap_set_wb_err - set a writeback error on an address_space - * @mapping: mapping in which to set writeback error - * @err: error to be set in mapping - * - * When writeback fails in some way, we must record that error so that - * userspace can be informed when fsync and the like are called. We endeavor - * to report errors on any file that was open at the time of the error. Some - * internal callers also need to know when writeback errors have occurred. - * - * When a writeback error occurs, most filesystems will want to call - * filemap_set_wb_err to record the error in the mapping so that it will be - * automatically reported whenever fsync is called on the file. - */ -static inline void filemap_set_wb_err(struct address_space *mapping, int err) -{ - /* Fastpath for common case of no error */ - if (unlikely(err)) - __filemap_set_wb_err(mapping, err); -} - -/** - * filemap_check_wb_err - has an error occurred since the mark was sampled? - * @mapping: mapping to check for writeback errors - * @since: previously-sampled errseq_t - * - * Grab the errseq_t value from the mapping, and see if it has changed "since" - * the given value was sampled. - * - * If it has then report the latest error set, otherwise return 0. - */ -static inline int filemap_check_wb_err(struct address_space *mapping, - errseq_t since) -{ - return errseq_check(&mapping->wb_err, since); -} - -/** - * filemap_sample_wb_err - sample the current errseq_t to test for later errors - * @mapping: mapping to be sampled - * - * Writeback errors are always reported relative to a particular sample point - * in the past. This function provides those sample points. - */ -static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) -{ - return errseq_sample(&mapping->wb_err); -} - -/** - * file_sample_sb_err - sample the current errseq_t to test for later errors - * @file: file pointer to be sampled - * - * Grab the most current superblock-level errseq_t value for the given - * struct file. - */ -static inline errseq_t file_sample_sb_err(struct file *file) -{ - return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); -} - extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); @@ -3604,15 +3499,4 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); -/* - * Flush file data before changing attributes. Caller must hold any locks - * required to prevent further writes to this file until we're done setting - * flags. - */ -static inline int inode_drain_writes(struct inode *inode) -{ - inode_dio_wait(inode); - return filemap_write_and_wait(inode->i_mapping); -} - #endif /* _LINUX_FS_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cdb3f118603a..f968b36ad771 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -18,6 +18,120 @@ struct folio_batch; +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end); + +static inline void invalidate_remote_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + invalidate_mapping_pages(inode->i_mapping, 0, -1); +} +int invalidate_inode_pages2(struct address_space *mapping); +int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end); +int write_inode_now(struct inode *, int sync); +int filemap_fdatawrite(struct address_space *); +int filemap_flush(struct address_space *); +int filemap_fdatawait_keep_errors(struct address_space *mapping); +int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); +int filemap_fdatawait_range_keep_errors(struct address_space *mapping, + loff_t start_byte, loff_t end_byte); + +static inline int filemap_fdatawait(struct address_space *mapping) +{ + return filemap_fdatawait_range(mapping, 0, LLONG_MAX); +} + +bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); +int filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend); +int __filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end, int sync_mode); +int filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end); +int filemap_check_errors(struct address_space *mapping); +void __filemap_set_wb_err(struct address_space *mapping, int err); +int filemap_fdatawrite_wbc(struct address_space *mapping, + struct writeback_control *wbc); + +static inline int filemap_write_and_wait(struct address_space *mapping) +{ + return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); +} + +/** + * filemap_set_wb_err - set a writeback error on an address_space + * @mapping: mapping in which to set writeback error + * @err: error to be set in mapping + * + * When writeback fails in some way, we must record that error so that + * userspace can be informed when fsync and the like are called. We endeavor + * to report errors on any file that was open at the time of the error. Some + * internal callers also need to know when writeback errors have occurred. + * + * When a writeback error occurs, most filesystems will want to call + * filemap_set_wb_err to record the error in the mapping so that it will be + * automatically reported whenever fsync is called on the file. + */ +static inline void filemap_set_wb_err(struct address_space *mapping, int err) +{ + /* Fastpath for common case of no error */ + if (unlikely(err)) + __filemap_set_wb_err(mapping, err); +} + +/** + * filemap_check_wb_err - has an error occurred since the mark was sampled? + * @mapping: mapping to check for writeback errors + * @since: previously-sampled errseq_t + * + * Grab the errseq_t value from the mapping, and see if it has changed "since" + * the given value was sampled. + * + * If it has then report the latest error set, otherwise return 0. + */ +static inline int filemap_check_wb_err(struct address_space *mapping, + errseq_t since) +{ + return errseq_check(&mapping->wb_err, since); +} + +/** + * filemap_sample_wb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Writeback errors are always reported relative to a particular sample point + * in the past. This function provides those sample points. + */ +static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) +{ + return errseq_sample(&mapping->wb_err); +} + +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @file: file pointer to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + +/* + * Flush file data before changing attributes. Caller must hold any locks + * required to prevent further writes to this file until we're done setting + * flags. + */ +static inline int inode_drain_writes(struct inode *inode) +{ + inode_dio_wait(inode); + return filemap_write_and_wait(inode->i_mapping); +} + static inline bool mapping_empty(struct address_space *mapping) { return xa_empty(&mapping->i_pages); -- cgit v1.2.3-71-gd317 From 84dacdbd5352bfef82423760fa2e8bffaeef9e05 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:38:51 -0700 Subject: mm: document and polish read-ahead code Add some "big-picture" documentation for read-ahead and polish the code to make it fit this documentation. The meaning of ->async_size is clarified to match its name. i.e. Any request to ->readahead() has a sync part and an async part. The caller will wait for the sync pages to complete, but will not wait for the async pages. The first async page is still marked PG_readahead Note that the current function names page_cache_sync_ra() and page_cache_async_ra() are misleading. All ra request are partly sync and partly async, so either part can be empty. A page_cache_sync_ra() request will usually set ->async_size non-zero, implying it is not all synchronous. When a non-zero req_count is passed to page_cache_async_ra(), the implication is that some prefix of the request is synchronous, though the calculation made there is incorrect - I haven't tried to fix it. Link: https://lkml.kernel.org/r/164549983734.9187.11586890887006601405.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/mm-api.rst | 19 +++++++- Documentation/filesystems/vfs.rst | 16 ++++--- include/linux/fs.h | 9 +++- mm/readahead.c | 99 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 133 insertions(+), 10 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 395835f9289f..f5b2f92822c8 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -58,15 +58,30 @@ Virtually Contiguous Mappings File Mapping and Page Cache =========================== -.. kernel-doc:: mm/readahead.c - :export: +Filemap +------- .. kernel-doc:: mm/filemap.c :export: +Readahead +--------- + +.. kernel-doc:: mm/readahead.c + :doc: Readahead Overview + +.. kernel-doc:: mm/readahead.c + :export: + +Writeback +--------- + .. kernel-doc:: mm/page-writeback.c :export: +Truncate +-------- + .. kernel-doc:: mm/truncate.c :export: diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index bf5c48066fac..b4a0baa46dcc 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -806,12 +806,16 @@ cache in your filesystem. The following members are defined: object. The pages are consecutive in the page cache and are locked. The implementation should decrement the page refcount after starting I/O on each page. Usually the page will be - unlocked by the I/O completion handler. If the filesystem decides - to stop attempting I/O before reaching the end of the readahead - window, it can simply return. The caller will decrement the page - refcount and unlock the remaining pages for you. Set PageUptodate - if the I/O completes successfully. Setting PageError on any page - will be ignored; simply unlock the page if an I/O error occurs. + unlocked by the I/O completion handler. The set of pages are + divided into some sync pages followed by some async pages, + rac->ra->async_size gives the number of async pages. The + filesystem should attempt to read all sync pages but may decide + to stop once it reaches the async pages. If it does decide to + stop attempting I/O, it can simply return. The caller will + remove the remaining pages from the address space, unlock them + and decrement the page refcount. Set PageUptodate if the I/O + completes successfully. Setting PageError on any page will be + ignored; simply unlock the page if an I/O error occurs. ``readpages`` called by the VM to read pages associated with the address_space diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..8b5c486bd4a2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -930,10 +930,15 @@ struct fown_struct { * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. - * @async_size: Start next readahead when this many pages are left. - * @ra_pages: Maximum size of a readahead request. + * @async_size: Numer of pages that were/are not needed immediately + * and so were/are genuinely "ahead". Start next readahead when + * the first of these pages is accessed. + * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. + * + * When this structure is passed to ->readahead(), the "most recent" + * readahead means the current readahead. */ struct file_ra_state { pgoff_t start; diff --git a/mm/readahead.c b/mm/readahead.c index cf0dcf89eb69..73b2bc5302e0 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,6 +8,105 @@ * Initial version. */ +/** + * DOC: Readahead Overview + * + * Readahead is used to read content into the page cache before it is + * explicitly requested by the application. Readahead only ever + * attempts to read pages that are not yet in the page cache. If a + * page is present but not up-to-date, readahead will not try to read + * it. In that case a simple ->readpage() will be requested. + * + * Readahead is triggered when an application read request (whether a + * systemcall or a page fault) finds that the requested page is not in + * the page cache, or that it is in the page cache and has the + * %PG_readahead flag set. This flag indicates that the page was loaded + * as part of a previous read-ahead request and now that it has been + * accessed, it is time for the next read-ahead. + * + * Each readahead request is partly synchronous read, and partly async + * read-ahead. This is reflected in the struct file_ra_state which + * contains ->size being to total number of pages, and ->async_size + * which is the number of pages in the async section. The first page in + * this async section will have %PG_readahead set as a trigger for a + * subsequent read ahead. Once a series of sequential reads has been + * established, there should be no need for a synchronous component and + * all read ahead request will be fully asynchronous. + * + * When either of the triggers causes a readahead, three numbers need to + * be determined: the start of the region, the size of the region, and + * the size of the async tail. + * + * The start of the region is simply the first page address at or after + * the accessed address, which is not currently populated in the page + * cache. This is found with a simple search in the page cache. + * + * The size of the async tail is determined by subtracting the size that + * was explicitly requested from the determined request size, unless + * this would be less than zero - then zero is used. NOTE THIS + * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED + * PAGE. + * + * The size of the region is normally determined from the size of the + * previous readahead which loaded the preceding pages. This may be + * discovered from the struct file_ra_state for simple sequential reads, + * or from examining the state of the page cache when multiple + * sequential reads are interleaved. Specifically: where the readahead + * was triggered by the %PG_readahead flag, the size of the previous + * readahead is assumed to be the number of pages from the triggering + * page to the start of the new readahead. In these cases, the size of + * the previous readahead is scaled, often doubled, for the new + * readahead, though see get_next_ra_size() for details. + * + * If the size of the previous read cannot be determined, the number of + * preceding pages in the page cache is used to estimate the size of + * a previous read. This estimate could easily be misled by random + * reads being coincidentally adjacent, so it is ignored unless it is + * larger than the current request, and it is not scaled up, unless it + * is at the start of file. + * + * In general read ahead is accelerated at the start of the file, as + * reads from there are often sequential. There are other minor + * adjustments to the read ahead size in various special cases and these + * are best discovered by reading the code. + * + * The above calculation determines the readahead, to which any requested + * read size may be added. + * + * Readahead requests are sent to the filesystem using the ->readahead() + * address space operation, for which mpage_readahead() is a canonical + * implementation. ->readahead() should normally initiate reads on all + * pages, but may fail to read any or all pages without causing an IO + * error. The page cache reading code will issue a ->readpage() request + * for any page which ->readahead() does not provided, and only an error + * from this will be final. + * + * ->readahead() will generally call readahead_page() repeatedly to get + * each page from those prepared for read ahead. It may fail to read a + * page by: + * + * * not calling readahead_page() sufficiently many times, effectively + * ignoring some pages, as might be appropriate if the path to + * storage is congested. + * + * * failing to actually submit a read request for a given page, + * possibly due to insufficient resources, or + * + * * getting an error during subsequent processing of a request. + * + * In the last two cases, the page should be unlocked to indicate that + * the read attempt has failed. In the first case the page will be + * unlocked by the caller. + * + * Those pages not in the final ``async_size`` of the request should be + * considered to be important and ->readahead() should not fail them due + * to congestion or temporary resource unavailability, but should wait + * for necessary resources (e.g. memory or indexing information) to + * become available. Pages in the final ``async_size`` may be + * considered less urgent and failure to read them is more acceptable. + * They will eventually be read individually using ->readpage(). + */ + #include #include #include -- cgit v1.2.3-71-gd317 From a128b054ce029554a4a52fc3abb8c1df8bafcaef Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Tue, 22 Mar 2022 14:39:22 -0700 Subject: mount: warn only once about timestamp range expiration Commit f8b92ba67c5d ("mount: Add mount warning for impending timestamp expiry") introduced a mount warning regarding filesystem timestamp limits, that is printed upon each writable mount or remount. This can result in a lot of unnecessary messages in the kernel log in setups where filesystems are being frequently remounted (or mounted multiple times). Avoid this by setting a superblock flag which indicates that the warning has been emitted at least once for any particular mount, as suggested in [1]. Link: https://lore.kernel.org/CAHk-=wim6VGnxQmjfK_tDg6fbHYKL4EFkmnTjVr9QnRqjDBAeA@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20220119202934.26495-1-ailiop@suse.com Signed-off-by: Anthony Iliopoulos Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Reviewed-by: Darrick J. Wong Cc: Alexander Viro Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 ++ include/linux/fs.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux/fs.h') diff --git a/fs/namespace.c b/fs/namespace.c index de6fae84f1a1..0044feef59d0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2597,6 +2597,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); @@ -2611,6 +2612,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; } } diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b5c486bd4a2..ca9445f6cf3d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1440,6 +1440,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ /* Possible states of 'frozen' field */ enum { -- cgit v1.2.3-71-gd317 From 8b9f3ac5b01db85c6cf74c2c3a71280cc3045c9c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:00 -0700 Subject: fs: introduce alloc_inode_sb() to allocate filesystems specific inode The allocated inode cache is supposed to be added to its memcg list_lru which should be allocated as well in advance. That can be done by kmem_cache_alloc_lru() which allocates object and list_lru. The file systems is main user of it. So introduce alloc_inode_sb() to allocate file system specific inodes and set up the inode reclaim context properly. The file system is supposed to use alloc_inode_sb() to allocate inodes. In later patches, we will convert all users to the new API. Link: https://lkml.kernel.org/r/20220228122126.37293-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Roman Gushchin Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/porting.rst | 6 ++++++ fs/inode.c | 2 +- include/linux/fs.h | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index bf19fd6b86e7..7c1583dbeb59 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -45,6 +45,12 @@ typically between calling iget_locked() and unlocking the inode. At some point that will become mandatory. +**mandatory** + +The foo_inode_info should always be allocated through alloc_inode_sb() rather +than kmem_cache_alloc() or kmalloc() related to set up the inode reclaim context +correctly. + --- **mandatory** diff --git a/fs/inode.c b/fs/inode.c index 63324df6fa27..9d9b422504d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -259,7 +259,7 @@ static struct inode *alloc_inode(struct super_block *sb) if (ops->alloc_inode) inode = ops->alloc_inode(sb); else - inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); + inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index ca9445f6cf3d..58a73e59e4c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -3114,6 +3115,16 @@ extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_privs(struct file *); +/* + * This must be used for allocating filesystems specific inodes to set + * up the inode reclaim context correctly. + */ +static inline void * +alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp) +{ + return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp); +} + extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { -- cgit v1.2.3-71-gd317 From 704528d895dd3e7b173e672116b4eb2b0a0fceb0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 23 Mar 2022 21:29:04 -0400 Subject: fs: Remove ->readpages address space operation All filesystems have now been converted to use ->readahead, so remove the ->readpages operation and fix all the comments that used to refer to it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- Documentation/filesystems/fsverity.rst | 6 +++--- Documentation/filesystems/locking.rst | 6 ------ Documentation/filesystems/vfs.rst | 11 ----------- fs/btrfs/reflink.c | 4 ++-- fs/cifs/cifssmb.c | 2 +- fs/cifs/inode.c | 2 +- fs/crypto/crypto.c | 2 +- fs/ext4/readpage.c | 2 +- fs/f2fs/data.c | 4 ++-- fs/fuse/fuse_i.h | 2 +- fs/verity/verify.c | 4 ++-- include/linux/fs.h | 6 ------ include/linux/fsverity.h | 2 +- mm/filemap.c | 2 +- mm/readahead.c | 15 ++------------- 15 files changed, 18 insertions(+), 52 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 1d831e3cbcb3..8cc536d08f51 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -549,7 +549,7 @@ Pagecache ~~~~~~~~~ For filesystems using Linux's pagecache, the ``->readpage()`` and -``->readpages()`` methods must be modified to verify pages before they +``->readahead()`` methods must be modified to verify pages before they are marked Uptodate. Merely hooking ``->read_iter()`` would be insufficient, since ``->read_iter()`` is not used for memory maps. @@ -611,7 +611,7 @@ workqueue, and then the workqueue work does the decryption or verification. Finally, pages where no decryption or verity error occurred are marked Uptodate, and the pages are unlocked. -Files on ext4 and f2fs may contain holes. Normally, ``->readpages()`` +Files on ext4 and f2fs may contain holes. Normally, ``->readahead()`` simply zeroes holes and sets the corresponding pages Uptodate; no bios are issued. To prevent this case from bypassing fs-verity, these filesystems use fsverity_verify_page() to verify hole pages. @@ -778,7 +778,7 @@ weren't already directly answered in other parts of this document. - To prevent bypassing verification, pages must not be marked Uptodate until they've been verified. Currently, each filesystem is responsible for marking pages Uptodate via - ``->readpages()``. Therefore, currently it's not possible for + ``->readahead()``. Therefore, currently it's not possible for the VFS to do the verification on its own. Changing this would require significant changes to the VFS and all filesystems. diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 2998cec9af4b..c26d854275a0 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -241,8 +241,6 @@ prototypes:: int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *folio); void (*readahead)(struct readahead_control *); - int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); @@ -274,7 +272,6 @@ readpage: yes, unlocks shared writepages: dirty_folio maybe readahead: yes, unlocks shared -readpages: no shared write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: @@ -300,9 +297,6 @@ completion. ->readahead() unlocks the pages that I/O is attempted on like ->readpage(). -->readpages() populates the pagecache with the passed pages and starts -I/O against them. They come unlocked upon I/O completion. - ->writepage() is used for two purposes: for "memory cleansing" and for "sync". These are quite different operations and the behaviour may differ depending upon the mode. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 4f14edf93941..794bd1a66bfb 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -726,8 +726,6 @@ cache in your filesystem. The following members are defined: int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); - int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); @@ -817,15 +815,6 @@ cache in your filesystem. The following members are defined: completes successfully. Setting PageError on any page will be ignored; simply unlock the page if an I/O error occurs. -``readpages`` - called by the VM to read pages associated with the address_space - object. This is essentially just a vector version of readpage. - Instead of just one page, several pages are requested. - readpages is only used for read-ahead, so read errors are - ignored. If anything goes wrong, feel free to give up. - This interface is deprecated and will be removed by the end of - 2020; implement readahead instead. - ``write_begin`` Called by the generic buffered write code to ask the filesystem to prepare to write len bytes at the given offset in the file. diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 04a88bfe4fcf..998e3f180d90 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -645,7 +645,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, int ret; /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); @@ -739,7 +739,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, } /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, off, inode, destoff, len); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 071e2f21a7db..bc3ded4f34f6 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -597,7 +597,7 @@ CIFSSMBNegotiate(const unsigned int xid, set_credits(server, server->maxReq); /* probably no need to store and check maxvcs */ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); - /* set up max_read for readpages check */ + /* set up max_read for readahead check */ server->max_read = server->maxBuf; server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 60d853c92f6a..2f9e7d2f81b6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -49,7 +49,7 @@ static void cifs_set_ops(struct inode *inode) inode->i_fop = &cifs_file_ops; } - /* check if server can support readpages */ + /* check if server can support readahead */ if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read < PAGE_SIZE + MAX_CIFS_HDR_SIZE) inode->i_data.a_ops = &cifs_addr_ops_smallbuf; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 4fcca79f39ae..526a4c1bed99 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -248,7 +248,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); * which must still be locked and not uptodate. Normally, blocksize == * PAGE_SIZE and the whole page is decrypted at once. * - * This is for use by the filesystem's ->readpages() method. + * This is for use by the filesystem's ->readahead() method. * * Return: 0 on success; -errno on failure */ diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 1aa26d6634fc..af491e170c4a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -109,7 +109,7 @@ static void verity_work(struct work_struct *work) struct bio *bio = ctx->bio; /* - * fsverity_verify_bio() may call readpages() again, and although verity + * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f8fcbe91059b..c92920c8661d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -164,7 +164,7 @@ static void f2fs_verify_bio(struct work_struct *work) bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* - * fsverity_verify_bio() may call readpages() again, and while verity + * fsverity_verify_bio() may call readahead() again, and while verity * will be disabled for this, decryption and/or decompression may still * be needed, resulting in another bio_post_read_ctx being allocated. * So to prevent deadlocks we need to release the current ctx to the @@ -2392,7 +2392,7 @@ static void f2fs_readahead(struct readahead_control *rac) if (!f2fs_is_compress_backend_ready(inode)) return; - /* If the file has inline data, skip readpages */ + /* If the file has inline data, skip readahead */ if (f2fs_has_inline_data(inode)) return; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index eac4984cc753..488b460e046f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -627,7 +627,7 @@ struct fuse_conn { /** Connection successful. Only set in INIT */ unsigned conn_init:1; - /** Do readpages asynchronously? Only set in INIT */ + /** Do readahead asynchronously? Only set in INIT */ unsigned async_read:1; /** Return an unique read error after abort. Only set in INIT */ diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 0adb970f4e73..14e2fb49cff5 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Data verification functions, i.e. hooks for ->readpages() + * Data verification functions, i.e. hooks for ->readahead() * * Copyright 2019 Google LLC */ @@ -214,7 +214,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); * that fail verification are set to the Error state. Verification is skipped * for pages already in the Error state, e.g. due to fscrypt decryption failure. * - * This is a helper function for use by the ->readpages() method of filesystems + * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that * populate the page cache without issuing bios (e.g. non block-based * filesystems) must instead call fsverity_verify_page() directly on each page. diff --git a/include/linux/fs.h b/include/linux/fs.h index 183160872133..7c81887cc7e8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -370,12 +370,6 @@ struct address_space_operations { /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); - /* - * Reads in the requested pages. Unlike ->readpage(), this is - * PURELY used for read-ahead!. - */ - int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index b568b3c7d095..a7afc800bd8d 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -221,7 +221,7 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work) * * This checks whether ->i_verity_info has been set. * - * Filesystems call this from ->readpages() to check whether the pages need to + * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) diff --git a/mm/filemap.c b/mm/filemap.c index 647d72bf23b6..d904cd7e4181 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2538,7 +2538,7 @@ static int filemap_create_folio(struct file *file, * the page cache as the locked folio would then be enough to * synchronize with hole punching. But there are code paths * such as filemap_update_page() filling in partially uptodate - * pages or ->readpages() that need to hold invalidate_lock + * pages or ->readahead() that need to hold invalidate_lock * while mapping blocks for IO so let's hold the lock here as * well to keep locking rules simple. */ diff --git a/mm/readahead.c b/mm/readahead.c index 9097af639beb..297bd0719cda 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -170,13 +170,6 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, unlock_page(page); put_page(page); } - } else if (aops->readpages) { - aops->readpages(rac->file, rac->mapping, pages, - readahead_count(rac)); - /* Clean up the remaining pages */ - put_pages_list(pages); - rac->_index += rac->_nr_pages; - rac->_nr_pages = 0; } else { while ((page = readahead_page(rac))) { aops->readpage(rac->file, page); @@ -253,10 +246,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; - if (mapping->a_ops->readpages) { - folio->index = index + i; - list_add(&folio->lru, &page_pool); - } else if (filemap_add_folio(mapping, folio, index + i, + if (filemap_add_folio(mapping, folio, index + i, gfp_mask) < 0) { folio_put(folio); read_pages(ractl, &page_pool, true); @@ -318,8 +308,7 @@ void force_page_cache_ra(struct readahead_control *ractl, struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && - !mapping->a_ops->readahead)) + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readahead)) return; /* -- cgit v1.2.3-71-gd317 From a9fcd89d67bb8c4ad613b54ab691fc603c94a03a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Feb 2022 09:13:43 -0500 Subject: fs: Remove read_actor_t This typedef is not used any more. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7c81887cc7e8..7588d3a0ced8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -357,9 +357,6 @@ typedef struct { int error; } read_descriptor_t; -typedef int (*read_actor_t)(read_descriptor_t *, struct page *, - unsigned long, unsigned long); - struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); -- cgit v1.2.3-71-gd317 From b2403a61308533c576c9dd783fcb73a9186e0b37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Feb 2022 09:15:34 -0500 Subject: fs, net: Move read_descriptor_t to net.h fs.h has no more need for this typedef; networking is now the sole user of the read_descriptor_t. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- include/linux/fs.h | 19 ------------------- include/linux/net.h | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7588d3a0ced8..8ff28939de60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -338,25 +338,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) return kiocb->ki_complete == NULL; } -/* - * "descriptor" for what we're up to with a read. - * This allows us to use the same read code yet - * have multiple different users of the data that - * we read from a file. - * - * The simplest case just copies the data to user - * mode. - */ -typedef struct { - size_t written; - size_t count; - union { - char __user *buf; - void *data; - } arg; - int error; -} read_descriptor_t; - struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); diff --git a/include/linux/net.h b/include/linux/net.h index ba736b457a06..12093f4db50c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -125,6 +125,25 @@ struct socket { struct socket_wq wq; }; +/* + * "descriptor" for what we're up to with a read. + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ +typedef struct { + size_t written; + size_t count; + union { + char __user *buf; + void *data; + } arg; + int error; +} read_descriptor_t; + struct vm_area_struct; struct page; struct sockaddr; -- cgit v1.2.3-71-gd317 From 800ba29547e16d5fbe67ca764ba660e049e9f1bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 19 Feb 2022 23:19:49 -0500 Subject: fs: Pass an iocb to generic_perform_write() We can extract both the file pointer and the pos from the iocb. This simplifies each caller as well as allowing generic_perform_write() to see more of the iocb contents in the future. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Al Viro Acked-by: Al Viro --- fs/ceph/file.c | 2 +- fs/ext4/file.c | 2 +- fs/f2fs/file.c | 2 +- fs/nfs/file.c | 2 +- include/linux/fs.h | 2 +- mm/filemap.c | 10 ++++++---- 6 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index feb75eb1cd82..6c9e837aa1d3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1869,7 +1869,7 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_perform_write(file, from, pos); + written = generic_perform_write(iocb, from); if (likely(written >= 0)) iocb->ki_pos = pos + written; ceph_end_io_write(inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8bd66cdc41be..6feb07e3e1eb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -267,7 +267,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, goto out; current->backing_dev_info = inode_to_bdi(inode); - ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + ret = generic_perform_write(iocb, from); current->backing_dev_info = NULL; out: diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d3f39a704b8b..5b89af0f27f0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4448,7 +4448,7 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, return -EOPNOTSUPP; current->backing_dev_info = inode_to_bdi(inode); - ret = generic_perform_write(file, from, iocb->ki_pos); + ret = generic_perform_write(iocb, from); current->backing_dev_info = NULL; if (ret > 0) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index b0ca244c50d0..150b7fa8f0a7 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -646,7 +646,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) result = generic_write_checks(iocb, from); if (result > 0) { current->backing_dev_info = inode_to_bdi(inode); - result = generic_perform_write(file, from, iocb->ki_pos); + result = generic_perform_write(iocb, from); current->backing_dev_info = NULL; } nfs_end_io_write(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 8ff28939de60..468dc7ec821f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2999,7 +2999,7 @@ extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); -extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t); +ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); diff --git a/mm/filemap.c b/mm/filemap.c index d904cd7e4181..3a5ffb5587cd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3752,9 +3752,10 @@ out: } EXPORT_SYMBOL(generic_file_direct_write); -ssize_t generic_perform_write(struct file *file, - struct iov_iter *i, loff_t pos) +ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) { + struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; long status = 0; @@ -3884,7 +3885,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) goto out; - status = generic_perform_write(file, from, pos = iocb->ki_pos); + pos = iocb->ki_pos; + status = generic_perform_write(iocb, from); /* * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were @@ -3916,7 +3918,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) */ } } else { - written = generic_perform_write(file, from, iocb->ki_pos); + written = generic_perform_write(iocb, from); if (likely(written > 0)) iocb->ki_pos += written; } -- cgit v1.2.3-71-gd317 From d7414ba14a3a67f81321069219dc7dbc095022c3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 20 Feb 2022 22:28:03 -0500 Subject: filemap: Remove AOP_FLAG_CONT_EXPAND This flag is no longer used, so remove it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- fs/buffer.c | 3 +-- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/buffer.c b/fs/buffer.c index d67fbe063a3a..2b5561ae5d0b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2352,8 +2352,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) if (err) goto out; - err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_CONT_EXPAND, &page, &fsdata); + err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); if (err) goto out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 468dc7ec821f..bbde95387a23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -275,7 +275,6 @@ enum positive_aop_returns { AOP_TRUNCATED_PAGE = 0x80001, }; -#define AOP_FLAG_CONT_EXPAND 0x0001 /* called from cont_expand */ #define AOP_FLAG_NOFS 0x0002 /* used by filesystem to direct * helper code (eg buffer layer) * to clear GFP_FS from alloc */ -- cgit v1.2.3-71-gd317