page-io.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
page-io.c (15431B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * linux/fs/ext4/page-io.c
      4 *
      5 * This contains the new page_io functions for ext4
      6 *
      7 * Written by Theodore Ts'o, 2010.
      8 */
      9
     10#include <linux/fs.h>
     11#include <linux/time.h>
     12#include <linux/highuid.h>
     13#include <linux/pagemap.h>
     14#include <linux/quotaops.h>
     15#include <linux/string.h>
     16#include <linux/buffer_head.h>
     17#include <linux/writeback.h>
     18#include <linux/pagevec.h>
     19#include <linux/mpage.h>
     20#include <linux/namei.h>
     21#include <linux/uio.h>
     22#include <linux/bio.h>
     23#include <linux/workqueue.h>
     24#include <linux/kernel.h>
     25#include <linux/slab.h>
     26#include <linux/mm.h>
     27#include <linux/sched/mm.h>
     28
     29#include "ext4_jbd2.h"
     30#include "xattr.h"
     31#include "acl.h"
     32
     33static struct kmem_cache *io_end_cachep;
     34static struct kmem_cache *io_end_vec_cachep;
     35
     36int __init ext4_init_pageio(void)
     37{
     38	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
     39	if (io_end_cachep == NULL)
     40		return -ENOMEM;
     41
     42	io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
     43	if (io_end_vec_cachep == NULL) {
     44		kmem_cache_destroy(io_end_cachep);
     45		return -ENOMEM;
     46	}
     47	return 0;
     48}
     49
     50void ext4_exit_pageio(void)
     51{
     52	kmem_cache_destroy(io_end_cachep);
     53	kmem_cache_destroy(io_end_vec_cachep);
     54}
     55
     56struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
     57{
     58	struct ext4_io_end_vec *io_end_vec;
     59
     60	io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
     61	if (!io_end_vec)
     62		return ERR_PTR(-ENOMEM);
     63	INIT_LIST_HEAD(&io_end_vec->list);
     64	list_add_tail(&io_end_vec->list, &io_end->list_vec);
     65	return io_end_vec;
     66}
     67
     68static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
     69{
     70	struct ext4_io_end_vec *io_end_vec, *tmp;
     71
     72	if (list_empty(&io_end->list_vec))
     73		return;
     74	list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
     75		list_del(&io_end_vec->list);
     76		kmem_cache_free(io_end_vec_cachep, io_end_vec);
     77	}
     78}
     79
     80struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
     81{
     82	BUG_ON(list_empty(&io_end->list_vec));
     83	return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
     84}
     85
     86/*
     87 * Print an buffer I/O error compatible with the fs/buffer.c.  This
     88 * provides compatibility with dmesg scrapers that look for a specific
     89 * buffer I/O error message.  We really need a unified error reporting
     90 * structure to userspace ala Digital Unix's uerf system, but it's
     91 * probably not going to happen in my lifetime, due to LKML politics...
     92 */
     93static void buffer_io_error(struct buffer_head *bh)
     94{
     95	printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
     96		       bh->b_bdev,
     97			(unsigned long long)bh->b_blocknr);
     98}
     99
    100static void ext4_finish_bio(struct bio *bio)
    101{
    102	struct bio_vec *bvec;
    103	struct bvec_iter_all iter_all;
    104
    105	bio_for_each_segment_all(bvec, bio, iter_all) {
    106		struct page *page = bvec->bv_page;
    107		struct page *bounce_page = NULL;
    108		struct buffer_head *bh, *head;
    109		unsigned bio_start = bvec->bv_offset;
    110		unsigned bio_end = bio_start + bvec->bv_len;
    111		unsigned under_io = 0;
    112		unsigned long flags;
    113
    114		if (fscrypt_is_bounce_page(page)) {
    115			bounce_page = page;
    116			page = fscrypt_pagecache_page(bounce_page);
    117		}
    118
    119		if (bio->bi_status) {
    120			SetPageError(page);
    121			mapping_set_error(page->mapping, -EIO);
    122		}
    123		bh = head = page_buffers(page);
    124		/*
    125		 * We check all buffers in the page under b_uptodate_lock
    126		 * to avoid races with other end io clearing async_write flags
    127		 */
    128		spin_lock_irqsave(&head->b_uptodate_lock, flags);
    129		do {
    130			if (bh_offset(bh) < bio_start ||
    131			    bh_offset(bh) + bh->b_size > bio_end) {
    132				if (buffer_async_write(bh))
    133					under_io++;
    134				continue;
    135			}
    136			clear_buffer_async_write(bh);
    137			if (bio->bi_status) {
    138				set_buffer_write_io_error(bh);
    139				buffer_io_error(bh);
    140			}
    141		} while ((bh = bh->b_this_page) != head);
    142		spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
    143		if (!under_io) {
    144			fscrypt_free_bounce_page(bounce_page);
    145			end_page_writeback(page);
    146		}
    147	}
    148}
    149
    150static void ext4_release_io_end(ext4_io_end_t *io_end)
    151{
    152	struct bio *bio, *next_bio;
    153
    154	BUG_ON(!list_empty(&io_end->list));
    155	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
    156	WARN_ON(io_end->handle);
    157
    158	for (bio = io_end->bio; bio; bio = next_bio) {
    159		next_bio = bio->bi_private;
    160		ext4_finish_bio(bio);
    161		bio_put(bio);
    162	}
    163	ext4_free_io_end_vec(io_end);
    164	kmem_cache_free(io_end_cachep, io_end);
    165}
    166
    167/*
    168 * Check a range of space and convert unwritten extents to written. Note that
    169 * we are protected from truncate touching same part of extent tree by the
    170 * fact that truncate code waits for all DIO to finish (thus exclusion from
    171 * direct IO is achieved) and also waits for PageWriteback bits. Thus we
    172 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
    173 * completed (happens from ext4_free_ioend()).
    174 */
    175static int ext4_end_io_end(ext4_io_end_t *io_end)
    176{
    177	struct inode *inode = io_end->inode;
    178	handle_t *handle = io_end->handle;
    179	int ret = 0;
    180
    181	ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
    182		   "list->prev 0x%p\n",
    183		   io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
    184
    185	io_end->handle = NULL;	/* Following call will use up the handle */
    186	ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
    187	if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
    188		ext4_msg(inode->i_sb, KERN_EMERG,
    189			 "failed to convert unwritten extents to written "
    190			 "extents -- potential data loss!  "
    191			 "(inode %lu, error %d)", inode->i_ino, ret);
    192	}
    193	ext4_clear_io_unwritten_flag(io_end);
    194	ext4_release_io_end(io_end);
    195	return ret;
    196}
    197
    198static void dump_completed_IO(struct inode *inode, struct list_head *head)
    199{
    200#ifdef	EXT4FS_DEBUG
    201	struct list_head *cur, *before, *after;
    202	ext4_io_end_t *io_end, *io_end0, *io_end1;
    203
    204	if (list_empty(head))
    205		return;
    206
    207	ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
    208	list_for_each_entry(io_end, head, list) {
    209		cur = &io_end->list;
    210		before = cur->prev;
    211		io_end0 = container_of(before, ext4_io_end_t, list);
    212		after = cur->next;
    213		io_end1 = container_of(after, ext4_io_end_t, list);
    214
    215		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
    216			    io_end, inode->i_ino, io_end0, io_end1);
    217	}
    218#endif
    219}
    220
    221/* Add the io_end to per-inode completed end_io list. */
    222static void ext4_add_complete_io(ext4_io_end_t *io_end)
    223{
    224	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
    225	struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
    226	struct workqueue_struct *wq;
    227	unsigned long flags;
    228
    229	/* Only reserved conversions from writeback should enter here */
    230	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
    231	WARN_ON(!io_end->handle && sbi->s_journal);
    232	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
    233	wq = sbi->rsv_conversion_wq;
    234	if (list_empty(&ei->i_rsv_conversion_list))
    235		queue_work(wq, &ei->i_rsv_conversion_work);
    236	list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
    237	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
    238}
    239
    240static int ext4_do_flush_completed_IO(struct inode *inode,
    241				      struct list_head *head)
    242{
    243	ext4_io_end_t *io_end;
    244	struct list_head unwritten;
    245	unsigned long flags;
    246	struct ext4_inode_info *ei = EXT4_I(inode);
    247	int err, ret = 0;
    248
    249	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
    250	dump_completed_IO(inode, head);
    251	list_replace_init(head, &unwritten);
    252	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
    253
    254	while (!list_empty(&unwritten)) {
    255		io_end = list_entry(unwritten.next, ext4_io_end_t, list);
    256		BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
    257		list_del_init(&io_end->list);
    258
    259		err = ext4_end_io_end(io_end);
    260		if (unlikely(!ret && err))
    261			ret = err;
    262	}
    263	return ret;
    264}
    265
    266/*
    267 * work on completed IO, to convert unwritten extents to extents
    268 */
    269void ext4_end_io_rsv_work(struct work_struct *work)
    270{
    271	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
    272						  i_rsv_conversion_work);
    273	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
    274}
    275
    276ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
    277{
    278	ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
    279
    280	if (io_end) {
    281		io_end->inode = inode;
    282		INIT_LIST_HEAD(&io_end->list);
    283		INIT_LIST_HEAD(&io_end->list_vec);
    284		refcount_set(&io_end->count, 1);
    285	}
    286	return io_end;
    287}
    288
    289void ext4_put_io_end_defer(ext4_io_end_t *io_end)
    290{
    291	if (refcount_dec_and_test(&io_end->count)) {
    292		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
    293				list_empty(&io_end->list_vec)) {
    294			ext4_release_io_end(io_end);
    295			return;
    296		}
    297		ext4_add_complete_io(io_end);
    298	}
    299}
    300
    301int ext4_put_io_end(ext4_io_end_t *io_end)
    302{
    303	int err = 0;
    304
    305	if (refcount_dec_and_test(&io_end->count)) {
    306		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
    307			err = ext4_convert_unwritten_io_end_vec(io_end->handle,
    308								io_end);
    309			io_end->handle = NULL;
    310			ext4_clear_io_unwritten_flag(io_end);
    311		}
    312		ext4_release_io_end(io_end);
    313	}
    314	return err;
    315}
    316
    317ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
    318{
    319	refcount_inc(&io_end->count);
    320	return io_end;
    321}
    322
    323/* BIO completion function for page writeback */
    324static void ext4_end_bio(struct bio *bio)
    325{
    326	ext4_io_end_t *io_end = bio->bi_private;
    327	sector_t bi_sector = bio->bi_iter.bi_sector;
    328
    329	if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
    330		      bio->bi_bdev,
    331		      (long long) bio->bi_iter.bi_sector,
    332		      (unsigned) bio_sectors(bio),
    333		      bio->bi_status)) {
    334		ext4_finish_bio(bio);
    335		bio_put(bio);
    336		return;
    337	}
    338	bio->bi_end_io = NULL;
    339
    340	if (bio->bi_status) {
    341		struct inode *inode = io_end->inode;
    342
    343		ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
    344			     "starting block %llu)",
    345			     bio->bi_status, inode->i_ino,
    346			     (unsigned long long)
    347			     bi_sector >> (inode->i_blkbits - 9));
    348		mapping_set_error(inode->i_mapping,
    349				blk_status_to_errno(bio->bi_status));
    350	}
    351
    352	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
    353		/*
    354		 * Link bio into list hanging from io_end. We have to do it
    355		 * atomically as bio completions can be racing against each
    356		 * other.
    357		 */
    358		bio->bi_private = xchg(&io_end->bio, bio);
    359		ext4_put_io_end_defer(io_end);
    360	} else {
    361		/*
    362		 * Drop io_end reference early. Inode can get freed once
    363		 * we finish the bio.
    364		 */
    365		ext4_put_io_end_defer(io_end);
    366		ext4_finish_bio(bio);
    367		bio_put(bio);
    368	}
    369}
    370
    371void ext4_io_submit(struct ext4_io_submit *io)
    372{
    373	struct bio *bio = io->io_bio;
    374
    375	if (bio) {
    376		if (io->io_wbc->sync_mode == WB_SYNC_ALL)
    377			io->io_bio->bi_opf |= REQ_SYNC;
    378		submit_bio(io->io_bio);
    379	}
    380	io->io_bio = NULL;
    381}
    382
    383void ext4_io_submit_init(struct ext4_io_submit *io,
    384			 struct writeback_control *wbc)
    385{
    386	io->io_wbc = wbc;
    387	io->io_bio = NULL;
    388	io->io_end = NULL;
    389}
    390
    391static void io_submit_init_bio(struct ext4_io_submit *io,
    392			       struct buffer_head *bh)
    393{
    394	struct bio *bio;
    395
    396	/*
    397	 * bio_alloc will _always_ be able to allocate a bio if
    398	 * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
    399	 */
    400	bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
    401	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
    402	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
    403	bio->bi_end_io = ext4_end_bio;
    404	bio->bi_private = ext4_get_io_end(io->io_end);
    405	io->io_bio = bio;
    406	io->io_next_block = bh->b_blocknr;
    407	wbc_init_bio(io->io_wbc, bio);
    408}
    409
    410static void io_submit_add_bh(struct ext4_io_submit *io,
    411			     struct inode *inode,
    412			     struct page *page,
    413			     struct buffer_head *bh)
    414{
    415	int ret;
    416
    417	if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
    418			   !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
    419submit_and_retry:
    420		ext4_io_submit(io);
    421	}
    422	if (io->io_bio == NULL)
    423		io_submit_init_bio(io, bh);
    424	ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
    425	if (ret != bh->b_size)
    426		goto submit_and_retry;
    427	wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
    428	io->io_next_block++;
    429}
    430
    431int ext4_bio_write_page(struct ext4_io_submit *io,
    432			struct page *page,
    433			int len,
    434			bool keep_towrite)
    435{
    436	struct page *bounce_page = NULL;
    437	struct inode *inode = page->mapping->host;
    438	unsigned block_start;
    439	struct buffer_head *bh, *head;
    440	int ret = 0;
    441	int nr_submitted = 0;
    442	int nr_to_submit = 0;
    443	struct writeback_control *wbc = io->io_wbc;
    444
    445	BUG_ON(!PageLocked(page));
    446	BUG_ON(PageWriteback(page));
    447
    448	if (keep_towrite)
    449		set_page_writeback_keepwrite(page);
    450	else
    451		set_page_writeback(page);
    452	ClearPageError(page);
    453
    454	/*
    455	 * Comments copied from block_write_full_page:
    456	 *
    457	 * The page straddles i_size.  It must be zeroed out on each and every
    458	 * writepage invocation because it may be mmapped.  "A file is mapped
    459	 * in multiples of the page size.  For a file that is not a multiple of
    460	 * the page size, the remaining memory is zeroed when mapped, and
    461	 * writes to that region are not written out to the file."
    462	 */
    463	if (len < PAGE_SIZE)
    464		zero_user_segment(page, len, PAGE_SIZE);
    465	/*
    466	 * In the first loop we prepare and mark buffers to submit. We have to
    467	 * mark all buffers in the page before submitting so that
    468	 * end_page_writeback() cannot be called from ext4_end_bio() when IO
    469	 * on the first buffer finishes and we are still working on submitting
    470	 * the second buffer.
    471	 */
    472	bh = head = page_buffers(page);
    473	do {
    474		block_start = bh_offset(bh);
    475		if (block_start >= len) {
    476			clear_buffer_dirty(bh);
    477			set_buffer_uptodate(bh);
    478			continue;
    479		}
    480		if (!buffer_dirty(bh) || buffer_delay(bh) ||
    481		    !buffer_mapped(bh) || buffer_unwritten(bh)) {
    482			/* A hole? We can safely clear the dirty bit */
    483			if (!buffer_mapped(bh))
    484				clear_buffer_dirty(bh);
    485			if (io->io_bio)
    486				ext4_io_submit(io);
    487			continue;
    488		}
    489		if (buffer_new(bh))
    490			clear_buffer_new(bh);
    491		set_buffer_async_write(bh);
    492		nr_to_submit++;
    493	} while ((bh = bh->b_this_page) != head);
    494
    495	bh = head = page_buffers(page);
    496
    497	/*
    498	 * If any blocks are being written to an encrypted file, encrypt them
    499	 * into a bounce page.  For simplicity, just encrypt until the last
    500	 * block which might be needed.  This may cause some unneeded blocks
    501	 * (e.g. holes) to be unnecessarily encrypted, but this is rare and
    502	 * can't happen in the common case of blocksize == PAGE_SIZE.
    503	 */
    504	if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
    505		gfp_t gfp_flags = GFP_NOFS;
    506		unsigned int enc_bytes = round_up(len, i_blocksize(inode));
    507
    508		/*
    509		 * Since bounce page allocation uses a mempool, we can only use
    510		 * a waiting mask (i.e. request guaranteed allocation) on the
    511		 * first page of the bio.  Otherwise it can deadlock.
    512		 */
    513		if (io->io_bio)
    514			gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
    515	retry_encrypt:
    516		bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
    517							       0, gfp_flags);
    518		if (IS_ERR(bounce_page)) {
    519			ret = PTR_ERR(bounce_page);
    520			if (ret == -ENOMEM &&
    521			    (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
    522				gfp_t new_gfp_flags = GFP_NOFS;
    523				if (io->io_bio)
    524					ext4_io_submit(io);
    525				else
    526					new_gfp_flags |= __GFP_NOFAIL;
    527				memalloc_retry_wait(gfp_flags);
    528				gfp_flags = new_gfp_flags;
    529				goto retry_encrypt;
    530			}
    531
    532			printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
    533			redirty_page_for_writepage(wbc, page);
    534			do {
    535				clear_buffer_async_write(bh);
    536				bh = bh->b_this_page;
    537			} while (bh != head);
    538			goto unlock;
    539		}
    540	}
    541
    542	/* Now submit buffers to write */
    543	do {
    544		if (!buffer_async_write(bh))
    545			continue;
    546		io_submit_add_bh(io, inode,
    547				 bounce_page ? bounce_page : page, bh);
    548		nr_submitted++;
    549		clear_buffer_dirty(bh);
    550	} while ((bh = bh->b_this_page) != head);
    551
    552unlock:
    553	unlock_page(page);
    554	/* Nothing submitted - we have to end page writeback */
    555	if (!nr_submitted)
    556		end_page_writeback(page);
    557	return ret;
    558}