page_io.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
page_io.c (12992B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/mm/page_io.c
      4 *
      5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
      6 *
      7 *  Swap reorganised 29.12.95, 
      8 *  Asynchronous swapping added 30.12.95. Stephen Tweedie
      9 *  Removed race in async swapping. 14.4.1996. Bruno Haible
     10 *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
     11 *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
     12 */
     13
     14#include <linux/mm.h>
     15#include <linux/kernel_stat.h>
     16#include <linux/gfp.h>
     17#include <linux/pagemap.h>
     18#include <linux/swap.h>
     19#include <linux/bio.h>
     20#include <linux/swapops.h>
     21#include <linux/buffer_head.h>
     22#include <linux/writeback.h>
     23#include <linux/frontswap.h>
     24#include <linux/blkdev.h>
     25#include <linux/psi.h>
     26#include <linux/uio.h>
     27#include <linux/sched/task.h>
     28#include <linux/delayacct.h>
     29#include "swap.h"
     30
     31void end_swap_bio_write(struct bio *bio)
     32{
     33	struct page *page = bio_first_page_all(bio);
     34
     35	if (bio->bi_status) {
     36		SetPageError(page);
     37		/*
     38		 * We failed to write the page out to swap-space.
     39		 * Re-dirty the page in order to avoid it being reclaimed.
     40		 * Also print a dire warning that things will go BAD (tm)
     41		 * very quickly.
     42		 *
     43		 * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
     44		 */
     45		set_page_dirty(page);
     46		pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
     47				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
     48				     (unsigned long long)bio->bi_iter.bi_sector);
     49		ClearPageReclaim(page);
     50	}
     51	end_page_writeback(page);
     52	bio_put(bio);
     53}
     54
     55static void end_swap_bio_read(struct bio *bio)
     56{
     57	struct page *page = bio_first_page_all(bio);
     58	struct task_struct *waiter = bio->bi_private;
     59
     60	if (bio->bi_status) {
     61		SetPageError(page);
     62		ClearPageUptodate(page);
     63		pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
     64				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
     65				     (unsigned long long)bio->bi_iter.bi_sector);
     66		goto out;
     67	}
     68
     69	SetPageUptodate(page);
     70out:
     71	unlock_page(page);
     72	WRITE_ONCE(bio->bi_private, NULL);
     73	bio_put(bio);
     74	if (waiter) {
     75		blk_wake_io_task(waiter);
     76		put_task_struct(waiter);
     77	}
     78}
     79
     80int generic_swapfile_activate(struct swap_info_struct *sis,
     81				struct file *swap_file,
     82				sector_t *span)
     83{
     84	struct address_space *mapping = swap_file->f_mapping;
     85	struct inode *inode = mapping->host;
     86	unsigned blocks_per_page;
     87	unsigned long page_no;
     88	unsigned blkbits;
     89	sector_t probe_block;
     90	sector_t last_block;
     91	sector_t lowest_block = -1;
     92	sector_t highest_block = 0;
     93	int nr_extents = 0;
     94	int ret;
     95
     96	blkbits = inode->i_blkbits;
     97	blocks_per_page = PAGE_SIZE >> blkbits;
     98
     99	/*
    100	 * Map all the blocks into the extent tree.  This code doesn't try
    101	 * to be very smart.
    102	 */
    103	probe_block = 0;
    104	page_no = 0;
    105	last_block = i_size_read(inode) >> blkbits;
    106	while ((probe_block + blocks_per_page) <= last_block &&
    107			page_no < sis->max) {
    108		unsigned block_in_page;
    109		sector_t first_block;
    110
    111		cond_resched();
    112
    113		first_block = probe_block;
    114		ret = bmap(inode, &first_block);
    115		if (ret || !first_block)
    116			goto bad_bmap;
    117
    118		/*
    119		 * It must be PAGE_SIZE aligned on-disk
    120		 */
    121		if (first_block & (blocks_per_page - 1)) {
    122			probe_block++;
    123			goto reprobe;
    124		}
    125
    126		for (block_in_page = 1; block_in_page < blocks_per_page;
    127					block_in_page++) {
    128			sector_t block;
    129
    130			block = probe_block + block_in_page;
    131			ret = bmap(inode, &block);
    132			if (ret || !block)
    133				goto bad_bmap;
    134
    135			if (block != first_block + block_in_page) {
    136				/* Discontiguity */
    137				probe_block++;
    138				goto reprobe;
    139			}
    140		}
    141
    142		first_block >>= (PAGE_SHIFT - blkbits);
    143		if (page_no) {	/* exclude the header page */
    144			if (first_block < lowest_block)
    145				lowest_block = first_block;
    146			if (first_block > highest_block)
    147				highest_block = first_block;
    148		}
    149
    150		/*
    151		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
    152		 */
    153		ret = add_swap_extent(sis, page_no, 1, first_block);
    154		if (ret < 0)
    155			goto out;
    156		nr_extents += ret;
    157		page_no++;
    158		probe_block += blocks_per_page;
    159reprobe:
    160		continue;
    161	}
    162	ret = nr_extents;
    163	*span = 1 + highest_block - lowest_block;
    164	if (page_no == 0)
    165		page_no = 1;	/* force Empty message */
    166	sis->max = page_no;
    167	sis->pages = page_no - 1;
    168	sis->highest_bit = page_no - 1;
    169out:
    170	return ret;
    171bad_bmap:
    172	pr_err("swapon: swapfile has holes\n");
    173	ret = -EINVAL;
    174	goto out;
    175}
    176
    177/*
    178 * We may have stale swap cache pages in memory: notice
    179 * them here and get rid of the unnecessary final write.
    180 */
    181int swap_writepage(struct page *page, struct writeback_control *wbc)
    182{
    183	int ret = 0;
    184
    185	if (try_to_free_swap(page)) {
    186		unlock_page(page);
    187		goto out;
    188	}
    189	/*
    190	 * Arch code may have to preserve more data than just the page
    191	 * contents, e.g. memory tags.
    192	 */
    193	ret = arch_prepare_to_swap(page);
    194	if (ret) {
    195		set_page_dirty(page);
    196		unlock_page(page);
    197		goto out;
    198	}
    199	if (frontswap_store(page) == 0) {
    200		set_page_writeback(page);
    201		unlock_page(page);
    202		end_page_writeback(page);
    203		goto out;
    204	}
    205	ret = __swap_writepage(page, wbc, end_swap_bio_write);
    206out:
    207	return ret;
    208}
    209
    210static inline void count_swpout_vm_event(struct page *page)
    211{
    212#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    213	if (unlikely(PageTransHuge(page)))
    214		count_vm_event(THP_SWPOUT);
    215#endif
    216	count_vm_events(PSWPOUT, thp_nr_pages(page));
    217}
    218
    219#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
    220static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
    221{
    222	struct cgroup_subsys_state *css;
    223	struct mem_cgroup *memcg;
    224
    225	memcg = page_memcg(page);
    226	if (!memcg)
    227		return;
    228
    229	rcu_read_lock();
    230	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
    231	bio_associate_blkg_from_css(bio, css);
    232	rcu_read_unlock();
    233}
    234#else
    235#define bio_associate_blkg_from_page(bio, page)		do { } while (0)
    236#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
    237
    238struct swap_iocb {
    239	struct kiocb		iocb;
    240	struct bio_vec		bvec[SWAP_CLUSTER_MAX];
    241	int			pages;
    242	int			len;
    243};
    244static mempool_t *sio_pool;
    245
    246int sio_pool_init(void)
    247{
    248	if (!sio_pool) {
    249		mempool_t *pool = mempool_create_kmalloc_pool(
    250			SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
    251		if (cmpxchg(&sio_pool, NULL, pool))
    252			mempool_destroy(pool);
    253	}
    254	if (!sio_pool)
    255		return -ENOMEM;
    256	return 0;
    257}
    258
    259static void sio_write_complete(struct kiocb *iocb, long ret)
    260{
    261	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
    262	struct page *page = sio->bvec[0].bv_page;
    263	int p;
    264
    265	if (ret != sio->len) {
    266		/*
    267		 * In the case of swap-over-nfs, this can be a
    268		 * temporary failure if the system has limited
    269		 * memory for allocating transmit buffers.
    270		 * Mark the page dirty and avoid
    271		 * folio_rotate_reclaimable but rate-limit the
    272		 * messages but do not flag PageError like
    273		 * the normal direct-to-bio case as it could
    274		 * be temporary.
    275		 */
    276		pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
    277				   ret, page_file_offset(page));
    278		for (p = 0; p < sio->pages; p++) {
    279			page = sio->bvec[p].bv_page;
    280			set_page_dirty(page);
    281			ClearPageReclaim(page);
    282		}
    283	} else {
    284		for (p = 0; p < sio->pages; p++)
    285			count_swpout_vm_event(sio->bvec[p].bv_page);
    286	}
    287
    288	for (p = 0; p < sio->pages; p++)
    289		end_page_writeback(sio->bvec[p].bv_page);
    290
    291	mempool_free(sio, sio_pool);
    292}
    293
    294static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
    295{
    296	struct swap_iocb *sio = NULL;
    297	struct swap_info_struct *sis = page_swap_info(page);
    298	struct file *swap_file = sis->swap_file;
    299	loff_t pos = page_file_offset(page);
    300
    301	set_page_writeback(page);
    302	unlock_page(page);
    303	if (wbc->swap_plug)
    304		sio = *wbc->swap_plug;
    305	if (sio) {
    306		if (sio->iocb.ki_filp != swap_file ||
    307		    sio->iocb.ki_pos + sio->len != pos) {
    308			swap_write_unplug(sio);
    309			sio = NULL;
    310		}
    311	}
    312	if (!sio) {
    313		sio = mempool_alloc(sio_pool, GFP_NOIO);
    314		init_sync_kiocb(&sio->iocb, swap_file);
    315		sio->iocb.ki_complete = sio_write_complete;
    316		sio->iocb.ki_pos = pos;
    317		sio->pages = 0;
    318		sio->len = 0;
    319	}
    320	sio->bvec[sio->pages].bv_page = page;
    321	sio->bvec[sio->pages].bv_len = thp_size(page);
    322	sio->bvec[sio->pages].bv_offset = 0;
    323	sio->len += thp_size(page);
    324	sio->pages += 1;
    325	if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
    326		swap_write_unplug(sio);
    327		sio = NULL;
    328	}
    329	if (wbc->swap_plug)
    330		*wbc->swap_plug = sio;
    331
    332	return 0;
    333}
    334
    335int __swap_writepage(struct page *page, struct writeback_control *wbc,
    336		     bio_end_io_t end_write_func)
    337{
    338	struct bio *bio;
    339	int ret;
    340	struct swap_info_struct *sis = page_swap_info(page);
    341
    342	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
    343	/*
    344	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
    345	 * but that will never affect SWP_FS_OPS, so the data_race
    346	 * is safe.
    347	 */
    348	if (data_race(sis->flags & SWP_FS_OPS))
    349		return swap_writepage_fs(page, wbc);
    350
    351	ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
    352	if (!ret) {
    353		count_swpout_vm_event(page);
    354		return 0;
    355	}
    356
    357	bio = bio_alloc(sis->bdev, 1,
    358			REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
    359			GFP_NOIO);
    360	bio->bi_iter.bi_sector = swap_page_sector(page);
    361	bio->bi_end_io = end_write_func;
    362	bio_add_page(bio, page, thp_size(page), 0);
    363
    364	bio_associate_blkg_from_page(bio, page);
    365	count_swpout_vm_event(page);
    366	set_page_writeback(page);
    367	unlock_page(page);
    368	submit_bio(bio);
    369
    370	return 0;
    371}
    372
    373void swap_write_unplug(struct swap_iocb *sio)
    374{
    375	struct iov_iter from;
    376	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
    377	int ret;
    378
    379	iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len);
    380	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
    381	if (ret != -EIOCBQUEUED)
    382		sio_write_complete(&sio->iocb, ret);
    383}
    384
    385static void sio_read_complete(struct kiocb *iocb, long ret)
    386{
    387	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
    388	int p;
    389
    390	if (ret == sio->len) {
    391		for (p = 0; p < sio->pages; p++) {
    392			struct page *page = sio->bvec[p].bv_page;
    393
    394			SetPageUptodate(page);
    395			unlock_page(page);
    396		}
    397		count_vm_events(PSWPIN, sio->pages);
    398	} else {
    399		for (p = 0; p < sio->pages; p++) {
    400			struct page *page = sio->bvec[p].bv_page;
    401
    402			SetPageError(page);
    403			ClearPageUptodate(page);
    404			unlock_page(page);
    405		}
    406		pr_alert_ratelimited("Read-error on swap-device\n");
    407	}
    408	mempool_free(sio, sio_pool);
    409}
    410
    411static void swap_readpage_fs(struct page *page,
    412			     struct swap_iocb **plug)
    413{
    414	struct swap_info_struct *sis = page_swap_info(page);
    415	struct swap_iocb *sio = NULL;
    416	loff_t pos = page_file_offset(page);
    417
    418	if (plug)
    419		sio = *plug;
    420	if (sio) {
    421		if (sio->iocb.ki_filp != sis->swap_file ||
    422		    sio->iocb.ki_pos + sio->len != pos) {
    423			swap_read_unplug(sio);
    424			sio = NULL;
    425		}
    426	}
    427	if (!sio) {
    428		sio = mempool_alloc(sio_pool, GFP_KERNEL);
    429		init_sync_kiocb(&sio->iocb, sis->swap_file);
    430		sio->iocb.ki_pos = pos;
    431		sio->iocb.ki_complete = sio_read_complete;
    432		sio->pages = 0;
    433		sio->len = 0;
    434	}
    435	sio->bvec[sio->pages].bv_page = page;
    436	sio->bvec[sio->pages].bv_len = thp_size(page);
    437	sio->bvec[sio->pages].bv_offset = 0;
    438	sio->len += thp_size(page);
    439	sio->pages += 1;
    440	if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
    441		swap_read_unplug(sio);
    442		sio = NULL;
    443	}
    444	if (plug)
    445		*plug = sio;
    446}
    447
    448int swap_readpage(struct page *page, bool synchronous,
    449		  struct swap_iocb **plug)
    450{
    451	struct bio *bio;
    452	int ret = 0;
    453	struct swap_info_struct *sis = page_swap_info(page);
    454	bool workingset = PageWorkingset(page);
    455	unsigned long pflags;
    456
    457	VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
    458	VM_BUG_ON_PAGE(!PageLocked(page), page);
    459	VM_BUG_ON_PAGE(PageUptodate(page), page);
    460
    461	/*
    462	 * Count submission time as memory stall. When the device is congested,
    463	 * or the submitting cgroup IO-throttled, submission can be a
    464	 * significant part of overall IO time.
    465	 */
    466	if (workingset)
    467		psi_memstall_enter(&pflags);
    468	delayacct_swapin_start();
    469
    470	if (frontswap_load(page) == 0) {
    471		SetPageUptodate(page);
    472		unlock_page(page);
    473		goto out;
    474	}
    475
    476	if (data_race(sis->flags & SWP_FS_OPS)) {
    477		swap_readpage_fs(page, plug);
    478		goto out;
    479	}
    480
    481	if (sis->flags & SWP_SYNCHRONOUS_IO) {
    482		ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
    483		if (!ret) {
    484			count_vm_event(PSWPIN);
    485			goto out;
    486		}
    487	}
    488
    489	ret = 0;
    490	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
    491	bio->bi_iter.bi_sector = swap_page_sector(page);
    492	bio->bi_end_io = end_swap_bio_read;
    493	bio_add_page(bio, page, thp_size(page), 0);
    494	/*
    495	 * Keep this task valid during swap readpage because the oom killer may
    496	 * attempt to access it in the page fault retry time check.
    497	 */
    498	if (synchronous) {
    499		get_task_struct(current);
    500		bio->bi_private = current;
    501	}
    502	count_vm_event(PSWPIN);
    503	bio_get(bio);
    504	submit_bio(bio);
    505	while (synchronous) {
    506		set_current_state(TASK_UNINTERRUPTIBLE);
    507		if (!READ_ONCE(bio->bi_private))
    508			break;
    509
    510		blk_io_schedule();
    511	}
    512	__set_current_state(TASK_RUNNING);
    513	bio_put(bio);
    514
    515out:
    516	if (workingset)
    517		psi_memstall_leave(&pflags);
    518	delayacct_swapin_end();
    519	return ret;
    520}
    521
    522void __swap_read_unplug(struct swap_iocb *sio)
    523{
    524	struct iov_iter from;
    525	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
    526	int ret;
    527
    528	iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len);
    529	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
    530	if (ret != -EIOCBQUEUED)
    531		sio_read_complete(&sio->iocb, ret);
    532}