cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

swap_state.c (24961B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/mm/swap_state.c
      4 *
      5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
      6 *  Swap reorganised 29.12.95, Stephen Tweedie
      7 *
      8 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
      9 */
     10#include <linux/mm.h>
     11#include <linux/gfp.h>
     12#include <linux/kernel_stat.h>
     13#include <linux/swap.h>
     14#include <linux/swapops.h>
     15#include <linux/init.h>
     16#include <linux/pagemap.h>
     17#include <linux/backing-dev.h>
     18#include <linux/blkdev.h>
     19#include <linux/pagevec.h>
     20#include <linux/migrate.h>
     21#include <linux/vmalloc.h>
     22#include <linux/swap_slots.h>
     23#include <linux/huge_mm.h>
     24#include <linux/shmem_fs.h>
     25#include "internal.h"
     26#include "swap.h"
     27
     28/*
     29 * swapper_space is a fiction, retained to simplify the path through
     30 * vmscan's shrink_page_list.
     31 */
     32static const struct address_space_operations swap_aops = {
     33	.writepage	= swap_writepage,
     34	.dirty_folio	= noop_dirty_folio,
     35#ifdef CONFIG_MIGRATION
     36	.migratepage	= migrate_page,
     37#endif
     38};
     39
     40struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
     41static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
     42static bool enable_vma_readahead __read_mostly = true;
     43
     44#define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
     45#define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
     46#define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
     47#define SWAP_RA_WIN_MASK	(~PAGE_MASK & ~SWAP_RA_HITS_MASK)
     48
     49#define SWAP_RA_HITS(v)		((v) & SWAP_RA_HITS_MASK)
     50#define SWAP_RA_WIN(v)		(((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
     51#define SWAP_RA_ADDR(v)		((v) & PAGE_MASK)
     52
     53#define SWAP_RA_VAL(addr, win, hits)				\
     54	(((addr) & PAGE_MASK) |					\
     55	 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |	\
     56	 ((hits) & SWAP_RA_HITS_MASK))
     57
     58/* Initial readahead hits is 4 to start up with a small window */
     59#define GET_SWAP_RA_VAL(vma)					\
     60	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
     61
     62#define INC_CACHE_INFO(x)	data_race(swap_cache_info.x++)
     63#define ADD_CACHE_INFO(x, nr)	data_race(swap_cache_info.x += (nr))
     64
     65static struct {
     66	unsigned long add_total;
     67	unsigned long del_total;
     68	unsigned long find_success;
     69	unsigned long find_total;
     70} swap_cache_info;
     71
     72static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
     73
     74void show_swap_cache_info(void)
     75{
     76	printk("%lu pages in swap cache\n", total_swapcache_pages());
     77	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
     78		swap_cache_info.add_total, swap_cache_info.del_total,
     79		swap_cache_info.find_success, swap_cache_info.find_total);
     80	printk("Free swap  = %ldkB\n",
     81		get_nr_swap_pages() << (PAGE_SHIFT - 10));
     82	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
     83}
     84
     85void *get_shadow_from_swap_cache(swp_entry_t entry)
     86{
     87	struct address_space *address_space = swap_address_space(entry);
     88	pgoff_t idx = swp_offset(entry);
     89	struct page *page;
     90
     91	page = xa_load(&address_space->i_pages, idx);
     92	if (xa_is_value(page))
     93		return page;
     94	return NULL;
     95}
     96
     97/*
     98 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
     99 * but sets SwapCache flag and private instead of mapping and index.
    100 */
    101int add_to_swap_cache(struct page *page, swp_entry_t entry,
    102			gfp_t gfp, void **shadowp)
    103{
    104	struct address_space *address_space = swap_address_space(entry);
    105	pgoff_t idx = swp_offset(entry);
    106	XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
    107	unsigned long i, nr = thp_nr_pages(page);
    108	void *old;
    109
    110	VM_BUG_ON_PAGE(!PageLocked(page), page);
    111	VM_BUG_ON_PAGE(PageSwapCache(page), page);
    112	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
    113
    114	page_ref_add(page, nr);
    115	SetPageSwapCache(page);
    116
    117	do {
    118		xas_lock_irq(&xas);
    119		xas_create_range(&xas);
    120		if (xas_error(&xas))
    121			goto unlock;
    122		for (i = 0; i < nr; i++) {
    123			VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
    124			old = xas_load(&xas);
    125			if (xa_is_value(old)) {
    126				if (shadowp)
    127					*shadowp = old;
    128			}
    129			set_page_private(page + i, entry.val + i);
    130			xas_store(&xas, page);
    131			xas_next(&xas);
    132		}
    133		address_space->nrpages += nr;
    134		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
    135		__mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
    136		ADD_CACHE_INFO(add_total, nr);
    137unlock:
    138		xas_unlock_irq(&xas);
    139	} while (xas_nomem(&xas, gfp));
    140
    141	if (!xas_error(&xas))
    142		return 0;
    143
    144	ClearPageSwapCache(page);
    145	page_ref_sub(page, nr);
    146	return xas_error(&xas);
    147}
    148
    149/*
    150 * This must be called only on pages that have
    151 * been verified to be in the swap cache.
    152 */
    153void __delete_from_swap_cache(struct page *page,
    154			swp_entry_t entry, void *shadow)
    155{
    156	struct address_space *address_space = swap_address_space(entry);
    157	int i, nr = thp_nr_pages(page);
    158	pgoff_t idx = swp_offset(entry);
    159	XA_STATE(xas, &address_space->i_pages, idx);
    160
    161	VM_BUG_ON_PAGE(!PageLocked(page), page);
    162	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
    163	VM_BUG_ON_PAGE(PageWriteback(page), page);
    164
    165	for (i = 0; i < nr; i++) {
    166		void *entry = xas_store(&xas, shadow);
    167		VM_BUG_ON_PAGE(entry != page, entry);
    168		set_page_private(page + i, 0);
    169		xas_next(&xas);
    170	}
    171	ClearPageSwapCache(page);
    172	address_space->nrpages -= nr;
    173	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
    174	__mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
    175	ADD_CACHE_INFO(del_total, nr);
    176}
    177
    178/**
    179 * add_to_swap - allocate swap space for a folio
    180 * @folio: folio we want to move to swap
    181 *
    182 * Allocate swap space for the folio and add the folio to the
    183 * swap cache.
    184 *
    185 * Context: Caller needs to hold the folio lock.
    186 * Return: Whether the folio was added to the swap cache.
    187 */
    188bool add_to_swap(struct folio *folio)
    189{
    190	swp_entry_t entry;
    191	int err;
    192
    193	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    194	VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
    195
    196	entry = folio_alloc_swap(folio);
    197	if (!entry.val)
    198		return false;
    199
    200	/*
    201	 * XArray node allocations from PF_MEMALLOC contexts could
    202	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
    203	 * stops emergency reserves from being allocated.
    204	 *
    205	 * TODO: this could cause a theoretical memory reclaim
    206	 * deadlock in the swap out path.
    207	 */
    208	/*
    209	 * Add it to the swap cache.
    210	 */
    211	err = add_to_swap_cache(&folio->page, entry,
    212			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
    213	if (err)
    214		/*
    215		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
    216		 * clear SWAP_HAS_CACHE flag.
    217		 */
    218		goto fail;
    219	/*
    220	 * Normally the folio will be dirtied in unmap because its
    221	 * pte should be dirty. A special case is MADV_FREE page. The
    222	 * page's pte could have dirty bit cleared but the folio's
    223	 * SwapBacked flag is still set because clearing the dirty bit
    224	 * and SwapBacked flag has no lock protected. For such folio,
    225	 * unmap will not set dirty bit for it, so folio reclaim will
    226	 * not write the folio out. This can cause data corruption when
    227	 * the folio is swapped in later. Always setting the dirty flag
    228	 * for the folio solves the problem.
    229	 */
    230	folio_mark_dirty(folio);
    231
    232	return true;
    233
    234fail:
    235	put_swap_page(&folio->page, entry);
    236	return false;
    237}
    238
    239/*
    240 * This must be called only on pages that have
    241 * been verified to be in the swap cache and locked.
    242 * It will never put the page into the free list,
    243 * the caller has a reference on the page.
    244 */
    245void delete_from_swap_cache(struct page *page)
    246{
    247	swp_entry_t entry = { .val = page_private(page) };
    248	struct address_space *address_space = swap_address_space(entry);
    249
    250	xa_lock_irq(&address_space->i_pages);
    251	__delete_from_swap_cache(page, entry, NULL);
    252	xa_unlock_irq(&address_space->i_pages);
    253
    254	put_swap_page(page, entry);
    255	page_ref_sub(page, thp_nr_pages(page));
    256}
    257
    258void clear_shadow_from_swap_cache(int type, unsigned long begin,
    259				unsigned long end)
    260{
    261	unsigned long curr = begin;
    262	void *old;
    263
    264	for (;;) {
    265		swp_entry_t entry = swp_entry(type, curr);
    266		struct address_space *address_space = swap_address_space(entry);
    267		XA_STATE(xas, &address_space->i_pages, curr);
    268
    269		xa_lock_irq(&address_space->i_pages);
    270		xas_for_each(&xas, old, end) {
    271			if (!xa_is_value(old))
    272				continue;
    273			xas_store(&xas, NULL);
    274		}
    275		xa_unlock_irq(&address_space->i_pages);
    276
    277		/* search the next swapcache until we meet end */
    278		curr >>= SWAP_ADDRESS_SPACE_SHIFT;
    279		curr++;
    280		curr <<= SWAP_ADDRESS_SPACE_SHIFT;
    281		if (curr > end)
    282			break;
    283	}
    284}
    285
    286/* 
    287 * If we are the only user, then try to free up the swap cache. 
    288 * 
    289 * Its ok to check for PageSwapCache without the page lock
    290 * here because we are going to recheck again inside
    291 * try_to_free_swap() _with_ the lock.
    292 * 					- Marcelo
    293 */
    294void free_swap_cache(struct page *page)
    295{
    296	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
    297		try_to_free_swap(page);
    298		unlock_page(page);
    299	}
    300}
    301
    302/* 
    303 * Perform a free_page(), also freeing any swap cache associated with
    304 * this page if it is the last user of the page.
    305 */
    306void free_page_and_swap_cache(struct page *page)
    307{
    308	free_swap_cache(page);
    309	if (!is_huge_zero_page(page))
    310		put_page(page);
    311}
    312
    313/*
    314 * Passed an array of pages, drop them all from swapcache and then release
    315 * them.  They are removed from the LRU and freed if this is their last use.
    316 */
    317void free_pages_and_swap_cache(struct page **pages, int nr)
    318{
    319	struct page **pagep = pages;
    320	int i;
    321
    322	lru_add_drain();
    323	for (i = 0; i < nr; i++)
    324		free_swap_cache(pagep[i]);
    325	release_pages(pagep, nr);
    326}
    327
    328static inline bool swap_use_vma_readahead(void)
    329{
    330	return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
    331}
    332
    333/*
    334 * Lookup a swap entry in the swap cache. A found page will be returned
    335 * unlocked and with its refcount incremented - we rely on the kernel
    336 * lock getting page table operations atomic even if we drop the page
    337 * lock before returning.
    338 */
    339struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
    340			       unsigned long addr)
    341{
    342	struct page *page;
    343	struct swap_info_struct *si;
    344
    345	si = get_swap_device(entry);
    346	if (!si)
    347		return NULL;
    348	page = find_get_page(swap_address_space(entry), swp_offset(entry));
    349	put_swap_device(si);
    350
    351	INC_CACHE_INFO(find_total);
    352	if (page) {
    353		bool vma_ra = swap_use_vma_readahead();
    354		bool readahead;
    355
    356		INC_CACHE_INFO(find_success);
    357		/*
    358		 * At the moment, we don't support PG_readahead for anon THP
    359		 * so let's bail out rather than confusing the readahead stat.
    360		 */
    361		if (unlikely(PageTransCompound(page)))
    362			return page;
    363
    364		readahead = TestClearPageReadahead(page);
    365		if (vma && vma_ra) {
    366			unsigned long ra_val;
    367			int win, hits;
    368
    369			ra_val = GET_SWAP_RA_VAL(vma);
    370			win = SWAP_RA_WIN(ra_val);
    371			hits = SWAP_RA_HITS(ra_val);
    372			if (readahead)
    373				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
    374			atomic_long_set(&vma->swap_readahead_info,
    375					SWAP_RA_VAL(addr, win, hits));
    376		}
    377
    378		if (readahead) {
    379			count_vm_event(SWAP_RA_HIT);
    380			if (!vma || !vma_ra)
    381				atomic_inc(&swapin_readahead_hits);
    382		}
    383	}
    384
    385	return page;
    386}
    387
    388/**
    389 * find_get_incore_page - Find and get a page from the page or swap caches.
    390 * @mapping: The address_space to search.
    391 * @index: The page cache index.
    392 *
    393 * This differs from find_get_page() in that it will also look for the
    394 * page in the swap cache.
    395 *
    396 * Return: The found page or %NULL.
    397 */
    398struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
    399{
    400	swp_entry_t swp;
    401	struct swap_info_struct *si;
    402	struct page *page = pagecache_get_page(mapping, index,
    403						FGP_ENTRY | FGP_HEAD, 0);
    404
    405	if (!page)
    406		return page;
    407	if (!xa_is_value(page))
    408		return find_subpage(page, index);
    409	if (!shmem_mapping(mapping))
    410		return NULL;
    411
    412	swp = radix_to_swp_entry(page);
    413	/* There might be swapin error entries in shmem mapping. */
    414	if (non_swap_entry(swp))
    415		return NULL;
    416	/* Prevent swapoff from happening to us */
    417	si = get_swap_device(swp);
    418	if (!si)
    419		return NULL;
    420	page = find_get_page(swap_address_space(swp), swp_offset(swp));
    421	put_swap_device(si);
    422	return page;
    423}
    424
    425struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
    426			struct vm_area_struct *vma, unsigned long addr,
    427			bool *new_page_allocated)
    428{
    429	struct swap_info_struct *si;
    430	struct page *page;
    431	void *shadow = NULL;
    432
    433	*new_page_allocated = false;
    434
    435	for (;;) {
    436		int err;
    437		/*
    438		 * First check the swap cache.  Since this is normally
    439		 * called after lookup_swap_cache() failed, re-calling
    440		 * that would confuse statistics.
    441		 */
    442		si = get_swap_device(entry);
    443		if (!si)
    444			return NULL;
    445		page = find_get_page(swap_address_space(entry),
    446				     swp_offset(entry));
    447		put_swap_device(si);
    448		if (page)
    449			return page;
    450
    451		/*
    452		 * Just skip read ahead for unused swap slot.
    453		 * During swap_off when swap_slot_cache is disabled,
    454		 * we have to handle the race between putting
    455		 * swap entry in swap cache and marking swap slot
    456		 * as SWAP_HAS_CACHE.  That's done in later part of code or
    457		 * else swap_off will be aborted if we return NULL.
    458		 */
    459		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
    460			return NULL;
    461
    462		/*
    463		 * Get a new page to read into from swap.  Allocate it now,
    464		 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
    465		 * cause any racers to loop around until we add it to cache.
    466		 */
    467		page = alloc_page_vma(gfp_mask, vma, addr);
    468		if (!page)
    469			return NULL;
    470
    471		/*
    472		 * Swap entry may have been freed since our caller observed it.
    473		 */
    474		err = swapcache_prepare(entry);
    475		if (!err)
    476			break;
    477
    478		put_page(page);
    479		if (err != -EEXIST)
    480			return NULL;
    481
    482		/*
    483		 * We might race against __delete_from_swap_cache(), and
    484		 * stumble across a swap_map entry whose SWAP_HAS_CACHE
    485		 * has not yet been cleared.  Or race against another
    486		 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
    487		 * in swap_map, but not yet added its page to swap cache.
    488		 */
    489		schedule_timeout_uninterruptible(1);
    490	}
    491
    492	/*
    493	 * The swap entry is ours to swap in. Prepare the new page.
    494	 */
    495
    496	__SetPageLocked(page);
    497	__SetPageSwapBacked(page);
    498
    499	if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
    500		goto fail_unlock;
    501
    502	/* May fail (-ENOMEM) if XArray node allocation failed. */
    503	if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
    504		goto fail_unlock;
    505
    506	mem_cgroup_swapin_uncharge_swap(entry);
    507
    508	if (shadow)
    509		workingset_refault(page_folio(page), shadow);
    510
    511	/* Caller will initiate read into locked page */
    512	lru_cache_add(page);
    513	*new_page_allocated = true;
    514	return page;
    515
    516fail_unlock:
    517	put_swap_page(page, entry);
    518	unlock_page(page);
    519	put_page(page);
    520	return NULL;
    521}
    522
    523/*
    524 * Locate a page of swap in physical memory, reserving swap cache space
    525 * and reading the disk if it is not already cached.
    526 * A failure return means that either the page allocation failed or that
    527 * the swap entry is no longer in use.
    528 */
    529struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
    530				   struct vm_area_struct *vma,
    531				   unsigned long addr, bool do_poll,
    532				   struct swap_iocb **plug)
    533{
    534	bool page_was_allocated;
    535	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
    536			vma, addr, &page_was_allocated);
    537
    538	if (page_was_allocated)
    539		swap_readpage(retpage, do_poll, plug);
    540
    541	return retpage;
    542}
    543
    544static unsigned int __swapin_nr_pages(unsigned long prev_offset,
    545				      unsigned long offset,
    546				      int hits,
    547				      int max_pages,
    548				      int prev_win)
    549{
    550	unsigned int pages, last_ra;
    551
    552	/*
    553	 * This heuristic has been found to work well on both sequential and
    554	 * random loads, swapping to hard disk or to SSD: please don't ask
    555	 * what the "+ 2" means, it just happens to work well, that's all.
    556	 */
    557	pages = hits + 2;
    558	if (pages == 2) {
    559		/*
    560		 * We can have no readahead hits to judge by: but must not get
    561		 * stuck here forever, so check for an adjacent offset instead
    562		 * (and don't even bother to check whether swap type is same).
    563		 */
    564		if (offset != prev_offset + 1 && offset != prev_offset - 1)
    565			pages = 1;
    566	} else {
    567		unsigned int roundup = 4;
    568		while (roundup < pages)
    569			roundup <<= 1;
    570		pages = roundup;
    571	}
    572
    573	if (pages > max_pages)
    574		pages = max_pages;
    575
    576	/* Don't shrink readahead too fast */
    577	last_ra = prev_win / 2;
    578	if (pages < last_ra)
    579		pages = last_ra;
    580
    581	return pages;
    582}
    583
    584static unsigned long swapin_nr_pages(unsigned long offset)
    585{
    586	static unsigned long prev_offset;
    587	unsigned int hits, pages, max_pages;
    588	static atomic_t last_readahead_pages;
    589
    590	max_pages = 1 << READ_ONCE(page_cluster);
    591	if (max_pages <= 1)
    592		return 1;
    593
    594	hits = atomic_xchg(&swapin_readahead_hits, 0);
    595	pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
    596				  max_pages,
    597				  atomic_read(&last_readahead_pages));
    598	if (!hits)
    599		WRITE_ONCE(prev_offset, offset);
    600	atomic_set(&last_readahead_pages, pages);
    601
    602	return pages;
    603}
    604
    605/**
    606 * swap_cluster_readahead - swap in pages in hope we need them soon
    607 * @entry: swap entry of this memory
    608 * @gfp_mask: memory allocation flags
    609 * @vmf: fault information
    610 *
    611 * Returns the struct page for entry and addr, after queueing swapin.
    612 *
    613 * Primitive swap readahead code. We simply read an aligned block of
    614 * (1 << page_cluster) entries in the swap area. This method is chosen
    615 * because it doesn't cost us any seek time.  We also make sure to queue
    616 * the 'original' request together with the readahead ones...
    617 *
    618 * This has been extended to use the NUMA policies from the mm triggering
    619 * the readahead.
    620 *
    621 * Caller must hold read mmap_lock if vmf->vma is not NULL.
    622 */
    623struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
    624				struct vm_fault *vmf)
    625{
    626	struct page *page;
    627	unsigned long entry_offset = swp_offset(entry);
    628	unsigned long offset = entry_offset;
    629	unsigned long start_offset, end_offset;
    630	unsigned long mask;
    631	struct swap_info_struct *si = swp_swap_info(entry);
    632	struct blk_plug plug;
    633	struct swap_iocb *splug = NULL;
    634	bool do_poll = true, page_allocated;
    635	struct vm_area_struct *vma = vmf->vma;
    636	unsigned long addr = vmf->address;
    637
    638	mask = swapin_nr_pages(offset) - 1;
    639	if (!mask)
    640		goto skip;
    641
    642	do_poll = false;
    643	/* Read a page_cluster sized and aligned cluster around offset. */
    644	start_offset = offset & ~mask;
    645	end_offset = offset | mask;
    646	if (!start_offset)	/* First page is swap header. */
    647		start_offset++;
    648	if (end_offset >= si->max)
    649		end_offset = si->max - 1;
    650
    651	blk_start_plug(&plug);
    652	for (offset = start_offset; offset <= end_offset ; offset++) {
    653		/* Ok, do the async read-ahead now */
    654		page = __read_swap_cache_async(
    655			swp_entry(swp_type(entry), offset),
    656			gfp_mask, vma, addr, &page_allocated);
    657		if (!page)
    658			continue;
    659		if (page_allocated) {
    660			swap_readpage(page, false, &splug);
    661			if (offset != entry_offset) {
    662				SetPageReadahead(page);
    663				count_vm_event(SWAP_RA);
    664			}
    665		}
    666		put_page(page);
    667	}
    668	blk_finish_plug(&plug);
    669	swap_read_unplug(splug);
    670
    671	lru_add_drain();	/* Push any new pages onto the LRU now */
    672skip:
    673	/* The page was likely read above, so no need for plugging here */
    674	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL);
    675}
    676
    677int init_swap_address_space(unsigned int type, unsigned long nr_pages)
    678{
    679	struct address_space *spaces, *space;
    680	unsigned int i, nr;
    681
    682	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
    683	spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
    684	if (!spaces)
    685		return -ENOMEM;
    686	for (i = 0; i < nr; i++) {
    687		space = spaces + i;
    688		xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
    689		atomic_set(&space->i_mmap_writable, 0);
    690		space->a_ops = &swap_aops;
    691		/* swap cache doesn't use writeback related tags */
    692		mapping_set_no_writeback_tags(space);
    693	}
    694	nr_swapper_spaces[type] = nr;
    695	swapper_spaces[type] = spaces;
    696
    697	return 0;
    698}
    699
    700void exit_swap_address_space(unsigned int type)
    701{
    702	int i;
    703	struct address_space *spaces = swapper_spaces[type];
    704
    705	for (i = 0; i < nr_swapper_spaces[type]; i++)
    706		VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
    707	kvfree(spaces);
    708	nr_swapper_spaces[type] = 0;
    709	swapper_spaces[type] = NULL;
    710}
    711
    712static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
    713				     unsigned long faddr,
    714				     unsigned long lpfn,
    715				     unsigned long rpfn,
    716				     unsigned long *start,
    717				     unsigned long *end)
    718{
    719	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
    720		      PFN_DOWN(faddr & PMD_MASK));
    721	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
    722		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
    723}
    724
    725static void swap_ra_info(struct vm_fault *vmf,
    726			struct vma_swap_readahead *ra_info)
    727{
    728	struct vm_area_struct *vma = vmf->vma;
    729	unsigned long ra_val;
    730	unsigned long faddr, pfn, fpfn;
    731	unsigned long start, end;
    732	pte_t *pte, *orig_pte;
    733	unsigned int max_win, hits, prev_win, win, left;
    734#ifndef CONFIG_64BIT
    735	pte_t *tpte;
    736#endif
    737
    738	max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
    739			     SWAP_RA_ORDER_CEILING);
    740	if (max_win == 1) {
    741		ra_info->win = 1;
    742		return;
    743	}
    744
    745	faddr = vmf->address;
    746	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
    747
    748	fpfn = PFN_DOWN(faddr);
    749	ra_val = GET_SWAP_RA_VAL(vma);
    750	pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
    751	prev_win = SWAP_RA_WIN(ra_val);
    752	hits = SWAP_RA_HITS(ra_val);
    753	ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
    754					       max_win, prev_win);
    755	atomic_long_set(&vma->swap_readahead_info,
    756			SWAP_RA_VAL(faddr, win, 0));
    757
    758	if (win == 1) {
    759		pte_unmap(orig_pte);
    760		return;
    761	}
    762
    763	/* Copy the PTEs because the page table may be unmapped */
    764	if (fpfn == pfn + 1)
    765		swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
    766	else if (pfn == fpfn + 1)
    767		swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
    768				  &start, &end);
    769	else {
    770		left = (win - 1) / 2;
    771		swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
    772				  &start, &end);
    773	}
    774	ra_info->nr_pte = end - start;
    775	ra_info->offset = fpfn - start;
    776	pte -= ra_info->offset;
    777#ifdef CONFIG_64BIT
    778	ra_info->ptes = pte;
    779#else
    780	tpte = ra_info->ptes;
    781	for (pfn = start; pfn != end; pfn++)
    782		*tpte++ = *pte++;
    783#endif
    784	pte_unmap(orig_pte);
    785}
    786
    787/**
    788 * swap_vma_readahead - swap in pages in hope we need them soon
    789 * @fentry: swap entry of this memory
    790 * @gfp_mask: memory allocation flags
    791 * @vmf: fault information
    792 *
    793 * Returns the struct page for entry and addr, after queueing swapin.
    794 *
    795 * Primitive swap readahead code. We simply read in a few pages whose
    796 * virtual addresses are around the fault address in the same vma.
    797 *
    798 * Caller must hold read mmap_lock if vmf->vma is not NULL.
    799 *
    800 */
    801static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
    802				       struct vm_fault *vmf)
    803{
    804	struct blk_plug plug;
    805	struct swap_iocb *splug = NULL;
    806	struct vm_area_struct *vma = vmf->vma;
    807	struct page *page;
    808	pte_t *pte, pentry;
    809	swp_entry_t entry;
    810	unsigned int i;
    811	bool page_allocated;
    812	struct vma_swap_readahead ra_info = {
    813		.win = 1,
    814	};
    815
    816	swap_ra_info(vmf, &ra_info);
    817	if (ra_info.win == 1)
    818		goto skip;
    819
    820	blk_start_plug(&plug);
    821	for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
    822	     i++, pte++) {
    823		pentry = *pte;
    824		if (!is_swap_pte(pentry))
    825			continue;
    826		entry = pte_to_swp_entry(pentry);
    827		if (unlikely(non_swap_entry(entry)))
    828			continue;
    829		page = __read_swap_cache_async(entry, gfp_mask, vma,
    830					       vmf->address, &page_allocated);
    831		if (!page)
    832			continue;
    833		if (page_allocated) {
    834			swap_readpage(page, false, &splug);
    835			if (i != ra_info.offset) {
    836				SetPageReadahead(page);
    837				count_vm_event(SWAP_RA);
    838			}
    839		}
    840		put_page(page);
    841	}
    842	blk_finish_plug(&plug);
    843	swap_read_unplug(splug);
    844	lru_add_drain();
    845skip:
    846	/* The page was likely read above, so no need for plugging here */
    847	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
    848				     ra_info.win == 1, NULL);
    849}
    850
    851/**
    852 * swapin_readahead - swap in pages in hope we need them soon
    853 * @entry: swap entry of this memory
    854 * @gfp_mask: memory allocation flags
    855 * @vmf: fault information
    856 *
    857 * Returns the struct page for entry and addr, after queueing swapin.
    858 *
    859 * It's a main entry function for swap readahead. By the configuration,
    860 * it will read ahead blocks by cluster-based(ie, physical disk based)
    861 * or vma-based(ie, virtual address based on faulty address) readahead.
    862 */
    863struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
    864				struct vm_fault *vmf)
    865{
    866	return swap_use_vma_readahead() ?
    867			swap_vma_readahead(entry, gfp_mask, vmf) :
    868			swap_cluster_readahead(entry, gfp_mask, vmf);
    869}
    870
    871#ifdef CONFIG_SYSFS
    872static ssize_t vma_ra_enabled_show(struct kobject *kobj,
    873				     struct kobj_attribute *attr, char *buf)
    874{
    875	return sysfs_emit(buf, "%s\n",
    876			  enable_vma_readahead ? "true" : "false");
    877}
    878static ssize_t vma_ra_enabled_store(struct kobject *kobj,
    879				      struct kobj_attribute *attr,
    880				      const char *buf, size_t count)
    881{
    882	ssize_t ret;
    883
    884	ret = kstrtobool(buf, &enable_vma_readahead);
    885	if (ret)
    886		return ret;
    887
    888	return count;
    889}
    890static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
    891
    892static struct attribute *swap_attrs[] = {
    893	&vma_ra_enabled_attr.attr,
    894	NULL,
    895};
    896
    897static const struct attribute_group swap_attr_group = {
    898	.attrs = swap_attrs,
    899};
    900
    901static int __init swap_init_sysfs(void)
    902{
    903	int err;
    904	struct kobject *swap_kobj;
    905
    906	swap_kobj = kobject_create_and_add("swap", mm_kobj);
    907	if (!swap_kobj) {
    908		pr_err("failed to create swap kobject\n");
    909		return -ENOMEM;
    910	}
    911	err = sysfs_create_group(swap_kobj, &swap_attr_group);
    912	if (err) {
    913		pr_err("failed to register swap group\n");
    914		goto delete_obj;
    915	}
    916	return 0;
    917
    918delete_obj:
    919	kobject_put(swap_kobj);
    920	return err;
    921}
    922subsys_initcall(swap_init_sysfs);
    923#endif