cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

swapfile.c (94283B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  linux/mm/swapfile.c
      4 *
      5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
      6 *  Swap reorganised 29.12.95, Stephen Tweedie
      7 */
      8
      9#include <linux/blkdev.h>
     10#include <linux/mm.h>
     11#include <linux/sched/mm.h>
     12#include <linux/sched/task.h>
     13#include <linux/hugetlb.h>
     14#include <linux/mman.h>
     15#include <linux/slab.h>
     16#include <linux/kernel_stat.h>
     17#include <linux/swap.h>
     18#include <linux/vmalloc.h>
     19#include <linux/pagemap.h>
     20#include <linux/namei.h>
     21#include <linux/shmem_fs.h>
     22#include <linux/blk-cgroup.h>
     23#include <linux/random.h>
     24#include <linux/writeback.h>
     25#include <linux/proc_fs.h>
     26#include <linux/seq_file.h>
     27#include <linux/init.h>
     28#include <linux/ksm.h>
     29#include <linux/rmap.h>
     30#include <linux/security.h>
     31#include <linux/backing-dev.h>
     32#include <linux/mutex.h>
     33#include <linux/capability.h>
     34#include <linux/syscalls.h>
     35#include <linux/memcontrol.h>
     36#include <linux/poll.h>
     37#include <linux/oom.h>
     38#include <linux/frontswap.h>
     39#include <linux/swapfile.h>
     40#include <linux/export.h>
     41#include <linux/swap_slots.h>
     42#include <linux/sort.h>
     43#include <linux/completion.h>
     44
     45#include <asm/tlbflush.h>
     46#include <linux/swapops.h>
     47#include <linux/swap_cgroup.h>
     48#include "swap.h"
     49
     50static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
     51				 unsigned char);
     52static void free_swap_count_continuations(struct swap_info_struct *);
     53
     54static DEFINE_SPINLOCK(swap_lock);
     55static unsigned int nr_swapfiles;
     56atomic_long_t nr_swap_pages;
     57/*
     58 * Some modules use swappable objects and may try to swap them out under
     59 * memory pressure (via the shrinker). Before doing so, they may wish to
     60 * check to see if any swap space is available.
     61 */
     62EXPORT_SYMBOL_GPL(nr_swap_pages);
     63/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
     64long total_swap_pages;
     65static int least_priority = -1;
     66
     67static const char Bad_file[] = "Bad swap file entry ";
     68static const char Unused_file[] = "Unused swap file entry ";
     69static const char Bad_offset[] = "Bad swap offset entry ";
     70static const char Unused_offset[] = "Unused swap offset entry ";
     71
     72/*
     73 * all active swap_info_structs
     74 * protected with swap_lock, and ordered by priority.
     75 */
     76static PLIST_HEAD(swap_active_head);
     77
     78/*
     79 * all available (active, not full) swap_info_structs
     80 * protected with swap_avail_lock, ordered by priority.
     81 * This is used by folio_alloc_swap() instead of swap_active_head
     82 * because swap_active_head includes all swap_info_structs,
     83 * but folio_alloc_swap() doesn't need to look at full ones.
     84 * This uses its own lock instead of swap_lock because when a
     85 * swap_info_struct changes between not-full/full, it needs to
     86 * add/remove itself to/from this list, but the swap_info_struct->lock
     87 * is held and the locking order requires swap_lock to be taken
     88 * before any swap_info_struct->lock.
     89 */
     90static struct plist_head *swap_avail_heads;
     91static DEFINE_SPINLOCK(swap_avail_lock);
     92
     93struct swap_info_struct *swap_info[MAX_SWAPFILES];
     94
     95static DEFINE_MUTEX(swapon_mutex);
     96
     97static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
     98/* Activity counter to indicate that a swapon or swapoff has occurred */
     99static atomic_t proc_poll_event = ATOMIC_INIT(0);
    100
    101atomic_t nr_rotate_swap = ATOMIC_INIT(0);
    102
    103static struct swap_info_struct *swap_type_to_swap_info(int type)
    104{
    105	if (type >= MAX_SWAPFILES)
    106		return NULL;
    107
    108	return READ_ONCE(swap_info[type]); /* rcu_dereference() */
    109}
    110
    111static inline unsigned char swap_count(unsigned char ent)
    112{
    113	return ent & ~SWAP_HAS_CACHE;	/* may include COUNT_CONTINUED flag */
    114}
    115
    116/* Reclaim the swap entry anyway if possible */
    117#define TTRS_ANYWAY		0x1
    118/*
    119 * Reclaim the swap entry if there are no more mappings of the
    120 * corresponding page
    121 */
    122#define TTRS_UNMAPPED		0x2
    123/* Reclaim the swap entry if swap is getting full*/
    124#define TTRS_FULL		0x4
    125
    126/* returns 1 if swap entry is freed */
    127static int __try_to_reclaim_swap(struct swap_info_struct *si,
    128				 unsigned long offset, unsigned long flags)
    129{
    130	swp_entry_t entry = swp_entry(si->type, offset);
    131	struct page *page;
    132	int ret = 0;
    133
    134	page = find_get_page(swap_address_space(entry), offset);
    135	if (!page)
    136		return 0;
    137	/*
    138	 * When this function is called from scan_swap_map_slots() and it's
    139	 * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
    140	 * here. We have to use trylock for avoiding deadlock. This is a special
    141	 * case and you should use try_to_free_swap() with explicit lock_page()
    142	 * in usual operations.
    143	 */
    144	if (trylock_page(page)) {
    145		if ((flags & TTRS_ANYWAY) ||
    146		    ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
    147		    ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
    148			ret = try_to_free_swap(page);
    149		unlock_page(page);
    150	}
    151	put_page(page);
    152	return ret;
    153}
    154
    155static inline struct swap_extent *first_se(struct swap_info_struct *sis)
    156{
    157	struct rb_node *rb = rb_first(&sis->swap_extent_root);
    158	return rb_entry(rb, struct swap_extent, rb_node);
    159}
    160
    161static inline struct swap_extent *next_se(struct swap_extent *se)
    162{
    163	struct rb_node *rb = rb_next(&se->rb_node);
    164	return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
    165}
    166
    167/*
    168 * swapon tell device that all the old swap contents can be discarded,
    169 * to allow the swap device to optimize its wear-levelling.
    170 */
    171static int discard_swap(struct swap_info_struct *si)
    172{
    173	struct swap_extent *se;
    174	sector_t start_block;
    175	sector_t nr_blocks;
    176	int err = 0;
    177
    178	/* Do not discard the swap header page! */
    179	se = first_se(si);
    180	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
    181	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
    182	if (nr_blocks) {
    183		err = blkdev_issue_discard(si->bdev, start_block,
    184				nr_blocks, GFP_KERNEL);
    185		if (err)
    186			return err;
    187		cond_resched();
    188	}
    189
    190	for (se = next_se(se); se; se = next_se(se)) {
    191		start_block = se->start_block << (PAGE_SHIFT - 9);
    192		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
    193
    194		err = blkdev_issue_discard(si->bdev, start_block,
    195				nr_blocks, GFP_KERNEL);
    196		if (err)
    197			break;
    198
    199		cond_resched();
    200	}
    201	return err;		/* That will often be -EOPNOTSUPP */
    202}
    203
    204static struct swap_extent *
    205offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
    206{
    207	struct swap_extent *se;
    208	struct rb_node *rb;
    209
    210	rb = sis->swap_extent_root.rb_node;
    211	while (rb) {
    212		se = rb_entry(rb, struct swap_extent, rb_node);
    213		if (offset < se->start_page)
    214			rb = rb->rb_left;
    215		else if (offset >= se->start_page + se->nr_pages)
    216			rb = rb->rb_right;
    217		else
    218			return se;
    219	}
    220	/* It *must* be present */
    221	BUG();
    222}
    223
    224sector_t swap_page_sector(struct page *page)
    225{
    226	struct swap_info_struct *sis = page_swap_info(page);
    227	struct swap_extent *se;
    228	sector_t sector;
    229	pgoff_t offset;
    230
    231	offset = __page_file_index(page);
    232	se = offset_to_swap_extent(sis, offset);
    233	sector = se->start_block + (offset - se->start_page);
    234	return sector << (PAGE_SHIFT - 9);
    235}
    236
    237/*
    238 * swap allocation tell device that a cluster of swap can now be discarded,
    239 * to allow the swap device to optimize its wear-levelling.
    240 */
    241static void discard_swap_cluster(struct swap_info_struct *si,
    242				 pgoff_t start_page, pgoff_t nr_pages)
    243{
    244	struct swap_extent *se = offset_to_swap_extent(si, start_page);
    245
    246	while (nr_pages) {
    247		pgoff_t offset = start_page - se->start_page;
    248		sector_t start_block = se->start_block + offset;
    249		sector_t nr_blocks = se->nr_pages - offset;
    250
    251		if (nr_blocks > nr_pages)
    252			nr_blocks = nr_pages;
    253		start_page += nr_blocks;
    254		nr_pages -= nr_blocks;
    255
    256		start_block <<= PAGE_SHIFT - 9;
    257		nr_blocks <<= PAGE_SHIFT - 9;
    258		if (blkdev_issue_discard(si->bdev, start_block,
    259					nr_blocks, GFP_NOIO))
    260			break;
    261
    262		se = next_se(se);
    263	}
    264}
    265
    266#ifdef CONFIG_THP_SWAP
    267#define SWAPFILE_CLUSTER	HPAGE_PMD_NR
    268
    269#define swap_entry_size(size)	(size)
    270#else
    271#define SWAPFILE_CLUSTER	256
    272
    273/*
    274 * Define swap_entry_size() as constant to let compiler to optimize
    275 * out some code if !CONFIG_THP_SWAP
    276 */
    277#define swap_entry_size(size)	1
    278#endif
    279#define LATENCY_LIMIT		256
    280
    281static inline void cluster_set_flag(struct swap_cluster_info *info,
    282	unsigned int flag)
    283{
    284	info->flags = flag;
    285}
    286
    287static inline unsigned int cluster_count(struct swap_cluster_info *info)
    288{
    289	return info->data;
    290}
    291
    292static inline void cluster_set_count(struct swap_cluster_info *info,
    293				     unsigned int c)
    294{
    295	info->data = c;
    296}
    297
    298static inline void cluster_set_count_flag(struct swap_cluster_info *info,
    299					 unsigned int c, unsigned int f)
    300{
    301	info->flags = f;
    302	info->data = c;
    303}
    304
    305static inline unsigned int cluster_next(struct swap_cluster_info *info)
    306{
    307	return info->data;
    308}
    309
    310static inline void cluster_set_next(struct swap_cluster_info *info,
    311				    unsigned int n)
    312{
    313	info->data = n;
    314}
    315
    316static inline void cluster_set_next_flag(struct swap_cluster_info *info,
    317					 unsigned int n, unsigned int f)
    318{
    319	info->flags = f;
    320	info->data = n;
    321}
    322
    323static inline bool cluster_is_free(struct swap_cluster_info *info)
    324{
    325	return info->flags & CLUSTER_FLAG_FREE;
    326}
    327
    328static inline bool cluster_is_null(struct swap_cluster_info *info)
    329{
    330	return info->flags & CLUSTER_FLAG_NEXT_NULL;
    331}
    332
    333static inline void cluster_set_null(struct swap_cluster_info *info)
    334{
    335	info->flags = CLUSTER_FLAG_NEXT_NULL;
    336	info->data = 0;
    337}
    338
    339static inline bool cluster_is_huge(struct swap_cluster_info *info)
    340{
    341	if (IS_ENABLED(CONFIG_THP_SWAP))
    342		return info->flags & CLUSTER_FLAG_HUGE;
    343	return false;
    344}
    345
    346static inline void cluster_clear_huge(struct swap_cluster_info *info)
    347{
    348	info->flags &= ~CLUSTER_FLAG_HUGE;
    349}
    350
    351static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
    352						     unsigned long offset)
    353{
    354	struct swap_cluster_info *ci;
    355
    356	ci = si->cluster_info;
    357	if (ci) {
    358		ci += offset / SWAPFILE_CLUSTER;
    359		spin_lock(&ci->lock);
    360	}
    361	return ci;
    362}
    363
    364static inline void unlock_cluster(struct swap_cluster_info *ci)
    365{
    366	if (ci)
    367		spin_unlock(&ci->lock);
    368}
    369
    370/*
    371 * Determine the locking method in use for this device.  Return
    372 * swap_cluster_info if SSD-style cluster-based locking is in place.
    373 */
    374static inline struct swap_cluster_info *lock_cluster_or_swap_info(
    375		struct swap_info_struct *si, unsigned long offset)
    376{
    377	struct swap_cluster_info *ci;
    378
    379	/* Try to use fine-grained SSD-style locking if available: */
    380	ci = lock_cluster(si, offset);
    381	/* Otherwise, fall back to traditional, coarse locking: */
    382	if (!ci)
    383		spin_lock(&si->lock);
    384
    385	return ci;
    386}
    387
    388static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
    389					       struct swap_cluster_info *ci)
    390{
    391	if (ci)
    392		unlock_cluster(ci);
    393	else
    394		spin_unlock(&si->lock);
    395}
    396
    397static inline bool cluster_list_empty(struct swap_cluster_list *list)
    398{
    399	return cluster_is_null(&list->head);
    400}
    401
    402static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
    403{
    404	return cluster_next(&list->head);
    405}
    406
    407static void cluster_list_init(struct swap_cluster_list *list)
    408{
    409	cluster_set_null(&list->head);
    410	cluster_set_null(&list->tail);
    411}
    412
    413static void cluster_list_add_tail(struct swap_cluster_list *list,
    414				  struct swap_cluster_info *ci,
    415				  unsigned int idx)
    416{
    417	if (cluster_list_empty(list)) {
    418		cluster_set_next_flag(&list->head, idx, 0);
    419		cluster_set_next_flag(&list->tail, idx, 0);
    420	} else {
    421		struct swap_cluster_info *ci_tail;
    422		unsigned int tail = cluster_next(&list->tail);
    423
    424		/*
    425		 * Nested cluster lock, but both cluster locks are
    426		 * only acquired when we held swap_info_struct->lock
    427		 */
    428		ci_tail = ci + tail;
    429		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
    430		cluster_set_next(ci_tail, idx);
    431		spin_unlock(&ci_tail->lock);
    432		cluster_set_next_flag(&list->tail, idx, 0);
    433	}
    434}
    435
    436static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
    437					   struct swap_cluster_info *ci)
    438{
    439	unsigned int idx;
    440
    441	idx = cluster_next(&list->head);
    442	if (cluster_next(&list->tail) == idx) {
    443		cluster_set_null(&list->head);
    444		cluster_set_null(&list->tail);
    445	} else
    446		cluster_set_next_flag(&list->head,
    447				      cluster_next(&ci[idx]), 0);
    448
    449	return idx;
    450}
    451
    452/* Add a cluster to discard list and schedule it to do discard */
    453static void swap_cluster_schedule_discard(struct swap_info_struct *si,
    454		unsigned int idx)
    455{
    456	/*
    457	 * If scan_swap_map_slots() can't find a free cluster, it will check
    458	 * si->swap_map directly. To make sure the discarding cluster isn't
    459	 * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
    460	 * It will be cleared after discard
    461	 */
    462	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
    463			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
    464
    465	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
    466
    467	schedule_work(&si->discard_work);
    468}
    469
    470static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
    471{
    472	struct swap_cluster_info *ci = si->cluster_info;
    473
    474	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
    475	cluster_list_add_tail(&si->free_clusters, ci, idx);
    476}
    477
    478/*
    479 * Doing discard actually. After a cluster discard is finished, the cluster
    480 * will be added to free cluster list. caller should hold si->lock.
    481*/
    482static void swap_do_scheduled_discard(struct swap_info_struct *si)
    483{
    484	struct swap_cluster_info *info, *ci;
    485	unsigned int idx;
    486
    487	info = si->cluster_info;
    488
    489	while (!cluster_list_empty(&si->discard_clusters)) {
    490		idx = cluster_list_del_first(&si->discard_clusters, info);
    491		spin_unlock(&si->lock);
    492
    493		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
    494				SWAPFILE_CLUSTER);
    495
    496		spin_lock(&si->lock);
    497		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
    498		__free_cluster(si, idx);
    499		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
    500				0, SWAPFILE_CLUSTER);
    501		unlock_cluster(ci);
    502	}
    503}
    504
    505static void swap_discard_work(struct work_struct *work)
    506{
    507	struct swap_info_struct *si;
    508
    509	si = container_of(work, struct swap_info_struct, discard_work);
    510
    511	spin_lock(&si->lock);
    512	swap_do_scheduled_discard(si);
    513	spin_unlock(&si->lock);
    514}
    515
    516static void swap_users_ref_free(struct percpu_ref *ref)
    517{
    518	struct swap_info_struct *si;
    519
    520	si = container_of(ref, struct swap_info_struct, users);
    521	complete(&si->comp);
    522}
    523
    524static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
    525{
    526	struct swap_cluster_info *ci = si->cluster_info;
    527
    528	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
    529	cluster_list_del_first(&si->free_clusters, ci);
    530	cluster_set_count_flag(ci + idx, 0, 0);
    531}
    532
    533static void free_cluster(struct swap_info_struct *si, unsigned long idx)
    534{
    535	struct swap_cluster_info *ci = si->cluster_info + idx;
    536
    537	VM_BUG_ON(cluster_count(ci) != 0);
    538	/*
    539	 * If the swap is discardable, prepare discard the cluster
    540	 * instead of free it immediately. The cluster will be freed
    541	 * after discard.
    542	 */
    543	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
    544	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
    545		swap_cluster_schedule_discard(si, idx);
    546		return;
    547	}
    548
    549	__free_cluster(si, idx);
    550}
    551
    552/*
    553 * The cluster corresponding to page_nr will be used. The cluster will be
    554 * removed from free cluster list and its usage counter will be increased.
    555 */
    556static void inc_cluster_info_page(struct swap_info_struct *p,
    557	struct swap_cluster_info *cluster_info, unsigned long page_nr)
    558{
    559	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
    560
    561	if (!cluster_info)
    562		return;
    563	if (cluster_is_free(&cluster_info[idx]))
    564		alloc_cluster(p, idx);
    565
    566	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
    567	cluster_set_count(&cluster_info[idx],
    568		cluster_count(&cluster_info[idx]) + 1);
    569}
    570
    571/*
    572 * The cluster corresponding to page_nr decreases one usage. If the usage
    573 * counter becomes 0, which means no page in the cluster is in using, we can
    574 * optionally discard the cluster and add it to free cluster list.
    575 */
    576static void dec_cluster_info_page(struct swap_info_struct *p,
    577	struct swap_cluster_info *cluster_info, unsigned long page_nr)
    578{
    579	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
    580
    581	if (!cluster_info)
    582		return;
    583
    584	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
    585	cluster_set_count(&cluster_info[idx],
    586		cluster_count(&cluster_info[idx]) - 1);
    587
    588	if (cluster_count(&cluster_info[idx]) == 0)
    589		free_cluster(p, idx);
    590}
    591
    592/*
    593 * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
    594 * cluster list. Avoiding such abuse to avoid list corruption.
    595 */
    596static bool
    597scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
    598	unsigned long offset)
    599{
    600	struct percpu_cluster *percpu_cluster;
    601	bool conflict;
    602
    603	offset /= SWAPFILE_CLUSTER;
    604	conflict = !cluster_list_empty(&si->free_clusters) &&
    605		offset != cluster_list_first(&si->free_clusters) &&
    606		cluster_is_free(&si->cluster_info[offset]);
    607
    608	if (!conflict)
    609		return false;
    610
    611	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
    612	cluster_set_null(&percpu_cluster->index);
    613	return true;
    614}
    615
    616/*
    617 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
    618 * might involve allocating a new cluster for current CPU too.
    619 */
    620static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
    621	unsigned long *offset, unsigned long *scan_base)
    622{
    623	struct percpu_cluster *cluster;
    624	struct swap_cluster_info *ci;
    625	unsigned long tmp, max;
    626
    627new_cluster:
    628	cluster = this_cpu_ptr(si->percpu_cluster);
    629	if (cluster_is_null(&cluster->index)) {
    630		if (!cluster_list_empty(&si->free_clusters)) {
    631			cluster->index = si->free_clusters.head;
    632			cluster->next = cluster_next(&cluster->index) *
    633					SWAPFILE_CLUSTER;
    634		} else if (!cluster_list_empty(&si->discard_clusters)) {
    635			/*
    636			 * we don't have free cluster but have some clusters in
    637			 * discarding, do discard now and reclaim them, then
    638			 * reread cluster_next_cpu since we dropped si->lock
    639			 */
    640			swap_do_scheduled_discard(si);
    641			*scan_base = this_cpu_read(*si->cluster_next_cpu);
    642			*offset = *scan_base;
    643			goto new_cluster;
    644		} else
    645			return false;
    646	}
    647
    648	/*
    649	 * Other CPUs can use our cluster if they can't find a free cluster,
    650	 * check if there is still free entry in the cluster
    651	 */
    652	tmp = cluster->next;
    653	max = min_t(unsigned long, si->max,
    654		    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
    655	if (tmp < max) {
    656		ci = lock_cluster(si, tmp);
    657		while (tmp < max) {
    658			if (!si->swap_map[tmp])
    659				break;
    660			tmp++;
    661		}
    662		unlock_cluster(ci);
    663	}
    664	if (tmp >= max) {
    665		cluster_set_null(&cluster->index);
    666		goto new_cluster;
    667	}
    668	cluster->next = tmp + 1;
    669	*offset = tmp;
    670	*scan_base = tmp;
    671	return true;
    672}
    673
    674static void __del_from_avail_list(struct swap_info_struct *p)
    675{
    676	int nid;
    677
    678	for_each_node(nid)
    679		plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
    680}
    681
    682static void del_from_avail_list(struct swap_info_struct *p)
    683{
    684	spin_lock(&swap_avail_lock);
    685	__del_from_avail_list(p);
    686	spin_unlock(&swap_avail_lock);
    687}
    688
    689static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
    690			     unsigned int nr_entries)
    691{
    692	unsigned int end = offset + nr_entries - 1;
    693
    694	if (offset == si->lowest_bit)
    695		si->lowest_bit += nr_entries;
    696	if (end == si->highest_bit)
    697		WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
    698	si->inuse_pages += nr_entries;
    699	if (si->inuse_pages == si->pages) {
    700		si->lowest_bit = si->max;
    701		si->highest_bit = 0;
    702		del_from_avail_list(si);
    703	}
    704}
    705
    706static void add_to_avail_list(struct swap_info_struct *p)
    707{
    708	int nid;
    709
    710	spin_lock(&swap_avail_lock);
    711	for_each_node(nid) {
    712		WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
    713		plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
    714	}
    715	spin_unlock(&swap_avail_lock);
    716}
    717
    718static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
    719			    unsigned int nr_entries)
    720{
    721	unsigned long begin = offset;
    722	unsigned long end = offset + nr_entries - 1;
    723	void (*swap_slot_free_notify)(struct block_device *, unsigned long);
    724
    725	if (offset < si->lowest_bit)
    726		si->lowest_bit = offset;
    727	if (end > si->highest_bit) {
    728		bool was_full = !si->highest_bit;
    729
    730		WRITE_ONCE(si->highest_bit, end);
    731		if (was_full && (si->flags & SWP_WRITEOK))
    732			add_to_avail_list(si);
    733	}
    734	atomic_long_add(nr_entries, &nr_swap_pages);
    735	si->inuse_pages -= nr_entries;
    736	if (si->flags & SWP_BLKDEV)
    737		swap_slot_free_notify =
    738			si->bdev->bd_disk->fops->swap_slot_free_notify;
    739	else
    740		swap_slot_free_notify = NULL;
    741	while (offset <= end) {
    742		arch_swap_invalidate_page(si->type, offset);
    743		frontswap_invalidate_page(si->type, offset);
    744		if (swap_slot_free_notify)
    745			swap_slot_free_notify(si->bdev, offset);
    746		offset++;
    747	}
    748	clear_shadow_from_swap_cache(si->type, begin, end);
    749}
    750
    751static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
    752{
    753	unsigned long prev;
    754
    755	if (!(si->flags & SWP_SOLIDSTATE)) {
    756		si->cluster_next = next;
    757		return;
    758	}
    759
    760	prev = this_cpu_read(*si->cluster_next_cpu);
    761	/*
    762	 * Cross the swap address space size aligned trunk, choose
    763	 * another trunk randomly to avoid lock contention on swap
    764	 * address space if possible.
    765	 */
    766	if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
    767	    (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
    768		/* No free swap slots available */
    769		if (si->highest_bit <= si->lowest_bit)
    770			return;
    771		next = si->lowest_bit +
    772			prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
    773		next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
    774		next = max_t(unsigned int, next, si->lowest_bit);
    775	}
    776	this_cpu_write(*si->cluster_next_cpu, next);
    777}
    778
    779static bool swap_offset_available_and_locked(struct swap_info_struct *si,
    780					     unsigned long offset)
    781{
    782	if (data_race(!si->swap_map[offset])) {
    783		spin_lock(&si->lock);
    784		return true;
    785	}
    786
    787	if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
    788		spin_lock(&si->lock);
    789		return true;
    790	}
    791
    792	return false;
    793}
    794
    795static int scan_swap_map_slots(struct swap_info_struct *si,
    796			       unsigned char usage, int nr,
    797			       swp_entry_t slots[])
    798{
    799	struct swap_cluster_info *ci;
    800	unsigned long offset;
    801	unsigned long scan_base;
    802	unsigned long last_in_cluster = 0;
    803	int latency_ration = LATENCY_LIMIT;
    804	int n_ret = 0;
    805	bool scanned_many = false;
    806
    807	/*
    808	 * We try to cluster swap pages by allocating them sequentially
    809	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
    810	 * way, however, we resort to first-free allocation, starting
    811	 * a new cluster.  This prevents us from scattering swap pages
    812	 * all over the entire swap partition, so that we reduce
    813	 * overall disk seek times between swap pages.  -- sct
    814	 * But we do now try to find an empty cluster.  -Andrea
    815	 * And we let swap pages go all over an SSD partition.  Hugh
    816	 */
    817
    818	si->flags += SWP_SCANNING;
    819	/*
    820	 * Use percpu scan base for SSD to reduce lock contention on
    821	 * cluster and swap cache.  For HDD, sequential access is more
    822	 * important.
    823	 */
    824	if (si->flags & SWP_SOLIDSTATE)
    825		scan_base = this_cpu_read(*si->cluster_next_cpu);
    826	else
    827		scan_base = si->cluster_next;
    828	offset = scan_base;
    829
    830	/* SSD algorithm */
    831	if (si->cluster_info) {
    832		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
    833			goto scan;
    834	} else if (unlikely(!si->cluster_nr--)) {
    835		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
    836			si->cluster_nr = SWAPFILE_CLUSTER - 1;
    837			goto checks;
    838		}
    839
    840		spin_unlock(&si->lock);
    841
    842		/*
    843		 * If seek is expensive, start searching for new cluster from
    844		 * start of partition, to minimize the span of allocated swap.
    845		 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
    846		 * case, just handled by scan_swap_map_try_ssd_cluster() above.
    847		 */
    848		scan_base = offset = si->lowest_bit;
    849		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
    850
    851		/* Locate the first empty (unaligned) cluster */
    852		for (; last_in_cluster <= si->highest_bit; offset++) {
    853			if (si->swap_map[offset])
    854				last_in_cluster = offset + SWAPFILE_CLUSTER;
    855			else if (offset == last_in_cluster) {
    856				spin_lock(&si->lock);
    857				offset -= SWAPFILE_CLUSTER - 1;
    858				si->cluster_next = offset;
    859				si->cluster_nr = SWAPFILE_CLUSTER - 1;
    860				goto checks;
    861			}
    862			if (unlikely(--latency_ration < 0)) {
    863				cond_resched();
    864				latency_ration = LATENCY_LIMIT;
    865			}
    866		}
    867
    868		offset = scan_base;
    869		spin_lock(&si->lock);
    870		si->cluster_nr = SWAPFILE_CLUSTER - 1;
    871	}
    872
    873checks:
    874	if (si->cluster_info) {
    875		while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
    876		/* take a break if we already got some slots */
    877			if (n_ret)
    878				goto done;
    879			if (!scan_swap_map_try_ssd_cluster(si, &offset,
    880							&scan_base))
    881				goto scan;
    882		}
    883	}
    884	if (!(si->flags & SWP_WRITEOK))
    885		goto no_page;
    886	if (!si->highest_bit)
    887		goto no_page;
    888	if (offset > si->highest_bit)
    889		scan_base = offset = si->lowest_bit;
    890
    891	ci = lock_cluster(si, offset);
    892	/* reuse swap entry of cache-only swap if not busy. */
    893	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
    894		int swap_was_freed;
    895		unlock_cluster(ci);
    896		spin_unlock(&si->lock);
    897		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
    898		spin_lock(&si->lock);
    899		/* entry was freed successfully, try to use this again */
    900		if (swap_was_freed)
    901			goto checks;
    902		goto scan; /* check next one */
    903	}
    904
    905	if (si->swap_map[offset]) {
    906		unlock_cluster(ci);
    907		if (!n_ret)
    908			goto scan;
    909		else
    910			goto done;
    911	}
    912	WRITE_ONCE(si->swap_map[offset], usage);
    913	inc_cluster_info_page(si, si->cluster_info, offset);
    914	unlock_cluster(ci);
    915
    916	swap_range_alloc(si, offset, 1);
    917	slots[n_ret++] = swp_entry(si->type, offset);
    918
    919	/* got enough slots or reach max slots? */
    920	if ((n_ret == nr) || (offset >= si->highest_bit))
    921		goto done;
    922
    923	/* search for next available slot */
    924
    925	/* time to take a break? */
    926	if (unlikely(--latency_ration < 0)) {
    927		if (n_ret)
    928			goto done;
    929		spin_unlock(&si->lock);
    930		cond_resched();
    931		spin_lock(&si->lock);
    932		latency_ration = LATENCY_LIMIT;
    933	}
    934
    935	/* try to get more slots in cluster */
    936	if (si->cluster_info) {
    937		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
    938			goto checks;
    939	} else if (si->cluster_nr && !si->swap_map[++offset]) {
    940		/* non-ssd case, still more slots in cluster? */
    941		--si->cluster_nr;
    942		goto checks;
    943	}
    944
    945	/*
    946	 * Even if there's no free clusters available (fragmented),
    947	 * try to scan a little more quickly with lock held unless we
    948	 * have scanned too many slots already.
    949	 */
    950	if (!scanned_many) {
    951		unsigned long scan_limit;
    952
    953		if (offset < scan_base)
    954			scan_limit = scan_base;
    955		else
    956			scan_limit = si->highest_bit;
    957		for (; offset <= scan_limit && --latency_ration > 0;
    958		     offset++) {
    959			if (!si->swap_map[offset])
    960				goto checks;
    961		}
    962	}
    963
    964done:
    965	set_cluster_next(si, offset + 1);
    966	si->flags -= SWP_SCANNING;
    967	return n_ret;
    968
    969scan:
    970	spin_unlock(&si->lock);
    971	while (++offset <= READ_ONCE(si->highest_bit)) {
    972		if (swap_offset_available_and_locked(si, offset))
    973			goto checks;
    974		if (unlikely(--latency_ration < 0)) {
    975			cond_resched();
    976			latency_ration = LATENCY_LIMIT;
    977			scanned_many = true;
    978		}
    979	}
    980	offset = si->lowest_bit;
    981	while (offset < scan_base) {
    982		if (swap_offset_available_and_locked(si, offset))
    983			goto checks;
    984		if (unlikely(--latency_ration < 0)) {
    985			cond_resched();
    986			latency_ration = LATENCY_LIMIT;
    987			scanned_many = true;
    988		}
    989		offset++;
    990	}
    991	spin_lock(&si->lock);
    992
    993no_page:
    994	si->flags -= SWP_SCANNING;
    995	return n_ret;
    996}
    997
    998static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
    999{
   1000	unsigned long idx;
   1001	struct swap_cluster_info *ci;
   1002	unsigned long offset;
   1003
   1004	/*
   1005	 * Should not even be attempting cluster allocations when huge
   1006	 * page swap is disabled.  Warn and fail the allocation.
   1007	 */
   1008	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
   1009		VM_WARN_ON_ONCE(1);
   1010		return 0;
   1011	}
   1012
   1013	if (cluster_list_empty(&si->free_clusters))
   1014		return 0;
   1015
   1016	idx = cluster_list_first(&si->free_clusters);
   1017	offset = idx * SWAPFILE_CLUSTER;
   1018	ci = lock_cluster(si, offset);
   1019	alloc_cluster(si, idx);
   1020	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
   1021
   1022	memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
   1023	unlock_cluster(ci);
   1024	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
   1025	*slot = swp_entry(si->type, offset);
   1026
   1027	return 1;
   1028}
   1029
   1030static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
   1031{
   1032	unsigned long offset = idx * SWAPFILE_CLUSTER;
   1033	struct swap_cluster_info *ci;
   1034
   1035	ci = lock_cluster(si, offset);
   1036	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
   1037	cluster_set_count_flag(ci, 0, 0);
   1038	free_cluster(si, idx);
   1039	unlock_cluster(ci);
   1040	swap_range_free(si, offset, SWAPFILE_CLUSTER);
   1041}
   1042
   1043int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
   1044{
   1045	unsigned long size = swap_entry_size(entry_size);
   1046	struct swap_info_struct *si, *next;
   1047	long avail_pgs;
   1048	int n_ret = 0;
   1049	int node;
   1050
   1051	/* Only single cluster request supported */
   1052	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
   1053
   1054	spin_lock(&swap_avail_lock);
   1055
   1056	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
   1057	if (avail_pgs <= 0) {
   1058		spin_unlock(&swap_avail_lock);
   1059		goto noswap;
   1060	}
   1061
   1062	n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
   1063
   1064	atomic_long_sub(n_goal * size, &nr_swap_pages);
   1065
   1066start_over:
   1067	node = numa_node_id();
   1068	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
   1069		/* requeue si to after same-priority siblings */
   1070		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
   1071		spin_unlock(&swap_avail_lock);
   1072		spin_lock(&si->lock);
   1073		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
   1074			spin_lock(&swap_avail_lock);
   1075			if (plist_node_empty(&si->avail_lists[node])) {
   1076				spin_unlock(&si->lock);
   1077				goto nextsi;
   1078			}
   1079			WARN(!si->highest_bit,
   1080			     "swap_info %d in list but !highest_bit\n",
   1081			     si->type);
   1082			WARN(!(si->flags & SWP_WRITEOK),
   1083			     "swap_info %d in list but !SWP_WRITEOK\n",
   1084			     si->type);
   1085			__del_from_avail_list(si);
   1086			spin_unlock(&si->lock);
   1087			goto nextsi;
   1088		}
   1089		if (size == SWAPFILE_CLUSTER) {
   1090			if (si->flags & SWP_BLKDEV)
   1091				n_ret = swap_alloc_cluster(si, swp_entries);
   1092		} else
   1093			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
   1094						    n_goal, swp_entries);
   1095		spin_unlock(&si->lock);
   1096		if (n_ret || size == SWAPFILE_CLUSTER)
   1097			goto check_out;
   1098		pr_debug("scan_swap_map of si %d failed to find offset\n",
   1099			si->type);
   1100
   1101		spin_lock(&swap_avail_lock);
   1102nextsi:
   1103		/*
   1104		 * if we got here, it's likely that si was almost full before,
   1105		 * and since scan_swap_map_slots() can drop the si->lock,
   1106		 * multiple callers probably all tried to get a page from the
   1107		 * same si and it filled up before we could get one; or, the si
   1108		 * filled up between us dropping swap_avail_lock and taking
   1109		 * si->lock. Since we dropped the swap_avail_lock, the
   1110		 * swap_avail_head list may have been modified; so if next is
   1111		 * still in the swap_avail_head list then try it, otherwise
   1112		 * start over if we have not gotten any slots.
   1113		 */
   1114		if (plist_node_empty(&next->avail_lists[node]))
   1115			goto start_over;
   1116	}
   1117
   1118	spin_unlock(&swap_avail_lock);
   1119
   1120check_out:
   1121	if (n_ret < n_goal)
   1122		atomic_long_add((long)(n_goal - n_ret) * size,
   1123				&nr_swap_pages);
   1124noswap:
   1125	return n_ret;
   1126}
   1127
   1128static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
   1129{
   1130	struct swap_info_struct *p;
   1131	unsigned long offset;
   1132
   1133	if (!entry.val)
   1134		goto out;
   1135	p = swp_swap_info(entry);
   1136	if (!p)
   1137		goto bad_nofile;
   1138	if (data_race(!(p->flags & SWP_USED)))
   1139		goto bad_device;
   1140	offset = swp_offset(entry);
   1141	if (offset >= p->max)
   1142		goto bad_offset;
   1143	if (data_race(!p->swap_map[swp_offset(entry)]))
   1144		goto bad_free;
   1145	return p;
   1146
   1147bad_free:
   1148	pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
   1149	goto out;
   1150bad_offset:
   1151	pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
   1152	goto out;
   1153bad_device:
   1154	pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
   1155	goto out;
   1156bad_nofile:
   1157	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
   1158out:
   1159	return NULL;
   1160}
   1161
   1162static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
   1163					struct swap_info_struct *q)
   1164{
   1165	struct swap_info_struct *p;
   1166
   1167	p = _swap_info_get(entry);
   1168
   1169	if (p != q) {
   1170		if (q != NULL)
   1171			spin_unlock(&q->lock);
   1172		if (p != NULL)
   1173			spin_lock(&p->lock);
   1174	}
   1175	return p;
   1176}
   1177
   1178static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
   1179					      unsigned long offset,
   1180					      unsigned char usage)
   1181{
   1182	unsigned char count;
   1183	unsigned char has_cache;
   1184
   1185	count = p->swap_map[offset];
   1186
   1187	has_cache = count & SWAP_HAS_CACHE;
   1188	count &= ~SWAP_HAS_CACHE;
   1189
   1190	if (usage == SWAP_HAS_CACHE) {
   1191		VM_BUG_ON(!has_cache);
   1192		has_cache = 0;
   1193	} else if (count == SWAP_MAP_SHMEM) {
   1194		/*
   1195		 * Or we could insist on shmem.c using a special
   1196		 * swap_shmem_free() and free_shmem_swap_and_cache()...
   1197		 */
   1198		count = 0;
   1199	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
   1200		if (count == COUNT_CONTINUED) {
   1201			if (swap_count_continued(p, offset, count))
   1202				count = SWAP_MAP_MAX | COUNT_CONTINUED;
   1203			else
   1204				count = SWAP_MAP_MAX;
   1205		} else
   1206			count--;
   1207	}
   1208
   1209	usage = count | has_cache;
   1210	if (usage)
   1211		WRITE_ONCE(p->swap_map[offset], usage);
   1212	else
   1213		WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
   1214
   1215	return usage;
   1216}
   1217
   1218/*
   1219 * Check whether swap entry is valid in the swap device.  If so,
   1220 * return pointer to swap_info_struct, and keep the swap entry valid
   1221 * via preventing the swap device from being swapoff, until
   1222 * put_swap_device() is called.  Otherwise return NULL.
   1223 *
   1224 * Notice that swapoff or swapoff+swapon can still happen before the
   1225 * percpu_ref_tryget_live() in get_swap_device() or after the
   1226 * percpu_ref_put() in put_swap_device() if there isn't any other way
   1227 * to prevent swapoff, such as page lock, page table lock, etc.  The
   1228 * caller must be prepared for that.  For example, the following
   1229 * situation is possible.
   1230 *
   1231 *   CPU1				CPU2
   1232 *   do_swap_page()
   1233 *     ...				swapoff+swapon
   1234 *     __read_swap_cache_async()
   1235 *       swapcache_prepare()
   1236 *         __swap_duplicate()
   1237 *           // check swap_map
   1238 *     // verify PTE not changed
   1239 *
   1240 * In __swap_duplicate(), the swap_map need to be checked before
   1241 * changing partly because the specified swap entry may be for another
   1242 * swap device which has been swapoff.  And in do_swap_page(), after
   1243 * the page is read from the swap device, the PTE is verified not
   1244 * changed with the page table locked to check whether the swap device
   1245 * has been swapoff or swapoff+swapon.
   1246 */
   1247struct swap_info_struct *get_swap_device(swp_entry_t entry)
   1248{
   1249	struct swap_info_struct *si;
   1250	unsigned long offset;
   1251
   1252	if (!entry.val)
   1253		goto out;
   1254	si = swp_swap_info(entry);
   1255	if (!si)
   1256		goto bad_nofile;
   1257	if (!percpu_ref_tryget_live(&si->users))
   1258		goto out;
   1259	/*
   1260	 * Guarantee the si->users are checked before accessing other
   1261	 * fields of swap_info_struct.
   1262	 *
   1263	 * Paired with the spin_unlock() after setup_swap_info() in
   1264	 * enable_swap_info().
   1265	 */
   1266	smp_rmb();
   1267	offset = swp_offset(entry);
   1268	if (offset >= si->max)
   1269		goto put_out;
   1270
   1271	return si;
   1272bad_nofile:
   1273	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
   1274out:
   1275	return NULL;
   1276put_out:
   1277	pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
   1278	percpu_ref_put(&si->users);
   1279	return NULL;
   1280}
   1281
   1282static unsigned char __swap_entry_free(struct swap_info_struct *p,
   1283				       swp_entry_t entry)
   1284{
   1285	struct swap_cluster_info *ci;
   1286	unsigned long offset = swp_offset(entry);
   1287	unsigned char usage;
   1288
   1289	ci = lock_cluster_or_swap_info(p, offset);
   1290	usage = __swap_entry_free_locked(p, offset, 1);
   1291	unlock_cluster_or_swap_info(p, ci);
   1292	if (!usage)
   1293		free_swap_slot(entry);
   1294
   1295	return usage;
   1296}
   1297
   1298static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
   1299{
   1300	struct swap_cluster_info *ci;
   1301	unsigned long offset = swp_offset(entry);
   1302	unsigned char count;
   1303
   1304	ci = lock_cluster(p, offset);
   1305	count = p->swap_map[offset];
   1306	VM_BUG_ON(count != SWAP_HAS_CACHE);
   1307	p->swap_map[offset] = 0;
   1308	dec_cluster_info_page(p, p->cluster_info, offset);
   1309	unlock_cluster(ci);
   1310
   1311	mem_cgroup_uncharge_swap(entry, 1);
   1312	swap_range_free(p, offset, 1);
   1313}
   1314
   1315/*
   1316 * Caller has made sure that the swap device corresponding to entry
   1317 * is still around or has not been recycled.
   1318 */
   1319void swap_free(swp_entry_t entry)
   1320{
   1321	struct swap_info_struct *p;
   1322
   1323	p = _swap_info_get(entry);
   1324	if (p)
   1325		__swap_entry_free(p, entry);
   1326}
   1327
   1328/*
   1329 * Called after dropping swapcache to decrease refcnt to swap entries.
   1330 */
   1331void put_swap_page(struct page *page, swp_entry_t entry)
   1332{
   1333	unsigned long offset = swp_offset(entry);
   1334	unsigned long idx = offset / SWAPFILE_CLUSTER;
   1335	struct swap_cluster_info *ci;
   1336	struct swap_info_struct *si;
   1337	unsigned char *map;
   1338	unsigned int i, free_entries = 0;
   1339	unsigned char val;
   1340	int size = swap_entry_size(thp_nr_pages(page));
   1341
   1342	si = _swap_info_get(entry);
   1343	if (!si)
   1344		return;
   1345
   1346	ci = lock_cluster_or_swap_info(si, offset);
   1347	if (size == SWAPFILE_CLUSTER) {
   1348		VM_BUG_ON(!cluster_is_huge(ci));
   1349		map = si->swap_map + offset;
   1350		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
   1351			val = map[i];
   1352			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
   1353			if (val == SWAP_HAS_CACHE)
   1354				free_entries++;
   1355		}
   1356		cluster_clear_huge(ci);
   1357		if (free_entries == SWAPFILE_CLUSTER) {
   1358			unlock_cluster_or_swap_info(si, ci);
   1359			spin_lock(&si->lock);
   1360			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
   1361			swap_free_cluster(si, idx);
   1362			spin_unlock(&si->lock);
   1363			return;
   1364		}
   1365	}
   1366	for (i = 0; i < size; i++, entry.val++) {
   1367		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
   1368			unlock_cluster_or_swap_info(si, ci);
   1369			free_swap_slot(entry);
   1370			if (i == size - 1)
   1371				return;
   1372			lock_cluster_or_swap_info(si, offset);
   1373		}
   1374	}
   1375	unlock_cluster_or_swap_info(si, ci);
   1376}
   1377
   1378#ifdef CONFIG_THP_SWAP
   1379int split_swap_cluster(swp_entry_t entry)
   1380{
   1381	struct swap_info_struct *si;
   1382	struct swap_cluster_info *ci;
   1383	unsigned long offset = swp_offset(entry);
   1384
   1385	si = _swap_info_get(entry);
   1386	if (!si)
   1387		return -EBUSY;
   1388	ci = lock_cluster(si, offset);
   1389	cluster_clear_huge(ci);
   1390	unlock_cluster(ci);
   1391	return 0;
   1392}
   1393#endif
   1394
   1395static int swp_entry_cmp(const void *ent1, const void *ent2)
   1396{
   1397	const swp_entry_t *e1 = ent1, *e2 = ent2;
   1398
   1399	return (int)swp_type(*e1) - (int)swp_type(*e2);
   1400}
   1401
   1402void swapcache_free_entries(swp_entry_t *entries, int n)
   1403{
   1404	struct swap_info_struct *p, *prev;
   1405	int i;
   1406
   1407	if (n <= 0)
   1408		return;
   1409
   1410	prev = NULL;
   1411	p = NULL;
   1412
   1413	/*
   1414	 * Sort swap entries by swap device, so each lock is only taken once.
   1415	 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
   1416	 * so low that it isn't necessary to optimize further.
   1417	 */
   1418	if (nr_swapfiles > 1)
   1419		sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
   1420	for (i = 0; i < n; ++i) {
   1421		p = swap_info_get_cont(entries[i], prev);
   1422		if (p)
   1423			swap_entry_free(p, entries[i]);
   1424		prev = p;
   1425	}
   1426	if (p)
   1427		spin_unlock(&p->lock);
   1428}
   1429
   1430/*
   1431 * How many references to page are currently swapped out?
   1432 * This does not give an exact answer when swap count is continued,
   1433 * but does include the high COUNT_CONTINUED flag to allow for that.
   1434 */
   1435static int page_swapcount(struct page *page)
   1436{
   1437	int count = 0;
   1438	struct swap_info_struct *p;
   1439	struct swap_cluster_info *ci;
   1440	swp_entry_t entry;
   1441	unsigned long offset;
   1442
   1443	entry.val = page_private(page);
   1444	p = _swap_info_get(entry);
   1445	if (p) {
   1446		offset = swp_offset(entry);
   1447		ci = lock_cluster_or_swap_info(p, offset);
   1448		count = swap_count(p->swap_map[offset]);
   1449		unlock_cluster_or_swap_info(p, ci);
   1450	}
   1451	return count;
   1452}
   1453
   1454int __swap_count(swp_entry_t entry)
   1455{
   1456	struct swap_info_struct *si;
   1457	pgoff_t offset = swp_offset(entry);
   1458	int count = 0;
   1459
   1460	si = get_swap_device(entry);
   1461	if (si) {
   1462		count = swap_count(si->swap_map[offset]);
   1463		put_swap_device(si);
   1464	}
   1465	return count;
   1466}
   1467
   1468static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
   1469{
   1470	int count = 0;
   1471	pgoff_t offset = swp_offset(entry);
   1472	struct swap_cluster_info *ci;
   1473
   1474	ci = lock_cluster_or_swap_info(si, offset);
   1475	count = swap_count(si->swap_map[offset]);
   1476	unlock_cluster_or_swap_info(si, ci);
   1477	return count;
   1478}
   1479
   1480/*
   1481 * How many references to @entry are currently swapped out?
   1482 * This does not give an exact answer when swap count is continued,
   1483 * but does include the high COUNT_CONTINUED flag to allow for that.
   1484 */
   1485int __swp_swapcount(swp_entry_t entry)
   1486{
   1487	int count = 0;
   1488	struct swap_info_struct *si;
   1489
   1490	si = get_swap_device(entry);
   1491	if (si) {
   1492		count = swap_swapcount(si, entry);
   1493		put_swap_device(si);
   1494	}
   1495	return count;
   1496}
   1497
   1498/*
   1499 * How many references to @entry are currently swapped out?
   1500 * This considers COUNT_CONTINUED so it returns exact answer.
   1501 */
   1502int swp_swapcount(swp_entry_t entry)
   1503{
   1504	int count, tmp_count, n;
   1505	struct swap_info_struct *p;
   1506	struct swap_cluster_info *ci;
   1507	struct page *page;
   1508	pgoff_t offset;
   1509	unsigned char *map;
   1510
   1511	p = _swap_info_get(entry);
   1512	if (!p)
   1513		return 0;
   1514
   1515	offset = swp_offset(entry);
   1516
   1517	ci = lock_cluster_or_swap_info(p, offset);
   1518
   1519	count = swap_count(p->swap_map[offset]);
   1520	if (!(count & COUNT_CONTINUED))
   1521		goto out;
   1522
   1523	count &= ~COUNT_CONTINUED;
   1524	n = SWAP_MAP_MAX + 1;
   1525
   1526	page = vmalloc_to_page(p->swap_map + offset);
   1527	offset &= ~PAGE_MASK;
   1528	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
   1529
   1530	do {
   1531		page = list_next_entry(page, lru);
   1532		map = kmap_atomic(page);
   1533		tmp_count = map[offset];
   1534		kunmap_atomic(map);
   1535
   1536		count += (tmp_count & ~COUNT_CONTINUED) * n;
   1537		n *= (SWAP_CONT_MAX + 1);
   1538	} while (tmp_count & COUNT_CONTINUED);
   1539out:
   1540	unlock_cluster_or_swap_info(p, ci);
   1541	return count;
   1542}
   1543
   1544static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
   1545					 swp_entry_t entry)
   1546{
   1547	struct swap_cluster_info *ci;
   1548	unsigned char *map = si->swap_map;
   1549	unsigned long roffset = swp_offset(entry);
   1550	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
   1551	int i;
   1552	bool ret = false;
   1553
   1554	ci = lock_cluster_or_swap_info(si, offset);
   1555	if (!ci || !cluster_is_huge(ci)) {
   1556		if (swap_count(map[roffset]))
   1557			ret = true;
   1558		goto unlock_out;
   1559	}
   1560	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
   1561		if (swap_count(map[offset + i])) {
   1562			ret = true;
   1563			break;
   1564		}
   1565	}
   1566unlock_out:
   1567	unlock_cluster_or_swap_info(si, ci);
   1568	return ret;
   1569}
   1570
   1571static bool page_swapped(struct page *page)
   1572{
   1573	swp_entry_t entry;
   1574	struct swap_info_struct *si;
   1575
   1576	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
   1577		return page_swapcount(page) != 0;
   1578
   1579	page = compound_head(page);
   1580	entry.val = page_private(page);
   1581	si = _swap_info_get(entry);
   1582	if (si)
   1583		return swap_page_trans_huge_swapped(si, entry);
   1584	return false;
   1585}
   1586
   1587/*
   1588 * If swap is getting full, or if there are no more mappings of this page,
   1589 * then try_to_free_swap is called to free its swap space.
   1590 */
   1591int try_to_free_swap(struct page *page)
   1592{
   1593	VM_BUG_ON_PAGE(!PageLocked(page), page);
   1594
   1595	if (!PageSwapCache(page))
   1596		return 0;
   1597	if (PageWriteback(page))
   1598		return 0;
   1599	if (page_swapped(page))
   1600		return 0;
   1601
   1602	/*
   1603	 * Once hibernation has begun to create its image of memory,
   1604	 * there's a danger that one of the calls to try_to_free_swap()
   1605	 * - most probably a call from __try_to_reclaim_swap() while
   1606	 * hibernation is allocating its own swap pages for the image,
   1607	 * but conceivably even a call from memory reclaim - will free
   1608	 * the swap from a page which has already been recorded in the
   1609	 * image as a clean swapcache page, and then reuse its swap for
   1610	 * another page of the image.  On waking from hibernation, the
   1611	 * original page might be freed under memory pressure, then
   1612	 * later read back in from swap, now with the wrong data.
   1613	 *
   1614	 * Hibernation suspends storage while it is writing the image
   1615	 * to disk so check that here.
   1616	 */
   1617	if (pm_suspended_storage())
   1618		return 0;
   1619
   1620	page = compound_head(page);
   1621	delete_from_swap_cache(page);
   1622	SetPageDirty(page);
   1623	return 1;
   1624}
   1625
   1626/*
   1627 * Free the swap entry like above, but also try to
   1628 * free the page cache entry if it is the last user.
   1629 */
   1630int free_swap_and_cache(swp_entry_t entry)
   1631{
   1632	struct swap_info_struct *p;
   1633	unsigned char count;
   1634
   1635	if (non_swap_entry(entry))
   1636		return 1;
   1637
   1638	p = _swap_info_get(entry);
   1639	if (p) {
   1640		count = __swap_entry_free(p, entry);
   1641		if (count == SWAP_HAS_CACHE &&
   1642		    !swap_page_trans_huge_swapped(p, entry))
   1643			__try_to_reclaim_swap(p, swp_offset(entry),
   1644					      TTRS_UNMAPPED | TTRS_FULL);
   1645	}
   1646	return p != NULL;
   1647}
   1648
   1649#ifdef CONFIG_HIBERNATION
   1650
   1651swp_entry_t get_swap_page_of_type(int type)
   1652{
   1653	struct swap_info_struct *si = swap_type_to_swap_info(type);
   1654	swp_entry_t entry = {0};
   1655
   1656	if (!si)
   1657		goto fail;
   1658
   1659	/* This is called for allocating swap entry, not cache */
   1660	spin_lock(&si->lock);
   1661	if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
   1662		atomic_long_dec(&nr_swap_pages);
   1663	spin_unlock(&si->lock);
   1664fail:
   1665	return entry;
   1666}
   1667
   1668/*
   1669 * Find the swap type that corresponds to given device (if any).
   1670 *
   1671 * @offset - number of the PAGE_SIZE-sized block of the device, starting
   1672 * from 0, in which the swap header is expected to be located.
   1673 *
   1674 * This is needed for the suspend to disk (aka swsusp).
   1675 */
   1676int swap_type_of(dev_t device, sector_t offset)
   1677{
   1678	int type;
   1679
   1680	if (!device)
   1681		return -1;
   1682
   1683	spin_lock(&swap_lock);
   1684	for (type = 0; type < nr_swapfiles; type++) {
   1685		struct swap_info_struct *sis = swap_info[type];
   1686
   1687		if (!(sis->flags & SWP_WRITEOK))
   1688			continue;
   1689
   1690		if (device == sis->bdev->bd_dev) {
   1691			struct swap_extent *se = first_se(sis);
   1692
   1693			if (se->start_block == offset) {
   1694				spin_unlock(&swap_lock);
   1695				return type;
   1696			}
   1697		}
   1698	}
   1699	spin_unlock(&swap_lock);
   1700	return -ENODEV;
   1701}
   1702
   1703int find_first_swap(dev_t *device)
   1704{
   1705	int type;
   1706
   1707	spin_lock(&swap_lock);
   1708	for (type = 0; type < nr_swapfiles; type++) {
   1709		struct swap_info_struct *sis = swap_info[type];
   1710
   1711		if (!(sis->flags & SWP_WRITEOK))
   1712			continue;
   1713		*device = sis->bdev->bd_dev;
   1714		spin_unlock(&swap_lock);
   1715		return type;
   1716	}
   1717	spin_unlock(&swap_lock);
   1718	return -ENODEV;
   1719}
   1720
   1721/*
   1722 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
   1723 * corresponding to given index in swap_info (swap type).
   1724 */
   1725sector_t swapdev_block(int type, pgoff_t offset)
   1726{
   1727	struct swap_info_struct *si = swap_type_to_swap_info(type);
   1728	struct swap_extent *se;
   1729
   1730	if (!si || !(si->flags & SWP_WRITEOK))
   1731		return 0;
   1732	se = offset_to_swap_extent(si, offset);
   1733	return se->start_block + (offset - se->start_page);
   1734}
   1735
   1736/*
   1737 * Return either the total number of swap pages of given type, or the number
   1738 * of free pages of that type (depending on @free)
   1739 *
   1740 * This is needed for software suspend
   1741 */
   1742unsigned int count_swap_pages(int type, int free)
   1743{
   1744	unsigned int n = 0;
   1745
   1746	spin_lock(&swap_lock);
   1747	if ((unsigned int)type < nr_swapfiles) {
   1748		struct swap_info_struct *sis = swap_info[type];
   1749
   1750		spin_lock(&sis->lock);
   1751		if (sis->flags & SWP_WRITEOK) {
   1752			n = sis->pages;
   1753			if (free)
   1754				n -= sis->inuse_pages;
   1755		}
   1756		spin_unlock(&sis->lock);
   1757	}
   1758	spin_unlock(&swap_lock);
   1759	return n;
   1760}
   1761#endif /* CONFIG_HIBERNATION */
   1762
   1763static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
   1764{
   1765	return pte_same(pte_swp_clear_flags(pte), swp_pte);
   1766}
   1767
   1768/*
   1769 * No need to decide whether this PTE shares the swap entry with others,
   1770 * just let do_wp_page work it out if a write is requested later - to
   1771 * force COW, vm_page_prot omits write permission from any private vma.
   1772 */
   1773static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
   1774		unsigned long addr, swp_entry_t entry, struct page *page)
   1775{
   1776	struct page *swapcache;
   1777	spinlock_t *ptl;
   1778	pte_t *pte, new_pte;
   1779	int ret = 1;
   1780
   1781	swapcache = page;
   1782	page = ksm_might_need_to_copy(page, vma, addr);
   1783	if (unlikely(!page))
   1784		return -ENOMEM;
   1785
   1786	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
   1787	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
   1788		ret = 0;
   1789		goto out;
   1790	}
   1791
   1792	if (unlikely(!PageUptodate(page))) {
   1793		pte_t pteval;
   1794
   1795		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
   1796		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
   1797		set_pte_at(vma->vm_mm, addr, pte, pteval);
   1798		swap_free(entry);
   1799		ret = 0;
   1800		goto out;
   1801	}
   1802
   1803	/* See do_swap_page() */
   1804	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
   1805	BUG_ON(PageAnon(page) && PageAnonExclusive(page));
   1806
   1807	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
   1808	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
   1809	get_page(page);
   1810	if (page == swapcache) {
   1811		rmap_t rmap_flags = RMAP_NONE;
   1812
   1813		/*
   1814		 * See do_swap_page(): PageWriteback() would be problematic.
   1815		 * However, we do a wait_on_page_writeback() just before this
   1816		 * call and have the page locked.
   1817		 */
   1818		VM_BUG_ON_PAGE(PageWriteback(page), page);
   1819		if (pte_swp_exclusive(*pte))
   1820			rmap_flags |= RMAP_EXCLUSIVE;
   1821
   1822		page_add_anon_rmap(page, vma, addr, rmap_flags);
   1823	} else { /* ksm created a completely new copy */
   1824		page_add_new_anon_rmap(page, vma, addr);
   1825		lru_cache_add_inactive_or_unevictable(page, vma);
   1826	}
   1827	new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
   1828	if (pte_swp_soft_dirty(*pte))
   1829		new_pte = pte_mksoft_dirty(new_pte);
   1830	if (pte_swp_uffd_wp(*pte))
   1831		new_pte = pte_mkuffd_wp(new_pte);
   1832	set_pte_at(vma->vm_mm, addr, pte, new_pte);
   1833	swap_free(entry);
   1834out:
   1835	pte_unmap_unlock(pte, ptl);
   1836	if (page != swapcache) {
   1837		unlock_page(page);
   1838		put_page(page);
   1839	}
   1840	return ret;
   1841}
   1842
   1843static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
   1844			unsigned long addr, unsigned long end,
   1845			unsigned int type)
   1846{
   1847	struct page *page;
   1848	swp_entry_t entry;
   1849	pte_t *pte;
   1850	struct swap_info_struct *si;
   1851	unsigned long offset;
   1852	int ret = 0;
   1853	volatile unsigned char *swap_map;
   1854
   1855	si = swap_info[type];
   1856	pte = pte_offset_map(pmd, addr);
   1857	do {
   1858		if (!is_swap_pte(*pte))
   1859			continue;
   1860
   1861		entry = pte_to_swp_entry(*pte);
   1862		if (swp_type(entry) != type)
   1863			continue;
   1864
   1865		offset = swp_offset(entry);
   1866		pte_unmap(pte);
   1867		swap_map = &si->swap_map[offset];
   1868		page = lookup_swap_cache(entry, vma, addr);
   1869		if (!page) {
   1870			struct vm_fault vmf = {
   1871				.vma = vma,
   1872				.address = addr,
   1873				.real_address = addr,
   1874				.pmd = pmd,
   1875			};
   1876
   1877			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
   1878						&vmf);
   1879		}
   1880		if (!page) {
   1881			if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
   1882				goto try_next;
   1883			return -ENOMEM;
   1884		}
   1885
   1886		lock_page(page);
   1887		wait_on_page_writeback(page);
   1888		ret = unuse_pte(vma, pmd, addr, entry, page);
   1889		if (ret < 0) {
   1890			unlock_page(page);
   1891			put_page(page);
   1892			goto out;
   1893		}
   1894
   1895		try_to_free_swap(page);
   1896		unlock_page(page);
   1897		put_page(page);
   1898try_next:
   1899		pte = pte_offset_map(pmd, addr);
   1900	} while (pte++, addr += PAGE_SIZE, addr != end);
   1901	pte_unmap(pte - 1);
   1902
   1903	ret = 0;
   1904out:
   1905	return ret;
   1906}
   1907
   1908static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
   1909				unsigned long addr, unsigned long end,
   1910				unsigned int type)
   1911{
   1912	pmd_t *pmd;
   1913	unsigned long next;
   1914	int ret;
   1915
   1916	pmd = pmd_offset(pud, addr);
   1917	do {
   1918		cond_resched();
   1919		next = pmd_addr_end(addr, end);
   1920		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
   1921			continue;
   1922		ret = unuse_pte_range(vma, pmd, addr, next, type);
   1923		if (ret)
   1924			return ret;
   1925	} while (pmd++, addr = next, addr != end);
   1926	return 0;
   1927}
   1928
   1929static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
   1930				unsigned long addr, unsigned long end,
   1931				unsigned int type)
   1932{
   1933	pud_t *pud;
   1934	unsigned long next;
   1935	int ret;
   1936
   1937	pud = pud_offset(p4d, addr);
   1938	do {
   1939		next = pud_addr_end(addr, end);
   1940		if (pud_none_or_clear_bad(pud))
   1941			continue;
   1942		ret = unuse_pmd_range(vma, pud, addr, next, type);
   1943		if (ret)
   1944			return ret;
   1945	} while (pud++, addr = next, addr != end);
   1946	return 0;
   1947}
   1948
   1949static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
   1950				unsigned long addr, unsigned long end,
   1951				unsigned int type)
   1952{
   1953	p4d_t *p4d;
   1954	unsigned long next;
   1955	int ret;
   1956
   1957	p4d = p4d_offset(pgd, addr);
   1958	do {
   1959		next = p4d_addr_end(addr, end);
   1960		if (p4d_none_or_clear_bad(p4d))
   1961			continue;
   1962		ret = unuse_pud_range(vma, p4d, addr, next, type);
   1963		if (ret)
   1964			return ret;
   1965	} while (p4d++, addr = next, addr != end);
   1966	return 0;
   1967}
   1968
   1969static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
   1970{
   1971	pgd_t *pgd;
   1972	unsigned long addr, end, next;
   1973	int ret;
   1974
   1975	addr = vma->vm_start;
   1976	end = vma->vm_end;
   1977
   1978	pgd = pgd_offset(vma->vm_mm, addr);
   1979	do {
   1980		next = pgd_addr_end(addr, end);
   1981		if (pgd_none_or_clear_bad(pgd))
   1982			continue;
   1983		ret = unuse_p4d_range(vma, pgd, addr, next, type);
   1984		if (ret)
   1985			return ret;
   1986	} while (pgd++, addr = next, addr != end);
   1987	return 0;
   1988}
   1989
   1990static int unuse_mm(struct mm_struct *mm, unsigned int type)
   1991{
   1992	struct vm_area_struct *vma;
   1993	int ret = 0;
   1994
   1995	mmap_read_lock(mm);
   1996	for (vma = mm->mmap; vma; vma = vma->vm_next) {
   1997		if (vma->anon_vma) {
   1998			ret = unuse_vma(vma, type);
   1999			if (ret)
   2000				break;
   2001		}
   2002		cond_resched();
   2003	}
   2004	mmap_read_unlock(mm);
   2005	return ret;
   2006}
   2007
   2008/*
   2009 * Scan swap_map from current position to next entry still in use.
   2010 * Return 0 if there are no inuse entries after prev till end of
   2011 * the map.
   2012 */
   2013static unsigned int find_next_to_unuse(struct swap_info_struct *si,
   2014					unsigned int prev)
   2015{
   2016	unsigned int i;
   2017	unsigned char count;
   2018
   2019	/*
   2020	 * No need for swap_lock here: we're just looking
   2021	 * for whether an entry is in use, not modifying it; false
   2022	 * hits are okay, and sys_swapoff() has already prevented new
   2023	 * allocations from this area (while holding swap_lock).
   2024	 */
   2025	for (i = prev + 1; i < si->max; i++) {
   2026		count = READ_ONCE(si->swap_map[i]);
   2027		if (count && swap_count(count) != SWAP_MAP_BAD)
   2028			break;
   2029		if ((i % LATENCY_LIMIT) == 0)
   2030			cond_resched();
   2031	}
   2032
   2033	if (i == si->max)
   2034		i = 0;
   2035
   2036	return i;
   2037}
   2038
   2039static int try_to_unuse(unsigned int type)
   2040{
   2041	struct mm_struct *prev_mm;
   2042	struct mm_struct *mm;
   2043	struct list_head *p;
   2044	int retval = 0;
   2045	struct swap_info_struct *si = swap_info[type];
   2046	struct page *page;
   2047	swp_entry_t entry;
   2048	unsigned int i;
   2049
   2050	if (!READ_ONCE(si->inuse_pages))
   2051		return 0;
   2052
   2053retry:
   2054	retval = shmem_unuse(type);
   2055	if (retval)
   2056		return retval;
   2057
   2058	prev_mm = &init_mm;
   2059	mmget(prev_mm);
   2060
   2061	spin_lock(&mmlist_lock);
   2062	p = &init_mm.mmlist;
   2063	while (READ_ONCE(si->inuse_pages) &&
   2064	       !signal_pending(current) &&
   2065	       (p = p->next) != &init_mm.mmlist) {
   2066
   2067		mm = list_entry(p, struct mm_struct, mmlist);
   2068		if (!mmget_not_zero(mm))
   2069			continue;
   2070		spin_unlock(&mmlist_lock);
   2071		mmput(prev_mm);
   2072		prev_mm = mm;
   2073		retval = unuse_mm(mm, type);
   2074		if (retval) {
   2075			mmput(prev_mm);
   2076			return retval;
   2077		}
   2078
   2079		/*
   2080		 * Make sure that we aren't completely killing
   2081		 * interactive performance.
   2082		 */
   2083		cond_resched();
   2084		spin_lock(&mmlist_lock);
   2085	}
   2086	spin_unlock(&mmlist_lock);
   2087
   2088	mmput(prev_mm);
   2089
   2090	i = 0;
   2091	while (READ_ONCE(si->inuse_pages) &&
   2092	       !signal_pending(current) &&
   2093	       (i = find_next_to_unuse(si, i)) != 0) {
   2094
   2095		entry = swp_entry(type, i);
   2096		page = find_get_page(swap_address_space(entry), i);
   2097		if (!page)
   2098			continue;
   2099
   2100		/*
   2101		 * It is conceivable that a racing task removed this page from
   2102		 * swap cache just before we acquired the page lock. The page
   2103		 * might even be back in swap cache on another swap area. But
   2104		 * that is okay, try_to_free_swap() only removes stale pages.
   2105		 */
   2106		lock_page(page);
   2107		wait_on_page_writeback(page);
   2108		try_to_free_swap(page);
   2109		unlock_page(page);
   2110		put_page(page);
   2111	}
   2112
   2113	/*
   2114	 * Lets check again to see if there are still swap entries in the map.
   2115	 * If yes, we would need to do retry the unuse logic again.
   2116	 * Under global memory pressure, swap entries can be reinserted back
   2117	 * into process space after the mmlist loop above passes over them.
   2118	 *
   2119	 * Limit the number of retries? No: when mmget_not_zero()
   2120	 * above fails, that mm is likely to be freeing swap from
   2121	 * exit_mmap(), which proceeds at its own independent pace;
   2122	 * and even shmem_writepage() could have been preempted after
   2123	 * folio_alloc_swap(), temporarily hiding that swap.  It's easy
   2124	 * and robust (though cpu-intensive) just to keep retrying.
   2125	 */
   2126	if (READ_ONCE(si->inuse_pages)) {
   2127		if (!signal_pending(current))
   2128			goto retry;
   2129		return -EINTR;
   2130	}
   2131
   2132	return 0;
   2133}
   2134
   2135/*
   2136 * After a successful try_to_unuse, if no swap is now in use, we know
   2137 * we can empty the mmlist.  swap_lock must be held on entry and exit.
   2138 * Note that mmlist_lock nests inside swap_lock, and an mm must be
   2139 * added to the mmlist just after page_duplicate - before would be racy.
   2140 */
   2141static void drain_mmlist(void)
   2142{
   2143	struct list_head *p, *next;
   2144	unsigned int type;
   2145
   2146	for (type = 0; type < nr_swapfiles; type++)
   2147		if (swap_info[type]->inuse_pages)
   2148			return;
   2149	spin_lock(&mmlist_lock);
   2150	list_for_each_safe(p, next, &init_mm.mmlist)
   2151		list_del_init(p);
   2152	spin_unlock(&mmlist_lock);
   2153}
   2154
   2155/*
   2156 * Free all of a swapdev's extent information
   2157 */
   2158static void destroy_swap_extents(struct swap_info_struct *sis)
   2159{
   2160	while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
   2161		struct rb_node *rb = sis->swap_extent_root.rb_node;
   2162		struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
   2163
   2164		rb_erase(rb, &sis->swap_extent_root);
   2165		kfree(se);
   2166	}
   2167
   2168	if (sis->flags & SWP_ACTIVATED) {
   2169		struct file *swap_file = sis->swap_file;
   2170		struct address_space *mapping = swap_file->f_mapping;
   2171
   2172		sis->flags &= ~SWP_ACTIVATED;
   2173		if (mapping->a_ops->swap_deactivate)
   2174			mapping->a_ops->swap_deactivate(swap_file);
   2175	}
   2176}
   2177
   2178/*
   2179 * Add a block range (and the corresponding page range) into this swapdev's
   2180 * extent tree.
   2181 *
   2182 * This function rather assumes that it is called in ascending page order.
   2183 */
   2184int
   2185add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
   2186		unsigned long nr_pages, sector_t start_block)
   2187{
   2188	struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
   2189	struct swap_extent *se;
   2190	struct swap_extent *new_se;
   2191
   2192	/*
   2193	 * place the new node at the right most since the
   2194	 * function is called in ascending page order.
   2195	 */
   2196	while (*link) {
   2197		parent = *link;
   2198		link = &parent->rb_right;
   2199	}
   2200
   2201	if (parent) {
   2202		se = rb_entry(parent, struct swap_extent, rb_node);
   2203		BUG_ON(se->start_page + se->nr_pages != start_page);
   2204		if (se->start_block + se->nr_pages == start_block) {
   2205			/* Merge it */
   2206			se->nr_pages += nr_pages;
   2207			return 0;
   2208		}
   2209	}
   2210
   2211	/* No merge, insert a new extent. */
   2212	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
   2213	if (new_se == NULL)
   2214		return -ENOMEM;
   2215	new_se->start_page = start_page;
   2216	new_se->nr_pages = nr_pages;
   2217	new_se->start_block = start_block;
   2218
   2219	rb_link_node(&new_se->rb_node, parent, link);
   2220	rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
   2221	return 1;
   2222}
   2223EXPORT_SYMBOL_GPL(add_swap_extent);
   2224
   2225/*
   2226 * A `swap extent' is a simple thing which maps a contiguous range of pages
   2227 * onto a contiguous range of disk blocks.  A rbtree of swap extents is
   2228 * built at swapon time and is then used at swap_writepage/swap_readpage
   2229 * time for locating where on disk a page belongs.
   2230 *
   2231 * If the swapfile is an S_ISBLK block device, a single extent is installed.
   2232 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
   2233 * swap files identically.
   2234 *
   2235 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
   2236 * extent rbtree operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
   2237 * swapfiles are handled *identically* after swapon time.
   2238 *
   2239 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
   2240 * and will parse them into a rbtree, in PAGE_SIZE chunks.  If some stray
   2241 * blocks are found which do not fall within the PAGE_SIZE alignment
   2242 * requirements, they are simply tossed out - we will never use those blocks
   2243 * for swapping.
   2244 *
   2245 * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
   2246 * prevents users from writing to the swap device, which will corrupt memory.
   2247 *
   2248 * The amount of disk space which a single swap extent represents varies.
   2249 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
   2250 * extents in the rbtree. - akpm.
   2251 */
   2252static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
   2253{
   2254	struct file *swap_file = sis->swap_file;
   2255	struct address_space *mapping = swap_file->f_mapping;
   2256	struct inode *inode = mapping->host;
   2257	int ret;
   2258
   2259	if (S_ISBLK(inode->i_mode)) {
   2260		ret = add_swap_extent(sis, 0, sis->max, 0);
   2261		*span = sis->pages;
   2262		return ret;
   2263	}
   2264
   2265	if (mapping->a_ops->swap_activate) {
   2266		ret = mapping->a_ops->swap_activate(sis, swap_file, span);
   2267		if (ret < 0)
   2268			return ret;
   2269		sis->flags |= SWP_ACTIVATED;
   2270		if ((sis->flags & SWP_FS_OPS) &&
   2271		    sio_pool_init() != 0) {
   2272			destroy_swap_extents(sis);
   2273			return -ENOMEM;
   2274		}
   2275		return ret;
   2276	}
   2277
   2278	return generic_swapfile_activate(sis, swap_file, span);
   2279}
   2280
   2281static int swap_node(struct swap_info_struct *p)
   2282{
   2283	struct block_device *bdev;
   2284
   2285	if (p->bdev)
   2286		bdev = p->bdev;
   2287	else
   2288		bdev = p->swap_file->f_inode->i_sb->s_bdev;
   2289
   2290	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
   2291}
   2292
   2293static void setup_swap_info(struct swap_info_struct *p, int prio,
   2294			    unsigned char *swap_map,
   2295			    struct swap_cluster_info *cluster_info)
   2296{
   2297	int i;
   2298
   2299	if (prio >= 0)
   2300		p->prio = prio;
   2301	else
   2302		p->prio = --least_priority;
   2303	/*
   2304	 * the plist prio is negated because plist ordering is
   2305	 * low-to-high, while swap ordering is high-to-low
   2306	 */
   2307	p->list.prio = -p->prio;
   2308	for_each_node(i) {
   2309		if (p->prio >= 0)
   2310			p->avail_lists[i].prio = -p->prio;
   2311		else {
   2312			if (swap_node(p) == i)
   2313				p->avail_lists[i].prio = 1;
   2314			else
   2315				p->avail_lists[i].prio = -p->prio;
   2316		}
   2317	}
   2318	p->swap_map = swap_map;
   2319	p->cluster_info = cluster_info;
   2320}
   2321
   2322static void _enable_swap_info(struct swap_info_struct *p)
   2323{
   2324	p->flags |= SWP_WRITEOK;
   2325	atomic_long_add(p->pages, &nr_swap_pages);
   2326	total_swap_pages += p->pages;
   2327
   2328	assert_spin_locked(&swap_lock);
   2329	/*
   2330	 * both lists are plists, and thus priority ordered.
   2331	 * swap_active_head needs to be priority ordered for swapoff(),
   2332	 * which on removal of any swap_info_struct with an auto-assigned
   2333	 * (i.e. negative) priority increments the auto-assigned priority
   2334	 * of any lower-priority swap_info_structs.
   2335	 * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
   2336	 * which allocates swap pages from the highest available priority
   2337	 * swap_info_struct.
   2338	 */
   2339	plist_add(&p->list, &swap_active_head);
   2340	add_to_avail_list(p);
   2341}
   2342
   2343static void enable_swap_info(struct swap_info_struct *p, int prio,
   2344				unsigned char *swap_map,
   2345				struct swap_cluster_info *cluster_info,
   2346				unsigned long *frontswap_map)
   2347{
   2348	if (IS_ENABLED(CONFIG_FRONTSWAP))
   2349		frontswap_init(p->type, frontswap_map);
   2350	spin_lock(&swap_lock);
   2351	spin_lock(&p->lock);
   2352	setup_swap_info(p, prio, swap_map, cluster_info);
   2353	spin_unlock(&p->lock);
   2354	spin_unlock(&swap_lock);
   2355	/*
   2356	 * Finished initializing swap device, now it's safe to reference it.
   2357	 */
   2358	percpu_ref_resurrect(&p->users);
   2359	spin_lock(&swap_lock);
   2360	spin_lock(&p->lock);
   2361	_enable_swap_info(p);
   2362	spin_unlock(&p->lock);
   2363	spin_unlock(&swap_lock);
   2364}
   2365
   2366static void reinsert_swap_info(struct swap_info_struct *p)
   2367{
   2368	spin_lock(&swap_lock);
   2369	spin_lock(&p->lock);
   2370	setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
   2371	_enable_swap_info(p);
   2372	spin_unlock(&p->lock);
   2373	spin_unlock(&swap_lock);
   2374}
   2375
   2376bool has_usable_swap(void)
   2377{
   2378	bool ret = true;
   2379
   2380	spin_lock(&swap_lock);
   2381	if (plist_head_empty(&swap_active_head))
   2382		ret = false;
   2383	spin_unlock(&swap_lock);
   2384	return ret;
   2385}
   2386
   2387SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
   2388{
   2389	struct swap_info_struct *p = NULL;
   2390	unsigned char *swap_map;
   2391	struct swap_cluster_info *cluster_info;
   2392	unsigned long *frontswap_map;
   2393	struct file *swap_file, *victim;
   2394	struct address_space *mapping;
   2395	struct inode *inode;
   2396	struct filename *pathname;
   2397	int err, found = 0;
   2398	unsigned int old_block_size;
   2399
   2400	if (!capable(CAP_SYS_ADMIN))
   2401		return -EPERM;
   2402
   2403	BUG_ON(!current->mm);
   2404
   2405	pathname = getname(specialfile);
   2406	if (IS_ERR(pathname))
   2407		return PTR_ERR(pathname);
   2408
   2409	victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
   2410	err = PTR_ERR(victim);
   2411	if (IS_ERR(victim))
   2412		goto out;
   2413
   2414	mapping = victim->f_mapping;
   2415	spin_lock(&swap_lock);
   2416	plist_for_each_entry(p, &swap_active_head, list) {
   2417		if (p->flags & SWP_WRITEOK) {
   2418			if (p->swap_file->f_mapping == mapping) {
   2419				found = 1;
   2420				break;
   2421			}
   2422		}
   2423	}
   2424	if (!found) {
   2425		err = -EINVAL;
   2426		spin_unlock(&swap_lock);
   2427		goto out_dput;
   2428	}
   2429	if (!security_vm_enough_memory_mm(current->mm, p->pages))
   2430		vm_unacct_memory(p->pages);
   2431	else {
   2432		err = -ENOMEM;
   2433		spin_unlock(&swap_lock);
   2434		goto out_dput;
   2435	}
   2436	del_from_avail_list(p);
   2437	spin_lock(&p->lock);
   2438	if (p->prio < 0) {
   2439		struct swap_info_struct *si = p;
   2440		int nid;
   2441
   2442		plist_for_each_entry_continue(si, &swap_active_head, list) {
   2443			si->prio++;
   2444			si->list.prio--;
   2445			for_each_node(nid) {
   2446				if (si->avail_lists[nid].prio != 1)
   2447					si->avail_lists[nid].prio--;
   2448			}
   2449		}
   2450		least_priority++;
   2451	}
   2452	plist_del(&p->list, &swap_active_head);
   2453	atomic_long_sub(p->pages, &nr_swap_pages);
   2454	total_swap_pages -= p->pages;
   2455	p->flags &= ~SWP_WRITEOK;
   2456	spin_unlock(&p->lock);
   2457	spin_unlock(&swap_lock);
   2458
   2459	disable_swap_slots_cache_lock();
   2460
   2461	set_current_oom_origin();
   2462	err = try_to_unuse(p->type);
   2463	clear_current_oom_origin();
   2464
   2465	if (err) {
   2466		/* re-insert swap space back into swap_list */
   2467		reinsert_swap_info(p);
   2468		reenable_swap_slots_cache_unlock();
   2469		goto out_dput;
   2470	}
   2471
   2472	reenable_swap_slots_cache_unlock();
   2473
   2474	/*
   2475	 * Wait for swap operations protected by get/put_swap_device()
   2476	 * to complete.
   2477	 *
   2478	 * We need synchronize_rcu() here to protect the accessing to
   2479	 * the swap cache data structure.
   2480	 */
   2481	percpu_ref_kill(&p->users);
   2482	synchronize_rcu();
   2483	wait_for_completion(&p->comp);
   2484
   2485	flush_work(&p->discard_work);
   2486
   2487	destroy_swap_extents(p);
   2488	if (p->flags & SWP_CONTINUED)
   2489		free_swap_count_continuations(p);
   2490
   2491	if (!p->bdev || !bdev_nonrot(p->bdev))
   2492		atomic_dec(&nr_rotate_swap);
   2493
   2494	mutex_lock(&swapon_mutex);
   2495	spin_lock(&swap_lock);
   2496	spin_lock(&p->lock);
   2497	drain_mmlist();
   2498
   2499	/* wait for anyone still in scan_swap_map_slots */
   2500	p->highest_bit = 0;		/* cuts scans short */
   2501	while (p->flags >= SWP_SCANNING) {
   2502		spin_unlock(&p->lock);
   2503		spin_unlock(&swap_lock);
   2504		schedule_timeout_uninterruptible(1);
   2505		spin_lock(&swap_lock);
   2506		spin_lock(&p->lock);
   2507	}
   2508
   2509	swap_file = p->swap_file;
   2510	old_block_size = p->old_block_size;
   2511	p->swap_file = NULL;
   2512	p->max = 0;
   2513	swap_map = p->swap_map;
   2514	p->swap_map = NULL;
   2515	cluster_info = p->cluster_info;
   2516	p->cluster_info = NULL;
   2517	frontswap_map = frontswap_map_get(p);
   2518	spin_unlock(&p->lock);
   2519	spin_unlock(&swap_lock);
   2520	arch_swap_invalidate_area(p->type);
   2521	frontswap_invalidate_area(p->type);
   2522	frontswap_map_set(p, NULL);
   2523	mutex_unlock(&swapon_mutex);
   2524	free_percpu(p->percpu_cluster);
   2525	p->percpu_cluster = NULL;
   2526	free_percpu(p->cluster_next_cpu);
   2527	p->cluster_next_cpu = NULL;
   2528	vfree(swap_map);
   2529	kvfree(cluster_info);
   2530	kvfree(frontswap_map);
   2531	/* Destroy swap account information */
   2532	swap_cgroup_swapoff(p->type);
   2533	exit_swap_address_space(p->type);
   2534
   2535	inode = mapping->host;
   2536	if (S_ISBLK(inode->i_mode)) {
   2537		struct block_device *bdev = I_BDEV(inode);
   2538
   2539		set_blocksize(bdev, old_block_size);
   2540		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
   2541	}
   2542
   2543	inode_lock(inode);
   2544	inode->i_flags &= ~S_SWAPFILE;
   2545	inode_unlock(inode);
   2546	filp_close(swap_file, NULL);
   2547
   2548	/*
   2549	 * Clear the SWP_USED flag after all resources are freed so that swapon
   2550	 * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
   2551	 * not hold p->lock after we cleared its SWP_WRITEOK.
   2552	 */
   2553	spin_lock(&swap_lock);
   2554	p->flags = 0;
   2555	spin_unlock(&swap_lock);
   2556
   2557	err = 0;
   2558	atomic_inc(&proc_poll_event);
   2559	wake_up_interruptible(&proc_poll_wait);
   2560
   2561out_dput:
   2562	filp_close(victim, NULL);
   2563out:
   2564	putname(pathname);
   2565	return err;
   2566}
   2567
   2568#ifdef CONFIG_PROC_FS
   2569static __poll_t swaps_poll(struct file *file, poll_table *wait)
   2570{
   2571	struct seq_file *seq = file->private_data;
   2572
   2573	poll_wait(file, &proc_poll_wait, wait);
   2574
   2575	if (seq->poll_event != atomic_read(&proc_poll_event)) {
   2576		seq->poll_event = atomic_read(&proc_poll_event);
   2577		return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
   2578	}
   2579
   2580	return EPOLLIN | EPOLLRDNORM;
   2581}
   2582
   2583/* iterator */
   2584static void *swap_start(struct seq_file *swap, loff_t *pos)
   2585{
   2586	struct swap_info_struct *si;
   2587	int type;
   2588	loff_t l = *pos;
   2589
   2590	mutex_lock(&swapon_mutex);
   2591
   2592	if (!l)
   2593		return SEQ_START_TOKEN;
   2594
   2595	for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
   2596		if (!(si->flags & SWP_USED) || !si->swap_map)
   2597			continue;
   2598		if (!--l)
   2599			return si;
   2600	}
   2601
   2602	return NULL;
   2603}
   2604
   2605static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
   2606{
   2607	struct swap_info_struct *si = v;
   2608	int type;
   2609
   2610	if (v == SEQ_START_TOKEN)
   2611		type = 0;
   2612	else
   2613		type = si->type + 1;
   2614
   2615	++(*pos);
   2616	for (; (si = swap_type_to_swap_info(type)); type++) {
   2617		if (!(si->flags & SWP_USED) || !si->swap_map)
   2618			continue;
   2619		return si;
   2620	}
   2621
   2622	return NULL;
   2623}
   2624
   2625static void swap_stop(struct seq_file *swap, void *v)
   2626{
   2627	mutex_unlock(&swapon_mutex);
   2628}
   2629
   2630static int swap_show(struct seq_file *swap, void *v)
   2631{
   2632	struct swap_info_struct *si = v;
   2633	struct file *file;
   2634	int len;
   2635	unsigned long bytes, inuse;
   2636
   2637	if (si == SEQ_START_TOKEN) {
   2638		seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
   2639		return 0;
   2640	}
   2641
   2642	bytes = si->pages << (PAGE_SHIFT - 10);
   2643	inuse = si->inuse_pages << (PAGE_SHIFT - 10);
   2644
   2645	file = si->swap_file;
   2646	len = seq_file_path(swap, file, " \t\n\\");
   2647	seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
   2648			len < 40 ? 40 - len : 1, " ",
   2649			S_ISBLK(file_inode(file)->i_mode) ?
   2650				"partition" : "file\t",
   2651			bytes, bytes < 10000000 ? "\t" : "",
   2652			inuse, inuse < 10000000 ? "\t" : "",
   2653			si->prio);
   2654	return 0;
   2655}
   2656
   2657static const struct seq_operations swaps_op = {
   2658	.start =	swap_start,
   2659	.next =		swap_next,
   2660	.stop =		swap_stop,
   2661	.show =		swap_show
   2662};
   2663
   2664static int swaps_open(struct inode *inode, struct file *file)
   2665{
   2666	struct seq_file *seq;
   2667	int ret;
   2668
   2669	ret = seq_open(file, &swaps_op);
   2670	if (ret)
   2671		return ret;
   2672
   2673	seq = file->private_data;
   2674	seq->poll_event = atomic_read(&proc_poll_event);
   2675	return 0;
   2676}
   2677
   2678static const struct proc_ops swaps_proc_ops = {
   2679	.proc_flags	= PROC_ENTRY_PERMANENT,
   2680	.proc_open	= swaps_open,
   2681	.proc_read	= seq_read,
   2682	.proc_lseek	= seq_lseek,
   2683	.proc_release	= seq_release,
   2684	.proc_poll	= swaps_poll,
   2685};
   2686
   2687static int __init procswaps_init(void)
   2688{
   2689	proc_create("swaps", 0, NULL, &swaps_proc_ops);
   2690	return 0;
   2691}
   2692__initcall(procswaps_init);
   2693#endif /* CONFIG_PROC_FS */
   2694
   2695#ifdef MAX_SWAPFILES_CHECK
   2696static int __init max_swapfiles_check(void)
   2697{
   2698	MAX_SWAPFILES_CHECK();
   2699	return 0;
   2700}
   2701late_initcall(max_swapfiles_check);
   2702#endif
   2703
   2704static struct swap_info_struct *alloc_swap_info(void)
   2705{
   2706	struct swap_info_struct *p;
   2707	struct swap_info_struct *defer = NULL;
   2708	unsigned int type;
   2709	int i;
   2710
   2711	p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
   2712	if (!p)
   2713		return ERR_PTR(-ENOMEM);
   2714
   2715	if (percpu_ref_init(&p->users, swap_users_ref_free,
   2716			    PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
   2717		kvfree(p);
   2718		return ERR_PTR(-ENOMEM);
   2719	}
   2720
   2721	spin_lock(&swap_lock);
   2722	for (type = 0; type < nr_swapfiles; type++) {
   2723		if (!(swap_info[type]->flags & SWP_USED))
   2724			break;
   2725	}
   2726	if (type >= MAX_SWAPFILES) {
   2727		spin_unlock(&swap_lock);
   2728		percpu_ref_exit(&p->users);
   2729		kvfree(p);
   2730		return ERR_PTR(-EPERM);
   2731	}
   2732	if (type >= nr_swapfiles) {
   2733		p->type = type;
   2734		/*
   2735		 * Publish the swap_info_struct after initializing it.
   2736		 * Note that kvzalloc() above zeroes all its fields.
   2737		 */
   2738		smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
   2739		nr_swapfiles++;
   2740	} else {
   2741		defer = p;
   2742		p = swap_info[type];
   2743		/*
   2744		 * Do not memset this entry: a racing procfs swap_next()
   2745		 * would be relying on p->type to remain valid.
   2746		 */
   2747	}
   2748	p->swap_extent_root = RB_ROOT;
   2749	plist_node_init(&p->list, 0);
   2750	for_each_node(i)
   2751		plist_node_init(&p->avail_lists[i], 0);
   2752	p->flags = SWP_USED;
   2753	spin_unlock(&swap_lock);
   2754	if (defer) {
   2755		percpu_ref_exit(&defer->users);
   2756		kvfree(defer);
   2757	}
   2758	spin_lock_init(&p->lock);
   2759	spin_lock_init(&p->cont_lock);
   2760	init_completion(&p->comp);
   2761
   2762	return p;
   2763}
   2764
   2765static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
   2766{
   2767	int error;
   2768
   2769	if (S_ISBLK(inode->i_mode)) {
   2770		p->bdev = blkdev_get_by_dev(inode->i_rdev,
   2771				   FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
   2772		if (IS_ERR(p->bdev)) {
   2773			error = PTR_ERR(p->bdev);
   2774			p->bdev = NULL;
   2775			return error;
   2776		}
   2777		p->old_block_size = block_size(p->bdev);
   2778		error = set_blocksize(p->bdev, PAGE_SIZE);
   2779		if (error < 0)
   2780			return error;
   2781		/*
   2782		 * Zoned block devices contain zones that have a sequential
   2783		 * write only restriction.  Hence zoned block devices are not
   2784		 * suitable for swapping.  Disallow them here.
   2785		 */
   2786		if (bdev_is_zoned(p->bdev))
   2787			return -EINVAL;
   2788		p->flags |= SWP_BLKDEV;
   2789	} else if (S_ISREG(inode->i_mode)) {
   2790		p->bdev = inode->i_sb->s_bdev;
   2791	}
   2792
   2793	return 0;
   2794}
   2795
   2796
   2797/*
   2798 * Find out how many pages are allowed for a single swap device. There
   2799 * are two limiting factors:
   2800 * 1) the number of bits for the swap offset in the swp_entry_t type, and
   2801 * 2) the number of bits in the swap pte, as defined by the different
   2802 * architectures.
   2803 *
   2804 * In order to find the largest possible bit mask, a swap entry with
   2805 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
   2806 * decoded to a swp_entry_t again, and finally the swap offset is
   2807 * extracted.
   2808 *
   2809 * This will mask all the bits from the initial ~0UL mask that can't
   2810 * be encoded in either the swp_entry_t or the architecture definition
   2811 * of a swap pte.
   2812 */
   2813unsigned long generic_max_swapfile_size(void)
   2814{
   2815	return swp_offset(pte_to_swp_entry(
   2816			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
   2817}
   2818
   2819/* Can be overridden by an architecture for additional checks. */
   2820__weak unsigned long max_swapfile_size(void)
   2821{
   2822	return generic_max_swapfile_size();
   2823}
   2824
   2825static unsigned long read_swap_header(struct swap_info_struct *p,
   2826					union swap_header *swap_header,
   2827					struct inode *inode)
   2828{
   2829	int i;
   2830	unsigned long maxpages;
   2831	unsigned long swapfilepages;
   2832	unsigned long last_page;
   2833
   2834	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
   2835		pr_err("Unable to find swap-space signature\n");
   2836		return 0;
   2837	}
   2838
   2839	/* swap partition endianness hack... */
   2840	if (swab32(swap_header->info.version) == 1) {
   2841		swab32s(&swap_header->info.version);
   2842		swab32s(&swap_header->info.last_page);
   2843		swab32s(&swap_header->info.nr_badpages);
   2844		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
   2845			return 0;
   2846		for (i = 0; i < swap_header->info.nr_badpages; i++)
   2847			swab32s(&swap_header->info.badpages[i]);
   2848	}
   2849	/* Check the swap header's sub-version */
   2850	if (swap_header->info.version != 1) {
   2851		pr_warn("Unable to handle swap header version %d\n",
   2852			swap_header->info.version);
   2853		return 0;
   2854	}
   2855
   2856	p->lowest_bit  = 1;
   2857	p->cluster_next = 1;
   2858	p->cluster_nr = 0;
   2859
   2860	maxpages = max_swapfile_size();
   2861	last_page = swap_header->info.last_page;
   2862	if (!last_page) {
   2863		pr_warn("Empty swap-file\n");
   2864		return 0;
   2865	}
   2866	if (last_page > maxpages) {
   2867		pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
   2868			maxpages << (PAGE_SHIFT - 10),
   2869			last_page << (PAGE_SHIFT - 10));
   2870	}
   2871	if (maxpages > last_page) {
   2872		maxpages = last_page + 1;
   2873		/* p->max is an unsigned int: don't overflow it */
   2874		if ((unsigned int)maxpages == 0)
   2875			maxpages = UINT_MAX;
   2876	}
   2877	p->highest_bit = maxpages - 1;
   2878
   2879	if (!maxpages)
   2880		return 0;
   2881	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
   2882	if (swapfilepages && maxpages > swapfilepages) {
   2883		pr_warn("Swap area shorter than signature indicates\n");
   2884		return 0;
   2885	}
   2886	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
   2887		return 0;
   2888	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
   2889		return 0;
   2890
   2891	return maxpages;
   2892}
   2893
   2894#define SWAP_CLUSTER_INFO_COLS						\
   2895	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
   2896#define SWAP_CLUSTER_SPACE_COLS						\
   2897	DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
   2898#define SWAP_CLUSTER_COLS						\
   2899	max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
   2900
   2901static int setup_swap_map_and_extents(struct swap_info_struct *p,
   2902					union swap_header *swap_header,
   2903					unsigned char *swap_map,
   2904					struct swap_cluster_info *cluster_info,
   2905					unsigned long maxpages,
   2906					sector_t *span)
   2907{
   2908	unsigned int j, k;
   2909	unsigned int nr_good_pages;
   2910	int nr_extents;
   2911	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
   2912	unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
   2913	unsigned long i, idx;
   2914
   2915	nr_good_pages = maxpages - 1;	/* omit header page */
   2916
   2917	cluster_list_init(&p->free_clusters);
   2918	cluster_list_init(&p->discard_clusters);
   2919
   2920	for (i = 0; i < swap_header->info.nr_badpages; i++) {
   2921		unsigned int page_nr = swap_header->info.badpages[i];
   2922		if (page_nr == 0 || page_nr > swap_header->info.last_page)
   2923			return -EINVAL;
   2924		if (page_nr < maxpages) {
   2925			swap_map[page_nr] = SWAP_MAP_BAD;
   2926			nr_good_pages--;
   2927			/*
   2928			 * Haven't marked the cluster free yet, no list
   2929			 * operation involved
   2930			 */
   2931			inc_cluster_info_page(p, cluster_info, page_nr);
   2932		}
   2933	}
   2934
   2935	/* Haven't marked the cluster free yet, no list operation involved */
   2936	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
   2937		inc_cluster_info_page(p, cluster_info, i);
   2938
   2939	if (nr_good_pages) {
   2940		swap_map[0] = SWAP_MAP_BAD;
   2941		/*
   2942		 * Not mark the cluster free yet, no list
   2943		 * operation involved
   2944		 */
   2945		inc_cluster_info_page(p, cluster_info, 0);
   2946		p->max = maxpages;
   2947		p->pages = nr_good_pages;
   2948		nr_extents = setup_swap_extents(p, span);
   2949		if (nr_extents < 0)
   2950			return nr_extents;
   2951		nr_good_pages = p->pages;
   2952	}
   2953	if (!nr_good_pages) {
   2954		pr_warn("Empty swap-file\n");
   2955		return -EINVAL;
   2956	}
   2957
   2958	if (!cluster_info)
   2959		return nr_extents;
   2960
   2961
   2962	/*
   2963	 * Reduce false cache line sharing between cluster_info and
   2964	 * sharing same address space.
   2965	 */
   2966	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
   2967		j = (k + col) % SWAP_CLUSTER_COLS;
   2968		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
   2969			idx = i * SWAP_CLUSTER_COLS + j;
   2970			if (idx >= nr_clusters)
   2971				continue;
   2972			if (cluster_count(&cluster_info[idx]))
   2973				continue;
   2974			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
   2975			cluster_list_add_tail(&p->free_clusters, cluster_info,
   2976					      idx);
   2977		}
   2978	}
   2979	return nr_extents;
   2980}
   2981
   2982SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
   2983{
   2984	struct swap_info_struct *p;
   2985	struct filename *name;
   2986	struct file *swap_file = NULL;
   2987	struct address_space *mapping;
   2988	struct dentry *dentry;
   2989	int prio;
   2990	int error;
   2991	union swap_header *swap_header;
   2992	int nr_extents;
   2993	sector_t span;
   2994	unsigned long maxpages;
   2995	unsigned char *swap_map = NULL;
   2996	struct swap_cluster_info *cluster_info = NULL;
   2997	unsigned long *frontswap_map = NULL;
   2998	struct page *page = NULL;
   2999	struct inode *inode = NULL;
   3000	bool inced_nr_rotate_swap = false;
   3001
   3002	if (swap_flags & ~SWAP_FLAGS_VALID)
   3003		return -EINVAL;
   3004
   3005	if (!capable(CAP_SYS_ADMIN))
   3006		return -EPERM;
   3007
   3008	if (!swap_avail_heads)
   3009		return -ENOMEM;
   3010
   3011	p = alloc_swap_info();
   3012	if (IS_ERR(p))
   3013		return PTR_ERR(p);
   3014
   3015	INIT_WORK(&p->discard_work, swap_discard_work);
   3016
   3017	name = getname(specialfile);
   3018	if (IS_ERR(name)) {
   3019		error = PTR_ERR(name);
   3020		name = NULL;
   3021		goto bad_swap;
   3022	}
   3023	swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
   3024	if (IS_ERR(swap_file)) {
   3025		error = PTR_ERR(swap_file);
   3026		swap_file = NULL;
   3027		goto bad_swap;
   3028	}
   3029
   3030	p->swap_file = swap_file;
   3031	mapping = swap_file->f_mapping;
   3032	dentry = swap_file->f_path.dentry;
   3033	inode = mapping->host;
   3034
   3035	error = claim_swapfile(p, inode);
   3036	if (unlikely(error))
   3037		goto bad_swap;
   3038
   3039	inode_lock(inode);
   3040	if (d_unlinked(dentry) || cant_mount(dentry)) {
   3041		error = -ENOENT;
   3042		goto bad_swap_unlock_inode;
   3043	}
   3044	if (IS_SWAPFILE(inode)) {
   3045		error = -EBUSY;
   3046		goto bad_swap_unlock_inode;
   3047	}
   3048
   3049	/*
   3050	 * Read the swap header.
   3051	 */
   3052	if (!mapping->a_ops->read_folio) {
   3053		error = -EINVAL;
   3054		goto bad_swap_unlock_inode;
   3055	}
   3056	page = read_mapping_page(mapping, 0, swap_file);
   3057	if (IS_ERR(page)) {
   3058		error = PTR_ERR(page);
   3059		goto bad_swap_unlock_inode;
   3060	}
   3061	swap_header = kmap(page);
   3062
   3063	maxpages = read_swap_header(p, swap_header, inode);
   3064	if (unlikely(!maxpages)) {
   3065		error = -EINVAL;
   3066		goto bad_swap_unlock_inode;
   3067	}
   3068
   3069	/* OK, set up the swap map and apply the bad block list */
   3070	swap_map = vzalloc(maxpages);
   3071	if (!swap_map) {
   3072		error = -ENOMEM;
   3073		goto bad_swap_unlock_inode;
   3074	}
   3075
   3076	if (p->bdev && bdev_stable_writes(p->bdev))
   3077		p->flags |= SWP_STABLE_WRITES;
   3078
   3079	if (p->bdev && p->bdev->bd_disk->fops->rw_page)
   3080		p->flags |= SWP_SYNCHRONOUS_IO;
   3081
   3082	if (p->bdev && bdev_nonrot(p->bdev)) {
   3083		int cpu;
   3084		unsigned long ci, nr_cluster;
   3085
   3086		p->flags |= SWP_SOLIDSTATE;
   3087		p->cluster_next_cpu = alloc_percpu(unsigned int);
   3088		if (!p->cluster_next_cpu) {
   3089			error = -ENOMEM;
   3090			goto bad_swap_unlock_inode;
   3091		}
   3092		/*
   3093		 * select a random position to start with to help wear leveling
   3094		 * SSD
   3095		 */
   3096		for_each_possible_cpu(cpu) {
   3097			per_cpu(*p->cluster_next_cpu, cpu) =
   3098				1 + prandom_u32_max(p->highest_bit);
   3099		}
   3100		nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
   3101
   3102		cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
   3103					GFP_KERNEL);
   3104		if (!cluster_info) {
   3105			error = -ENOMEM;
   3106			goto bad_swap_unlock_inode;
   3107		}
   3108
   3109		for (ci = 0; ci < nr_cluster; ci++)
   3110			spin_lock_init(&((cluster_info + ci)->lock));
   3111
   3112		p->percpu_cluster = alloc_percpu(struct percpu_cluster);
   3113		if (!p->percpu_cluster) {
   3114			error = -ENOMEM;
   3115			goto bad_swap_unlock_inode;
   3116		}
   3117		for_each_possible_cpu(cpu) {
   3118			struct percpu_cluster *cluster;
   3119			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
   3120			cluster_set_null(&cluster->index);
   3121		}
   3122	} else {
   3123		atomic_inc(&nr_rotate_swap);
   3124		inced_nr_rotate_swap = true;
   3125	}
   3126
   3127	error = swap_cgroup_swapon(p->type, maxpages);
   3128	if (error)
   3129		goto bad_swap_unlock_inode;
   3130
   3131	nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
   3132		cluster_info, maxpages, &span);
   3133	if (unlikely(nr_extents < 0)) {
   3134		error = nr_extents;
   3135		goto bad_swap_unlock_inode;
   3136	}
   3137	/* frontswap enabled? set up bit-per-page map for frontswap */
   3138	if (IS_ENABLED(CONFIG_FRONTSWAP))
   3139		frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
   3140					 sizeof(long),
   3141					 GFP_KERNEL);
   3142
   3143	if ((swap_flags & SWAP_FLAG_DISCARD) &&
   3144	    p->bdev && bdev_max_discard_sectors(p->bdev)) {
   3145		/*
   3146		 * When discard is enabled for swap with no particular
   3147		 * policy flagged, we set all swap discard flags here in
   3148		 * order to sustain backward compatibility with older
   3149		 * swapon(8) releases.
   3150		 */
   3151		p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
   3152			     SWP_PAGE_DISCARD);
   3153
   3154		/*
   3155		 * By flagging sys_swapon, a sysadmin can tell us to
   3156		 * either do single-time area discards only, or to just
   3157		 * perform discards for released swap page-clusters.
   3158		 * Now it's time to adjust the p->flags accordingly.
   3159		 */
   3160		if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
   3161			p->flags &= ~SWP_PAGE_DISCARD;
   3162		else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
   3163			p->flags &= ~SWP_AREA_DISCARD;
   3164
   3165		/* issue a swapon-time discard if it's still required */
   3166		if (p->flags & SWP_AREA_DISCARD) {
   3167			int err = discard_swap(p);
   3168			if (unlikely(err))
   3169				pr_err("swapon: discard_swap(%p): %d\n",
   3170					p, err);
   3171		}
   3172	}
   3173
   3174	error = init_swap_address_space(p->type, maxpages);
   3175	if (error)
   3176		goto bad_swap_unlock_inode;
   3177
   3178	/*
   3179	 * Flush any pending IO and dirty mappings before we start using this
   3180	 * swap device.
   3181	 */
   3182	inode->i_flags |= S_SWAPFILE;
   3183	error = inode_drain_writes(inode);
   3184	if (error) {
   3185		inode->i_flags &= ~S_SWAPFILE;
   3186		goto free_swap_address_space;
   3187	}
   3188
   3189	mutex_lock(&swapon_mutex);
   3190	prio = -1;
   3191	if (swap_flags & SWAP_FLAG_PREFER)
   3192		prio =
   3193		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
   3194	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
   3195
   3196	pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
   3197		p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
   3198		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
   3199		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
   3200		(p->flags & SWP_DISCARDABLE) ? "D" : "",
   3201		(p->flags & SWP_AREA_DISCARD) ? "s" : "",
   3202		(p->flags & SWP_PAGE_DISCARD) ? "c" : "",
   3203		(frontswap_map) ? "FS" : "");
   3204
   3205	mutex_unlock(&swapon_mutex);
   3206	atomic_inc(&proc_poll_event);
   3207	wake_up_interruptible(&proc_poll_wait);
   3208
   3209	error = 0;
   3210	goto out;
   3211free_swap_address_space:
   3212	exit_swap_address_space(p->type);
   3213bad_swap_unlock_inode:
   3214	inode_unlock(inode);
   3215bad_swap:
   3216	free_percpu(p->percpu_cluster);
   3217	p->percpu_cluster = NULL;
   3218	free_percpu(p->cluster_next_cpu);
   3219	p->cluster_next_cpu = NULL;
   3220	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
   3221		set_blocksize(p->bdev, p->old_block_size);
   3222		blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
   3223	}
   3224	inode = NULL;
   3225	destroy_swap_extents(p);
   3226	swap_cgroup_swapoff(p->type);
   3227	spin_lock(&swap_lock);
   3228	p->swap_file = NULL;
   3229	p->flags = 0;
   3230	spin_unlock(&swap_lock);
   3231	vfree(swap_map);
   3232	kvfree(cluster_info);
   3233	kvfree(frontswap_map);
   3234	if (inced_nr_rotate_swap)
   3235		atomic_dec(&nr_rotate_swap);
   3236	if (swap_file)
   3237		filp_close(swap_file, NULL);
   3238out:
   3239	if (page && !IS_ERR(page)) {
   3240		kunmap(page);
   3241		put_page(page);
   3242	}
   3243	if (name)
   3244		putname(name);
   3245	if (inode)
   3246		inode_unlock(inode);
   3247	if (!error)
   3248		enable_swap_slots_cache();
   3249	return error;
   3250}
   3251
   3252void si_swapinfo(struct sysinfo *val)
   3253{
   3254	unsigned int type;
   3255	unsigned long nr_to_be_unused = 0;
   3256
   3257	spin_lock(&swap_lock);
   3258	for (type = 0; type < nr_swapfiles; type++) {
   3259		struct swap_info_struct *si = swap_info[type];
   3260
   3261		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
   3262			nr_to_be_unused += si->inuse_pages;
   3263	}
   3264	val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
   3265	val->totalswap = total_swap_pages + nr_to_be_unused;
   3266	spin_unlock(&swap_lock);
   3267}
   3268
   3269/*
   3270 * Verify that a swap entry is valid and increment its swap map count.
   3271 *
   3272 * Returns error code in following case.
   3273 * - success -> 0
   3274 * - swp_entry is invalid -> EINVAL
   3275 * - swp_entry is migration entry -> EINVAL
   3276 * - swap-cache reference is requested but there is already one. -> EEXIST
   3277 * - swap-cache reference is requested but the entry is not used. -> ENOENT
   3278 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
   3279 */
   3280static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
   3281{
   3282	struct swap_info_struct *p;
   3283	struct swap_cluster_info *ci;
   3284	unsigned long offset;
   3285	unsigned char count;
   3286	unsigned char has_cache;
   3287	int err;
   3288
   3289	p = get_swap_device(entry);
   3290	if (!p)
   3291		return -EINVAL;
   3292
   3293	offset = swp_offset(entry);
   3294	ci = lock_cluster_or_swap_info(p, offset);
   3295
   3296	count = p->swap_map[offset];
   3297
   3298	/*
   3299	 * swapin_readahead() doesn't check if a swap entry is valid, so the
   3300	 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
   3301	 */
   3302	if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
   3303		err = -ENOENT;
   3304		goto unlock_out;
   3305	}
   3306
   3307	has_cache = count & SWAP_HAS_CACHE;
   3308	count &= ~SWAP_HAS_CACHE;
   3309	err = 0;
   3310
   3311	if (usage == SWAP_HAS_CACHE) {
   3312
   3313		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
   3314		if (!has_cache && count)
   3315			has_cache = SWAP_HAS_CACHE;
   3316		else if (has_cache)		/* someone else added cache */
   3317			err = -EEXIST;
   3318		else				/* no users remaining */
   3319			err = -ENOENT;
   3320
   3321	} else if (count || has_cache) {
   3322
   3323		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
   3324			count += usage;
   3325		else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
   3326			err = -EINVAL;
   3327		else if (swap_count_continued(p, offset, count))
   3328			count = COUNT_CONTINUED;
   3329		else
   3330			err = -ENOMEM;
   3331	} else
   3332		err = -ENOENT;			/* unused swap entry */
   3333
   3334	WRITE_ONCE(p->swap_map[offset], count | has_cache);
   3335
   3336unlock_out:
   3337	unlock_cluster_or_swap_info(p, ci);
   3338	put_swap_device(p);
   3339	return err;
   3340}
   3341
   3342/*
   3343 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
   3344 * (in which case its reference count is never incremented).
   3345 */
   3346void swap_shmem_alloc(swp_entry_t entry)
   3347{
   3348	__swap_duplicate(entry, SWAP_MAP_SHMEM);
   3349}
   3350
   3351/*
   3352 * Increase reference count of swap entry by 1.
   3353 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
   3354 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
   3355 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
   3356 * might occur if a page table entry has got corrupted.
   3357 */
   3358int swap_duplicate(swp_entry_t entry)
   3359{
   3360	int err = 0;
   3361
   3362	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
   3363		err = add_swap_count_continuation(entry, GFP_ATOMIC);
   3364	return err;
   3365}
   3366
   3367/*
   3368 * @entry: swap entry for which we allocate swap cache.
   3369 *
   3370 * Called when allocating swap cache for existing swap entry,
   3371 * This can return error codes. Returns 0 at success.
   3372 * -EEXIST means there is a swap cache.
   3373 * Note: return code is different from swap_duplicate().
   3374 */
   3375int swapcache_prepare(swp_entry_t entry)
   3376{
   3377	return __swap_duplicate(entry, SWAP_HAS_CACHE);
   3378}
   3379
   3380struct swap_info_struct *swp_swap_info(swp_entry_t entry)
   3381{
   3382	return swap_type_to_swap_info(swp_type(entry));
   3383}
   3384
   3385struct swap_info_struct *page_swap_info(struct page *page)
   3386{
   3387	swp_entry_t entry = { .val = page_private(page) };
   3388	return swp_swap_info(entry);
   3389}
   3390
   3391/*
   3392 * out-of-line methods to avoid include hell.
   3393 */
   3394struct address_space *swapcache_mapping(struct folio *folio)
   3395{
   3396	return page_swap_info(&folio->page)->swap_file->f_mapping;
   3397}
   3398EXPORT_SYMBOL_GPL(swapcache_mapping);
   3399
   3400pgoff_t __page_file_index(struct page *page)
   3401{
   3402	swp_entry_t swap = { .val = page_private(page) };
   3403	return swp_offset(swap);
   3404}
   3405EXPORT_SYMBOL_GPL(__page_file_index);
   3406
   3407/*
   3408 * add_swap_count_continuation - called when a swap count is duplicated
   3409 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
   3410 * page of the original vmalloc'ed swap_map, to hold the continuation count
   3411 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
   3412 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
   3413 *
   3414 * These continuation pages are seldom referenced: the common paths all work
   3415 * on the original swap_map, only referring to a continuation page when the
   3416 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
   3417 *
   3418 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
   3419 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
   3420 * can be called after dropping locks.
   3421 */
   3422int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
   3423{
   3424	struct swap_info_struct *si;
   3425	struct swap_cluster_info *ci;
   3426	struct page *head;
   3427	struct page *page;
   3428	struct page *list_page;
   3429	pgoff_t offset;
   3430	unsigned char count;
   3431	int ret = 0;
   3432
   3433	/*
   3434	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
   3435	 * for latency not to zero a page while GFP_ATOMIC and holding locks.
   3436	 */
   3437	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
   3438
   3439	si = get_swap_device(entry);
   3440	if (!si) {
   3441		/*
   3442		 * An acceptable race has occurred since the failing
   3443		 * __swap_duplicate(): the swap device may be swapoff
   3444		 */
   3445		goto outer;
   3446	}
   3447	spin_lock(&si->lock);
   3448
   3449	offset = swp_offset(entry);
   3450
   3451	ci = lock_cluster(si, offset);
   3452
   3453	count = swap_count(si->swap_map[offset]);
   3454
   3455	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
   3456		/*
   3457		 * The higher the swap count, the more likely it is that tasks
   3458		 * will race to add swap count continuation: we need to avoid
   3459		 * over-provisioning.
   3460		 */
   3461		goto out;
   3462	}
   3463
   3464	if (!page) {
   3465		ret = -ENOMEM;
   3466		goto out;
   3467	}
   3468
   3469	/*
   3470	 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
   3471	 * no architecture is using highmem pages for kernel page tables: so it
   3472	 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
   3473	 */
   3474	head = vmalloc_to_page(si->swap_map + offset);
   3475	offset &= ~PAGE_MASK;
   3476
   3477	spin_lock(&si->cont_lock);
   3478	/*
   3479	 * Page allocation does not initialize the page's lru field,
   3480	 * but it does always reset its private field.
   3481	 */
   3482	if (!page_private(head)) {
   3483		BUG_ON(count & COUNT_CONTINUED);
   3484		INIT_LIST_HEAD(&head->lru);
   3485		set_page_private(head, SWP_CONTINUED);
   3486		si->flags |= SWP_CONTINUED;
   3487	}
   3488
   3489	list_for_each_entry(list_page, &head->lru, lru) {
   3490		unsigned char *map;
   3491
   3492		/*
   3493		 * If the previous map said no continuation, but we've found
   3494		 * a continuation page, free our allocation and use this one.
   3495		 */
   3496		if (!(count & COUNT_CONTINUED))
   3497			goto out_unlock_cont;
   3498
   3499		map = kmap_atomic(list_page) + offset;
   3500		count = *map;
   3501		kunmap_atomic(map);
   3502
   3503		/*
   3504		 * If this continuation count now has some space in it,
   3505		 * free our allocation and use this one.
   3506		 */
   3507		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
   3508			goto out_unlock_cont;
   3509	}
   3510
   3511	list_add_tail(&page->lru, &head->lru);
   3512	page = NULL;			/* now it's attached, don't free it */
   3513out_unlock_cont:
   3514	spin_unlock(&si->cont_lock);
   3515out:
   3516	unlock_cluster(ci);
   3517	spin_unlock(&si->lock);
   3518	put_swap_device(si);
   3519outer:
   3520	if (page)
   3521		__free_page(page);
   3522	return ret;
   3523}
   3524
   3525/*
   3526 * swap_count_continued - when the original swap_map count is incremented
   3527 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
   3528 * into, carry if so, or else fail until a new continuation page is allocated;
   3529 * when the original swap_map count is decremented from 0 with continuation,
   3530 * borrow from the continuation and report whether it still holds more.
   3531 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
   3532 * lock.
   3533 */
   3534static bool swap_count_continued(struct swap_info_struct *si,
   3535				 pgoff_t offset, unsigned char count)
   3536{
   3537	struct page *head;
   3538	struct page *page;
   3539	unsigned char *map;
   3540	bool ret;
   3541
   3542	head = vmalloc_to_page(si->swap_map + offset);
   3543	if (page_private(head) != SWP_CONTINUED) {
   3544		BUG_ON(count & COUNT_CONTINUED);
   3545		return false;		/* need to add count continuation */
   3546	}
   3547
   3548	spin_lock(&si->cont_lock);
   3549	offset &= ~PAGE_MASK;
   3550	page = list_next_entry(head, lru);
   3551	map = kmap_atomic(page) + offset;
   3552
   3553	if (count == SWAP_MAP_MAX)	/* initial increment from swap_map */
   3554		goto init_map;		/* jump over SWAP_CONT_MAX checks */
   3555
   3556	if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
   3557		/*
   3558		 * Think of how you add 1 to 999
   3559		 */
   3560		while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
   3561			kunmap_atomic(map);
   3562			page = list_next_entry(page, lru);
   3563			BUG_ON(page == head);
   3564			map = kmap_atomic(page) + offset;
   3565		}
   3566		if (*map == SWAP_CONT_MAX) {
   3567			kunmap_atomic(map);
   3568			page = list_next_entry(page, lru);
   3569			if (page == head) {
   3570				ret = false;	/* add count continuation */
   3571				goto out;
   3572			}
   3573			map = kmap_atomic(page) + offset;
   3574init_map:		*map = 0;		/* we didn't zero the page */
   3575		}
   3576		*map += 1;
   3577		kunmap_atomic(map);
   3578		while ((page = list_prev_entry(page, lru)) != head) {
   3579			map = kmap_atomic(page) + offset;
   3580			*map = COUNT_CONTINUED;
   3581			kunmap_atomic(map);
   3582		}
   3583		ret = true;			/* incremented */
   3584
   3585	} else {				/* decrementing */
   3586		/*
   3587		 * Think of how you subtract 1 from 1000
   3588		 */
   3589		BUG_ON(count != COUNT_CONTINUED);
   3590		while (*map == COUNT_CONTINUED) {
   3591			kunmap_atomic(map);
   3592			page = list_next_entry(page, lru);
   3593			BUG_ON(page == head);
   3594			map = kmap_atomic(page) + offset;
   3595		}
   3596		BUG_ON(*map == 0);
   3597		*map -= 1;
   3598		if (*map == 0)
   3599			count = 0;
   3600		kunmap_atomic(map);
   3601		while ((page = list_prev_entry(page, lru)) != head) {
   3602			map = kmap_atomic(page) + offset;
   3603			*map = SWAP_CONT_MAX | count;
   3604			count = COUNT_CONTINUED;
   3605			kunmap_atomic(map);
   3606		}
   3607		ret = count == COUNT_CONTINUED;
   3608	}
   3609out:
   3610	spin_unlock(&si->cont_lock);
   3611	return ret;
   3612}
   3613
   3614/*
   3615 * free_swap_count_continuations - swapoff free all the continuation pages
   3616 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
   3617 */
   3618static void free_swap_count_continuations(struct swap_info_struct *si)
   3619{
   3620	pgoff_t offset;
   3621
   3622	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
   3623		struct page *head;
   3624		head = vmalloc_to_page(si->swap_map + offset);
   3625		if (page_private(head)) {
   3626			struct page *page, *next;
   3627
   3628			list_for_each_entry_safe(page, next, &head->lru, lru) {
   3629				list_del(&page->lru);
   3630				__free_page(page);
   3631			}
   3632		}
   3633	}
   3634}
   3635
   3636#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
   3637void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
   3638{
   3639	struct swap_info_struct *si, *next;
   3640	int nid = page_to_nid(page);
   3641
   3642	if (!(gfp_mask & __GFP_IO))
   3643		return;
   3644
   3645	if (!blk_cgroup_congested())
   3646		return;
   3647
   3648	/*
   3649	 * We've already scheduled a throttle, avoid taking the global swap
   3650	 * lock.
   3651	 */
   3652	if (current->throttle_queue)
   3653		return;
   3654
   3655	spin_lock(&swap_avail_lock);
   3656	plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
   3657				  avail_lists[nid]) {
   3658		if (si->bdev) {
   3659			blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
   3660			break;
   3661		}
   3662	}
   3663	spin_unlock(&swap_avail_lock);
   3664}
   3665#endif
   3666
   3667static int __init swapfile_init(void)
   3668{
   3669	int nid;
   3670
   3671	swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
   3672					 GFP_KERNEL);
   3673	if (!swap_avail_heads) {
   3674		pr_emerg("Not enough memory for swap heads, swap is disabled\n");
   3675		return -ENOMEM;
   3676	}
   3677
   3678	for_each_node(nid)
   3679		plist_head_init(&swap_avail_heads[nid]);
   3680
   3681	return 0;
   3682}
   3683subsys_initcall(swapfile_init);