shmem.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
shmem.c (111153B)
      1/*
      2 * Resizable virtual memory filesystem for Linux.
      3 *
      4 * Copyright (C) 2000 Linus Torvalds.
      5 *		 2000 Transmeta Corp.
      6 *		 2000-2001 Christoph Rohland
      7 *		 2000-2001 SAP AG
      8 *		 2002 Red Hat Inc.
      9 * Copyright (C) 2002-2011 Hugh Dickins.
     10 * Copyright (C) 2011 Google Inc.
     11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
     12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
     13 *
     14 * Extended attribute support for tmpfs:
     15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
     16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
     17 *
     18 * tiny-shmem:
     19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
     20 *
     21 * This file is released under the GPL.
     22 */
     23
     24#include <linux/fs.h>
     25#include <linux/init.h>
     26#include <linux/vfs.h>
     27#include <linux/mount.h>
     28#include <linux/ramfs.h>
     29#include <linux/pagemap.h>
     30#include <linux/file.h>
     31#include <linux/mm.h>
     32#include <linux/random.h>
     33#include <linux/sched/signal.h>
     34#include <linux/export.h>
     35#include <linux/swap.h>
     36#include <linux/uio.h>
     37#include <linux/hugetlb.h>
     38#include <linux/fs_parser.h>
     39#include <linux/swapfile.h>
     40#include "swap.h"
     41
     42static struct vfsmount *shm_mnt;
     43
     44#ifdef CONFIG_SHMEM
     45/*
     46 * This virtual memory filesystem is heavily based on the ramfs. It
     47 * extends ramfs by the ability to use swap and honor resource limits
     48 * which makes it a completely usable filesystem.
     49 */
     50
     51#include <linux/xattr.h>
     52#include <linux/exportfs.h>
     53#include <linux/posix_acl.h>
     54#include <linux/posix_acl_xattr.h>
     55#include <linux/mman.h>
     56#include <linux/string.h>
     57#include <linux/slab.h>
     58#include <linux/backing-dev.h>
     59#include <linux/shmem_fs.h>
     60#include <linux/writeback.h>
     61#include <linux/pagevec.h>
     62#include <linux/percpu_counter.h>
     63#include <linux/falloc.h>
     64#include <linux/splice.h>
     65#include <linux/security.h>
     66#include <linux/swapops.h>
     67#include <linux/mempolicy.h>
     68#include <linux/namei.h>
     69#include <linux/ctype.h>
     70#include <linux/migrate.h>
     71#include <linux/highmem.h>
     72#include <linux/seq_file.h>
     73#include <linux/magic.h>
     74#include <linux/syscalls.h>
     75#include <linux/fcntl.h>
     76#include <uapi/linux/memfd.h>
     77#include <linux/userfaultfd_k.h>
     78#include <linux/rmap.h>
     79#include <linux/uuid.h>
     80
     81#include <linux/uaccess.h>
     82
     83#include "internal.h"
     84
     85#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
     86#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
     87
     88/* Pretend that each entry is of this size in directory's i_size */
     89#define BOGO_DIRENT_SIZE 20
     90
     91/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
     92#define SHORT_SYMLINK_LEN 128
     93
     94/*
     95 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
     96 * inode->i_private (with i_rwsem making sure that it has only one user at
     97 * a time): we would prefer not to enlarge the shmem inode just for that.
     98 */
     99struct shmem_falloc {
    100	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
    101	pgoff_t start;		/* start of range currently being fallocated */
    102	pgoff_t next;		/* the next page offset to be fallocated */
    103	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
    104	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
    105};
    106
    107struct shmem_options {
    108	unsigned long long blocks;
    109	unsigned long long inodes;
    110	struct mempolicy *mpol;
    111	kuid_t uid;
    112	kgid_t gid;
    113	umode_t mode;
    114	bool full_inums;
    115	int huge;
    116	int seen;
    117#define SHMEM_SEEN_BLOCKS 1
    118#define SHMEM_SEEN_INODES 2
    119#define SHMEM_SEEN_HUGE 4
    120#define SHMEM_SEEN_INUMS 8
    121};
    122
    123#ifdef CONFIG_TMPFS
    124static unsigned long shmem_default_max_blocks(void)
    125{
    126	return totalram_pages() / 2;
    127}
    128
    129static unsigned long shmem_default_max_inodes(void)
    130{
    131	unsigned long nr_pages = totalram_pages();
    132
    133	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
    134}
    135#endif
    136
    137static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
    138			     struct folio **foliop, enum sgp_type sgp,
    139			     gfp_t gfp, struct vm_area_struct *vma,
    140			     vm_fault_t *fault_type);
    141static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
    142		struct page **pagep, enum sgp_type sgp,
    143		gfp_t gfp, struct vm_area_struct *vma,
    144		struct vm_fault *vmf, vm_fault_t *fault_type);
    145
    146int shmem_getpage(struct inode *inode, pgoff_t index,
    147		struct page **pagep, enum sgp_type sgp)
    148{
    149	return shmem_getpage_gfp(inode, index, pagep, sgp,
    150		mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
    151}
    152
    153static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
    154{
    155	return sb->s_fs_info;
    156}
    157
    158/*
    159 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
    160 * for shared memory and for shared anonymous (/dev/zero) mappings
    161 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
    162 * consistent with the pre-accounting of private mappings ...
    163 */
    164static inline int shmem_acct_size(unsigned long flags, loff_t size)
    165{
    166	return (flags & VM_NORESERVE) ?
    167		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
    168}
    169
    170static inline void shmem_unacct_size(unsigned long flags, loff_t size)
    171{
    172	if (!(flags & VM_NORESERVE))
    173		vm_unacct_memory(VM_ACCT(size));
    174}
    175
    176static inline int shmem_reacct_size(unsigned long flags,
    177		loff_t oldsize, loff_t newsize)
    178{
    179	if (!(flags & VM_NORESERVE)) {
    180		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
    181			return security_vm_enough_memory_mm(current->mm,
    182					VM_ACCT(newsize) - VM_ACCT(oldsize));
    183		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
    184			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
    185	}
    186	return 0;
    187}
    188
    189/*
    190 * ... whereas tmpfs objects are accounted incrementally as
    191 * pages are allocated, in order to allow large sparse files.
    192 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
    193 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
    194 */
    195static inline int shmem_acct_block(unsigned long flags, long pages)
    196{
    197	if (!(flags & VM_NORESERVE))
    198		return 0;
    199
    200	return security_vm_enough_memory_mm(current->mm,
    201			pages * VM_ACCT(PAGE_SIZE));
    202}
    203
    204static inline void shmem_unacct_blocks(unsigned long flags, long pages)
    205{
    206	if (flags & VM_NORESERVE)
    207		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
    208}
    209
    210static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
    211{
    212	struct shmem_inode_info *info = SHMEM_I(inode);
    213	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
    214
    215	if (shmem_acct_block(info->flags, pages))
    216		return false;
    217
    218	if (sbinfo->max_blocks) {
    219		if (percpu_counter_compare(&sbinfo->used_blocks,
    220					   sbinfo->max_blocks - pages) > 0)
    221			goto unacct;
    222		percpu_counter_add(&sbinfo->used_blocks, pages);
    223	}
    224
    225	return true;
    226
    227unacct:
    228	shmem_unacct_blocks(info->flags, pages);
    229	return false;
    230}
    231
    232static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
    233{
    234	struct shmem_inode_info *info = SHMEM_I(inode);
    235	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
    236
    237	if (sbinfo->max_blocks)
    238		percpu_counter_sub(&sbinfo->used_blocks, pages);
    239	shmem_unacct_blocks(info->flags, pages);
    240}
    241
    242static const struct super_operations shmem_ops;
    243const struct address_space_operations shmem_aops;
    244static const struct file_operations shmem_file_operations;
    245static const struct inode_operations shmem_inode_operations;
    246static const struct inode_operations shmem_dir_inode_operations;
    247static const struct inode_operations shmem_special_inode_operations;
    248static const struct vm_operations_struct shmem_vm_ops;
    249static struct file_system_type shmem_fs_type;
    250
    251bool vma_is_shmem(struct vm_area_struct *vma)
    252{
    253	return vma->vm_ops == &shmem_vm_ops;
    254}
    255
    256static LIST_HEAD(shmem_swaplist);
    257static DEFINE_MUTEX(shmem_swaplist_mutex);
    258
    259/*
    260 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
    261 * produces a novel ino for the newly allocated inode.
    262 *
    263 * It may also be called when making a hard link to permit the space needed by
    264 * each dentry. However, in that case, no new inode number is needed since that
    265 * internally draws from another pool of inode numbers (currently global
    266 * get_next_ino()). This case is indicated by passing NULL as inop.
    267 */
    268#define SHMEM_INO_BATCH 1024
    269static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
    270{
    271	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
    272	ino_t ino;
    273
    274	if (!(sb->s_flags & SB_KERNMOUNT)) {
    275		raw_spin_lock(&sbinfo->stat_lock);
    276		if (sbinfo->max_inodes) {
    277			if (!sbinfo->free_inodes) {
    278				raw_spin_unlock(&sbinfo->stat_lock);
    279				return -ENOSPC;
    280			}
    281			sbinfo->free_inodes--;
    282		}
    283		if (inop) {
    284			ino = sbinfo->next_ino++;
    285			if (unlikely(is_zero_ino(ino)))
    286				ino = sbinfo->next_ino++;
    287			if (unlikely(!sbinfo->full_inums &&
    288				     ino > UINT_MAX)) {
    289				/*
    290				 * Emulate get_next_ino uint wraparound for
    291				 * compatibility
    292				 */
    293				if (IS_ENABLED(CONFIG_64BIT))
    294					pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
    295						__func__, MINOR(sb->s_dev));
    296				sbinfo->next_ino = 1;
    297				ino = sbinfo->next_ino++;
    298			}
    299			*inop = ino;
    300		}
    301		raw_spin_unlock(&sbinfo->stat_lock);
    302	} else if (inop) {
    303		/*
    304		 * __shmem_file_setup, one of our callers, is lock-free: it
    305		 * doesn't hold stat_lock in shmem_reserve_inode since
    306		 * max_inodes is always 0, and is called from potentially
    307		 * unknown contexts. As such, use a per-cpu batched allocator
    308		 * which doesn't require the per-sb stat_lock unless we are at
    309		 * the batch boundary.
    310		 *
    311		 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
    312		 * shmem mounts are not exposed to userspace, so we don't need
    313		 * to worry about things like glibc compatibility.
    314		 */
    315		ino_t *next_ino;
    316
    317		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
    318		ino = *next_ino;
    319		if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
    320			raw_spin_lock(&sbinfo->stat_lock);
    321			ino = sbinfo->next_ino;
    322			sbinfo->next_ino += SHMEM_INO_BATCH;
    323			raw_spin_unlock(&sbinfo->stat_lock);
    324			if (unlikely(is_zero_ino(ino)))
    325				ino++;
    326		}
    327		*inop = ino;
    328		*next_ino = ++ino;
    329		put_cpu();
    330	}
    331
    332	return 0;
    333}
    334
    335static void shmem_free_inode(struct super_block *sb)
    336{
    337	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
    338	if (sbinfo->max_inodes) {
    339		raw_spin_lock(&sbinfo->stat_lock);
    340		sbinfo->free_inodes++;
    341		raw_spin_unlock(&sbinfo->stat_lock);
    342	}
    343}
    344
    345/**
    346 * shmem_recalc_inode - recalculate the block usage of an inode
    347 * @inode: inode to recalc
    348 *
    349 * We have to calculate the free blocks since the mm can drop
    350 * undirtied hole pages behind our back.
    351 *
    352 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
    353 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
    354 *
    355 * It has to be called with the spinlock held.
    356 */
    357static void shmem_recalc_inode(struct inode *inode)
    358{
    359	struct shmem_inode_info *info = SHMEM_I(inode);
    360	long freed;
    361
    362	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
    363	if (freed > 0) {
    364		info->alloced -= freed;
    365		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
    366		shmem_inode_unacct_blocks(inode, freed);
    367	}
    368}
    369
    370bool shmem_charge(struct inode *inode, long pages)
    371{
    372	struct shmem_inode_info *info = SHMEM_I(inode);
    373	unsigned long flags;
    374
    375	if (!shmem_inode_acct_block(inode, pages))
    376		return false;
    377
    378	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
    379	inode->i_mapping->nrpages += pages;
    380
    381	spin_lock_irqsave(&info->lock, flags);
    382	info->alloced += pages;
    383	inode->i_blocks += pages * BLOCKS_PER_PAGE;
    384	shmem_recalc_inode(inode);
    385	spin_unlock_irqrestore(&info->lock, flags);
    386
    387	return true;
    388}
    389
    390void shmem_uncharge(struct inode *inode, long pages)
    391{
    392	struct shmem_inode_info *info = SHMEM_I(inode);
    393	unsigned long flags;
    394
    395	/* nrpages adjustment done by __delete_from_page_cache() or caller */
    396
    397	spin_lock_irqsave(&info->lock, flags);
    398	info->alloced -= pages;
    399	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
    400	shmem_recalc_inode(inode);
    401	spin_unlock_irqrestore(&info->lock, flags);
    402
    403	shmem_inode_unacct_blocks(inode, pages);
    404}
    405
    406/*
    407 * Replace item expected in xarray by a new item, while holding xa_lock.
    408 */
    409static int shmem_replace_entry(struct address_space *mapping,
    410			pgoff_t index, void *expected, void *replacement)
    411{
    412	XA_STATE(xas, &mapping->i_pages, index);
    413	void *item;
    414
    415	VM_BUG_ON(!expected);
    416	VM_BUG_ON(!replacement);
    417	item = xas_load(&xas);
    418	if (item != expected)
    419		return -ENOENT;
    420	xas_store(&xas, replacement);
    421	return 0;
    422}
    423
    424/*
    425 * Sometimes, before we decide whether to proceed or to fail, we must check
    426 * that an entry was not already brought back from swap by a racing thread.
    427 *
    428 * Checking page is not enough: by the time a SwapCache page is locked, it
    429 * might be reused, and again be SwapCache, using the same swap as before.
    430 */
    431static bool shmem_confirm_swap(struct address_space *mapping,
    432			       pgoff_t index, swp_entry_t swap)
    433{
    434	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
    435}
    436
    437/*
    438 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
    439 *
    440 * SHMEM_HUGE_NEVER:
    441 *	disables huge pages for the mount;
    442 * SHMEM_HUGE_ALWAYS:
    443 *	enables huge pages for the mount;
    444 * SHMEM_HUGE_WITHIN_SIZE:
    445 *	only allocate huge pages if the page will be fully within i_size,
    446 *	also respect fadvise()/madvise() hints;
    447 * SHMEM_HUGE_ADVISE:
    448 *	only allocate huge pages if requested with fadvise()/madvise();
    449 */
    450
    451#define SHMEM_HUGE_NEVER	0
    452#define SHMEM_HUGE_ALWAYS	1
    453#define SHMEM_HUGE_WITHIN_SIZE	2
    454#define SHMEM_HUGE_ADVISE	3
    455
    456/*
    457 * Special values.
    458 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
    459 *
    460 * SHMEM_HUGE_DENY:
    461 *	disables huge on shm_mnt and all mounts, for emergency use;
    462 * SHMEM_HUGE_FORCE:
    463 *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
    464 *
    465 */
    466#define SHMEM_HUGE_DENY		(-1)
    467#define SHMEM_HUGE_FORCE	(-2)
    468
    469#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    470/* ifdef here to avoid bloating shmem.o when not necessary */
    471
    472static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
    473
    474bool shmem_is_huge(struct vm_area_struct *vma,
    475		   struct inode *inode, pgoff_t index)
    476{
    477	loff_t i_size;
    478
    479	if (!S_ISREG(inode->i_mode))
    480		return false;
    481	if (shmem_huge == SHMEM_HUGE_DENY)
    482		return false;
    483	if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
    484	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
    485		return false;
    486	if (shmem_huge == SHMEM_HUGE_FORCE)
    487		return true;
    488
    489	switch (SHMEM_SB(inode->i_sb)->huge) {
    490	case SHMEM_HUGE_ALWAYS:
    491		return true;
    492	case SHMEM_HUGE_WITHIN_SIZE:
    493		index = round_up(index + 1, HPAGE_PMD_NR);
    494		i_size = round_up(i_size_read(inode), PAGE_SIZE);
    495		if (i_size >> PAGE_SHIFT >= index)
    496			return true;
    497		fallthrough;
    498	case SHMEM_HUGE_ADVISE:
    499		if (vma && (vma->vm_flags & VM_HUGEPAGE))
    500			return true;
    501		fallthrough;
    502	default:
    503		return false;
    504	}
    505}
    506
    507#if defined(CONFIG_SYSFS)
    508static int shmem_parse_huge(const char *str)
    509{
    510	if (!strcmp(str, "never"))
    511		return SHMEM_HUGE_NEVER;
    512	if (!strcmp(str, "always"))
    513		return SHMEM_HUGE_ALWAYS;
    514	if (!strcmp(str, "within_size"))
    515		return SHMEM_HUGE_WITHIN_SIZE;
    516	if (!strcmp(str, "advise"))
    517		return SHMEM_HUGE_ADVISE;
    518	if (!strcmp(str, "deny"))
    519		return SHMEM_HUGE_DENY;
    520	if (!strcmp(str, "force"))
    521		return SHMEM_HUGE_FORCE;
    522	return -EINVAL;
    523}
    524#endif
    525
    526#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
    527static const char *shmem_format_huge(int huge)
    528{
    529	switch (huge) {
    530	case SHMEM_HUGE_NEVER:
    531		return "never";
    532	case SHMEM_HUGE_ALWAYS:
    533		return "always";
    534	case SHMEM_HUGE_WITHIN_SIZE:
    535		return "within_size";
    536	case SHMEM_HUGE_ADVISE:
    537		return "advise";
    538	case SHMEM_HUGE_DENY:
    539		return "deny";
    540	case SHMEM_HUGE_FORCE:
    541		return "force";
    542	default:
    543		VM_BUG_ON(1);
    544		return "bad_val";
    545	}
    546}
    547#endif
    548
    549static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
    550		struct shrink_control *sc, unsigned long nr_to_split)
    551{
    552	LIST_HEAD(list), *pos, *next;
    553	LIST_HEAD(to_remove);
    554	struct inode *inode;
    555	struct shmem_inode_info *info;
    556	struct folio *folio;
    557	unsigned long batch = sc ? sc->nr_to_scan : 128;
    558	int split = 0;
    559
    560	if (list_empty(&sbinfo->shrinklist))
    561		return SHRINK_STOP;
    562
    563	spin_lock(&sbinfo->shrinklist_lock);
    564	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
    565		info = list_entry(pos, struct shmem_inode_info, shrinklist);
    566
    567		/* pin the inode */
    568		inode = igrab(&info->vfs_inode);
    569
    570		/* inode is about to be evicted */
    571		if (!inode) {
    572			list_del_init(&info->shrinklist);
    573			goto next;
    574		}
    575
    576		/* Check if there's anything to gain */
    577		if (round_up(inode->i_size, PAGE_SIZE) ==
    578				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
    579			list_move(&info->shrinklist, &to_remove);
    580			goto next;
    581		}
    582
    583		list_move(&info->shrinklist, &list);
    584next:
    585		sbinfo->shrinklist_len--;
    586		if (!--batch)
    587			break;
    588	}
    589	spin_unlock(&sbinfo->shrinklist_lock);
    590
    591	list_for_each_safe(pos, next, &to_remove) {
    592		info = list_entry(pos, struct shmem_inode_info, shrinklist);
    593		inode = &info->vfs_inode;
    594		list_del_init(&info->shrinklist);
    595		iput(inode);
    596	}
    597
    598	list_for_each_safe(pos, next, &list) {
    599		int ret;
    600		pgoff_t index;
    601
    602		info = list_entry(pos, struct shmem_inode_info, shrinklist);
    603		inode = &info->vfs_inode;
    604
    605		if (nr_to_split && split >= nr_to_split)
    606			goto move_back;
    607
    608		index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
    609		folio = filemap_get_folio(inode->i_mapping, index);
    610		if (!folio)
    611			goto drop;
    612
    613		/* No huge page at the end of the file: nothing to split */
    614		if (!folio_test_large(folio)) {
    615			folio_put(folio);
    616			goto drop;
    617		}
    618
    619		/*
    620		 * Move the inode on the list back to shrinklist if we failed
    621		 * to lock the page at this time.
    622		 *
    623		 * Waiting for the lock may lead to deadlock in the
    624		 * reclaim path.
    625		 */
    626		if (!folio_trylock(folio)) {
    627			folio_put(folio);
    628			goto move_back;
    629		}
    630
    631		ret = split_huge_page(&folio->page);
    632		folio_unlock(folio);
    633		folio_put(folio);
    634
    635		/* If split failed move the inode on the list back to shrinklist */
    636		if (ret)
    637			goto move_back;
    638
    639		split++;
    640drop:
    641		list_del_init(&info->shrinklist);
    642		goto put;
    643move_back:
    644		/*
    645		 * Make sure the inode is either on the global list or deleted
    646		 * from any local list before iput() since it could be deleted
    647		 * in another thread once we put the inode (then the local list
    648		 * is corrupted).
    649		 */
    650		spin_lock(&sbinfo->shrinklist_lock);
    651		list_move(&info->shrinklist, &sbinfo->shrinklist);
    652		sbinfo->shrinklist_len++;
    653		spin_unlock(&sbinfo->shrinklist_lock);
    654put:
    655		iput(inode);
    656	}
    657
    658	return split;
    659}
    660
    661static long shmem_unused_huge_scan(struct super_block *sb,
    662		struct shrink_control *sc)
    663{
    664	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
    665
    666	if (!READ_ONCE(sbinfo->shrinklist_len))
    667		return SHRINK_STOP;
    668
    669	return shmem_unused_huge_shrink(sbinfo, sc, 0);
    670}
    671
    672static long shmem_unused_huge_count(struct super_block *sb,
    673		struct shrink_control *sc)
    674{
    675	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
    676	return READ_ONCE(sbinfo->shrinklist_len);
    677}
    678#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
    679
    680#define shmem_huge SHMEM_HUGE_DENY
    681
    682bool shmem_is_huge(struct vm_area_struct *vma,
    683		   struct inode *inode, pgoff_t index)
    684{
    685	return false;
    686}
    687
    688static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
    689		struct shrink_control *sc, unsigned long nr_to_split)
    690{
    691	return 0;
    692}
    693#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    694
    695/*
    696 * Like add_to_page_cache_locked, but error if expected item has gone.
    697 */
    698static int shmem_add_to_page_cache(struct folio *folio,
    699				   struct address_space *mapping,
    700				   pgoff_t index, void *expected, gfp_t gfp,
    701				   struct mm_struct *charge_mm)
    702{
    703	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
    704	long nr = folio_nr_pages(folio);
    705	int error;
    706
    707	VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
    708	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    709	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
    710	VM_BUG_ON(expected && folio_test_large(folio));
    711
    712	folio_ref_add(folio, nr);
    713	folio->mapping = mapping;
    714	folio->index = index;
    715
    716	if (!folio_test_swapcache(folio)) {
    717		error = mem_cgroup_charge(folio, charge_mm, gfp);
    718		if (error) {
    719			if (folio_test_pmd_mappable(folio)) {
    720				count_vm_event(THP_FILE_FALLBACK);
    721				count_vm_event(THP_FILE_FALLBACK_CHARGE);
    722			}
    723			goto error;
    724		}
    725	}
    726	folio_throttle_swaprate(folio, gfp);
    727
    728	do {
    729		xas_lock_irq(&xas);
    730		if (expected != xas_find_conflict(&xas)) {
    731			xas_set_err(&xas, -EEXIST);
    732			goto unlock;
    733		}
    734		if (expected && xas_find_conflict(&xas)) {
    735			xas_set_err(&xas, -EEXIST);
    736			goto unlock;
    737		}
    738		xas_store(&xas, folio);
    739		if (xas_error(&xas))
    740			goto unlock;
    741		if (folio_test_pmd_mappable(folio)) {
    742			count_vm_event(THP_FILE_ALLOC);
    743			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
    744		}
    745		mapping->nrpages += nr;
    746		__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
    747		__lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
    748unlock:
    749		xas_unlock_irq(&xas);
    750	} while (xas_nomem(&xas, gfp));
    751
    752	if (xas_error(&xas)) {
    753		error = xas_error(&xas);
    754		goto error;
    755	}
    756
    757	return 0;
    758error:
    759	folio->mapping = NULL;
    760	folio_ref_sub(folio, nr);
    761	return error;
    762}
    763
    764/*
    765 * Like delete_from_page_cache, but substitutes swap for page.
    766 */
    767static void shmem_delete_from_page_cache(struct page *page, void *radswap)
    768{
    769	struct address_space *mapping = page->mapping;
    770	int error;
    771
    772	VM_BUG_ON_PAGE(PageCompound(page), page);
    773
    774	xa_lock_irq(&mapping->i_pages);
    775	error = shmem_replace_entry(mapping, page->index, page, radswap);
    776	page->mapping = NULL;
    777	mapping->nrpages--;
    778	__dec_lruvec_page_state(page, NR_FILE_PAGES);
    779	__dec_lruvec_page_state(page, NR_SHMEM);
    780	xa_unlock_irq(&mapping->i_pages);
    781	put_page(page);
    782	BUG_ON(error);
    783}
    784
    785/*
    786 * Remove swap entry from page cache, free the swap and its page cache.
    787 */
    788static int shmem_free_swap(struct address_space *mapping,
    789			   pgoff_t index, void *radswap)
    790{
    791	void *old;
    792
    793	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
    794	if (old != radswap)
    795		return -ENOENT;
    796	free_swap_and_cache(radix_to_swp_entry(radswap));
    797	return 0;
    798}
    799
    800/*
    801 * Determine (in bytes) how many of the shmem object's pages mapped by the
    802 * given offsets are swapped out.
    803 *
    804 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
    805 * as long as the inode doesn't go away and racy results are not a problem.
    806 */
    807unsigned long shmem_partial_swap_usage(struct address_space *mapping,
    808						pgoff_t start, pgoff_t end)
    809{
    810	XA_STATE(xas, &mapping->i_pages, start);
    811	struct page *page;
    812	unsigned long swapped = 0;
    813
    814	rcu_read_lock();
    815	xas_for_each(&xas, page, end - 1) {
    816		if (xas_retry(&xas, page))
    817			continue;
    818		if (xa_is_value(page))
    819			swapped++;
    820
    821		if (need_resched()) {
    822			xas_pause(&xas);
    823			cond_resched_rcu();
    824		}
    825	}
    826
    827	rcu_read_unlock();
    828
    829	return swapped << PAGE_SHIFT;
    830}
    831
    832/*
    833 * Determine (in bytes) how many of the shmem object's pages mapped by the
    834 * given vma is swapped out.
    835 *
    836 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
    837 * as long as the inode doesn't go away and racy results are not a problem.
    838 */
    839unsigned long shmem_swap_usage(struct vm_area_struct *vma)
    840{
    841	struct inode *inode = file_inode(vma->vm_file);
    842	struct shmem_inode_info *info = SHMEM_I(inode);
    843	struct address_space *mapping = inode->i_mapping;
    844	unsigned long swapped;
    845
    846	/* Be careful as we don't hold info->lock */
    847	swapped = READ_ONCE(info->swapped);
    848
    849	/*
    850	 * The easier cases are when the shmem object has nothing in swap, or
    851	 * the vma maps it whole. Then we can simply use the stats that we
    852	 * already track.
    853	 */
    854	if (!swapped)
    855		return 0;
    856
    857	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
    858		return swapped << PAGE_SHIFT;
    859
    860	/* Here comes the more involved part */
    861	return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
    862					vma->vm_pgoff + vma_pages(vma));
    863}
    864
    865/*
    866 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
    867 */
    868void shmem_unlock_mapping(struct address_space *mapping)
    869{
    870	struct pagevec pvec;
    871	pgoff_t index = 0;
    872
    873	pagevec_init(&pvec);
    874	/*
    875	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
    876	 */
    877	while (!mapping_unevictable(mapping)) {
    878		if (!pagevec_lookup(&pvec, mapping, &index))
    879			break;
    880		check_move_unevictable_pages(&pvec);
    881		pagevec_release(&pvec);
    882		cond_resched();
    883	}
    884}
    885
    886static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
    887{
    888	struct folio *folio;
    889	struct page *page;
    890
    891	/*
    892	 * At first avoid shmem_getpage(,,,SGP_READ): that fails
    893	 * beyond i_size, and reports fallocated pages as holes.
    894	 */
    895	folio = __filemap_get_folio(inode->i_mapping, index,
    896					FGP_ENTRY | FGP_LOCK, 0);
    897	if (!xa_is_value(folio))
    898		return folio;
    899	/*
    900	 * But read a page back from swap if any of it is within i_size
    901	 * (although in some cases this is just a waste of time).
    902	 */
    903	page = NULL;
    904	shmem_getpage(inode, index, &page, SGP_READ);
    905	return page ? page_folio(page) : NULL;
    906}
    907
    908/*
    909 * Remove range of pages and swap entries from page cache, and free them.
    910 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
    911 */
    912static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
    913								 bool unfalloc)
    914{
    915	struct address_space *mapping = inode->i_mapping;
    916	struct shmem_inode_info *info = SHMEM_I(inode);
    917	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
    918	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
    919	struct folio_batch fbatch;
    920	pgoff_t indices[PAGEVEC_SIZE];
    921	struct folio *folio;
    922	bool same_folio;
    923	long nr_swaps_freed = 0;
    924	pgoff_t index;
    925	int i;
    926
    927	if (lend == -1)
    928		end = -1;	/* unsigned, so actually very big */
    929
    930	if (info->fallocend > start && info->fallocend <= end && !unfalloc)
    931		info->fallocend = start;
    932
    933	folio_batch_init(&fbatch);
    934	index = start;
    935	while (index < end && find_lock_entries(mapping, index, end - 1,
    936			&fbatch, indices)) {
    937		for (i = 0; i < folio_batch_count(&fbatch); i++) {
    938			folio = fbatch.folios[i];
    939
    940			index = indices[i];
    941
    942			if (xa_is_value(folio)) {
    943				if (unfalloc)
    944					continue;
    945				nr_swaps_freed += !shmem_free_swap(mapping,
    946								index, folio);
    947				continue;
    948			}
    949			index += folio_nr_pages(folio) - 1;
    950
    951			if (!unfalloc || !folio_test_uptodate(folio))
    952				truncate_inode_folio(mapping, folio);
    953			folio_unlock(folio);
    954		}
    955		folio_batch_remove_exceptionals(&fbatch);
    956		folio_batch_release(&fbatch);
    957		cond_resched();
    958		index++;
    959	}
    960
    961	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
    962	folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
    963	if (folio) {
    964		same_folio = lend < folio_pos(folio) + folio_size(folio);
    965		folio_mark_dirty(folio);
    966		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
    967			start = folio->index + folio_nr_pages(folio);
    968			if (same_folio)
    969				end = folio->index;
    970		}
    971		folio_unlock(folio);
    972		folio_put(folio);
    973		folio = NULL;
    974	}
    975
    976	if (!same_folio)
    977		folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
    978	if (folio) {
    979		folio_mark_dirty(folio);
    980		if (!truncate_inode_partial_folio(folio, lstart, lend))
    981			end = folio->index;
    982		folio_unlock(folio);
    983		folio_put(folio);
    984	}
    985
    986	index = start;
    987	while (index < end) {
    988		cond_resched();
    989
    990		if (!find_get_entries(mapping, index, end - 1, &fbatch,
    991				indices)) {
    992			/* If all gone or hole-punch or unfalloc, we're done */
    993			if (index == start || end != -1)
    994				break;
    995			/* But if truncating, restart to make sure all gone */
    996			index = start;
    997			continue;
    998		}
    999		for (i = 0; i < folio_batch_count(&fbatch); i++) {
   1000			folio = fbatch.folios[i];
   1001
   1002			index = indices[i];
   1003			if (xa_is_value(folio)) {
   1004				if (unfalloc)
   1005					continue;
   1006				if (shmem_free_swap(mapping, index, folio)) {
   1007					/* Swap was replaced by page: retry */
   1008					index--;
   1009					break;
   1010				}
   1011				nr_swaps_freed++;
   1012				continue;
   1013			}
   1014
   1015			folio_lock(folio);
   1016
   1017			if (!unfalloc || !folio_test_uptodate(folio)) {
   1018				if (folio_mapping(folio) != mapping) {
   1019					/* Page was replaced by swap: retry */
   1020					folio_unlock(folio);
   1021					index--;
   1022					break;
   1023				}
   1024				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
   1025						folio);
   1026				truncate_inode_folio(mapping, folio);
   1027			}
   1028			index = folio->index + folio_nr_pages(folio) - 1;
   1029			folio_unlock(folio);
   1030		}
   1031		folio_batch_remove_exceptionals(&fbatch);
   1032		folio_batch_release(&fbatch);
   1033		index++;
   1034	}
   1035
   1036	spin_lock_irq(&info->lock);
   1037	info->swapped -= nr_swaps_freed;
   1038	shmem_recalc_inode(inode);
   1039	spin_unlock_irq(&info->lock);
   1040}
   1041
   1042void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
   1043{
   1044	shmem_undo_range(inode, lstart, lend, false);
   1045	inode->i_ctime = inode->i_mtime = current_time(inode);
   1046}
   1047EXPORT_SYMBOL_GPL(shmem_truncate_range);
   1048
   1049static int shmem_getattr(struct user_namespace *mnt_userns,
   1050			 const struct path *path, struct kstat *stat,
   1051			 u32 request_mask, unsigned int query_flags)
   1052{
   1053	struct inode *inode = path->dentry->d_inode;
   1054	struct shmem_inode_info *info = SHMEM_I(inode);
   1055
   1056	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
   1057		spin_lock_irq(&info->lock);
   1058		shmem_recalc_inode(inode);
   1059		spin_unlock_irq(&info->lock);
   1060	}
   1061	generic_fillattr(&init_user_ns, inode, stat);
   1062
   1063	if (shmem_is_huge(NULL, inode, 0))
   1064		stat->blksize = HPAGE_PMD_SIZE;
   1065
   1066	if (request_mask & STATX_BTIME) {
   1067		stat->result_mask |= STATX_BTIME;
   1068		stat->btime.tv_sec = info->i_crtime.tv_sec;
   1069		stat->btime.tv_nsec = info->i_crtime.tv_nsec;
   1070	}
   1071
   1072	return 0;
   1073}
   1074
   1075static int shmem_setattr(struct user_namespace *mnt_userns,
   1076			 struct dentry *dentry, struct iattr *attr)
   1077{
   1078	struct inode *inode = d_inode(dentry);
   1079	struct shmem_inode_info *info = SHMEM_I(inode);
   1080	int error;
   1081
   1082	error = setattr_prepare(&init_user_ns, dentry, attr);
   1083	if (error)
   1084		return error;
   1085
   1086	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
   1087		loff_t oldsize = inode->i_size;
   1088		loff_t newsize = attr->ia_size;
   1089
   1090		/* protected by i_rwsem */
   1091		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
   1092		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
   1093			return -EPERM;
   1094
   1095		if (newsize != oldsize) {
   1096			error = shmem_reacct_size(SHMEM_I(inode)->flags,
   1097					oldsize, newsize);
   1098			if (error)
   1099				return error;
   1100			i_size_write(inode, newsize);
   1101			inode->i_ctime = inode->i_mtime = current_time(inode);
   1102		}
   1103		if (newsize <= oldsize) {
   1104			loff_t holebegin = round_up(newsize, PAGE_SIZE);
   1105			if (oldsize > holebegin)
   1106				unmap_mapping_range(inode->i_mapping,
   1107							holebegin, 0, 1);
   1108			if (info->alloced)
   1109				shmem_truncate_range(inode,
   1110							newsize, (loff_t)-1);
   1111			/* unmap again to remove racily COWed private pages */
   1112			if (oldsize > holebegin)
   1113				unmap_mapping_range(inode->i_mapping,
   1114							holebegin, 0, 1);
   1115		}
   1116	}
   1117
   1118	setattr_copy(&init_user_ns, inode, attr);
   1119	if (attr->ia_valid & ATTR_MODE)
   1120		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
   1121	return error;
   1122}
   1123
   1124static void shmem_evict_inode(struct inode *inode)
   1125{
   1126	struct shmem_inode_info *info = SHMEM_I(inode);
   1127	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
   1128
   1129	if (shmem_mapping(inode->i_mapping)) {
   1130		shmem_unacct_size(info->flags, inode->i_size);
   1131		inode->i_size = 0;
   1132		mapping_set_exiting(inode->i_mapping);
   1133		shmem_truncate_range(inode, 0, (loff_t)-1);
   1134		if (!list_empty(&info->shrinklist)) {
   1135			spin_lock(&sbinfo->shrinklist_lock);
   1136			if (!list_empty(&info->shrinklist)) {
   1137				list_del_init(&info->shrinklist);
   1138				sbinfo->shrinklist_len--;
   1139			}
   1140			spin_unlock(&sbinfo->shrinklist_lock);
   1141		}
   1142		while (!list_empty(&info->swaplist)) {
   1143			/* Wait while shmem_unuse() is scanning this inode... */
   1144			wait_var_event(&info->stop_eviction,
   1145				       !atomic_read(&info->stop_eviction));
   1146			mutex_lock(&shmem_swaplist_mutex);
   1147			/* ...but beware of the race if we peeked too early */
   1148			if (!atomic_read(&info->stop_eviction))
   1149				list_del_init(&info->swaplist);
   1150			mutex_unlock(&shmem_swaplist_mutex);
   1151		}
   1152	}
   1153
   1154	simple_xattrs_free(&info->xattrs);
   1155	WARN_ON(inode->i_blocks);
   1156	shmem_free_inode(inode->i_sb);
   1157	clear_inode(inode);
   1158}
   1159
   1160static int shmem_find_swap_entries(struct address_space *mapping,
   1161				   pgoff_t start, struct folio_batch *fbatch,
   1162				   pgoff_t *indices, unsigned int type)
   1163{
   1164	XA_STATE(xas, &mapping->i_pages, start);
   1165	struct folio *folio;
   1166	swp_entry_t entry;
   1167
   1168	rcu_read_lock();
   1169	xas_for_each(&xas, folio, ULONG_MAX) {
   1170		if (xas_retry(&xas, folio))
   1171			continue;
   1172
   1173		if (!xa_is_value(folio))
   1174			continue;
   1175
   1176		entry = radix_to_swp_entry(folio);
   1177		/*
   1178		 * swapin error entries can be found in the mapping. But they're
   1179		 * deliberately ignored here as we've done everything we can do.
   1180		 */
   1181		if (swp_type(entry) != type)
   1182			continue;
   1183
   1184		indices[folio_batch_count(fbatch)] = xas.xa_index;
   1185		if (!folio_batch_add(fbatch, folio))
   1186			break;
   1187
   1188		if (need_resched()) {
   1189			xas_pause(&xas);
   1190			cond_resched_rcu();
   1191		}
   1192	}
   1193	rcu_read_unlock();
   1194
   1195	return xas.xa_index;
   1196}
   1197
   1198/*
   1199 * Move the swapped pages for an inode to page cache. Returns the count
   1200 * of pages swapped in, or the error in case of failure.
   1201 */
   1202static int shmem_unuse_swap_entries(struct inode *inode,
   1203		struct folio_batch *fbatch, pgoff_t *indices)
   1204{
   1205	int i = 0;
   1206	int ret = 0;
   1207	int error = 0;
   1208	struct address_space *mapping = inode->i_mapping;
   1209
   1210	for (i = 0; i < folio_batch_count(fbatch); i++) {
   1211		struct folio *folio = fbatch->folios[i];
   1212
   1213		if (!xa_is_value(folio))
   1214			continue;
   1215		error = shmem_swapin_folio(inode, indices[i],
   1216					  &folio, SGP_CACHE,
   1217					  mapping_gfp_mask(mapping),
   1218					  NULL, NULL);
   1219		if (error == 0) {
   1220			folio_unlock(folio);
   1221			folio_put(folio);
   1222			ret++;
   1223		}
   1224		if (error == -ENOMEM)
   1225			break;
   1226		error = 0;
   1227	}
   1228	return error ? error : ret;
   1229}
   1230
   1231/*
   1232 * If swap found in inode, free it and move page from swapcache to filecache.
   1233 */
   1234static int shmem_unuse_inode(struct inode *inode, unsigned int type)
   1235{
   1236	struct address_space *mapping = inode->i_mapping;
   1237	pgoff_t start = 0;
   1238	struct folio_batch fbatch;
   1239	pgoff_t indices[PAGEVEC_SIZE];
   1240	int ret = 0;
   1241
   1242	do {
   1243		folio_batch_init(&fbatch);
   1244		shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
   1245		if (folio_batch_count(&fbatch) == 0) {
   1246			ret = 0;
   1247			break;
   1248		}
   1249
   1250		ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
   1251		if (ret < 0)
   1252			break;
   1253
   1254		start = indices[folio_batch_count(&fbatch) - 1];
   1255	} while (true);
   1256
   1257	return ret;
   1258}
   1259
   1260/*
   1261 * Read all the shared memory data that resides in the swap
   1262 * device 'type' back into memory, so the swap device can be
   1263 * unused.
   1264 */
   1265int shmem_unuse(unsigned int type)
   1266{
   1267	struct shmem_inode_info *info, *next;
   1268	int error = 0;
   1269
   1270	if (list_empty(&shmem_swaplist))
   1271		return 0;
   1272
   1273	mutex_lock(&shmem_swaplist_mutex);
   1274	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
   1275		if (!info->swapped) {
   1276			list_del_init(&info->swaplist);
   1277			continue;
   1278		}
   1279		/*
   1280		 * Drop the swaplist mutex while searching the inode for swap;
   1281		 * but before doing so, make sure shmem_evict_inode() will not
   1282		 * remove placeholder inode from swaplist, nor let it be freed
   1283		 * (igrab() would protect from unlink, but not from unmount).
   1284		 */
   1285		atomic_inc(&info->stop_eviction);
   1286		mutex_unlock(&shmem_swaplist_mutex);
   1287
   1288		error = shmem_unuse_inode(&info->vfs_inode, type);
   1289		cond_resched();
   1290
   1291		mutex_lock(&shmem_swaplist_mutex);
   1292		next = list_next_entry(info, swaplist);
   1293		if (!info->swapped)
   1294			list_del_init(&info->swaplist);
   1295		if (atomic_dec_and_test(&info->stop_eviction))
   1296			wake_up_var(&info->stop_eviction);
   1297		if (error)
   1298			break;
   1299	}
   1300	mutex_unlock(&shmem_swaplist_mutex);
   1301
   1302	return error;
   1303}
   1304
   1305/*
   1306 * Move the page from the page cache to the swap cache.
   1307 */
   1308static int shmem_writepage(struct page *page, struct writeback_control *wbc)
   1309{
   1310	struct folio *folio = page_folio(page);
   1311	struct shmem_inode_info *info;
   1312	struct address_space *mapping;
   1313	struct inode *inode;
   1314	swp_entry_t swap;
   1315	pgoff_t index;
   1316
   1317	/*
   1318	 * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
   1319	 * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
   1320	 * and its shmem_writeback() needs them to be split when swapping.
   1321	 */
   1322	if (PageTransCompound(page)) {
   1323		/* Ensure the subpages are still dirty */
   1324		SetPageDirty(page);
   1325		if (split_huge_page(page) < 0)
   1326			goto redirty;
   1327		ClearPageDirty(page);
   1328	}
   1329
   1330	BUG_ON(!PageLocked(page));
   1331	mapping = page->mapping;
   1332	index = page->index;
   1333	inode = mapping->host;
   1334	info = SHMEM_I(inode);
   1335	if (info->flags & VM_LOCKED)
   1336		goto redirty;
   1337	if (!total_swap_pages)
   1338		goto redirty;
   1339
   1340	/*
   1341	 * Our capabilities prevent regular writeback or sync from ever calling
   1342	 * shmem_writepage; but a stacking filesystem might use ->writepage of
   1343	 * its underlying filesystem, in which case tmpfs should write out to
   1344	 * swap only in response to memory pressure, and not for the writeback
   1345	 * threads or sync.
   1346	 */
   1347	if (!wbc->for_reclaim) {
   1348		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
   1349		goto redirty;
   1350	}
   1351
   1352	/*
   1353	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
   1354	 * value into swapfile.c, the only way we can correctly account for a
   1355	 * fallocated page arriving here is now to initialize it and write it.
   1356	 *
   1357	 * That's okay for a page already fallocated earlier, but if we have
   1358	 * not yet completed the fallocation, then (a) we want to keep track
   1359	 * of this page in case we have to undo it, and (b) it may not be a
   1360	 * good idea to continue anyway, once we're pushing into swap.  So
   1361	 * reactivate the page, and let shmem_fallocate() quit when too many.
   1362	 */
   1363	if (!PageUptodate(page)) {
   1364		if (inode->i_private) {
   1365			struct shmem_falloc *shmem_falloc;
   1366			spin_lock(&inode->i_lock);
   1367			shmem_falloc = inode->i_private;
   1368			if (shmem_falloc &&
   1369			    !shmem_falloc->waitq &&
   1370			    index >= shmem_falloc->start &&
   1371			    index < shmem_falloc->next)
   1372				shmem_falloc->nr_unswapped++;
   1373			else
   1374				shmem_falloc = NULL;
   1375			spin_unlock(&inode->i_lock);
   1376			if (shmem_falloc)
   1377				goto redirty;
   1378		}
   1379		clear_highpage(page);
   1380		flush_dcache_page(page);
   1381		SetPageUptodate(page);
   1382	}
   1383
   1384	swap = folio_alloc_swap(folio);
   1385	if (!swap.val)
   1386		goto redirty;
   1387
   1388	/*
   1389	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
   1390	 * if it's not already there.  Do it now before the page is
   1391	 * moved to swap cache, when its pagelock no longer protects
   1392	 * the inode from eviction.  But don't unlock the mutex until
   1393	 * we've incremented swapped, because shmem_unuse_inode() will
   1394	 * prune a !swapped inode from the swaplist under this mutex.
   1395	 */
   1396	mutex_lock(&shmem_swaplist_mutex);
   1397	if (list_empty(&info->swaplist))
   1398		list_add(&info->swaplist, &shmem_swaplist);
   1399
   1400	if (add_to_swap_cache(page, swap,
   1401			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
   1402			NULL) == 0) {
   1403		spin_lock_irq(&info->lock);
   1404		shmem_recalc_inode(inode);
   1405		info->swapped++;
   1406		spin_unlock_irq(&info->lock);
   1407
   1408		swap_shmem_alloc(swap);
   1409		shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
   1410
   1411		mutex_unlock(&shmem_swaplist_mutex);
   1412		BUG_ON(page_mapped(page));
   1413		swap_writepage(page, wbc);
   1414		return 0;
   1415	}
   1416
   1417	mutex_unlock(&shmem_swaplist_mutex);
   1418	put_swap_page(page, swap);
   1419redirty:
   1420	set_page_dirty(page);
   1421	if (wbc->for_reclaim)
   1422		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
   1423	unlock_page(page);
   1424	return 0;
   1425}
   1426
   1427#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
   1428static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
   1429{
   1430	char buffer[64];
   1431
   1432	if (!mpol || mpol->mode == MPOL_DEFAULT)
   1433		return;		/* show nothing */
   1434
   1435	mpol_to_str(buffer, sizeof(buffer), mpol);
   1436
   1437	seq_printf(seq, ",mpol=%s", buffer);
   1438}
   1439
   1440static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
   1441{
   1442	struct mempolicy *mpol = NULL;
   1443	if (sbinfo->mpol) {
   1444		raw_spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
   1445		mpol = sbinfo->mpol;
   1446		mpol_get(mpol);
   1447		raw_spin_unlock(&sbinfo->stat_lock);
   1448	}
   1449	return mpol;
   1450}
   1451#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
   1452static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
   1453{
   1454}
   1455static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
   1456{
   1457	return NULL;
   1458}
   1459#endif /* CONFIG_NUMA && CONFIG_TMPFS */
   1460#ifndef CONFIG_NUMA
   1461#define vm_policy vm_private_data
   1462#endif
   1463
   1464static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
   1465		struct shmem_inode_info *info, pgoff_t index)
   1466{
   1467	/* Create a pseudo vma that just contains the policy */
   1468	vma_init(vma, NULL);
   1469	/* Bias interleave by inode number to distribute better across nodes */
   1470	vma->vm_pgoff = index + info->vfs_inode.i_ino;
   1471	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
   1472}
   1473
   1474static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
   1475{
   1476	/* Drop reference taken by mpol_shared_policy_lookup() */
   1477	mpol_cond_put(vma->vm_policy);
   1478}
   1479
   1480static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
   1481			struct shmem_inode_info *info, pgoff_t index)
   1482{
   1483	struct vm_area_struct pvma;
   1484	struct page *page;
   1485	struct vm_fault vmf = {
   1486		.vma = &pvma,
   1487	};
   1488
   1489	shmem_pseudo_vma_init(&pvma, info, index);
   1490	page = swap_cluster_readahead(swap, gfp, &vmf);
   1491	shmem_pseudo_vma_destroy(&pvma);
   1492
   1493	return page;
   1494}
   1495
   1496/*
   1497 * Make sure huge_gfp is always more limited than limit_gfp.
   1498 * Some of the flags set permissions, while others set limitations.
   1499 */
   1500static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
   1501{
   1502	gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
   1503	gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
   1504	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
   1505	gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
   1506
   1507	/* Allow allocations only from the originally specified zones. */
   1508	result |= zoneflags;
   1509
   1510	/*
   1511	 * Minimize the result gfp by taking the union with the deny flags,
   1512	 * and the intersection of the allow flags.
   1513	 */
   1514	result |= (limit_gfp & denyflags);
   1515	result |= (huge_gfp & limit_gfp) & allowflags;
   1516
   1517	return result;
   1518}
   1519
   1520static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
   1521		struct shmem_inode_info *info, pgoff_t index)
   1522{
   1523	struct vm_area_struct pvma;
   1524	struct address_space *mapping = info->vfs_inode.i_mapping;
   1525	pgoff_t hindex;
   1526	struct folio *folio;
   1527
   1528	hindex = round_down(index, HPAGE_PMD_NR);
   1529	if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
   1530								XA_PRESENT))
   1531		return NULL;
   1532
   1533	shmem_pseudo_vma_init(&pvma, info, hindex);
   1534	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
   1535	shmem_pseudo_vma_destroy(&pvma);
   1536	if (!folio)
   1537		count_vm_event(THP_FILE_FALLBACK);
   1538	return folio;
   1539}
   1540
   1541static struct folio *shmem_alloc_folio(gfp_t gfp,
   1542			struct shmem_inode_info *info, pgoff_t index)
   1543{
   1544	struct vm_area_struct pvma;
   1545	struct folio *folio;
   1546
   1547	shmem_pseudo_vma_init(&pvma, info, index);
   1548	folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
   1549	shmem_pseudo_vma_destroy(&pvma);
   1550
   1551	return folio;
   1552}
   1553
   1554static struct page *shmem_alloc_page(gfp_t gfp,
   1555			struct shmem_inode_info *info, pgoff_t index)
   1556{
   1557	return &shmem_alloc_folio(gfp, info, index)->page;
   1558}
   1559
   1560static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
   1561		pgoff_t index, bool huge)
   1562{
   1563	struct shmem_inode_info *info = SHMEM_I(inode);
   1564	struct folio *folio;
   1565	int nr;
   1566	int err = -ENOSPC;
   1567
   1568	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
   1569		huge = false;
   1570	nr = huge ? HPAGE_PMD_NR : 1;
   1571
   1572	if (!shmem_inode_acct_block(inode, nr))
   1573		goto failed;
   1574
   1575	if (huge)
   1576		folio = shmem_alloc_hugefolio(gfp, info, index);
   1577	else
   1578		folio = shmem_alloc_folio(gfp, info, index);
   1579	if (folio) {
   1580		__folio_set_locked(folio);
   1581		__folio_set_swapbacked(folio);
   1582		return folio;
   1583	}
   1584
   1585	err = -ENOMEM;
   1586	shmem_inode_unacct_blocks(inode, nr);
   1587failed:
   1588	return ERR_PTR(err);
   1589}
   1590
   1591/*
   1592 * When a page is moved from swapcache to shmem filecache (either by the
   1593 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
   1594 * shmem_unuse_inode()), it may have been read in earlier from swap, in
   1595 * ignorance of the mapping it belongs to.  If that mapping has special
   1596 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
   1597 * we may need to copy to a suitable page before moving to filecache.
   1598 *
   1599 * In a future release, this may well be extended to respect cpuset and
   1600 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
   1601 * but for now it is a simple matter of zone.
   1602 */
   1603static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
   1604{
   1605	return folio_zonenum(folio) > gfp_zone(gfp);
   1606}
   1607
   1608static int shmem_replace_page(struct page **pagep, gfp_t gfp,
   1609				struct shmem_inode_info *info, pgoff_t index)
   1610{
   1611	struct page *oldpage, *newpage;
   1612	struct folio *old, *new;
   1613	struct address_space *swap_mapping;
   1614	swp_entry_t entry;
   1615	pgoff_t swap_index;
   1616	int error;
   1617
   1618	oldpage = *pagep;
   1619	entry.val = page_private(oldpage);
   1620	swap_index = swp_offset(entry);
   1621	swap_mapping = page_mapping(oldpage);
   1622
   1623	/*
   1624	 * We have arrived here because our zones are constrained, so don't
   1625	 * limit chance of success by further cpuset and node constraints.
   1626	 */
   1627	gfp &= ~GFP_CONSTRAINT_MASK;
   1628	newpage = shmem_alloc_page(gfp, info, index);
   1629	if (!newpage)
   1630		return -ENOMEM;
   1631
   1632	get_page(newpage);
   1633	copy_highpage(newpage, oldpage);
   1634	flush_dcache_page(newpage);
   1635
   1636	__SetPageLocked(newpage);
   1637	__SetPageSwapBacked(newpage);
   1638	SetPageUptodate(newpage);
   1639	set_page_private(newpage, entry.val);
   1640	SetPageSwapCache(newpage);
   1641
   1642	/*
   1643	 * Our caller will very soon move newpage out of swapcache, but it's
   1644	 * a nice clean interface for us to replace oldpage by newpage there.
   1645	 */
   1646	xa_lock_irq(&swap_mapping->i_pages);
   1647	error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
   1648	if (!error) {
   1649		old = page_folio(oldpage);
   1650		new = page_folio(newpage);
   1651		mem_cgroup_migrate(old, new);
   1652		__inc_lruvec_page_state(newpage, NR_FILE_PAGES);
   1653		__dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
   1654	}
   1655	xa_unlock_irq(&swap_mapping->i_pages);
   1656
   1657	if (unlikely(error)) {
   1658		/*
   1659		 * Is this possible?  I think not, now that our callers check
   1660		 * both PageSwapCache and page_private after getting page lock;
   1661		 * but be defensive.  Reverse old to newpage for clear and free.
   1662		 */
   1663		oldpage = newpage;
   1664	} else {
   1665		lru_cache_add(newpage);
   1666		*pagep = newpage;
   1667	}
   1668
   1669	ClearPageSwapCache(oldpage);
   1670	set_page_private(oldpage, 0);
   1671
   1672	unlock_page(oldpage);
   1673	put_page(oldpage);
   1674	put_page(oldpage);
   1675	return error;
   1676}
   1677
   1678static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
   1679					 struct folio *folio, swp_entry_t swap)
   1680{
   1681	struct address_space *mapping = inode->i_mapping;
   1682	struct shmem_inode_info *info = SHMEM_I(inode);
   1683	swp_entry_t swapin_error;
   1684	void *old;
   1685
   1686	swapin_error = make_swapin_error_entry(&folio->page);
   1687	old = xa_cmpxchg_irq(&mapping->i_pages, index,
   1688			     swp_to_radix_entry(swap),
   1689			     swp_to_radix_entry(swapin_error), 0);
   1690	if (old != swp_to_radix_entry(swap))
   1691		return;
   1692
   1693	folio_wait_writeback(folio);
   1694	delete_from_swap_cache(&folio->page);
   1695	spin_lock_irq(&info->lock);
   1696	/*
   1697	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
   1698	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
   1699	 * shmem_evict_inode.
   1700	 */
   1701	info->alloced--;
   1702	info->swapped--;
   1703	shmem_recalc_inode(inode);
   1704	spin_unlock_irq(&info->lock);
   1705	swap_free(swap);
   1706}
   1707
   1708/*
   1709 * Swap in the page pointed to by *pagep.
   1710 * Caller has to make sure that *pagep contains a valid swapped page.
   1711 * Returns 0 and the page in pagep if success. On failure, returns the
   1712 * error code and NULL in *pagep.
   1713 */
   1714static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
   1715			     struct folio **foliop, enum sgp_type sgp,
   1716			     gfp_t gfp, struct vm_area_struct *vma,
   1717			     vm_fault_t *fault_type)
   1718{
   1719	struct address_space *mapping = inode->i_mapping;
   1720	struct shmem_inode_info *info = SHMEM_I(inode);
   1721	struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
   1722	struct page *page;
   1723	struct folio *folio = NULL;
   1724	swp_entry_t swap;
   1725	int error;
   1726
   1727	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
   1728	swap = radix_to_swp_entry(*foliop);
   1729	*foliop = NULL;
   1730
   1731	if (is_swapin_error_entry(swap))
   1732		return -EIO;
   1733
   1734	/* Look it up and read it in.. */
   1735	page = lookup_swap_cache(swap, NULL, 0);
   1736	if (!page) {
   1737		/* Or update major stats only when swapin succeeds?? */
   1738		if (fault_type) {
   1739			*fault_type |= VM_FAULT_MAJOR;
   1740			count_vm_event(PGMAJFAULT);
   1741			count_memcg_event_mm(charge_mm, PGMAJFAULT);
   1742		}
   1743		/* Here we actually start the io */
   1744		page = shmem_swapin(swap, gfp, info, index);
   1745		if (!page) {
   1746			error = -ENOMEM;
   1747			goto failed;
   1748		}
   1749	}
   1750	folio = page_folio(page);
   1751
   1752	/* We have to do this with page locked to prevent races */
   1753	folio_lock(folio);
   1754	if (!folio_test_swapcache(folio) ||
   1755	    folio_swap_entry(folio).val != swap.val ||
   1756	    !shmem_confirm_swap(mapping, index, swap)) {
   1757		error = -EEXIST;
   1758		goto unlock;
   1759	}
   1760	if (!folio_test_uptodate(folio)) {
   1761		error = -EIO;
   1762		goto failed;
   1763	}
   1764	folio_wait_writeback(folio);
   1765
   1766	/*
   1767	 * Some architectures may have to restore extra metadata to the
   1768	 * folio after reading from swap.
   1769	 */
   1770	arch_swap_restore(swap, folio);
   1771
   1772	if (shmem_should_replace_folio(folio, gfp)) {
   1773		error = shmem_replace_page(&page, gfp, info, index);
   1774		if (error)
   1775			goto failed;
   1776	}
   1777
   1778	error = shmem_add_to_page_cache(folio, mapping, index,
   1779					swp_to_radix_entry(swap), gfp,
   1780					charge_mm);
   1781	if (error)
   1782		goto failed;
   1783
   1784	spin_lock_irq(&info->lock);
   1785	info->swapped--;
   1786	shmem_recalc_inode(inode);
   1787	spin_unlock_irq(&info->lock);
   1788
   1789	if (sgp == SGP_WRITE)
   1790		folio_mark_accessed(folio);
   1791
   1792	delete_from_swap_cache(&folio->page);
   1793	folio_mark_dirty(folio);
   1794	swap_free(swap);
   1795
   1796	*foliop = folio;
   1797	return 0;
   1798failed:
   1799	if (!shmem_confirm_swap(mapping, index, swap))
   1800		error = -EEXIST;
   1801	if (error == -EIO)
   1802		shmem_set_folio_swapin_error(inode, index, folio, swap);
   1803unlock:
   1804	if (folio) {
   1805		folio_unlock(folio);
   1806		folio_put(folio);
   1807	}
   1808
   1809	return error;
   1810}
   1811
   1812/*
   1813 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
   1814 *
   1815 * If we allocate a new one we do not mark it dirty. That's up to the
   1816 * vm. If we swap it in we mark it dirty since we also free the swap
   1817 * entry since a page cannot live in both the swap and page cache.
   1818 *
   1819 * vma, vmf, and fault_type are only supplied by shmem_fault:
   1820 * otherwise they are NULL.
   1821 */
   1822static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
   1823	struct page **pagep, enum sgp_type sgp, gfp_t gfp,
   1824	struct vm_area_struct *vma, struct vm_fault *vmf,
   1825			vm_fault_t *fault_type)
   1826{
   1827	struct address_space *mapping = inode->i_mapping;
   1828	struct shmem_inode_info *info = SHMEM_I(inode);
   1829	struct shmem_sb_info *sbinfo;
   1830	struct mm_struct *charge_mm;
   1831	struct folio *folio;
   1832	pgoff_t hindex = index;
   1833	gfp_t huge_gfp;
   1834	int error;
   1835	int once = 0;
   1836	int alloced = 0;
   1837
   1838	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
   1839		return -EFBIG;
   1840repeat:
   1841	if (sgp <= SGP_CACHE &&
   1842	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
   1843		return -EINVAL;
   1844	}
   1845
   1846	sbinfo = SHMEM_SB(inode->i_sb);
   1847	charge_mm = vma ? vma->vm_mm : NULL;
   1848
   1849	folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
   1850	if (folio && vma && userfaultfd_minor(vma)) {
   1851		if (!xa_is_value(folio)) {
   1852			folio_unlock(folio);
   1853			folio_put(folio);
   1854		}
   1855		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
   1856		return 0;
   1857	}
   1858
   1859	if (xa_is_value(folio)) {
   1860		error = shmem_swapin_folio(inode, index, &folio,
   1861					  sgp, gfp, vma, fault_type);
   1862		if (error == -EEXIST)
   1863			goto repeat;
   1864
   1865		*pagep = &folio->page;
   1866		return error;
   1867	}
   1868
   1869	if (folio) {
   1870		hindex = folio->index;
   1871		if (sgp == SGP_WRITE)
   1872			folio_mark_accessed(folio);
   1873		if (folio_test_uptodate(folio))
   1874			goto out;
   1875		/* fallocated page */
   1876		if (sgp != SGP_READ)
   1877			goto clear;
   1878		folio_unlock(folio);
   1879		folio_put(folio);
   1880	}
   1881
   1882	/*
   1883	 * SGP_READ: succeed on hole, with NULL page, letting caller zero.
   1884	 * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail.
   1885	 */
   1886	*pagep = NULL;
   1887	if (sgp == SGP_READ)
   1888		return 0;
   1889	if (sgp == SGP_NOALLOC)
   1890		return -ENOENT;
   1891
   1892	/*
   1893	 * Fast cache lookup and swap lookup did not find it: allocate.
   1894	 */
   1895
   1896	if (vma && userfaultfd_missing(vma)) {
   1897		*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
   1898		return 0;
   1899	}
   1900
   1901	if (!shmem_is_huge(vma, inode, index))
   1902		goto alloc_nohuge;
   1903
   1904	huge_gfp = vma_thp_gfp_mask(vma);
   1905	huge_gfp = limit_gfp_mask(huge_gfp, gfp);
   1906	folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
   1907	if (IS_ERR(folio)) {
   1908alloc_nohuge:
   1909		folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
   1910	}
   1911	if (IS_ERR(folio)) {
   1912		int retry = 5;
   1913
   1914		error = PTR_ERR(folio);
   1915		folio = NULL;
   1916		if (error != -ENOSPC)
   1917			goto unlock;
   1918		/*
   1919		 * Try to reclaim some space by splitting a huge page
   1920		 * beyond i_size on the filesystem.
   1921		 */
   1922		while (retry--) {
   1923			int ret;
   1924
   1925			ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
   1926			if (ret == SHRINK_STOP)
   1927				break;
   1928			if (ret)
   1929				goto alloc_nohuge;
   1930		}
   1931		goto unlock;
   1932	}
   1933
   1934	hindex = round_down(index, folio_nr_pages(folio));
   1935
   1936	if (sgp == SGP_WRITE)
   1937		__folio_set_referenced(folio);
   1938
   1939	error = shmem_add_to_page_cache(folio, mapping, hindex,
   1940					NULL, gfp & GFP_RECLAIM_MASK,
   1941					charge_mm);
   1942	if (error)
   1943		goto unacct;
   1944	folio_add_lru(folio);
   1945
   1946	spin_lock_irq(&info->lock);
   1947	info->alloced += folio_nr_pages(folio);
   1948	inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
   1949	shmem_recalc_inode(inode);
   1950	spin_unlock_irq(&info->lock);
   1951	alloced = true;
   1952
   1953	if (folio_test_pmd_mappable(folio) &&
   1954	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
   1955			hindex + HPAGE_PMD_NR - 1) {
   1956		/*
   1957		 * Part of the huge page is beyond i_size: subject
   1958		 * to shrink under memory pressure.
   1959		 */
   1960		spin_lock(&sbinfo->shrinklist_lock);
   1961		/*
   1962		 * _careful to defend against unlocked access to
   1963		 * ->shrink_list in shmem_unused_huge_shrink()
   1964		 */
   1965		if (list_empty_careful(&info->shrinklist)) {
   1966			list_add_tail(&info->shrinklist,
   1967				      &sbinfo->shrinklist);
   1968			sbinfo->shrinklist_len++;
   1969		}
   1970		spin_unlock(&sbinfo->shrinklist_lock);
   1971	}
   1972
   1973	/*
   1974	 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
   1975	 */
   1976	if (sgp == SGP_FALLOC)
   1977		sgp = SGP_WRITE;
   1978clear:
   1979	/*
   1980	 * Let SGP_WRITE caller clear ends if write does not fill page;
   1981	 * but SGP_FALLOC on a page fallocated earlier must initialize
   1982	 * it now, lest undo on failure cancel our earlier guarantee.
   1983	 */
   1984	if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
   1985		long i, n = folio_nr_pages(folio);
   1986
   1987		for (i = 0; i < n; i++)
   1988			clear_highpage(folio_page(folio, i));
   1989		flush_dcache_folio(folio);
   1990		folio_mark_uptodate(folio);
   1991	}
   1992
   1993	/* Perhaps the file has been truncated since we checked */
   1994	if (sgp <= SGP_CACHE &&
   1995	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
   1996		if (alloced) {
   1997			folio_clear_dirty(folio);
   1998			filemap_remove_folio(folio);
   1999			spin_lock_irq(&info->lock);
   2000			shmem_recalc_inode(inode);
   2001			spin_unlock_irq(&info->lock);
   2002		}
   2003		error = -EINVAL;
   2004		goto unlock;
   2005	}
   2006out:
   2007	*pagep = folio_page(folio, index - hindex);
   2008	return 0;
   2009
   2010	/*
   2011	 * Error recovery.
   2012	 */
   2013unacct:
   2014	shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
   2015
   2016	if (folio_test_large(folio)) {
   2017		folio_unlock(folio);
   2018		folio_put(folio);
   2019		goto alloc_nohuge;
   2020	}
   2021unlock:
   2022	if (folio) {
   2023		folio_unlock(folio);
   2024		folio_put(folio);
   2025	}
   2026	if (error == -ENOSPC && !once++) {
   2027		spin_lock_irq(&info->lock);
   2028		shmem_recalc_inode(inode);
   2029		spin_unlock_irq(&info->lock);
   2030		goto repeat;
   2031	}
   2032	if (error == -EEXIST)
   2033		goto repeat;
   2034	return error;
   2035}
   2036
   2037/*
   2038 * This is like autoremove_wake_function, but it removes the wait queue
   2039 * entry unconditionally - even if something else had already woken the
   2040 * target.
   2041 */
   2042static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
   2043{
   2044	int ret = default_wake_function(wait, mode, sync, key);
   2045	list_del_init(&wait->entry);
   2046	return ret;
   2047}
   2048
   2049static vm_fault_t shmem_fault(struct vm_fault *vmf)
   2050{
   2051	struct vm_area_struct *vma = vmf->vma;
   2052	struct inode *inode = file_inode(vma->vm_file);
   2053	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
   2054	int err;
   2055	vm_fault_t ret = VM_FAULT_LOCKED;
   2056
   2057	/*
   2058	 * Trinity finds that probing a hole which tmpfs is punching can
   2059	 * prevent the hole-punch from ever completing: which in turn
   2060	 * locks writers out with its hold on i_rwsem.  So refrain from
   2061	 * faulting pages into the hole while it's being punched.  Although
   2062	 * shmem_undo_range() does remove the additions, it may be unable to
   2063	 * keep up, as each new page needs its own unmap_mapping_range() call,
   2064	 * and the i_mmap tree grows ever slower to scan if new vmas are added.
   2065	 *
   2066	 * It does not matter if we sometimes reach this check just before the
   2067	 * hole-punch begins, so that one fault then races with the punch:
   2068	 * we just need to make racing faults a rare case.
   2069	 *
   2070	 * The implementation below would be much simpler if we just used a
   2071	 * standard mutex or completion: but we cannot take i_rwsem in fault,
   2072	 * and bloating every shmem inode for this unlikely case would be sad.
   2073	 */
   2074	if (unlikely(inode->i_private)) {
   2075		struct shmem_falloc *shmem_falloc;
   2076
   2077		spin_lock(&inode->i_lock);
   2078		shmem_falloc = inode->i_private;
   2079		if (shmem_falloc &&
   2080		    shmem_falloc->waitq &&
   2081		    vmf->pgoff >= shmem_falloc->start &&
   2082		    vmf->pgoff < shmem_falloc->next) {
   2083			struct file *fpin;
   2084			wait_queue_head_t *shmem_falloc_waitq;
   2085			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
   2086
   2087			ret = VM_FAULT_NOPAGE;
   2088			fpin = maybe_unlock_mmap_for_io(vmf, NULL);
   2089			if (fpin)
   2090				ret = VM_FAULT_RETRY;
   2091
   2092			shmem_falloc_waitq = shmem_falloc->waitq;
   2093			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
   2094					TASK_UNINTERRUPTIBLE);
   2095			spin_unlock(&inode->i_lock);
   2096			schedule();
   2097
   2098			/*
   2099			 * shmem_falloc_waitq points into the shmem_fallocate()
   2100			 * stack of the hole-punching task: shmem_falloc_waitq
   2101			 * is usually invalid by the time we reach here, but
   2102			 * finish_wait() does not dereference it in that case;
   2103			 * though i_lock needed lest racing with wake_up_all().
   2104			 */
   2105			spin_lock(&inode->i_lock);
   2106			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
   2107			spin_unlock(&inode->i_lock);
   2108
   2109			if (fpin)
   2110				fput(fpin);
   2111			return ret;
   2112		}
   2113		spin_unlock(&inode->i_lock);
   2114	}
   2115
   2116	err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
   2117				  gfp, vma, vmf, &ret);
   2118	if (err)
   2119		return vmf_error(err);
   2120	return ret;
   2121}
   2122
   2123unsigned long shmem_get_unmapped_area(struct file *file,
   2124				      unsigned long uaddr, unsigned long len,
   2125				      unsigned long pgoff, unsigned long flags)
   2126{
   2127	unsigned long (*get_area)(struct file *,
   2128		unsigned long, unsigned long, unsigned long, unsigned long);
   2129	unsigned long addr;
   2130	unsigned long offset;
   2131	unsigned long inflated_len;
   2132	unsigned long inflated_addr;
   2133	unsigned long inflated_offset;
   2134
   2135	if (len > TASK_SIZE)
   2136		return -ENOMEM;
   2137
   2138	get_area = current->mm->get_unmapped_area;
   2139	addr = get_area(file, uaddr, len, pgoff, flags);
   2140
   2141	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
   2142		return addr;
   2143	if (IS_ERR_VALUE(addr))
   2144		return addr;
   2145	if (addr & ~PAGE_MASK)
   2146		return addr;
   2147	if (addr > TASK_SIZE - len)
   2148		return addr;
   2149
   2150	if (shmem_huge == SHMEM_HUGE_DENY)
   2151		return addr;
   2152	if (len < HPAGE_PMD_SIZE)
   2153		return addr;
   2154	if (flags & MAP_FIXED)
   2155		return addr;
   2156	/*
   2157	 * Our priority is to support MAP_SHARED mapped hugely;
   2158	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
   2159	 * But if caller specified an address hint and we allocated area there
   2160	 * successfully, respect that as before.
   2161	 */
   2162	if (uaddr == addr)
   2163		return addr;
   2164
   2165	if (shmem_huge != SHMEM_HUGE_FORCE) {
   2166		struct super_block *sb;
   2167
   2168		if (file) {
   2169			VM_BUG_ON(file->f_op != &shmem_file_operations);
   2170			sb = file_inode(file)->i_sb;
   2171		} else {
   2172			/*
   2173			 * Called directly from mm/mmap.c, or drivers/char/mem.c
   2174			 * for "/dev/zero", to create a shared anonymous object.
   2175			 */
   2176			if (IS_ERR(shm_mnt))
   2177				return addr;
   2178			sb = shm_mnt->mnt_sb;
   2179		}
   2180		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
   2181			return addr;
   2182	}
   2183
   2184	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
   2185	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
   2186		return addr;
   2187	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
   2188		return addr;
   2189
   2190	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
   2191	if (inflated_len > TASK_SIZE)
   2192		return addr;
   2193	if (inflated_len < len)
   2194		return addr;
   2195
   2196	inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
   2197	if (IS_ERR_VALUE(inflated_addr))
   2198		return addr;
   2199	if (inflated_addr & ~PAGE_MASK)
   2200		return addr;
   2201
   2202	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
   2203	inflated_addr += offset - inflated_offset;
   2204	if (inflated_offset > offset)
   2205		inflated_addr += HPAGE_PMD_SIZE;
   2206
   2207	if (inflated_addr > TASK_SIZE - len)
   2208		return addr;
   2209	return inflated_addr;
   2210}
   2211
   2212#ifdef CONFIG_NUMA
   2213static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
   2214{
   2215	struct inode *inode = file_inode(vma->vm_file);
   2216	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
   2217}
   2218
   2219static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
   2220					  unsigned long addr)
   2221{
   2222	struct inode *inode = file_inode(vma->vm_file);
   2223	pgoff_t index;
   2224
   2225	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
   2226	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
   2227}
   2228#endif
   2229
   2230int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
   2231{
   2232	struct inode *inode = file_inode(file);
   2233	struct shmem_inode_info *info = SHMEM_I(inode);
   2234	int retval = -ENOMEM;
   2235
   2236	/*
   2237	 * What serializes the accesses to info->flags?
   2238	 * ipc_lock_object() when called from shmctl_do_lock(),
   2239	 * no serialization needed when called from shm_destroy().
   2240	 */
   2241	if (lock && !(info->flags & VM_LOCKED)) {
   2242		if (!user_shm_lock(inode->i_size, ucounts))
   2243			goto out_nomem;
   2244		info->flags |= VM_LOCKED;
   2245		mapping_set_unevictable(file->f_mapping);
   2246	}
   2247	if (!lock && (info->flags & VM_LOCKED) && ucounts) {
   2248		user_shm_unlock(inode->i_size, ucounts);
   2249		info->flags &= ~VM_LOCKED;
   2250		mapping_clear_unevictable(file->f_mapping);
   2251	}
   2252	retval = 0;
   2253
   2254out_nomem:
   2255	return retval;
   2256}
   2257
   2258static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
   2259{
   2260	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
   2261	int ret;
   2262
   2263	ret = seal_check_future_write(info->seals, vma);
   2264	if (ret)
   2265		return ret;
   2266
   2267	/* arm64 - allow memory tagging on RAM-based files */
   2268	vma->vm_flags |= VM_MTE_ALLOWED;
   2269
   2270	file_accessed(file);
   2271	vma->vm_ops = &shmem_vm_ops;
   2272	return 0;
   2273}
   2274
   2275static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
   2276				     umode_t mode, dev_t dev, unsigned long flags)
   2277{
   2278	struct inode *inode;
   2279	struct shmem_inode_info *info;
   2280	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
   2281	ino_t ino;
   2282
   2283	if (shmem_reserve_inode(sb, &ino))
   2284		return NULL;
   2285
   2286	inode = new_inode(sb);
   2287	if (inode) {
   2288		inode->i_ino = ino;
   2289		inode_init_owner(&init_user_ns, inode, dir, mode);
   2290		inode->i_blocks = 0;
   2291		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
   2292		inode->i_generation = prandom_u32();
   2293		info = SHMEM_I(inode);
   2294		memset(info, 0, (char *)inode - (char *)info);
   2295		spin_lock_init(&info->lock);
   2296		atomic_set(&info->stop_eviction, 0);
   2297		info->seals = F_SEAL_SEAL;
   2298		info->flags = flags & VM_NORESERVE;
   2299		info->i_crtime = inode->i_mtime;
   2300		INIT_LIST_HEAD(&info->shrinklist);
   2301		INIT_LIST_HEAD(&info->swaplist);
   2302		simple_xattrs_init(&info->xattrs);
   2303		cache_no_acl(inode);
   2304		mapping_set_large_folios(inode->i_mapping);
   2305
   2306		switch (mode & S_IFMT) {
   2307		default:
   2308			inode->i_op = &shmem_special_inode_operations;
   2309			init_special_inode(inode, mode, dev);
   2310			break;
   2311		case S_IFREG:
   2312			inode->i_mapping->a_ops = &shmem_aops;
   2313			inode->i_op = &shmem_inode_operations;
   2314			inode->i_fop = &shmem_file_operations;
   2315			mpol_shared_policy_init(&info->policy,
   2316						 shmem_get_sbmpol(sbinfo));
   2317			break;
   2318		case S_IFDIR:
   2319			inc_nlink(inode);
   2320			/* Some things misbehave if size == 0 on a directory */
   2321			inode->i_size = 2 * BOGO_DIRENT_SIZE;
   2322			inode->i_op = &shmem_dir_inode_operations;
   2323			inode->i_fop = &simple_dir_operations;
   2324			break;
   2325		case S_IFLNK:
   2326			/*
   2327			 * Must not load anything in the rbtree,
   2328			 * mpol_free_shared_policy will not be called.
   2329			 */
   2330			mpol_shared_policy_init(&info->policy, NULL);
   2331			break;
   2332		}
   2333
   2334		lockdep_annotate_inode_mutex_key(inode);
   2335	} else
   2336		shmem_free_inode(sb);
   2337	return inode;
   2338}
   2339
   2340#ifdef CONFIG_USERFAULTFD
   2341int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
   2342			   pmd_t *dst_pmd,
   2343			   struct vm_area_struct *dst_vma,
   2344			   unsigned long dst_addr,
   2345			   unsigned long src_addr,
   2346			   bool zeropage, bool wp_copy,
   2347			   struct page **pagep)
   2348{
   2349	struct inode *inode = file_inode(dst_vma->vm_file);
   2350	struct shmem_inode_info *info = SHMEM_I(inode);
   2351	struct address_space *mapping = inode->i_mapping;
   2352	gfp_t gfp = mapping_gfp_mask(mapping);
   2353	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
   2354	void *page_kaddr;
   2355	struct folio *folio;
   2356	struct page *page;
   2357	int ret;
   2358	pgoff_t max_off;
   2359
   2360	if (!shmem_inode_acct_block(inode, 1)) {
   2361		/*
   2362		 * We may have got a page, returned -ENOENT triggering a retry,
   2363		 * and now we find ourselves with -ENOMEM. Release the page, to
   2364		 * avoid a BUG_ON in our caller.
   2365		 */
   2366		if (unlikely(*pagep)) {
   2367			put_page(*pagep);
   2368			*pagep = NULL;
   2369		}
   2370		return -ENOMEM;
   2371	}
   2372
   2373	if (!*pagep) {
   2374		ret = -ENOMEM;
   2375		page = shmem_alloc_page(gfp, info, pgoff);
   2376		if (!page)
   2377			goto out_unacct_blocks;
   2378
   2379		if (!zeropage) {	/* COPY */
   2380			page_kaddr = kmap_atomic(page);
   2381			ret = copy_from_user(page_kaddr,
   2382					     (const void __user *)src_addr,
   2383					     PAGE_SIZE);
   2384			kunmap_atomic(page_kaddr);
   2385
   2386			/* fallback to copy_from_user outside mmap_lock */
   2387			if (unlikely(ret)) {
   2388				*pagep = page;
   2389				ret = -ENOENT;
   2390				/* don't free the page */
   2391				goto out_unacct_blocks;
   2392			}
   2393
   2394			flush_dcache_page(page);
   2395		} else {		/* ZEROPAGE */
   2396			clear_user_highpage(page, dst_addr);
   2397		}
   2398	} else {
   2399		page = *pagep;
   2400		*pagep = NULL;
   2401	}
   2402
   2403	VM_BUG_ON(PageLocked(page));
   2404	VM_BUG_ON(PageSwapBacked(page));
   2405	__SetPageLocked(page);
   2406	__SetPageSwapBacked(page);
   2407	__SetPageUptodate(page);
   2408
   2409	ret = -EFAULT;
   2410	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
   2411	if (unlikely(pgoff >= max_off))
   2412		goto out_release;
   2413
   2414	folio = page_folio(page);
   2415	ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
   2416				      gfp & GFP_RECLAIM_MASK, dst_mm);
   2417	if (ret)
   2418		goto out_release;
   2419
   2420	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
   2421				       page, true, wp_copy);
   2422	if (ret)
   2423		goto out_delete_from_cache;
   2424
   2425	spin_lock_irq(&info->lock);
   2426	info->alloced++;
   2427	inode->i_blocks += BLOCKS_PER_PAGE;
   2428	shmem_recalc_inode(inode);
   2429	spin_unlock_irq(&info->lock);
   2430
   2431	unlock_page(page);
   2432	return 0;
   2433out_delete_from_cache:
   2434	delete_from_page_cache(page);
   2435out_release:
   2436	unlock_page(page);
   2437	put_page(page);
   2438out_unacct_blocks:
   2439	shmem_inode_unacct_blocks(inode, 1);
   2440	return ret;
   2441}
   2442#endif /* CONFIG_USERFAULTFD */
   2443
   2444#ifdef CONFIG_TMPFS
   2445static const struct inode_operations shmem_symlink_inode_operations;
   2446static const struct inode_operations shmem_short_symlink_operations;
   2447
   2448#ifdef CONFIG_TMPFS_XATTR
   2449static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
   2450#else
   2451#define shmem_initxattrs NULL
   2452#endif
   2453
   2454static int
   2455shmem_write_begin(struct file *file, struct address_space *mapping,
   2456			loff_t pos, unsigned len,
   2457			struct page **pagep, void **fsdata)
   2458{
   2459	struct inode *inode = mapping->host;
   2460	struct shmem_inode_info *info = SHMEM_I(inode);
   2461	pgoff_t index = pos >> PAGE_SHIFT;
   2462	int ret = 0;
   2463
   2464	/* i_rwsem is held by caller */
   2465	if (unlikely(info->seals & (F_SEAL_GROW |
   2466				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
   2467		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
   2468			return -EPERM;
   2469		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
   2470			return -EPERM;
   2471	}
   2472
   2473	ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
   2474
   2475	if (ret)
   2476		return ret;
   2477
   2478	if (PageHWPoison(*pagep)) {
   2479		unlock_page(*pagep);
   2480		put_page(*pagep);
   2481		*pagep = NULL;
   2482		return -EIO;
   2483	}
   2484
   2485	return 0;
   2486}
   2487
   2488static int
   2489shmem_write_end(struct file *file, struct address_space *mapping,
   2490			loff_t pos, unsigned len, unsigned copied,
   2491			struct page *page, void *fsdata)
   2492{
   2493	struct inode *inode = mapping->host;
   2494
   2495	if (pos + copied > inode->i_size)
   2496		i_size_write(inode, pos + copied);
   2497
   2498	if (!PageUptodate(page)) {
   2499		struct page *head = compound_head(page);
   2500		if (PageTransCompound(page)) {
   2501			int i;
   2502
   2503			for (i = 0; i < HPAGE_PMD_NR; i++) {
   2504				if (head + i == page)
   2505					continue;
   2506				clear_highpage(head + i);
   2507				flush_dcache_page(head + i);
   2508			}
   2509		}
   2510		if (copied < PAGE_SIZE) {
   2511			unsigned from = pos & (PAGE_SIZE - 1);
   2512			zero_user_segments(page, 0, from,
   2513					from + copied, PAGE_SIZE);
   2514		}
   2515		SetPageUptodate(head);
   2516	}
   2517	set_page_dirty(page);
   2518	unlock_page(page);
   2519	put_page(page);
   2520
   2521	return copied;
   2522}
   2523
   2524static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
   2525{
   2526	struct file *file = iocb->ki_filp;
   2527	struct inode *inode = file_inode(file);
   2528	struct address_space *mapping = inode->i_mapping;
   2529	pgoff_t index;
   2530	unsigned long offset;
   2531	int error = 0;
   2532	ssize_t retval = 0;
   2533	loff_t *ppos = &iocb->ki_pos;
   2534
   2535	index = *ppos >> PAGE_SHIFT;
   2536	offset = *ppos & ~PAGE_MASK;
   2537
   2538	for (;;) {
   2539		struct page *page = NULL;
   2540		pgoff_t end_index;
   2541		unsigned long nr, ret;
   2542		loff_t i_size = i_size_read(inode);
   2543
   2544		end_index = i_size >> PAGE_SHIFT;
   2545		if (index > end_index)
   2546			break;
   2547		if (index == end_index) {
   2548			nr = i_size & ~PAGE_MASK;
   2549			if (nr <= offset)
   2550				break;
   2551		}
   2552
   2553		error = shmem_getpage(inode, index, &page, SGP_READ);
   2554		if (error) {
   2555			if (error == -EINVAL)
   2556				error = 0;
   2557			break;
   2558		}
   2559		if (page) {
   2560			unlock_page(page);
   2561
   2562			if (PageHWPoison(page)) {
   2563				put_page(page);
   2564				error = -EIO;
   2565				break;
   2566			}
   2567		}
   2568
   2569		/*
   2570		 * We must evaluate after, since reads (unlike writes)
   2571		 * are called without i_rwsem protection against truncate
   2572		 */
   2573		nr = PAGE_SIZE;
   2574		i_size = i_size_read(inode);
   2575		end_index = i_size >> PAGE_SHIFT;
   2576		if (index == end_index) {
   2577			nr = i_size & ~PAGE_MASK;
   2578			if (nr <= offset) {
   2579				if (page)
   2580					put_page(page);
   2581				break;
   2582			}
   2583		}
   2584		nr -= offset;
   2585
   2586		if (page) {
   2587			/*
   2588			 * If users can be writing to this page using arbitrary
   2589			 * virtual addresses, take care about potential aliasing
   2590			 * before reading the page on the kernel side.
   2591			 */
   2592			if (mapping_writably_mapped(mapping))
   2593				flush_dcache_page(page);
   2594			/*
   2595			 * Mark the page accessed if we read the beginning.
   2596			 */
   2597			if (!offset)
   2598				mark_page_accessed(page);
   2599			/*
   2600			 * Ok, we have the page, and it's up-to-date, so
   2601			 * now we can copy it to user space...
   2602			 */
   2603			ret = copy_page_to_iter(page, offset, nr, to);
   2604			put_page(page);
   2605
   2606		} else if (iter_is_iovec(to)) {
   2607			/*
   2608			 * Copy to user tends to be so well optimized, but
   2609			 * clear_user() not so much, that it is noticeably
   2610			 * faster to copy the zero page instead of clearing.
   2611			 */
   2612			ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
   2613		} else {
   2614			/*
   2615			 * But submitting the same page twice in a row to
   2616			 * splice() - or others? - can result in confusion:
   2617			 * so don't attempt that optimization on pipes etc.
   2618			 */
   2619			ret = iov_iter_zero(nr, to);
   2620		}
   2621
   2622		retval += ret;
   2623		offset += ret;
   2624		index += offset >> PAGE_SHIFT;
   2625		offset &= ~PAGE_MASK;
   2626
   2627		if (!iov_iter_count(to))
   2628			break;
   2629		if (ret < nr) {
   2630			error = -EFAULT;
   2631			break;
   2632		}
   2633		cond_resched();
   2634	}
   2635
   2636	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
   2637	file_accessed(file);
   2638	return retval ? retval : error;
   2639}
   2640
   2641static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
   2642{
   2643	struct address_space *mapping = file->f_mapping;
   2644	struct inode *inode = mapping->host;
   2645
   2646	if (whence != SEEK_DATA && whence != SEEK_HOLE)
   2647		return generic_file_llseek_size(file, offset, whence,
   2648					MAX_LFS_FILESIZE, i_size_read(inode));
   2649	if (offset < 0)
   2650		return -ENXIO;
   2651
   2652	inode_lock(inode);
   2653	/* We're holding i_rwsem so we can access i_size directly */
   2654	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
   2655	if (offset >= 0)
   2656		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
   2657	inode_unlock(inode);
   2658	return offset;
   2659}
   2660
   2661static long shmem_fallocate(struct file *file, int mode, loff_t offset,
   2662							 loff_t len)
   2663{
   2664	struct inode *inode = file_inode(file);
   2665	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
   2666	struct shmem_inode_info *info = SHMEM_I(inode);
   2667	struct shmem_falloc shmem_falloc;
   2668	pgoff_t start, index, end, undo_fallocend;
   2669	int error;
   2670
   2671	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
   2672		return -EOPNOTSUPP;
   2673
   2674	inode_lock(inode);
   2675
   2676	if (mode & FALLOC_FL_PUNCH_HOLE) {
   2677		struct address_space *mapping = file->f_mapping;
   2678		loff_t unmap_start = round_up(offset, PAGE_SIZE);
   2679		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
   2680		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
   2681
   2682		/* protected by i_rwsem */
   2683		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
   2684			error = -EPERM;
   2685			goto out;
   2686		}
   2687
   2688		shmem_falloc.waitq = &shmem_falloc_waitq;
   2689		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
   2690		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
   2691		spin_lock(&inode->i_lock);
   2692		inode->i_private = &shmem_falloc;
   2693		spin_unlock(&inode->i_lock);
   2694
   2695		if ((u64)unmap_end > (u64)unmap_start)
   2696			unmap_mapping_range(mapping, unmap_start,
   2697					    1 + unmap_end - unmap_start, 0);
   2698		shmem_truncate_range(inode, offset, offset + len - 1);
   2699		/* No need to unmap again: hole-punching leaves COWed pages */
   2700
   2701		spin_lock(&inode->i_lock);
   2702		inode->i_private = NULL;
   2703		wake_up_all(&shmem_falloc_waitq);
   2704		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
   2705		spin_unlock(&inode->i_lock);
   2706		error = 0;
   2707		goto out;
   2708	}
   2709
   2710	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
   2711	error = inode_newsize_ok(inode, offset + len);
   2712	if (error)
   2713		goto out;
   2714
   2715	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
   2716		error = -EPERM;
   2717		goto out;
   2718	}
   2719
   2720	start = offset >> PAGE_SHIFT;
   2721	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
   2722	/* Try to avoid a swapstorm if len is impossible to satisfy */
   2723	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
   2724		error = -ENOSPC;
   2725		goto out;
   2726	}
   2727
   2728	shmem_falloc.waitq = NULL;
   2729	shmem_falloc.start = start;
   2730	shmem_falloc.next  = start;
   2731	shmem_falloc.nr_falloced = 0;
   2732	shmem_falloc.nr_unswapped = 0;
   2733	spin_lock(&inode->i_lock);
   2734	inode->i_private = &shmem_falloc;
   2735	spin_unlock(&inode->i_lock);
   2736
   2737	/*
   2738	 * info->fallocend is only relevant when huge pages might be
   2739	 * involved: to prevent split_huge_page() freeing fallocated
   2740	 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
   2741	 */
   2742	undo_fallocend = info->fallocend;
   2743	if (info->fallocend < end)
   2744		info->fallocend = end;
   2745
   2746	for (index = start; index < end; ) {
   2747		struct page *page;
   2748
   2749		/*
   2750		 * Good, the fallocate(2) manpage permits EINTR: we may have
   2751		 * been interrupted because we are using up too much memory.
   2752		 */
   2753		if (signal_pending(current))
   2754			error = -EINTR;
   2755		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
   2756			error = -ENOMEM;
   2757		else
   2758			error = shmem_getpage(inode, index, &page, SGP_FALLOC);
   2759		if (error) {
   2760			info->fallocend = undo_fallocend;
   2761			/* Remove the !PageUptodate pages we added */
   2762			if (index > start) {
   2763				shmem_undo_range(inode,
   2764				    (loff_t)start << PAGE_SHIFT,
   2765				    ((loff_t)index << PAGE_SHIFT) - 1, true);
   2766			}
   2767			goto undone;
   2768		}
   2769
   2770		index++;
   2771		/*
   2772		 * Here is a more important optimization than it appears:
   2773		 * a second SGP_FALLOC on the same huge page will clear it,
   2774		 * making it PageUptodate and un-undoable if we fail later.
   2775		 */
   2776		if (PageTransCompound(page)) {
   2777			index = round_up(index, HPAGE_PMD_NR);
   2778			/* Beware 32-bit wraparound */
   2779			if (!index)
   2780				index--;
   2781		}
   2782
   2783		/*
   2784		 * Inform shmem_writepage() how far we have reached.
   2785		 * No need for lock or barrier: we have the page lock.
   2786		 */
   2787		if (!PageUptodate(page))
   2788			shmem_falloc.nr_falloced += index - shmem_falloc.next;
   2789		shmem_falloc.next = index;
   2790
   2791		/*
   2792		 * If !PageUptodate, leave it that way so that freeable pages
   2793		 * can be recognized if we need to rollback on error later.
   2794		 * But set_page_dirty so that memory pressure will swap rather
   2795		 * than free the pages we are allocating (and SGP_CACHE pages
   2796		 * might still be clean: we now need to mark those dirty too).
   2797		 */
   2798		set_page_dirty(page);
   2799		unlock_page(page);
   2800		put_page(page);
   2801		cond_resched();
   2802	}
   2803
   2804	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
   2805		i_size_write(inode, offset + len);
   2806	inode->i_ctime = current_time(inode);
   2807undone:
   2808	spin_lock(&inode->i_lock);
   2809	inode->i_private = NULL;
   2810	spin_unlock(&inode->i_lock);
   2811out:
   2812	inode_unlock(inode);
   2813	return error;
   2814}
   2815
   2816static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
   2817{
   2818	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
   2819
   2820	buf->f_type = TMPFS_MAGIC;
   2821	buf->f_bsize = PAGE_SIZE;
   2822	buf->f_namelen = NAME_MAX;
   2823	if (sbinfo->max_blocks) {
   2824		buf->f_blocks = sbinfo->max_blocks;
   2825		buf->f_bavail =
   2826		buf->f_bfree  = sbinfo->max_blocks -
   2827				percpu_counter_sum(&sbinfo->used_blocks);
   2828	}
   2829	if (sbinfo->max_inodes) {
   2830		buf->f_files = sbinfo->max_inodes;
   2831		buf->f_ffree = sbinfo->free_inodes;
   2832	}
   2833	/* else leave those fields 0 like simple_statfs */
   2834
   2835	buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
   2836
   2837	return 0;
   2838}
   2839
   2840/*
   2841 * File creation. Allocate an inode, and we're done..
   2842 */
   2843static int
   2844shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
   2845	    struct dentry *dentry, umode_t mode, dev_t dev)
   2846{
   2847	struct inode *inode;
   2848	int error = -ENOSPC;
   2849
   2850	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
   2851	if (inode) {
   2852		error = simple_acl_create(dir, inode);
   2853		if (error)
   2854			goto out_iput;
   2855		error = security_inode_init_security(inode, dir,
   2856						     &dentry->d_name,
   2857						     shmem_initxattrs, NULL);
   2858		if (error && error != -EOPNOTSUPP)
   2859			goto out_iput;
   2860
   2861		error = 0;
   2862		dir->i_size += BOGO_DIRENT_SIZE;
   2863		dir->i_ctime = dir->i_mtime = current_time(dir);
   2864		d_instantiate(dentry, inode);
   2865		dget(dentry); /* Extra count - pin the dentry in core */
   2866	}
   2867	return error;
   2868out_iput:
   2869	iput(inode);
   2870	return error;
   2871}
   2872
   2873static int
   2874shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
   2875	      struct dentry *dentry, umode_t mode)
   2876{
   2877	struct inode *inode;
   2878	int error = -ENOSPC;
   2879
   2880	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
   2881	if (inode) {
   2882		error = security_inode_init_security(inode, dir,
   2883						     NULL,
   2884						     shmem_initxattrs, NULL);
   2885		if (error && error != -EOPNOTSUPP)
   2886			goto out_iput;
   2887		error = simple_acl_create(dir, inode);
   2888		if (error)
   2889			goto out_iput;
   2890		d_tmpfile(dentry, inode);
   2891	}
   2892	return error;
   2893out_iput:
   2894	iput(inode);
   2895	return error;
   2896}
   2897
   2898static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
   2899		       struct dentry *dentry, umode_t mode)
   2900{
   2901	int error;
   2902
   2903	if ((error = shmem_mknod(&init_user_ns, dir, dentry,
   2904				 mode | S_IFDIR, 0)))
   2905		return error;
   2906	inc_nlink(dir);
   2907	return 0;
   2908}
   2909
   2910static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
   2911			struct dentry *dentry, umode_t mode, bool excl)
   2912{
   2913	return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
   2914}
   2915
   2916/*
   2917 * Link a file..
   2918 */
   2919static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
   2920{
   2921	struct inode *inode = d_inode(old_dentry);
   2922	int ret = 0;
   2923
   2924	/*
   2925	 * No ordinary (disk based) filesystem counts links as inodes;
   2926	 * but each new link needs a new dentry, pinning lowmem, and
   2927	 * tmpfs dentries cannot be pruned until they are unlinked.
   2928	 * But if an O_TMPFILE file is linked into the tmpfs, the
   2929	 * first link must skip that, to get the accounting right.
   2930	 */
   2931	if (inode->i_nlink) {
   2932		ret = shmem_reserve_inode(inode->i_sb, NULL);
   2933		if (ret)
   2934			goto out;
   2935	}
   2936
   2937	dir->i_size += BOGO_DIRENT_SIZE;
   2938	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
   2939	inc_nlink(inode);
   2940	ihold(inode);	/* New dentry reference */
   2941	dget(dentry);		/* Extra pinning count for the created dentry */
   2942	d_instantiate(dentry, inode);
   2943out:
   2944	return ret;
   2945}
   2946
   2947static int shmem_unlink(struct inode *dir, struct dentry *dentry)
   2948{
   2949	struct inode *inode = d_inode(dentry);
   2950
   2951	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
   2952		shmem_free_inode(inode->i_sb);
   2953
   2954	dir->i_size -= BOGO_DIRENT_SIZE;
   2955	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
   2956	drop_nlink(inode);
   2957	dput(dentry);	/* Undo the count from "create" - this does all the work */
   2958	return 0;
   2959}
   2960
   2961static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
   2962{
   2963	if (!simple_empty(dentry))
   2964		return -ENOTEMPTY;
   2965
   2966	drop_nlink(d_inode(dentry));
   2967	drop_nlink(dir);
   2968	return shmem_unlink(dir, dentry);
   2969}
   2970
   2971static int shmem_whiteout(struct user_namespace *mnt_userns,
   2972			  struct inode *old_dir, struct dentry *old_dentry)
   2973{
   2974	struct dentry *whiteout;
   2975	int error;
   2976
   2977	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
   2978	if (!whiteout)
   2979		return -ENOMEM;
   2980
   2981	error = shmem_mknod(&init_user_ns, old_dir, whiteout,
   2982			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
   2983	dput(whiteout);
   2984	if (error)
   2985		return error;
   2986
   2987	/*
   2988	 * Cheat and hash the whiteout while the old dentry is still in
   2989	 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
   2990	 *
   2991	 * d_lookup() will consistently find one of them at this point,
   2992	 * not sure which one, but that isn't even important.
   2993	 */
   2994	d_rehash(whiteout);
   2995	return 0;
   2996}
   2997
   2998/*
   2999 * The VFS layer already does all the dentry stuff for rename,
   3000 * we just have to decrement the usage count for the target if
   3001 * it exists so that the VFS layer correctly free's it when it
   3002 * gets overwritten.
   3003 */
   3004static int shmem_rename2(struct user_namespace *mnt_userns,
   3005			 struct inode *old_dir, struct dentry *old_dentry,
   3006			 struct inode *new_dir, struct dentry *new_dentry,
   3007			 unsigned int flags)
   3008{
   3009	struct inode *inode = d_inode(old_dentry);
   3010	int they_are_dirs = S_ISDIR(inode->i_mode);
   3011
   3012	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
   3013		return -EINVAL;
   3014
   3015	if (flags & RENAME_EXCHANGE)
   3016		return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
   3017
   3018	if (!simple_empty(new_dentry))
   3019		return -ENOTEMPTY;
   3020
   3021	if (flags & RENAME_WHITEOUT) {
   3022		int error;
   3023
   3024		error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
   3025		if (error)
   3026			return error;
   3027	}
   3028
   3029	if (d_really_is_positive(new_dentry)) {
   3030		(void) shmem_unlink(new_dir, new_dentry);
   3031		if (they_are_dirs) {
   3032			drop_nlink(d_inode(new_dentry));
   3033			drop_nlink(old_dir);
   3034		}
   3035	} else if (they_are_dirs) {
   3036		drop_nlink(old_dir);
   3037		inc_nlink(new_dir);
   3038	}
   3039
   3040	old_dir->i_size -= BOGO_DIRENT_SIZE;
   3041	new_dir->i_size += BOGO_DIRENT_SIZE;
   3042	old_dir->i_ctime = old_dir->i_mtime =
   3043	new_dir->i_ctime = new_dir->i_mtime =
   3044	inode->i_ctime = current_time(old_dir);
   3045	return 0;
   3046}
   3047
   3048static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
   3049			 struct dentry *dentry, const char *symname)
   3050{
   3051	int error;
   3052	int len;
   3053	struct inode *inode;
   3054	struct page *page;
   3055
   3056	len = strlen(symname) + 1;
   3057	if (len > PAGE_SIZE)
   3058		return -ENAMETOOLONG;
   3059
   3060	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
   3061				VM_NORESERVE);
   3062	if (!inode)
   3063		return -ENOSPC;
   3064
   3065	error = security_inode_init_security(inode, dir, &dentry->d_name,
   3066					     shmem_initxattrs, NULL);
   3067	if (error && error != -EOPNOTSUPP) {
   3068		iput(inode);
   3069		return error;
   3070	}
   3071
   3072	inode->i_size = len-1;
   3073	if (len <= SHORT_SYMLINK_LEN) {
   3074		inode->i_link = kmemdup(symname, len, GFP_KERNEL);
   3075		if (!inode->i_link) {
   3076			iput(inode);
   3077			return -ENOMEM;
   3078		}
   3079		inode->i_op = &shmem_short_symlink_operations;
   3080	} else {
   3081		inode_nohighmem(inode);
   3082		error = shmem_getpage(inode, 0, &page, SGP_WRITE);
   3083		if (error) {
   3084			iput(inode);
   3085			return error;
   3086		}
   3087		inode->i_mapping->a_ops = &shmem_aops;
   3088		inode->i_op = &shmem_symlink_inode_operations;
   3089		memcpy(page_address(page), symname, len);
   3090		SetPageUptodate(page);
   3091		set_page_dirty(page);
   3092		unlock_page(page);
   3093		put_page(page);
   3094	}
   3095	dir->i_size += BOGO_DIRENT_SIZE;
   3096	dir->i_ctime = dir->i_mtime = current_time(dir);
   3097	d_instantiate(dentry, inode);
   3098	dget(dentry);
   3099	return 0;
   3100}
   3101
   3102static void shmem_put_link(void *arg)
   3103{
   3104	mark_page_accessed(arg);
   3105	put_page(arg);
   3106}
   3107
   3108static const char *shmem_get_link(struct dentry *dentry,
   3109				  struct inode *inode,
   3110				  struct delayed_call *done)
   3111{
   3112	struct page *page = NULL;
   3113	int error;
   3114	if (!dentry) {
   3115		page = find_get_page(inode->i_mapping, 0);
   3116		if (!page)
   3117			return ERR_PTR(-ECHILD);
   3118		if (PageHWPoison(page) ||
   3119		    !PageUptodate(page)) {
   3120			put_page(page);
   3121			return ERR_PTR(-ECHILD);
   3122		}
   3123	} else {
   3124		error = shmem_getpage(inode, 0, &page, SGP_READ);
   3125		if (error)
   3126			return ERR_PTR(error);
   3127		if (!page)
   3128			return ERR_PTR(-ECHILD);
   3129		if (PageHWPoison(page)) {
   3130			unlock_page(page);
   3131			put_page(page);
   3132			return ERR_PTR(-ECHILD);
   3133		}
   3134		unlock_page(page);
   3135	}
   3136	set_delayed_call(done, shmem_put_link, page);
   3137	return page_address(page);
   3138}
   3139
   3140#ifdef CONFIG_TMPFS_XATTR
   3141/*
   3142 * Superblocks without xattr inode operations may get some security.* xattr
   3143 * support from the LSM "for free". As soon as we have any other xattrs
   3144 * like ACLs, we also need to implement the security.* handlers at
   3145 * filesystem level, though.
   3146 */
   3147
   3148/*
   3149 * Callback for security_inode_init_security() for acquiring xattrs.
   3150 */
   3151static int shmem_initxattrs(struct inode *inode,
   3152			    const struct xattr *xattr_array,
   3153			    void *fs_info)
   3154{
   3155	struct shmem_inode_info *info = SHMEM_I(inode);
   3156	const struct xattr *xattr;
   3157	struct simple_xattr *new_xattr;
   3158	size_t len;
   3159
   3160	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
   3161		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
   3162		if (!new_xattr)
   3163			return -ENOMEM;
   3164
   3165		len = strlen(xattr->name) + 1;
   3166		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
   3167					  GFP_KERNEL);
   3168		if (!new_xattr->name) {
   3169			kvfree(new_xattr);
   3170			return -ENOMEM;
   3171		}
   3172
   3173		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
   3174		       XATTR_SECURITY_PREFIX_LEN);
   3175		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
   3176		       xattr->name, len);
   3177
   3178		simple_xattr_list_add(&info->xattrs, new_xattr);
   3179	}
   3180
   3181	return 0;
   3182}
   3183
   3184static int shmem_xattr_handler_get(const struct xattr_handler *handler,
   3185				   struct dentry *unused, struct inode *inode,
   3186				   const char *name, void *buffer, size_t size)
   3187{
   3188	struct shmem_inode_info *info = SHMEM_I(inode);
   3189
   3190	name = xattr_full_name(handler, name);
   3191	return simple_xattr_get(&info->xattrs, name, buffer, size);
   3192}
   3193
   3194static int shmem_xattr_handler_set(const struct xattr_handler *handler,
   3195				   struct user_namespace *mnt_userns,
   3196				   struct dentry *unused, struct inode *inode,
   3197				   const char *name, const void *value,
   3198				   size_t size, int flags)
   3199{
   3200	struct shmem_inode_info *info = SHMEM_I(inode);
   3201
   3202	name = xattr_full_name(handler, name);
   3203	return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
   3204}
   3205
   3206static const struct xattr_handler shmem_security_xattr_handler = {
   3207	.prefix = XATTR_SECURITY_PREFIX,
   3208	.get = shmem_xattr_handler_get,
   3209	.set = shmem_xattr_handler_set,
   3210};
   3211
   3212static const struct xattr_handler shmem_trusted_xattr_handler = {
   3213	.prefix = XATTR_TRUSTED_PREFIX,
   3214	.get = shmem_xattr_handler_get,
   3215	.set = shmem_xattr_handler_set,
   3216};
   3217
   3218static const struct xattr_handler *shmem_xattr_handlers[] = {
   3219#ifdef CONFIG_TMPFS_POSIX_ACL
   3220	&posix_acl_access_xattr_handler,
   3221	&posix_acl_default_xattr_handler,
   3222#endif
   3223	&shmem_security_xattr_handler,
   3224	&shmem_trusted_xattr_handler,
   3225	NULL
   3226};
   3227
   3228static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
   3229{
   3230	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
   3231	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
   3232}
   3233#endif /* CONFIG_TMPFS_XATTR */
   3234
   3235static const struct inode_operations shmem_short_symlink_operations = {
   3236	.getattr	= shmem_getattr,
   3237	.get_link	= simple_get_link,
   3238#ifdef CONFIG_TMPFS_XATTR
   3239	.listxattr	= shmem_listxattr,
   3240#endif
   3241};
   3242
   3243static const struct inode_operations shmem_symlink_inode_operations = {
   3244	.getattr	= shmem_getattr,
   3245	.get_link	= shmem_get_link,
   3246#ifdef CONFIG_TMPFS_XATTR
   3247	.listxattr	= shmem_listxattr,
   3248#endif
   3249};
   3250
   3251static struct dentry *shmem_get_parent(struct dentry *child)
   3252{
   3253	return ERR_PTR(-ESTALE);
   3254}
   3255
   3256static int shmem_match(struct inode *ino, void *vfh)
   3257{
   3258	__u32 *fh = vfh;
   3259	__u64 inum = fh[2];
   3260	inum = (inum << 32) | fh[1];
   3261	return ino->i_ino == inum && fh[0] == ino->i_generation;
   3262}
   3263
   3264/* Find any alias of inode, but prefer a hashed alias */
   3265static struct dentry *shmem_find_alias(struct inode *inode)
   3266{
   3267	struct dentry *alias = d_find_alias(inode);
   3268
   3269	return alias ?: d_find_any_alias(inode);
   3270}
   3271
   3272
   3273static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
   3274		struct fid *fid, int fh_len, int fh_type)
   3275{
   3276	struct inode *inode;
   3277	struct dentry *dentry = NULL;
   3278	u64 inum;
   3279
   3280	if (fh_len < 3)
   3281		return NULL;
   3282
   3283	inum = fid->raw[2];
   3284	inum = (inum << 32) | fid->raw[1];
   3285
   3286	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
   3287			shmem_match, fid->raw);
   3288	if (inode) {
   3289		dentry = shmem_find_alias(inode);
   3290		iput(inode);
   3291	}
   3292
   3293	return dentry;
   3294}
   3295
   3296static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
   3297				struct inode *parent)
   3298{
   3299	if (*len < 3) {
   3300		*len = 3;
   3301		return FILEID_INVALID;
   3302	}
   3303
   3304	if (inode_unhashed(inode)) {
   3305		/* Unfortunately insert_inode_hash is not idempotent,
   3306		 * so as we hash inodes here rather than at creation
   3307		 * time, we need a lock to ensure we only try
   3308		 * to do it once
   3309		 */
   3310		static DEFINE_SPINLOCK(lock);
   3311		spin_lock(&lock);
   3312		if (inode_unhashed(inode))
   3313			__insert_inode_hash(inode,
   3314					    inode->i_ino + inode->i_generation);
   3315		spin_unlock(&lock);
   3316	}
   3317
   3318	fh[0] = inode->i_generation;
   3319	fh[1] = inode->i_ino;
   3320	fh[2] = ((__u64)inode->i_ino) >> 32;
   3321
   3322	*len = 3;
   3323	return 1;
   3324}
   3325
   3326static const struct export_operations shmem_export_ops = {
   3327	.get_parent     = shmem_get_parent,
   3328	.encode_fh      = shmem_encode_fh,
   3329	.fh_to_dentry	= shmem_fh_to_dentry,
   3330};
   3331
   3332enum shmem_param {
   3333	Opt_gid,
   3334	Opt_huge,
   3335	Opt_mode,
   3336	Opt_mpol,
   3337	Opt_nr_blocks,
   3338	Opt_nr_inodes,
   3339	Opt_size,
   3340	Opt_uid,
   3341	Opt_inode32,
   3342	Opt_inode64,
   3343};
   3344
   3345static const struct constant_table shmem_param_enums_huge[] = {
   3346	{"never",	SHMEM_HUGE_NEVER },
   3347	{"always",	SHMEM_HUGE_ALWAYS },
   3348	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
   3349	{"advise",	SHMEM_HUGE_ADVISE },
   3350	{}
   3351};
   3352
   3353const struct fs_parameter_spec shmem_fs_parameters[] = {
   3354	fsparam_u32   ("gid",		Opt_gid),
   3355	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
   3356	fsparam_u32oct("mode",		Opt_mode),
   3357	fsparam_string("mpol",		Opt_mpol),
   3358	fsparam_string("nr_blocks",	Opt_nr_blocks),
   3359	fsparam_string("nr_inodes",	Opt_nr_inodes),
   3360	fsparam_string("size",		Opt_size),
   3361	fsparam_u32   ("uid",		Opt_uid),
   3362	fsparam_flag  ("inode32",	Opt_inode32),
   3363	fsparam_flag  ("inode64",	Opt_inode64),
   3364	{}
   3365};
   3366
   3367static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
   3368{
   3369	struct shmem_options *ctx = fc->fs_private;
   3370	struct fs_parse_result result;
   3371	unsigned long long size;
   3372	char *rest;
   3373	int opt;
   3374
   3375	opt = fs_parse(fc, shmem_fs_parameters, param, &result);
   3376	if (opt < 0)
   3377		return opt;
   3378
   3379	switch (opt) {
   3380	case Opt_size:
   3381		size = memparse(param->string, &rest);
   3382		if (*rest == '%') {
   3383			size <<= PAGE_SHIFT;
   3384			size *= totalram_pages();
   3385			do_div(size, 100);
   3386			rest++;
   3387		}
   3388		if (*rest)
   3389			goto bad_value;
   3390		ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
   3391		ctx->seen |= SHMEM_SEEN_BLOCKS;
   3392		break;
   3393	case Opt_nr_blocks:
   3394		ctx->blocks = memparse(param->string, &rest);
   3395		if (*rest)
   3396			goto bad_value;
   3397		ctx->seen |= SHMEM_SEEN_BLOCKS;
   3398		break;
   3399	case Opt_nr_inodes:
   3400		ctx->inodes = memparse(param->string, &rest);
   3401		if (*rest)
   3402			goto bad_value;
   3403		ctx->seen |= SHMEM_SEEN_INODES;
   3404		break;
   3405	case Opt_mode:
   3406		ctx->mode = result.uint_32 & 07777;
   3407		break;
   3408	case Opt_uid:
   3409		ctx->uid = make_kuid(current_user_ns(), result.uint_32);
   3410		if (!uid_valid(ctx->uid))
   3411			goto bad_value;
   3412		break;
   3413	case Opt_gid:
   3414		ctx->gid = make_kgid(current_user_ns(), result.uint_32);
   3415		if (!gid_valid(ctx->gid))
   3416			goto bad_value;
   3417		break;
   3418	case Opt_huge:
   3419		ctx->huge = result.uint_32;
   3420		if (ctx->huge != SHMEM_HUGE_NEVER &&
   3421		    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
   3422		      has_transparent_hugepage()))
   3423			goto unsupported_parameter;
   3424		ctx->seen |= SHMEM_SEEN_HUGE;
   3425		break;
   3426	case Opt_mpol:
   3427		if (IS_ENABLED(CONFIG_NUMA)) {
   3428			mpol_put(ctx->mpol);
   3429			ctx->mpol = NULL;
   3430			if (mpol_parse_str(param->string, &ctx->mpol))
   3431				goto bad_value;
   3432			break;
   3433		}
   3434		goto unsupported_parameter;
   3435	case Opt_inode32:
   3436		ctx->full_inums = false;
   3437		ctx->seen |= SHMEM_SEEN_INUMS;
   3438		break;
   3439	case Opt_inode64:
   3440		if (sizeof(ino_t) < 8) {
   3441			return invalfc(fc,
   3442				       "Cannot use inode64 with <64bit inums in kernel\n");
   3443		}
   3444		ctx->full_inums = true;
   3445		ctx->seen |= SHMEM_SEEN_INUMS;
   3446		break;
   3447	}
   3448	return 0;
   3449
   3450unsupported_parameter:
   3451	return invalfc(fc, "Unsupported parameter '%s'", param->key);
   3452bad_value:
   3453	return invalfc(fc, "Bad value for '%s'", param->key);
   3454}
   3455
   3456static int shmem_parse_options(struct fs_context *fc, void *data)
   3457{
   3458	char *options = data;
   3459
   3460	if (options) {
   3461		int err = security_sb_eat_lsm_opts(options, &fc->security);
   3462		if (err)
   3463			return err;
   3464	}
   3465
   3466	while (options != NULL) {
   3467		char *this_char = options;
   3468		for (;;) {
   3469			/*
   3470			 * NUL-terminate this option: unfortunately,
   3471			 * mount options form a comma-separated list,
   3472			 * but mpol's nodelist may also contain commas.
   3473			 */
   3474			options = strchr(options, ',');
   3475			if (options == NULL)
   3476				break;
   3477			options++;
   3478			if (!isdigit(*options)) {
   3479				options[-1] = '\0';
   3480				break;
   3481			}
   3482		}
   3483		if (*this_char) {
   3484			char *value = strchr(this_char, '=');
   3485			size_t len = 0;
   3486			int err;
   3487
   3488			if (value) {
   3489				*value++ = '\0';
   3490				len = strlen(value);
   3491			}
   3492			err = vfs_parse_fs_string(fc, this_char, value, len);
   3493			if (err < 0)
   3494				return err;
   3495		}
   3496	}
   3497	return 0;
   3498}
   3499
   3500/*
   3501 * Reconfigure a shmem filesystem.
   3502 *
   3503 * Note that we disallow change from limited->unlimited blocks/inodes while any
   3504 * are in use; but we must separately disallow unlimited->limited, because in
   3505 * that case we have no record of how much is already in use.
   3506 */
   3507static int shmem_reconfigure(struct fs_context *fc)
   3508{
   3509	struct shmem_options *ctx = fc->fs_private;
   3510	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
   3511	unsigned long inodes;
   3512	struct mempolicy *mpol = NULL;
   3513	const char *err;
   3514
   3515	raw_spin_lock(&sbinfo->stat_lock);
   3516	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
   3517	if (ctx->blocks > S64_MAX) {
   3518		err = "Number of blocks too large";
   3519		goto out;
   3520	}
   3521	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
   3522		if (!sbinfo->max_blocks) {
   3523			err = "Cannot retroactively limit size";
   3524			goto out;
   3525		}
   3526		if (percpu_counter_compare(&sbinfo->used_blocks,
   3527					   ctx->blocks) > 0) {
   3528			err = "Too small a size for current use";
   3529			goto out;
   3530		}
   3531	}
   3532	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
   3533		if (!sbinfo->max_inodes) {
   3534			err = "Cannot retroactively limit inodes";
   3535			goto out;
   3536		}
   3537		if (ctx->inodes < inodes) {
   3538			err = "Too few inodes for current use";
   3539			goto out;
   3540		}
   3541	}
   3542
   3543	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
   3544	    sbinfo->next_ino > UINT_MAX) {
   3545		err = "Current inum too high to switch to 32-bit inums";
   3546		goto out;
   3547	}
   3548
   3549	if (ctx->seen & SHMEM_SEEN_HUGE)
   3550		sbinfo->huge = ctx->huge;
   3551	if (ctx->seen & SHMEM_SEEN_INUMS)
   3552		sbinfo->full_inums = ctx->full_inums;
   3553	if (ctx->seen & SHMEM_SEEN_BLOCKS)
   3554		sbinfo->max_blocks  = ctx->blocks;
   3555	if (ctx->seen & SHMEM_SEEN_INODES) {
   3556		sbinfo->max_inodes  = ctx->inodes;
   3557		sbinfo->free_inodes = ctx->inodes - inodes;
   3558	}
   3559
   3560	/*
   3561	 * Preserve previous mempolicy unless mpol remount option was specified.
   3562	 */
   3563	if (ctx->mpol) {
   3564		mpol = sbinfo->mpol;
   3565		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
   3566		ctx->mpol = NULL;
   3567	}
   3568	raw_spin_unlock(&sbinfo->stat_lock);
   3569	mpol_put(mpol);
   3570	return 0;
   3571out:
   3572	raw_spin_unlock(&sbinfo->stat_lock);
   3573	return invalfc(fc, "%s", err);
   3574}
   3575
   3576static int shmem_show_options(struct seq_file *seq, struct dentry *root)
   3577{
   3578	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
   3579
   3580	if (sbinfo->max_blocks != shmem_default_max_blocks())
   3581		seq_printf(seq, ",size=%luk",
   3582			sbinfo->max_blocks << (PAGE_SHIFT - 10));
   3583	if (sbinfo->max_inodes != shmem_default_max_inodes())
   3584		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
   3585	if (sbinfo->mode != (0777 | S_ISVTX))
   3586		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
   3587	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
   3588		seq_printf(seq, ",uid=%u",
   3589				from_kuid_munged(&init_user_ns, sbinfo->uid));
   3590	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
   3591		seq_printf(seq, ",gid=%u",
   3592				from_kgid_munged(&init_user_ns, sbinfo->gid));
   3593
   3594	/*
   3595	 * Showing inode{64,32} might be useful even if it's the system default,
   3596	 * since then people don't have to resort to checking both here and
   3597	 * /proc/config.gz to confirm 64-bit inums were successfully applied
   3598	 * (which may not even exist if IKCONFIG_PROC isn't enabled).
   3599	 *
   3600	 * We hide it when inode64 isn't the default and we are using 32-bit
   3601	 * inodes, since that probably just means the feature isn't even under
   3602	 * consideration.
   3603	 *
   3604	 * As such:
   3605	 *
   3606	 *                     +-----------------+-----------------+
   3607	 *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
   3608	 *  +------------------+-----------------+-----------------+
   3609	 *  | full_inums=true  | show            | show            |
   3610	 *  | full_inums=false | show            | hide            |
   3611	 *  +------------------+-----------------+-----------------+
   3612	 *
   3613	 */
   3614	if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
   3615		seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
   3616#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   3617	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
   3618	if (sbinfo->huge)
   3619		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
   3620#endif
   3621	shmem_show_mpol(seq, sbinfo->mpol);
   3622	return 0;
   3623}
   3624
   3625#endif /* CONFIG_TMPFS */
   3626
   3627static void shmem_put_super(struct super_block *sb)
   3628{
   3629	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
   3630
   3631	free_percpu(sbinfo->ino_batch);
   3632	percpu_counter_destroy(&sbinfo->used_blocks);
   3633	mpol_put(sbinfo->mpol);
   3634	kfree(sbinfo);
   3635	sb->s_fs_info = NULL;
   3636}
   3637
   3638static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
   3639{
   3640	struct shmem_options *ctx = fc->fs_private;
   3641	struct inode *inode;
   3642	struct shmem_sb_info *sbinfo;
   3643
   3644	/* Round up to L1_CACHE_BYTES to resist false sharing */
   3645	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
   3646				L1_CACHE_BYTES), GFP_KERNEL);
   3647	if (!sbinfo)
   3648		return -ENOMEM;
   3649
   3650	sb->s_fs_info = sbinfo;
   3651
   3652#ifdef CONFIG_TMPFS
   3653	/*
   3654	 * Per default we only allow half of the physical ram per
   3655	 * tmpfs instance, limiting inodes to one per page of lowmem;
   3656	 * but the internal instance is left unlimited.
   3657	 */
   3658	if (!(sb->s_flags & SB_KERNMOUNT)) {
   3659		if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
   3660			ctx->blocks = shmem_default_max_blocks();
   3661		if (!(ctx->seen & SHMEM_SEEN_INODES))
   3662			ctx->inodes = shmem_default_max_inodes();
   3663		if (!(ctx->seen & SHMEM_SEEN_INUMS))
   3664			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
   3665	} else {
   3666		sb->s_flags |= SB_NOUSER;
   3667	}
   3668	sb->s_export_op = &shmem_export_ops;
   3669	sb->s_flags |= SB_NOSEC;
   3670#else
   3671	sb->s_flags |= SB_NOUSER;
   3672#endif
   3673	sbinfo->max_blocks = ctx->blocks;
   3674	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
   3675	if (sb->s_flags & SB_KERNMOUNT) {
   3676		sbinfo->ino_batch = alloc_percpu(ino_t);
   3677		if (!sbinfo->ino_batch)
   3678			goto failed;
   3679	}
   3680	sbinfo->uid = ctx->uid;
   3681	sbinfo->gid = ctx->gid;
   3682	sbinfo->full_inums = ctx->full_inums;
   3683	sbinfo->mode = ctx->mode;
   3684	sbinfo->huge = ctx->huge;
   3685	sbinfo->mpol = ctx->mpol;
   3686	ctx->mpol = NULL;
   3687
   3688	raw_spin_lock_init(&sbinfo->stat_lock);
   3689	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
   3690		goto failed;
   3691	spin_lock_init(&sbinfo->shrinklist_lock);
   3692	INIT_LIST_HEAD(&sbinfo->shrinklist);
   3693
   3694	sb->s_maxbytes = MAX_LFS_FILESIZE;
   3695	sb->s_blocksize = PAGE_SIZE;
   3696	sb->s_blocksize_bits = PAGE_SHIFT;
   3697	sb->s_magic = TMPFS_MAGIC;
   3698	sb->s_op = &shmem_ops;
   3699	sb->s_time_gran = 1;
   3700#ifdef CONFIG_TMPFS_XATTR
   3701	sb->s_xattr = shmem_xattr_handlers;
   3702#endif
   3703#ifdef CONFIG_TMPFS_POSIX_ACL
   3704	sb->s_flags |= SB_POSIXACL;
   3705#endif
   3706	uuid_gen(&sb->s_uuid);
   3707
   3708	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
   3709	if (!inode)
   3710		goto failed;
   3711	inode->i_uid = sbinfo->uid;
   3712	inode->i_gid = sbinfo->gid;
   3713	sb->s_root = d_make_root(inode);
   3714	if (!sb->s_root)
   3715		goto failed;
   3716	return 0;
   3717
   3718failed:
   3719	shmem_put_super(sb);
   3720	return -ENOMEM;
   3721}
   3722
   3723static int shmem_get_tree(struct fs_context *fc)
   3724{
   3725	return get_tree_nodev(fc, shmem_fill_super);
   3726}
   3727
   3728static void shmem_free_fc(struct fs_context *fc)
   3729{
   3730	struct shmem_options *ctx = fc->fs_private;
   3731
   3732	if (ctx) {
   3733		mpol_put(ctx->mpol);
   3734		kfree(ctx);
   3735	}
   3736}
   3737
   3738static const struct fs_context_operations shmem_fs_context_ops = {
   3739	.free			= shmem_free_fc,
   3740	.get_tree		= shmem_get_tree,
   3741#ifdef CONFIG_TMPFS
   3742	.parse_monolithic	= shmem_parse_options,
   3743	.parse_param		= shmem_parse_one,
   3744	.reconfigure		= shmem_reconfigure,
   3745#endif
   3746};
   3747
   3748static struct kmem_cache *shmem_inode_cachep;
   3749
   3750static struct inode *shmem_alloc_inode(struct super_block *sb)
   3751{
   3752	struct shmem_inode_info *info;
   3753	info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
   3754	if (!info)
   3755		return NULL;
   3756	return &info->vfs_inode;
   3757}
   3758
   3759static void shmem_free_in_core_inode(struct inode *inode)
   3760{
   3761	if (S_ISLNK(inode->i_mode))
   3762		kfree(inode->i_link);
   3763	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
   3764}
   3765
   3766static void shmem_destroy_inode(struct inode *inode)
   3767{
   3768	if (S_ISREG(inode->i_mode))
   3769		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
   3770}
   3771
   3772static void shmem_init_inode(void *foo)
   3773{
   3774	struct shmem_inode_info *info = foo;
   3775	inode_init_once(&info->vfs_inode);
   3776}
   3777
   3778static void shmem_init_inodecache(void)
   3779{
   3780	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
   3781				sizeof(struct shmem_inode_info),
   3782				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
   3783}
   3784
   3785static void shmem_destroy_inodecache(void)
   3786{
   3787	kmem_cache_destroy(shmem_inode_cachep);
   3788}
   3789
   3790/* Keep the page in page cache instead of truncating it */
   3791static int shmem_error_remove_page(struct address_space *mapping,
   3792				   struct page *page)
   3793{
   3794	return 0;
   3795}
   3796
   3797const struct address_space_operations shmem_aops = {
   3798	.writepage	= shmem_writepage,
   3799	.dirty_folio	= noop_dirty_folio,
   3800#ifdef CONFIG_TMPFS
   3801	.write_begin	= shmem_write_begin,
   3802	.write_end	= shmem_write_end,
   3803#endif
   3804#ifdef CONFIG_MIGRATION
   3805	.migratepage	= migrate_page,
   3806#endif
   3807	.error_remove_page = shmem_error_remove_page,
   3808};
   3809EXPORT_SYMBOL(shmem_aops);
   3810
   3811static const struct file_operations shmem_file_operations = {
   3812	.mmap		= shmem_mmap,
   3813	.get_unmapped_area = shmem_get_unmapped_area,
   3814#ifdef CONFIG_TMPFS
   3815	.llseek		= shmem_file_llseek,
   3816	.read_iter	= shmem_file_read_iter,
   3817	.write_iter	= generic_file_write_iter,
   3818	.fsync		= noop_fsync,
   3819	.splice_read	= generic_file_splice_read,
   3820	.splice_write	= iter_file_splice_write,
   3821	.fallocate	= shmem_fallocate,
   3822#endif
   3823};
   3824
   3825static const struct inode_operations shmem_inode_operations = {
   3826	.getattr	= shmem_getattr,
   3827	.setattr	= shmem_setattr,
   3828#ifdef CONFIG_TMPFS_XATTR
   3829	.listxattr	= shmem_listxattr,
   3830	.set_acl	= simple_set_acl,
   3831#endif
   3832};
   3833
   3834static const struct inode_operations shmem_dir_inode_operations = {
   3835#ifdef CONFIG_TMPFS
   3836	.getattr	= shmem_getattr,
   3837	.create		= shmem_create,
   3838	.lookup		= simple_lookup,
   3839	.link		= shmem_link,
   3840	.unlink		= shmem_unlink,
   3841	.symlink	= shmem_symlink,
   3842	.mkdir		= shmem_mkdir,
   3843	.rmdir		= shmem_rmdir,
   3844	.mknod		= shmem_mknod,
   3845	.rename		= shmem_rename2,
   3846	.tmpfile	= shmem_tmpfile,
   3847#endif
   3848#ifdef CONFIG_TMPFS_XATTR
   3849	.listxattr	= shmem_listxattr,
   3850#endif
   3851#ifdef CONFIG_TMPFS_POSIX_ACL
   3852	.setattr	= shmem_setattr,
   3853	.set_acl	= simple_set_acl,
   3854#endif
   3855};
   3856
   3857static const struct inode_operations shmem_special_inode_operations = {
   3858	.getattr	= shmem_getattr,
   3859#ifdef CONFIG_TMPFS_XATTR
   3860	.listxattr	= shmem_listxattr,
   3861#endif
   3862#ifdef CONFIG_TMPFS_POSIX_ACL
   3863	.setattr	= shmem_setattr,
   3864	.set_acl	= simple_set_acl,
   3865#endif
   3866};
   3867
   3868static const struct super_operations shmem_ops = {
   3869	.alloc_inode	= shmem_alloc_inode,
   3870	.free_inode	= shmem_free_in_core_inode,
   3871	.destroy_inode	= shmem_destroy_inode,
   3872#ifdef CONFIG_TMPFS
   3873	.statfs		= shmem_statfs,
   3874	.show_options	= shmem_show_options,
   3875#endif
   3876	.evict_inode	= shmem_evict_inode,
   3877	.drop_inode	= generic_delete_inode,
   3878	.put_super	= shmem_put_super,
   3879#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   3880	.nr_cached_objects	= shmem_unused_huge_count,
   3881	.free_cached_objects	= shmem_unused_huge_scan,
   3882#endif
   3883};
   3884
   3885static const struct vm_operations_struct shmem_vm_ops = {
   3886	.fault		= shmem_fault,
   3887	.map_pages	= filemap_map_pages,
   3888#ifdef CONFIG_NUMA
   3889	.set_policy     = shmem_set_policy,
   3890	.get_policy     = shmem_get_policy,
   3891#endif
   3892};
   3893
   3894int shmem_init_fs_context(struct fs_context *fc)
   3895{
   3896	struct shmem_options *ctx;
   3897
   3898	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
   3899	if (!ctx)
   3900		return -ENOMEM;
   3901
   3902	ctx->mode = 0777 | S_ISVTX;
   3903	ctx->uid = current_fsuid();
   3904	ctx->gid = current_fsgid();
   3905
   3906	fc->fs_private = ctx;
   3907	fc->ops = &shmem_fs_context_ops;
   3908	return 0;
   3909}
   3910
   3911static struct file_system_type shmem_fs_type = {
   3912	.owner		= THIS_MODULE,
   3913	.name		= "tmpfs",
   3914	.init_fs_context = shmem_init_fs_context,
   3915#ifdef CONFIG_TMPFS
   3916	.parameters	= shmem_fs_parameters,
   3917#endif
   3918	.kill_sb	= kill_litter_super,
   3919	.fs_flags	= FS_USERNS_MOUNT,
   3920};
   3921
   3922void __init shmem_init(void)
   3923{
   3924	int error;
   3925
   3926	shmem_init_inodecache();
   3927
   3928	error = register_filesystem(&shmem_fs_type);
   3929	if (error) {
   3930		pr_err("Could not register tmpfs\n");
   3931		goto out2;
   3932	}
   3933
   3934	shm_mnt = kern_mount(&shmem_fs_type);
   3935	if (IS_ERR(shm_mnt)) {
   3936		error = PTR_ERR(shm_mnt);
   3937		pr_err("Could not kern_mount tmpfs\n");
   3938		goto out1;
   3939	}
   3940
   3941#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   3942	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
   3943		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
   3944	else
   3945		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
   3946#endif
   3947	return;
   3948
   3949out1:
   3950	unregister_filesystem(&shmem_fs_type);
   3951out2:
   3952	shmem_destroy_inodecache();
   3953	shm_mnt = ERR_PTR(error);
   3954}
   3955
   3956#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
   3957static ssize_t shmem_enabled_show(struct kobject *kobj,
   3958				  struct kobj_attribute *attr, char *buf)
   3959{
   3960	static const int values[] = {
   3961		SHMEM_HUGE_ALWAYS,
   3962		SHMEM_HUGE_WITHIN_SIZE,
   3963		SHMEM_HUGE_ADVISE,
   3964		SHMEM_HUGE_NEVER,
   3965		SHMEM_HUGE_DENY,
   3966		SHMEM_HUGE_FORCE,
   3967	};
   3968	int len = 0;
   3969	int i;
   3970
   3971	for (i = 0; i < ARRAY_SIZE(values); i++) {
   3972		len += sysfs_emit_at(buf, len,
   3973				     shmem_huge == values[i] ? "%s[%s]" : "%s%s",
   3974				     i ? " " : "",
   3975				     shmem_format_huge(values[i]));
   3976	}
   3977
   3978	len += sysfs_emit_at(buf, len, "\n");
   3979
   3980	return len;
   3981}
   3982
   3983static ssize_t shmem_enabled_store(struct kobject *kobj,
   3984		struct kobj_attribute *attr, const char *buf, size_t count)
   3985{
   3986	char tmp[16];
   3987	int huge;
   3988
   3989	if (count + 1 > sizeof(tmp))
   3990		return -EINVAL;
   3991	memcpy(tmp, buf, count);
   3992	tmp[count] = '\0';
   3993	if (count && tmp[count - 1] == '\n')
   3994		tmp[count - 1] = '\0';
   3995
   3996	huge = shmem_parse_huge(tmp);
   3997	if (huge == -EINVAL)
   3998		return -EINVAL;
   3999	if (!has_transparent_hugepage() &&
   4000			huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
   4001		return -EINVAL;
   4002
   4003	shmem_huge = huge;
   4004	if (shmem_huge > SHMEM_HUGE_DENY)
   4005		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
   4006	return count;
   4007}
   4008
   4009struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
   4010#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
   4011
   4012#else /* !CONFIG_SHMEM */
   4013
   4014/*
   4015 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
   4016 *
   4017 * This is intended for small system where the benefits of the full
   4018 * shmem code (swap-backed and resource-limited) are outweighed by
   4019 * their complexity. On systems without swap this code should be
   4020 * effectively equivalent, but much lighter weight.
   4021 */
   4022
   4023static struct file_system_type shmem_fs_type = {
   4024	.name		= "tmpfs",
   4025	.init_fs_context = ramfs_init_fs_context,
   4026	.parameters	= ramfs_fs_parameters,
   4027	.kill_sb	= kill_litter_super,
   4028	.fs_flags	= FS_USERNS_MOUNT,
   4029};
   4030
   4031void __init shmem_init(void)
   4032{
   4033	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
   4034
   4035	shm_mnt = kern_mount(&shmem_fs_type);
   4036	BUG_ON(IS_ERR(shm_mnt));
   4037}
   4038
   4039int shmem_unuse(unsigned int type)
   4040{
   4041	return 0;
   4042}
   4043
   4044int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
   4045{
   4046	return 0;
   4047}
   4048
   4049void shmem_unlock_mapping(struct address_space *mapping)
   4050{
   4051}
   4052
   4053#ifdef CONFIG_MMU
   4054unsigned long shmem_get_unmapped_area(struct file *file,
   4055				      unsigned long addr, unsigned long len,
   4056				      unsigned long pgoff, unsigned long flags)
   4057{
   4058	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
   4059}
   4060#endif
   4061
   4062void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
   4063{
   4064	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
   4065}
   4066EXPORT_SYMBOL_GPL(shmem_truncate_range);
   4067
   4068#define shmem_vm_ops				generic_file_vm_ops
   4069#define shmem_file_operations			ramfs_file_operations
   4070#define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
   4071#define shmem_acct_size(flags, size)		0
   4072#define shmem_unacct_size(flags, size)		do {} while (0)
   4073
   4074#endif /* CONFIG_SHMEM */
   4075
   4076/* common code */
   4077
   4078static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
   4079				       unsigned long flags, unsigned int i_flags)
   4080{
   4081	struct inode *inode;
   4082	struct file *res;
   4083
   4084	if (IS_ERR(mnt))
   4085		return ERR_CAST(mnt);
   4086
   4087	if (size < 0 || size > MAX_LFS_FILESIZE)
   4088		return ERR_PTR(-EINVAL);
   4089
   4090	if (shmem_acct_size(flags, size))
   4091		return ERR_PTR(-ENOMEM);
   4092
   4093	inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
   4094				flags);
   4095	if (unlikely(!inode)) {
   4096		shmem_unacct_size(flags, size);
   4097		return ERR_PTR(-ENOSPC);
   4098	}
   4099	inode->i_flags |= i_flags;
   4100	inode->i_size = size;
   4101	clear_nlink(inode);	/* It is unlinked */
   4102	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
   4103	if (!IS_ERR(res))
   4104		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
   4105				&shmem_file_operations);
   4106	if (IS_ERR(res))
   4107		iput(inode);
   4108	return res;
   4109}
   4110
   4111/**
   4112 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
   4113 * 	kernel internal.  There will be NO LSM permission checks against the
   4114 * 	underlying inode.  So users of this interface must do LSM checks at a
   4115 *	higher layer.  The users are the big_key and shm implementations.  LSM
   4116 *	checks are provided at the key or shm level rather than the inode.
   4117 * @name: name for dentry (to be seen in /proc/<pid>/maps
   4118 * @size: size to be set for the file
   4119 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
   4120 */
   4121struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
   4122{
   4123	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
   4124}
   4125
   4126/**
   4127 * shmem_file_setup - get an unlinked file living in tmpfs
   4128 * @name: name for dentry (to be seen in /proc/<pid>/maps
   4129 * @size: size to be set for the file
   4130 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
   4131 */
   4132struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
   4133{
   4134	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
   4135}
   4136EXPORT_SYMBOL_GPL(shmem_file_setup);
   4137
   4138/**
   4139 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
   4140 * @mnt: the tmpfs mount where the file will be created
   4141 * @name: name for dentry (to be seen in /proc/<pid>/maps
   4142 * @size: size to be set for the file
   4143 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
   4144 */
   4145struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
   4146				       loff_t size, unsigned long flags)
   4147{
   4148	return __shmem_file_setup(mnt, name, size, flags, 0);
   4149}
   4150EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
   4151
   4152/**
   4153 * shmem_zero_setup - setup a shared anonymous mapping
   4154 * @vma: the vma to be mmapped is prepared by do_mmap
   4155 */
   4156int shmem_zero_setup(struct vm_area_struct *vma)
   4157{
   4158	struct file *file;
   4159	loff_t size = vma->vm_end - vma->vm_start;
   4160
   4161	/*
   4162	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
   4163	 * between XFS directory reading and selinux: since this file is only
   4164	 * accessible to the user through its mapping, use S_PRIVATE flag to
   4165	 * bypass file security, in the same way as shmem_kernel_file_setup().
   4166	 */
   4167	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
   4168	if (IS_ERR(file))
   4169		return PTR_ERR(file);
   4170
   4171	if (vma->vm_file)
   4172		fput(vma->vm_file);
   4173	vma->vm_file = file;
   4174	vma->vm_ops = &shmem_vm_ops;
   4175
   4176	return 0;
   4177}
   4178
   4179/**
   4180 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
   4181 * @mapping:	the page's address_space
   4182 * @index:	the page index
   4183 * @gfp:	the page allocator flags to use if allocating
   4184 *
   4185 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
   4186 * with any new page allocations done using the specified allocation flags.
   4187 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
   4188 * suit tmpfs, since it may have pages in swapcache, and needs to find those
   4189 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
   4190 *
   4191 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
   4192 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
   4193 */
   4194struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
   4195					 pgoff_t index, gfp_t gfp)
   4196{
   4197#ifdef CONFIG_SHMEM
   4198	struct inode *inode = mapping->host;
   4199	struct page *page;
   4200	int error;
   4201
   4202	BUG_ON(!shmem_mapping(mapping));
   4203	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
   4204				  gfp, NULL, NULL, NULL);
   4205	if (error)
   4206		return ERR_PTR(error);
   4207
   4208	unlock_page(page);
   4209	if (PageHWPoison(page)) {
   4210		put_page(page);
   4211		return ERR_PTR(-EIO);
   4212	}
   4213
   4214	return page;
   4215#else
   4216	/*
   4217	 * The tiny !SHMEM case uses ramfs without swap
   4218	 */
   4219	return read_cache_page_gfp(mapping, index, gfp);
   4220#endif
   4221}
   4222EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);