cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

inode.c (43365B)


      1/*
      2 * hugetlbpage-backed filesystem.  Based on ramfs.
      3 *
      4 * Nadia Yvette Chambers, 2002
      5 *
      6 * Copyright (C) 2002 Linus Torvalds.
      7 * License: GPL
      8 */
      9
     10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     11
     12#include <linux/thread_info.h>
     13#include <asm/current.h>
     14#include <linux/sched/signal.h>		/* remove ASAP */
     15#include <linux/falloc.h>
     16#include <linux/fs.h>
     17#include <linux/mount.h>
     18#include <linux/file.h>
     19#include <linux/kernel.h>
     20#include <linux/writeback.h>
     21#include <linux/pagemap.h>
     22#include <linux/highmem.h>
     23#include <linux/init.h>
     24#include <linux/string.h>
     25#include <linux/capability.h>
     26#include <linux/ctype.h>
     27#include <linux/backing-dev.h>
     28#include <linux/hugetlb.h>
     29#include <linux/pagevec.h>
     30#include <linux/fs_parser.h>
     31#include <linux/mman.h>
     32#include <linux/slab.h>
     33#include <linux/dnotify.h>
     34#include <linux/statfs.h>
     35#include <linux/security.h>
     36#include <linux/magic.h>
     37#include <linux/migrate.h>
     38#include <linux/uio.h>
     39
     40#include <linux/uaccess.h>
     41#include <linux/sched/mm.h>
     42
     43static const struct super_operations hugetlbfs_ops;
     44static const struct address_space_operations hugetlbfs_aops;
     45const struct file_operations hugetlbfs_file_operations;
     46static const struct inode_operations hugetlbfs_dir_inode_operations;
     47static const struct inode_operations hugetlbfs_inode_operations;
     48
     49enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
     50
     51struct hugetlbfs_fs_context {
     52	struct hstate		*hstate;
     53	unsigned long long	max_size_opt;
     54	unsigned long long	min_size_opt;
     55	long			max_hpages;
     56	long			nr_inodes;
     57	long			min_hpages;
     58	enum hugetlbfs_size_type max_val_type;
     59	enum hugetlbfs_size_type min_val_type;
     60	kuid_t			uid;
     61	kgid_t			gid;
     62	umode_t			mode;
     63};
     64
     65int sysctl_hugetlb_shm_group;
     66
     67enum hugetlb_param {
     68	Opt_gid,
     69	Opt_min_size,
     70	Opt_mode,
     71	Opt_nr_inodes,
     72	Opt_pagesize,
     73	Opt_size,
     74	Opt_uid,
     75};
     76
     77static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
     78	fsparam_u32   ("gid",		Opt_gid),
     79	fsparam_string("min_size",	Opt_min_size),
     80	fsparam_u32oct("mode",		Opt_mode),
     81	fsparam_string("nr_inodes",	Opt_nr_inodes),
     82	fsparam_string("pagesize",	Opt_pagesize),
     83	fsparam_string("size",		Opt_size),
     84	fsparam_u32   ("uid",		Opt_uid),
     85	{}
     86};
     87
     88#ifdef CONFIG_NUMA
     89static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
     90					struct inode *inode, pgoff_t index)
     91{
     92	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
     93							index);
     94}
     95
     96static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
     97{
     98	mpol_cond_put(vma->vm_policy);
     99}
    100#else
    101static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
    102					struct inode *inode, pgoff_t index)
    103{
    104}
    105
    106static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
    107{
    108}
    109#endif
    110
    111static void huge_pagevec_release(struct pagevec *pvec)
    112{
    113	int i;
    114
    115	for (i = 0; i < pagevec_count(pvec); ++i)
    116		put_page(pvec->pages[i]);
    117
    118	pagevec_reinit(pvec);
    119}
    120
    121/*
    122 * Mask used when checking the page offset value passed in via system
    123 * calls.  This value will be converted to a loff_t which is signed.
    124 * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
    125 * value.  The extra bit (- 1 in the shift value) is to take the sign
    126 * bit into account.
    127 */
    128#define PGOFF_LOFFT_MAX \
    129	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
    130
    131static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
    132{
    133	struct inode *inode = file_inode(file);
    134	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
    135	loff_t len, vma_len;
    136	int ret;
    137	struct hstate *h = hstate_file(file);
    138
    139	/*
    140	 * vma address alignment (but not the pgoff alignment) has
    141	 * already been checked by prepare_hugepage_range.  If you add
    142	 * any error returns here, do so after setting VM_HUGETLB, so
    143	 * is_vm_hugetlb_page tests below unmap_region go the right
    144	 * way when do_mmap unwinds (may be important on powerpc
    145	 * and ia64).
    146	 */
    147	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
    148	vma->vm_ops = &hugetlb_vm_ops;
    149
    150	ret = seal_check_future_write(info->seals, vma);
    151	if (ret)
    152		return ret;
    153
    154	/*
    155	 * page based offset in vm_pgoff could be sufficiently large to
    156	 * overflow a loff_t when converted to byte offset.  This can
    157	 * only happen on architectures where sizeof(loff_t) ==
    158	 * sizeof(unsigned long).  So, only check in those instances.
    159	 */
    160	if (sizeof(unsigned long) == sizeof(loff_t)) {
    161		if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
    162			return -EINVAL;
    163	}
    164
    165	/* must be huge page aligned */
    166	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
    167		return -EINVAL;
    168
    169	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
    170	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
    171	/* check for overflow */
    172	if (len < vma_len)
    173		return -EINVAL;
    174
    175	inode_lock(inode);
    176	file_accessed(file);
    177
    178	ret = -ENOMEM;
    179	if (!hugetlb_reserve_pages(inode,
    180				vma->vm_pgoff >> huge_page_order(h),
    181				len >> huge_page_shift(h), vma,
    182				vma->vm_flags))
    183		goto out;
    184
    185	ret = 0;
    186	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
    187		i_size_write(inode, len);
    188out:
    189	inode_unlock(inode);
    190
    191	return ret;
    192}
    193
    194/*
    195 * Called under mmap_write_lock(mm).
    196 */
    197
    198static unsigned long
    199hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
    200		unsigned long len, unsigned long pgoff, unsigned long flags)
    201{
    202	struct hstate *h = hstate_file(file);
    203	struct vm_unmapped_area_info info;
    204
    205	info.flags = 0;
    206	info.length = len;
    207	info.low_limit = current->mm->mmap_base;
    208	info.high_limit = arch_get_mmap_end(addr, len, flags);
    209	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
    210	info.align_offset = 0;
    211	return vm_unmapped_area(&info);
    212}
    213
    214static unsigned long
    215hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
    216		unsigned long len, unsigned long pgoff, unsigned long flags)
    217{
    218	struct hstate *h = hstate_file(file);
    219	struct vm_unmapped_area_info info;
    220
    221	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
    222	info.length = len;
    223	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
    224	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
    225	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
    226	info.align_offset = 0;
    227	addr = vm_unmapped_area(&info);
    228
    229	/*
    230	 * A failed mmap() very likely causes application failure,
    231	 * so fall back to the bottom-up function here. This scenario
    232	 * can happen with large stack limits and large mmap()
    233	 * allocations.
    234	 */
    235	if (unlikely(offset_in_page(addr))) {
    236		VM_BUG_ON(addr != -ENOMEM);
    237		info.flags = 0;
    238		info.low_limit = current->mm->mmap_base;
    239		info.high_limit = arch_get_mmap_end(addr, len, flags);
    240		addr = vm_unmapped_area(&info);
    241	}
    242
    243	return addr;
    244}
    245
    246unsigned long
    247generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
    248				  unsigned long len, unsigned long pgoff,
    249				  unsigned long flags)
    250{
    251	struct mm_struct *mm = current->mm;
    252	struct vm_area_struct *vma;
    253	struct hstate *h = hstate_file(file);
    254	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
    255
    256	if (len & ~huge_page_mask(h))
    257		return -EINVAL;
    258	if (len > TASK_SIZE)
    259		return -ENOMEM;
    260
    261	if (flags & MAP_FIXED) {
    262		if (prepare_hugepage_range(file, addr, len))
    263			return -EINVAL;
    264		return addr;
    265	}
    266
    267	if (addr) {
    268		addr = ALIGN(addr, huge_page_size(h));
    269		vma = find_vma(mm, addr);
    270		if (mmap_end - len >= addr &&
    271		    (!vma || addr + len <= vm_start_gap(vma)))
    272			return addr;
    273	}
    274
    275	/*
    276	 * Use mm->get_unmapped_area value as a hint to use topdown routine.
    277	 * If architectures have special needs, they should define their own
    278	 * version of hugetlb_get_unmapped_area.
    279	 */
    280	if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
    281		return hugetlb_get_unmapped_area_topdown(file, addr, len,
    282				pgoff, flags);
    283	return hugetlb_get_unmapped_area_bottomup(file, addr, len,
    284			pgoff, flags);
    285}
    286
    287#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
    288static unsigned long
    289hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
    290			  unsigned long len, unsigned long pgoff,
    291			  unsigned long flags)
    292{
    293	return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
    294}
    295#endif
    296
    297static size_t
    298hugetlbfs_read_actor(struct page *page, unsigned long offset,
    299			struct iov_iter *to, unsigned long size)
    300{
    301	size_t copied = 0;
    302	int i, chunksize;
    303
    304	/* Find which 4k chunk and offset with in that chunk */
    305	i = offset >> PAGE_SHIFT;
    306	offset = offset & ~PAGE_MASK;
    307
    308	while (size) {
    309		size_t n;
    310		chunksize = PAGE_SIZE;
    311		if (offset)
    312			chunksize -= offset;
    313		if (chunksize > size)
    314			chunksize = size;
    315		n = copy_page_to_iter(&page[i], offset, chunksize, to);
    316		copied += n;
    317		if (n != chunksize)
    318			return copied;
    319		offset = 0;
    320		size -= chunksize;
    321		i++;
    322	}
    323	return copied;
    324}
    325
    326/*
    327 * Support for read() - Find the page attached to f_mapping and copy out the
    328 * data. Its *very* similar to generic_file_buffered_read(), we can't use that
    329 * since it has PAGE_SIZE assumptions.
    330 */
    331static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
    332{
    333	struct file *file = iocb->ki_filp;
    334	struct hstate *h = hstate_file(file);
    335	struct address_space *mapping = file->f_mapping;
    336	struct inode *inode = mapping->host;
    337	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
    338	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
    339	unsigned long end_index;
    340	loff_t isize;
    341	ssize_t retval = 0;
    342
    343	while (iov_iter_count(to)) {
    344		struct page *page;
    345		size_t nr, copied;
    346
    347		/* nr is the maximum number of bytes to copy from this page */
    348		nr = huge_page_size(h);
    349		isize = i_size_read(inode);
    350		if (!isize)
    351			break;
    352		end_index = (isize - 1) >> huge_page_shift(h);
    353		if (index > end_index)
    354			break;
    355		if (index == end_index) {
    356			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
    357			if (nr <= offset)
    358				break;
    359		}
    360		nr = nr - offset;
    361
    362		/* Find the page */
    363		page = find_lock_page(mapping, index);
    364		if (unlikely(page == NULL)) {
    365			/*
    366			 * We have a HOLE, zero out the user-buffer for the
    367			 * length of the hole or request.
    368			 */
    369			copied = iov_iter_zero(nr, to);
    370		} else {
    371			unlock_page(page);
    372
    373			/*
    374			 * We have the page, copy it to user space buffer.
    375			 */
    376			copied = hugetlbfs_read_actor(page, offset, to, nr);
    377			put_page(page);
    378		}
    379		offset += copied;
    380		retval += copied;
    381		if (copied != nr && iov_iter_count(to)) {
    382			if (!retval)
    383				retval = -EFAULT;
    384			break;
    385		}
    386		index += offset >> huge_page_shift(h);
    387		offset &= ~huge_page_mask(h);
    388	}
    389	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
    390	return retval;
    391}
    392
    393static int hugetlbfs_write_begin(struct file *file,
    394			struct address_space *mapping,
    395			loff_t pos, unsigned len,
    396			struct page **pagep, void **fsdata)
    397{
    398	return -EINVAL;
    399}
    400
    401static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
    402			loff_t pos, unsigned len, unsigned copied,
    403			struct page *page, void *fsdata)
    404{
    405	BUG();
    406	return -EINVAL;
    407}
    408
    409static void remove_huge_page(struct page *page)
    410{
    411	ClearPageDirty(page);
    412	ClearPageUptodate(page);
    413	delete_from_page_cache(page);
    414}
    415
    416static void
    417hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
    418		      zap_flags_t zap_flags)
    419{
    420	struct vm_area_struct *vma;
    421
    422	/*
    423	 * end == 0 indicates that the entire range after start should be
    424	 * unmapped.  Note, end is exclusive, whereas the interval tree takes
    425	 * an inclusive "last".
    426	 */
    427	vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
    428		unsigned long v_offset;
    429		unsigned long v_end;
    430
    431		/*
    432		 * Can the expression below overflow on 32-bit arches?
    433		 * No, because the interval tree returns us only those vmas
    434		 * which overlap the truncated area starting at pgoff,
    435		 * and no vma on a 32-bit arch can span beyond the 4GB.
    436		 */
    437		if (vma->vm_pgoff < start)
    438			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
    439		else
    440			v_offset = 0;
    441
    442		if (!end)
    443			v_end = vma->vm_end;
    444		else {
    445			v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
    446							+ vma->vm_start;
    447			if (v_end > vma->vm_end)
    448				v_end = vma->vm_end;
    449		}
    450
    451		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
    452				     NULL, zap_flags);
    453	}
    454}
    455
    456/*
    457 * remove_inode_hugepages handles two distinct cases: truncation and hole
    458 * punch.  There are subtle differences in operation for each case.
    459 *
    460 * truncation is indicated by end of range being LLONG_MAX
    461 *	In this case, we first scan the range and release found pages.
    462 *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
    463 *	maps and global counts.  Page faults can not race with truncation
    464 *	in this routine.  hugetlb_no_page() holds i_mmap_rwsem and prevents
    465 *	page faults in the truncated range by checking i_size.  i_size is
    466 *	modified while holding i_mmap_rwsem.
    467 * hole punch is indicated if end is not LLONG_MAX
    468 *	In the hole punch case we scan the range and release found pages.
    469 *	Only when releasing a page is the associated region/reserve map
    470 *	deleted.  The region/reserve map for ranges without associated
    471 *	pages are not modified.  Page faults can race with hole punch.
    472 *	This is indicated if we find a mapped page.
    473 * Note: If the passed end of range value is beyond the end of file, but
    474 * not LLONG_MAX this routine still performs a hole punch operation.
    475 */
    476static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
    477				   loff_t lend)
    478{
    479	struct hstate *h = hstate_inode(inode);
    480	struct address_space *mapping = &inode->i_data;
    481	const pgoff_t start = lstart >> huge_page_shift(h);
    482	const pgoff_t end = lend >> huge_page_shift(h);
    483	struct pagevec pvec;
    484	pgoff_t next, index;
    485	int i, freed = 0;
    486	bool truncate_op = (lend == LLONG_MAX);
    487
    488	pagevec_init(&pvec);
    489	next = start;
    490	while (next < end) {
    491		/*
    492		 * When no more pages are found, we are done.
    493		 */
    494		if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
    495			break;
    496
    497		for (i = 0; i < pagevec_count(&pvec); ++i) {
    498			struct page *page = pvec.pages[i];
    499			u32 hash = 0;
    500
    501			index = page->index;
    502			if (!truncate_op) {
    503				/*
    504				 * Only need to hold the fault mutex in the
    505				 * hole punch case.  This prevents races with
    506				 * page faults.  Races are not possible in the
    507				 * case of truncation.
    508				 */
    509				hash = hugetlb_fault_mutex_hash(mapping, index);
    510				mutex_lock(&hugetlb_fault_mutex_table[hash]);
    511			}
    512
    513			/*
    514			 * If page is mapped, it was faulted in after being
    515			 * unmapped in caller.  Unmap (again) now after taking
    516			 * the fault mutex.  The mutex will prevent faults
    517			 * until we finish removing the page.
    518			 *
    519			 * This race can only happen in the hole punch case.
    520			 * Getting here in a truncate operation is a bug.
    521			 */
    522			if (unlikely(page_mapped(page))) {
    523				BUG_ON(truncate_op);
    524
    525				mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    526				i_mmap_lock_write(mapping);
    527				mutex_lock(&hugetlb_fault_mutex_table[hash]);
    528				hugetlb_vmdelete_list(&mapping->i_mmap,
    529					index * pages_per_huge_page(h),
    530					(index + 1) * pages_per_huge_page(h),
    531					ZAP_FLAG_DROP_MARKER);
    532				i_mmap_unlock_write(mapping);
    533			}
    534
    535			lock_page(page);
    536			/*
    537			 * We must free the huge page and remove from page
    538			 * cache (remove_huge_page) BEFORE removing the
    539			 * region/reserve map (hugetlb_unreserve_pages).  In
    540			 * rare out of memory conditions, removal of the
    541			 * region/reserve map could fail. Correspondingly,
    542			 * the subpool and global reserve usage count can need
    543			 * to be adjusted.
    544			 */
    545			VM_BUG_ON(HPageRestoreReserve(page));
    546			remove_huge_page(page);
    547			freed++;
    548			if (!truncate_op) {
    549				if (unlikely(hugetlb_unreserve_pages(inode,
    550							index, index + 1, 1)))
    551					hugetlb_fix_reserve_counts(inode);
    552			}
    553
    554			unlock_page(page);
    555			if (!truncate_op)
    556				mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    557		}
    558		huge_pagevec_release(&pvec);
    559		cond_resched();
    560	}
    561
    562	if (truncate_op)
    563		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
    564}
    565
    566static void hugetlbfs_evict_inode(struct inode *inode)
    567{
    568	struct resv_map *resv_map;
    569
    570	remove_inode_hugepages(inode, 0, LLONG_MAX);
    571
    572	/*
    573	 * Get the resv_map from the address space embedded in the inode.
    574	 * This is the address space which points to any resv_map allocated
    575	 * at inode creation time.  If this is a device special inode,
    576	 * i_mapping may not point to the original address space.
    577	 */
    578	resv_map = (struct resv_map *)(&inode->i_data)->private_data;
    579	/* Only regular and link inodes have associated reserve maps */
    580	if (resv_map)
    581		resv_map_release(&resv_map->refs);
    582	clear_inode(inode);
    583}
    584
    585static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
    586{
    587	pgoff_t pgoff;
    588	struct address_space *mapping = inode->i_mapping;
    589	struct hstate *h = hstate_inode(inode);
    590
    591	BUG_ON(offset & ~huge_page_mask(h));
    592	pgoff = offset >> PAGE_SHIFT;
    593
    594	i_mmap_lock_write(mapping);
    595	i_size_write(inode, offset);
    596	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
    597		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
    598				      ZAP_FLAG_DROP_MARKER);
    599	i_mmap_unlock_write(mapping);
    600	remove_inode_hugepages(inode, offset, LLONG_MAX);
    601}
    602
    603static void hugetlbfs_zero_partial_page(struct hstate *h,
    604					struct address_space *mapping,
    605					loff_t start,
    606					loff_t end)
    607{
    608	pgoff_t idx = start >> huge_page_shift(h);
    609	struct folio *folio;
    610
    611	folio = filemap_lock_folio(mapping, idx);
    612	if (!folio)
    613		return;
    614
    615	start = start & ~huge_page_mask(h);
    616	end = end & ~huge_page_mask(h);
    617	if (!end)
    618		end = huge_page_size(h);
    619
    620	folio_zero_segment(folio, (size_t)start, (size_t)end);
    621
    622	folio_unlock(folio);
    623	folio_put(folio);
    624}
    625
    626static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
    627{
    628	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
    629	struct address_space *mapping = inode->i_mapping;
    630	struct hstate *h = hstate_inode(inode);
    631	loff_t hpage_size = huge_page_size(h);
    632	loff_t hole_start, hole_end;
    633
    634	/*
    635	 * hole_start and hole_end indicate the full pages within the hole.
    636	 */
    637	hole_start = round_up(offset, hpage_size);
    638	hole_end = round_down(offset + len, hpage_size);
    639
    640	inode_lock(inode);
    641
    642	/* protected by i_rwsem */
    643	if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
    644		inode_unlock(inode);
    645		return -EPERM;
    646	}
    647
    648	i_mmap_lock_write(mapping);
    649
    650	/* If range starts before first full page, zero partial page. */
    651	if (offset < hole_start)
    652		hugetlbfs_zero_partial_page(h, mapping,
    653				offset, min(offset + len, hole_start));
    654
    655	/* Unmap users of full pages in the hole. */
    656	if (hole_end > hole_start) {
    657		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
    658			hugetlb_vmdelete_list(&mapping->i_mmap,
    659					      hole_start >> PAGE_SHIFT,
    660					      hole_end >> PAGE_SHIFT, 0);
    661	}
    662
    663	/* If range extends beyond last full page, zero partial page. */
    664	if ((offset + len) > hole_end && (offset + len) > hole_start)
    665		hugetlbfs_zero_partial_page(h, mapping,
    666				hole_end, offset + len);
    667
    668	i_mmap_unlock_write(mapping);
    669
    670	/* Remove full pages from the file. */
    671	if (hole_end > hole_start)
    672		remove_inode_hugepages(inode, hole_start, hole_end);
    673
    674	inode_unlock(inode);
    675
    676	return 0;
    677}
    678
    679static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
    680				loff_t len)
    681{
    682	struct inode *inode = file_inode(file);
    683	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
    684	struct address_space *mapping = inode->i_mapping;
    685	struct hstate *h = hstate_inode(inode);
    686	struct vm_area_struct pseudo_vma;
    687	struct mm_struct *mm = current->mm;
    688	loff_t hpage_size = huge_page_size(h);
    689	unsigned long hpage_shift = huge_page_shift(h);
    690	pgoff_t start, index, end;
    691	int error;
    692	u32 hash;
    693
    694	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
    695		return -EOPNOTSUPP;
    696
    697	if (mode & FALLOC_FL_PUNCH_HOLE)
    698		return hugetlbfs_punch_hole(inode, offset, len);
    699
    700	/*
    701	 * Default preallocate case.
    702	 * For this range, start is rounded down and end is rounded up
    703	 * as well as being converted to page offsets.
    704	 */
    705	start = offset >> hpage_shift;
    706	end = (offset + len + hpage_size - 1) >> hpage_shift;
    707
    708	inode_lock(inode);
    709
    710	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
    711	error = inode_newsize_ok(inode, offset + len);
    712	if (error)
    713		goto out;
    714
    715	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
    716		error = -EPERM;
    717		goto out;
    718	}
    719
    720	/*
    721	 * Initialize a pseudo vma as this is required by the huge page
    722	 * allocation routines.  If NUMA is configured, use page index
    723	 * as input to create an allocation policy.
    724	 */
    725	vma_init(&pseudo_vma, mm);
    726	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
    727	pseudo_vma.vm_file = file;
    728
    729	for (index = start; index < end; index++) {
    730		/*
    731		 * This is supposed to be the vaddr where the page is being
    732		 * faulted in, but we have no vaddr here.
    733		 */
    734		struct page *page;
    735		unsigned long addr;
    736
    737		cond_resched();
    738
    739		/*
    740		 * fallocate(2) manpage permits EINTR; we may have been
    741		 * interrupted because we are using up too much memory.
    742		 */
    743		if (signal_pending(current)) {
    744			error = -EINTR;
    745			break;
    746		}
    747
    748		/* Set numa allocation policy based on index */
    749		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
    750
    751		/* addr is the offset within the file (zero based) */
    752		addr = index * hpage_size;
    753
    754		/*
    755		 * fault mutex taken here, protects against fault path
    756		 * and hole punch.  inode_lock previously taken protects
    757		 * against truncation.
    758		 */
    759		hash = hugetlb_fault_mutex_hash(mapping, index);
    760		mutex_lock(&hugetlb_fault_mutex_table[hash]);
    761
    762		/* See if already present in mapping to avoid alloc/free */
    763		page = find_get_page(mapping, index);
    764		if (page) {
    765			put_page(page);
    766			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    767			hugetlb_drop_vma_policy(&pseudo_vma);
    768			continue;
    769		}
    770
    771		/*
    772		 * Allocate page without setting the avoid_reserve argument.
    773		 * There certainly are no reserves associated with the
    774		 * pseudo_vma.  However, there could be shared mappings with
    775		 * reserves for the file at the inode level.  If we fallocate
    776		 * pages in these areas, we need to consume the reserves
    777		 * to keep reservation accounting consistent.
    778		 */
    779		page = alloc_huge_page(&pseudo_vma, addr, 0);
    780		hugetlb_drop_vma_policy(&pseudo_vma);
    781		if (IS_ERR(page)) {
    782			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    783			error = PTR_ERR(page);
    784			goto out;
    785		}
    786		clear_huge_page(page, addr, pages_per_huge_page(h));
    787		__SetPageUptodate(page);
    788		error = huge_add_to_page_cache(page, mapping, index);
    789		if (unlikely(error)) {
    790			restore_reserve_on_error(h, &pseudo_vma, addr, page);
    791			put_page(page);
    792			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    793			goto out;
    794		}
    795
    796		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    797
    798		SetHPageMigratable(page);
    799		/*
    800		 * unlock_page because locked by add_to_page_cache()
    801		 * put_page() due to reference from alloc_huge_page()
    802		 */
    803		unlock_page(page);
    804		put_page(page);
    805	}
    806
    807	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
    808		i_size_write(inode, offset + len);
    809	inode->i_ctime = current_time(inode);
    810out:
    811	inode_unlock(inode);
    812	return error;
    813}
    814
    815static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
    816			     struct dentry *dentry, struct iattr *attr)
    817{
    818	struct inode *inode = d_inode(dentry);
    819	struct hstate *h = hstate_inode(inode);
    820	int error;
    821	unsigned int ia_valid = attr->ia_valid;
    822	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
    823
    824	error = setattr_prepare(&init_user_ns, dentry, attr);
    825	if (error)
    826		return error;
    827
    828	if (ia_valid & ATTR_SIZE) {
    829		loff_t oldsize = inode->i_size;
    830		loff_t newsize = attr->ia_size;
    831
    832		if (newsize & ~huge_page_mask(h))
    833			return -EINVAL;
    834		/* protected by i_rwsem */
    835		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
    836		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
    837			return -EPERM;
    838		hugetlb_vmtruncate(inode, newsize);
    839	}
    840
    841	setattr_copy(&init_user_ns, inode, attr);
    842	mark_inode_dirty(inode);
    843	return 0;
    844}
    845
    846static struct inode *hugetlbfs_get_root(struct super_block *sb,
    847					struct hugetlbfs_fs_context *ctx)
    848{
    849	struct inode *inode;
    850
    851	inode = new_inode(sb);
    852	if (inode) {
    853		inode->i_ino = get_next_ino();
    854		inode->i_mode = S_IFDIR | ctx->mode;
    855		inode->i_uid = ctx->uid;
    856		inode->i_gid = ctx->gid;
    857		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
    858		inode->i_op = &hugetlbfs_dir_inode_operations;
    859		inode->i_fop = &simple_dir_operations;
    860		/* directory inodes start off with i_nlink == 2 (for "." entry) */
    861		inc_nlink(inode);
    862		lockdep_annotate_inode_mutex_key(inode);
    863	}
    864	return inode;
    865}
    866
    867/*
    868 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
    869 * be taken from reclaim -- unlike regular filesystems. This needs an
    870 * annotation because huge_pmd_share() does an allocation under hugetlb's
    871 * i_mmap_rwsem.
    872 */
    873static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
    874
    875static struct inode *hugetlbfs_get_inode(struct super_block *sb,
    876					struct inode *dir,
    877					umode_t mode, dev_t dev)
    878{
    879	struct inode *inode;
    880	struct resv_map *resv_map = NULL;
    881
    882	/*
    883	 * Reserve maps are only needed for inodes that can have associated
    884	 * page allocations.
    885	 */
    886	if (S_ISREG(mode) || S_ISLNK(mode)) {
    887		resv_map = resv_map_alloc();
    888		if (!resv_map)
    889			return NULL;
    890	}
    891
    892	inode = new_inode(sb);
    893	if (inode) {
    894		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
    895
    896		inode->i_ino = get_next_ino();
    897		inode_init_owner(&init_user_ns, inode, dir, mode);
    898		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
    899				&hugetlbfs_i_mmap_rwsem_key);
    900		inode->i_mapping->a_ops = &hugetlbfs_aops;
    901		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
    902		inode->i_mapping->private_data = resv_map;
    903		info->seals = F_SEAL_SEAL;
    904		switch (mode & S_IFMT) {
    905		default:
    906			init_special_inode(inode, mode, dev);
    907			break;
    908		case S_IFREG:
    909			inode->i_op = &hugetlbfs_inode_operations;
    910			inode->i_fop = &hugetlbfs_file_operations;
    911			break;
    912		case S_IFDIR:
    913			inode->i_op = &hugetlbfs_dir_inode_operations;
    914			inode->i_fop = &simple_dir_operations;
    915
    916			/* directory inodes start off with i_nlink == 2 (for "." entry) */
    917			inc_nlink(inode);
    918			break;
    919		case S_IFLNK:
    920			inode->i_op = &page_symlink_inode_operations;
    921			inode_nohighmem(inode);
    922			break;
    923		}
    924		lockdep_annotate_inode_mutex_key(inode);
    925	} else {
    926		if (resv_map)
    927			kref_put(&resv_map->refs, resv_map_release);
    928	}
    929
    930	return inode;
    931}
    932
    933/*
    934 * File creation. Allocate an inode, and we're done..
    935 */
    936static int do_hugetlbfs_mknod(struct inode *dir,
    937			struct dentry *dentry,
    938			umode_t mode,
    939			dev_t dev,
    940			bool tmpfile)
    941{
    942	struct inode *inode;
    943	int error = -ENOSPC;
    944
    945	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
    946	if (inode) {
    947		dir->i_ctime = dir->i_mtime = current_time(dir);
    948		if (tmpfile) {
    949			d_tmpfile(dentry, inode);
    950		} else {
    951			d_instantiate(dentry, inode);
    952			dget(dentry);/* Extra count - pin the dentry in core */
    953		}
    954		error = 0;
    955	}
    956	return error;
    957}
    958
    959static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
    960			   struct dentry *dentry, umode_t mode, dev_t dev)
    961{
    962	return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
    963}
    964
    965static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
    966			   struct dentry *dentry, umode_t mode)
    967{
    968	int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry,
    969				     mode | S_IFDIR, 0);
    970	if (!retval)
    971		inc_nlink(dir);
    972	return retval;
    973}
    974
    975static int hugetlbfs_create(struct user_namespace *mnt_userns,
    976			    struct inode *dir, struct dentry *dentry,
    977			    umode_t mode, bool excl)
    978{
    979	return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
    980}
    981
    982static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
    983			     struct inode *dir, struct dentry *dentry,
    984			     umode_t mode)
    985{
    986	return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
    987}
    988
    989static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
    990			     struct inode *dir, struct dentry *dentry,
    991			     const char *symname)
    992{
    993	struct inode *inode;
    994	int error = -ENOSPC;
    995
    996	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
    997	if (inode) {
    998		int l = strlen(symname)+1;
    999		error = page_symlink(inode, symname, l);
   1000		if (!error) {
   1001			d_instantiate(dentry, inode);
   1002			dget(dentry);
   1003		} else
   1004			iput(inode);
   1005	}
   1006	dir->i_ctime = dir->i_mtime = current_time(dir);
   1007
   1008	return error;
   1009}
   1010
   1011static int hugetlbfs_migrate_page(struct address_space *mapping,
   1012				struct page *newpage, struct page *page,
   1013				enum migrate_mode mode)
   1014{
   1015	int rc;
   1016
   1017	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
   1018	if (rc != MIGRATEPAGE_SUCCESS)
   1019		return rc;
   1020
   1021	if (hugetlb_page_subpool(page)) {
   1022		hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page));
   1023		hugetlb_set_page_subpool(page, NULL);
   1024	}
   1025
   1026	if (mode != MIGRATE_SYNC_NO_COPY)
   1027		migrate_page_copy(newpage, page);
   1028	else
   1029		migrate_page_states(newpage, page);
   1030
   1031	return MIGRATEPAGE_SUCCESS;
   1032}
   1033
   1034static int hugetlbfs_error_remove_page(struct address_space *mapping,
   1035				struct page *page)
   1036{
   1037	struct inode *inode = mapping->host;
   1038	pgoff_t index = page->index;
   1039
   1040	remove_huge_page(page);
   1041	if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
   1042		hugetlb_fix_reserve_counts(inode);
   1043
   1044	return 0;
   1045}
   1046
   1047/*
   1048 * Display the mount options in /proc/mounts.
   1049 */
   1050static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root)
   1051{
   1052	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb);
   1053	struct hugepage_subpool *spool = sbinfo->spool;
   1054	unsigned long hpage_size = huge_page_size(sbinfo->hstate);
   1055	unsigned hpage_shift = huge_page_shift(sbinfo->hstate);
   1056	char mod;
   1057
   1058	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
   1059		seq_printf(m, ",uid=%u",
   1060			   from_kuid_munged(&init_user_ns, sbinfo->uid));
   1061	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
   1062		seq_printf(m, ",gid=%u",
   1063			   from_kgid_munged(&init_user_ns, sbinfo->gid));
   1064	if (sbinfo->mode != 0755)
   1065		seq_printf(m, ",mode=%o", sbinfo->mode);
   1066	if (sbinfo->max_inodes != -1)
   1067		seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);
   1068
   1069	hpage_size /= 1024;
   1070	mod = 'K';
   1071	if (hpage_size >= 1024) {
   1072		hpage_size /= 1024;
   1073		mod = 'M';
   1074	}
   1075	seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
   1076	if (spool) {
   1077		if (spool->max_hpages != -1)
   1078			seq_printf(m, ",size=%llu",
   1079				   (unsigned long long)spool->max_hpages << hpage_shift);
   1080		if (spool->min_hpages != -1)
   1081			seq_printf(m, ",min_size=%llu",
   1082				   (unsigned long long)spool->min_hpages << hpage_shift);
   1083	}
   1084	return 0;
   1085}
   1086
   1087static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
   1088{
   1089	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
   1090	struct hstate *h = hstate_inode(d_inode(dentry));
   1091
   1092	buf->f_type = HUGETLBFS_MAGIC;
   1093	buf->f_bsize = huge_page_size(h);
   1094	if (sbinfo) {
   1095		spin_lock(&sbinfo->stat_lock);
   1096		/* If no limits set, just report 0 for max/free/used
   1097		 * blocks, like simple_statfs() */
   1098		if (sbinfo->spool) {
   1099			long free_pages;
   1100
   1101			spin_lock_irq(&sbinfo->spool->lock);
   1102			buf->f_blocks = sbinfo->spool->max_hpages;
   1103			free_pages = sbinfo->spool->max_hpages
   1104				- sbinfo->spool->used_hpages;
   1105			buf->f_bavail = buf->f_bfree = free_pages;
   1106			spin_unlock_irq(&sbinfo->spool->lock);
   1107			buf->f_files = sbinfo->max_inodes;
   1108			buf->f_ffree = sbinfo->free_inodes;
   1109		}
   1110		spin_unlock(&sbinfo->stat_lock);
   1111	}
   1112	buf->f_namelen = NAME_MAX;
   1113	return 0;
   1114}
   1115
   1116static void hugetlbfs_put_super(struct super_block *sb)
   1117{
   1118	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
   1119
   1120	if (sbi) {
   1121		sb->s_fs_info = NULL;
   1122
   1123		if (sbi->spool)
   1124			hugepage_put_subpool(sbi->spool);
   1125
   1126		kfree(sbi);
   1127	}
   1128}
   1129
   1130static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
   1131{
   1132	if (sbinfo->free_inodes >= 0) {
   1133		spin_lock(&sbinfo->stat_lock);
   1134		if (unlikely(!sbinfo->free_inodes)) {
   1135			spin_unlock(&sbinfo->stat_lock);
   1136			return 0;
   1137		}
   1138		sbinfo->free_inodes--;
   1139		spin_unlock(&sbinfo->stat_lock);
   1140	}
   1141
   1142	return 1;
   1143}
   1144
   1145static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
   1146{
   1147	if (sbinfo->free_inodes >= 0) {
   1148		spin_lock(&sbinfo->stat_lock);
   1149		sbinfo->free_inodes++;
   1150		spin_unlock(&sbinfo->stat_lock);
   1151	}
   1152}
   1153
   1154
   1155static struct kmem_cache *hugetlbfs_inode_cachep;
   1156
   1157static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
   1158{
   1159	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
   1160	struct hugetlbfs_inode_info *p;
   1161
   1162	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
   1163		return NULL;
   1164	p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
   1165	if (unlikely(!p)) {
   1166		hugetlbfs_inc_free_inodes(sbinfo);
   1167		return NULL;
   1168	}
   1169
   1170	/*
   1171	 * Any time after allocation, hugetlbfs_destroy_inode can be called
   1172	 * for the inode.  mpol_free_shared_policy is unconditionally called
   1173	 * as part of hugetlbfs_destroy_inode.  So, initialize policy here
   1174	 * in case of a quick call to destroy.
   1175	 *
   1176	 * Note that the policy is initialized even if we are creating a
   1177	 * private inode.  This simplifies hugetlbfs_destroy_inode.
   1178	 */
   1179	mpol_shared_policy_init(&p->policy, NULL);
   1180
   1181	return &p->vfs_inode;
   1182}
   1183
   1184static void hugetlbfs_free_inode(struct inode *inode)
   1185{
   1186	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
   1187}
   1188
   1189static void hugetlbfs_destroy_inode(struct inode *inode)
   1190{
   1191	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
   1192	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
   1193}
   1194
   1195static const struct address_space_operations hugetlbfs_aops = {
   1196	.write_begin	= hugetlbfs_write_begin,
   1197	.write_end	= hugetlbfs_write_end,
   1198	.dirty_folio	= noop_dirty_folio,
   1199	.migratepage    = hugetlbfs_migrate_page,
   1200	.error_remove_page	= hugetlbfs_error_remove_page,
   1201};
   1202
   1203
   1204static void init_once(void *foo)
   1205{
   1206	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
   1207
   1208	inode_init_once(&ei->vfs_inode);
   1209}
   1210
   1211const struct file_operations hugetlbfs_file_operations = {
   1212	.read_iter		= hugetlbfs_read_iter,
   1213	.mmap			= hugetlbfs_file_mmap,
   1214	.fsync			= noop_fsync,
   1215	.get_unmapped_area	= hugetlb_get_unmapped_area,
   1216	.llseek			= default_llseek,
   1217	.fallocate		= hugetlbfs_fallocate,
   1218};
   1219
   1220static const struct inode_operations hugetlbfs_dir_inode_operations = {
   1221	.create		= hugetlbfs_create,
   1222	.lookup		= simple_lookup,
   1223	.link		= simple_link,
   1224	.unlink		= simple_unlink,
   1225	.symlink	= hugetlbfs_symlink,
   1226	.mkdir		= hugetlbfs_mkdir,
   1227	.rmdir		= simple_rmdir,
   1228	.mknod		= hugetlbfs_mknod,
   1229	.rename		= simple_rename,
   1230	.setattr	= hugetlbfs_setattr,
   1231	.tmpfile	= hugetlbfs_tmpfile,
   1232};
   1233
   1234static const struct inode_operations hugetlbfs_inode_operations = {
   1235	.setattr	= hugetlbfs_setattr,
   1236};
   1237
   1238static const struct super_operations hugetlbfs_ops = {
   1239	.alloc_inode    = hugetlbfs_alloc_inode,
   1240	.free_inode     = hugetlbfs_free_inode,
   1241	.destroy_inode  = hugetlbfs_destroy_inode,
   1242	.evict_inode	= hugetlbfs_evict_inode,
   1243	.statfs		= hugetlbfs_statfs,
   1244	.put_super	= hugetlbfs_put_super,
   1245	.show_options	= hugetlbfs_show_options,
   1246};
   1247
   1248/*
   1249 * Convert size option passed from command line to number of huge pages
   1250 * in the pool specified by hstate.  Size option could be in bytes
   1251 * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
   1252 */
   1253static long
   1254hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
   1255			 enum hugetlbfs_size_type val_type)
   1256{
   1257	if (val_type == NO_SIZE)
   1258		return -1;
   1259
   1260	if (val_type == SIZE_PERCENT) {
   1261		size_opt <<= huge_page_shift(h);
   1262		size_opt *= h->max_huge_pages;
   1263		do_div(size_opt, 100);
   1264	}
   1265
   1266	size_opt >>= huge_page_shift(h);
   1267	return size_opt;
   1268}
   1269
   1270/*
   1271 * Parse one mount parameter.
   1272 */
   1273static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
   1274{
   1275	struct hugetlbfs_fs_context *ctx = fc->fs_private;
   1276	struct fs_parse_result result;
   1277	char *rest;
   1278	unsigned long ps;
   1279	int opt;
   1280
   1281	opt = fs_parse(fc, hugetlb_fs_parameters, param, &result);
   1282	if (opt < 0)
   1283		return opt;
   1284
   1285	switch (opt) {
   1286	case Opt_uid:
   1287		ctx->uid = make_kuid(current_user_ns(), result.uint_32);
   1288		if (!uid_valid(ctx->uid))
   1289			goto bad_val;
   1290		return 0;
   1291
   1292	case Opt_gid:
   1293		ctx->gid = make_kgid(current_user_ns(), result.uint_32);
   1294		if (!gid_valid(ctx->gid))
   1295			goto bad_val;
   1296		return 0;
   1297
   1298	case Opt_mode:
   1299		ctx->mode = result.uint_32 & 01777U;
   1300		return 0;
   1301
   1302	case Opt_size:
   1303		/* memparse() will accept a K/M/G without a digit */
   1304		if (!isdigit(param->string[0]))
   1305			goto bad_val;
   1306		ctx->max_size_opt = memparse(param->string, &rest);
   1307		ctx->max_val_type = SIZE_STD;
   1308		if (*rest == '%')
   1309			ctx->max_val_type = SIZE_PERCENT;
   1310		return 0;
   1311
   1312	case Opt_nr_inodes:
   1313		/* memparse() will accept a K/M/G without a digit */
   1314		if (!isdigit(param->string[0]))
   1315			goto bad_val;
   1316		ctx->nr_inodes = memparse(param->string, &rest);
   1317		return 0;
   1318
   1319	case Opt_pagesize:
   1320		ps = memparse(param->string, &rest);
   1321		ctx->hstate = size_to_hstate(ps);
   1322		if (!ctx->hstate) {
   1323			pr_err("Unsupported page size %lu MB\n", ps >> 20);
   1324			return -EINVAL;
   1325		}
   1326		return 0;
   1327
   1328	case Opt_min_size:
   1329		/* memparse() will accept a K/M/G without a digit */
   1330		if (!isdigit(param->string[0]))
   1331			goto bad_val;
   1332		ctx->min_size_opt = memparse(param->string, &rest);
   1333		ctx->min_val_type = SIZE_STD;
   1334		if (*rest == '%')
   1335			ctx->min_val_type = SIZE_PERCENT;
   1336		return 0;
   1337
   1338	default:
   1339		return -EINVAL;
   1340	}
   1341
   1342bad_val:
   1343	return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
   1344		      param->string, param->key);
   1345}
   1346
   1347/*
   1348 * Validate the parsed options.
   1349 */
   1350static int hugetlbfs_validate(struct fs_context *fc)
   1351{
   1352	struct hugetlbfs_fs_context *ctx = fc->fs_private;
   1353
   1354	/*
   1355	 * Use huge page pool size (in hstate) to convert the size
   1356	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
   1357	 */
   1358	ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
   1359						   ctx->max_size_opt,
   1360						   ctx->max_val_type);
   1361	ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
   1362						   ctx->min_size_opt,
   1363						   ctx->min_val_type);
   1364
   1365	/*
   1366	 * If max_size was specified, then min_size must be smaller
   1367	 */
   1368	if (ctx->max_val_type > NO_SIZE &&
   1369	    ctx->min_hpages > ctx->max_hpages) {
   1370		pr_err("Minimum size can not be greater than maximum size\n");
   1371		return -EINVAL;
   1372	}
   1373
   1374	return 0;
   1375}
   1376
   1377static int
   1378hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
   1379{
   1380	struct hugetlbfs_fs_context *ctx = fc->fs_private;
   1381	struct hugetlbfs_sb_info *sbinfo;
   1382
   1383	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
   1384	if (!sbinfo)
   1385		return -ENOMEM;
   1386	sb->s_fs_info = sbinfo;
   1387	spin_lock_init(&sbinfo->stat_lock);
   1388	sbinfo->hstate		= ctx->hstate;
   1389	sbinfo->max_inodes	= ctx->nr_inodes;
   1390	sbinfo->free_inodes	= ctx->nr_inodes;
   1391	sbinfo->spool		= NULL;
   1392	sbinfo->uid		= ctx->uid;
   1393	sbinfo->gid		= ctx->gid;
   1394	sbinfo->mode		= ctx->mode;
   1395
   1396	/*
   1397	 * Allocate and initialize subpool if maximum or minimum size is
   1398	 * specified.  Any needed reservations (for minimum size) are taken
   1399	 * taken when the subpool is created.
   1400	 */
   1401	if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
   1402		sbinfo->spool = hugepage_new_subpool(ctx->hstate,
   1403						     ctx->max_hpages,
   1404						     ctx->min_hpages);
   1405		if (!sbinfo->spool)
   1406			goto out_free;
   1407	}
   1408	sb->s_maxbytes = MAX_LFS_FILESIZE;
   1409	sb->s_blocksize = huge_page_size(ctx->hstate);
   1410	sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
   1411	sb->s_magic = HUGETLBFS_MAGIC;
   1412	sb->s_op = &hugetlbfs_ops;
   1413	sb->s_time_gran = 1;
   1414
   1415	/*
   1416	 * Due to the special and limited functionality of hugetlbfs, it does
   1417	 * not work well as a stacking filesystem.
   1418	 */
   1419	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
   1420	sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
   1421	if (!sb->s_root)
   1422		goto out_free;
   1423	return 0;
   1424out_free:
   1425	kfree(sbinfo->spool);
   1426	kfree(sbinfo);
   1427	return -ENOMEM;
   1428}
   1429
   1430static int hugetlbfs_get_tree(struct fs_context *fc)
   1431{
   1432	int err = hugetlbfs_validate(fc);
   1433	if (err)
   1434		return err;
   1435	return get_tree_nodev(fc, hugetlbfs_fill_super);
   1436}
   1437
   1438static void hugetlbfs_fs_context_free(struct fs_context *fc)
   1439{
   1440	kfree(fc->fs_private);
   1441}
   1442
   1443static const struct fs_context_operations hugetlbfs_fs_context_ops = {
   1444	.free		= hugetlbfs_fs_context_free,
   1445	.parse_param	= hugetlbfs_parse_param,
   1446	.get_tree	= hugetlbfs_get_tree,
   1447};
   1448
   1449static int hugetlbfs_init_fs_context(struct fs_context *fc)
   1450{
   1451	struct hugetlbfs_fs_context *ctx;
   1452
   1453	ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
   1454	if (!ctx)
   1455		return -ENOMEM;
   1456
   1457	ctx->max_hpages	= -1; /* No limit on size by default */
   1458	ctx->nr_inodes	= -1; /* No limit on number of inodes by default */
   1459	ctx->uid	= current_fsuid();
   1460	ctx->gid	= current_fsgid();
   1461	ctx->mode	= 0755;
   1462	ctx->hstate	= &default_hstate;
   1463	ctx->min_hpages	= -1; /* No default minimum size */
   1464	ctx->max_val_type = NO_SIZE;
   1465	ctx->min_val_type = NO_SIZE;
   1466	fc->fs_private = ctx;
   1467	fc->ops	= &hugetlbfs_fs_context_ops;
   1468	return 0;
   1469}
   1470
   1471static struct file_system_type hugetlbfs_fs_type = {
   1472	.name			= "hugetlbfs",
   1473	.init_fs_context	= hugetlbfs_init_fs_context,
   1474	.parameters		= hugetlb_fs_parameters,
   1475	.kill_sb		= kill_litter_super,
   1476};
   1477
   1478static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
   1479
   1480static int can_do_hugetlb_shm(void)
   1481{
   1482	kgid_t shm_group;
   1483	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
   1484	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
   1485}
   1486
   1487static int get_hstate_idx(int page_size_log)
   1488{
   1489	struct hstate *h = hstate_sizelog(page_size_log);
   1490
   1491	if (!h)
   1492		return -1;
   1493	return hstate_index(h);
   1494}
   1495
   1496/*
   1497 * Note that size should be aligned to proper hugepage size in caller side,
   1498 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
   1499 */
   1500struct file *hugetlb_file_setup(const char *name, size_t size,
   1501				vm_flags_t acctflag, int creat_flags,
   1502				int page_size_log)
   1503{
   1504	struct inode *inode;
   1505	struct vfsmount *mnt;
   1506	int hstate_idx;
   1507	struct file *file;
   1508
   1509	hstate_idx = get_hstate_idx(page_size_log);
   1510	if (hstate_idx < 0)
   1511		return ERR_PTR(-ENODEV);
   1512
   1513	mnt = hugetlbfs_vfsmount[hstate_idx];
   1514	if (!mnt)
   1515		return ERR_PTR(-ENOENT);
   1516
   1517	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
   1518		struct ucounts *ucounts = current_ucounts();
   1519
   1520		if (user_shm_lock(size, ucounts)) {
   1521			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
   1522				current->comm, current->pid);
   1523			user_shm_unlock(size, ucounts);
   1524		}
   1525		return ERR_PTR(-EPERM);
   1526	}
   1527
   1528	file = ERR_PTR(-ENOSPC);
   1529	inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
   1530	if (!inode)
   1531		goto out;
   1532	if (creat_flags == HUGETLB_SHMFS_INODE)
   1533		inode->i_flags |= S_PRIVATE;
   1534
   1535	inode->i_size = size;
   1536	clear_nlink(inode);
   1537
   1538	if (!hugetlb_reserve_pages(inode, 0,
   1539			size >> huge_page_shift(hstate_inode(inode)), NULL,
   1540			acctflag))
   1541		file = ERR_PTR(-ENOMEM);
   1542	else
   1543		file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
   1544					&hugetlbfs_file_operations);
   1545	if (!IS_ERR(file))
   1546		return file;
   1547
   1548	iput(inode);
   1549out:
   1550	return file;
   1551}
   1552
   1553static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
   1554{
   1555	struct fs_context *fc;
   1556	struct vfsmount *mnt;
   1557
   1558	fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
   1559	if (IS_ERR(fc)) {
   1560		mnt = ERR_CAST(fc);
   1561	} else {
   1562		struct hugetlbfs_fs_context *ctx = fc->fs_private;
   1563		ctx->hstate = h;
   1564		mnt = fc_mount(fc);
   1565		put_fs_context(fc);
   1566	}
   1567	if (IS_ERR(mnt))
   1568		pr_err("Cannot mount internal hugetlbfs for page size %luK",
   1569		       huge_page_size(h) >> 10);
   1570	return mnt;
   1571}
   1572
   1573static int __init init_hugetlbfs_fs(void)
   1574{
   1575	struct vfsmount *mnt;
   1576	struct hstate *h;
   1577	int error;
   1578	int i;
   1579
   1580	if (!hugepages_supported()) {
   1581		pr_info("disabling because there are no supported hugepage sizes\n");
   1582		return -ENOTSUPP;
   1583	}
   1584
   1585	error = -ENOMEM;
   1586	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
   1587					sizeof(struct hugetlbfs_inode_info),
   1588					0, SLAB_ACCOUNT, init_once);
   1589	if (hugetlbfs_inode_cachep == NULL)
   1590		goto out;
   1591
   1592	error = register_filesystem(&hugetlbfs_fs_type);
   1593	if (error)
   1594		goto out_free;
   1595
   1596	/* default hstate mount is required */
   1597	mnt = mount_one_hugetlbfs(&default_hstate);
   1598	if (IS_ERR(mnt)) {
   1599		error = PTR_ERR(mnt);
   1600		goto out_unreg;
   1601	}
   1602	hugetlbfs_vfsmount[default_hstate_idx] = mnt;
   1603
   1604	/* other hstates are optional */
   1605	i = 0;
   1606	for_each_hstate(h) {
   1607		if (i == default_hstate_idx) {
   1608			i++;
   1609			continue;
   1610		}
   1611
   1612		mnt = mount_one_hugetlbfs(h);
   1613		if (IS_ERR(mnt))
   1614			hugetlbfs_vfsmount[i] = NULL;
   1615		else
   1616			hugetlbfs_vfsmount[i] = mnt;
   1617		i++;
   1618	}
   1619
   1620	return 0;
   1621
   1622 out_unreg:
   1623	(void)unregister_filesystem(&hugetlbfs_fs_type);
   1624 out_free:
   1625	kmem_cache_destroy(hugetlbfs_inode_cachep);
   1626 out:
   1627	return error;
   1628}
   1629fs_initcall(init_hugetlbfs_fs)