util.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
util.c (31269B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2#include <linux/mm.h>
      3#include <linux/slab.h>
      4#include <linux/string.h>
      5#include <linux/compiler.h>
      6#include <linux/export.h>
      7#include <linux/err.h>
      8#include <linux/sched.h>
      9#include <linux/sched/mm.h>
     10#include <linux/sched/signal.h>
     11#include <linux/sched/task_stack.h>
     12#include <linux/security.h>
     13#include <linux/swap.h>
     14#include <linux/swapops.h>
     15#include <linux/mman.h>
     16#include <linux/hugetlb.h>
     17#include <linux/vmalloc.h>
     18#include <linux/userfaultfd_k.h>
     19#include <linux/elf.h>
     20#include <linux/elf-randomize.h>
     21#include <linux/personality.h>
     22#include <linux/random.h>
     23#include <linux/processor.h>
     24#include <linux/sizes.h>
     25#include <linux/compat.h>
     26
     27#include <linux/uaccess.h>
     28
     29#include "internal.h"
     30#include "swap.h"
     31
     32/**
     33 * kfree_const - conditionally free memory
     34 * @x: pointer to the memory
     35 *
     36 * Function calls kfree only if @x is not in .rodata section.
     37 */
     38void kfree_const(const void *x)
     39{
     40	if (!is_kernel_rodata((unsigned long)x))
     41		kfree(x);
     42}
     43EXPORT_SYMBOL(kfree_const);
     44
     45/**
     46 * kstrdup - allocate space for and copy an existing string
     47 * @s: the string to duplicate
     48 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
     49 *
     50 * Return: newly allocated copy of @s or %NULL in case of error
     51 */
     52char *kstrdup(const char *s, gfp_t gfp)
     53{
     54	size_t len;
     55	char *buf;
     56
     57	if (!s)
     58		return NULL;
     59
     60	len = strlen(s) + 1;
     61	buf = kmalloc_track_caller(len, gfp);
     62	if (buf)
     63		memcpy(buf, s, len);
     64	return buf;
     65}
     66EXPORT_SYMBOL(kstrdup);
     67
     68/**
     69 * kstrdup_const - conditionally duplicate an existing const string
     70 * @s: the string to duplicate
     71 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
     72 *
     73 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
     74 * must not be passed to krealloc().
     75 *
     76 * Return: source string if it is in .rodata section otherwise
     77 * fallback to kstrdup.
     78 */
     79const char *kstrdup_const(const char *s, gfp_t gfp)
     80{
     81	if (is_kernel_rodata((unsigned long)s))
     82		return s;
     83
     84	return kstrdup(s, gfp);
     85}
     86EXPORT_SYMBOL(kstrdup_const);
     87
     88/**
     89 * kstrndup - allocate space for and copy an existing string
     90 * @s: the string to duplicate
     91 * @max: read at most @max chars from @s
     92 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
     93 *
     94 * Note: Use kmemdup_nul() instead if the size is known exactly.
     95 *
     96 * Return: newly allocated copy of @s or %NULL in case of error
     97 */
     98char *kstrndup(const char *s, size_t max, gfp_t gfp)
     99{
    100	size_t len;
    101	char *buf;
    102
    103	if (!s)
    104		return NULL;
    105
    106	len = strnlen(s, max);
    107	buf = kmalloc_track_caller(len+1, gfp);
    108	if (buf) {
    109		memcpy(buf, s, len);
    110		buf[len] = '\0';
    111	}
    112	return buf;
    113}
    114EXPORT_SYMBOL(kstrndup);
    115
    116/**
    117 * kmemdup - duplicate region of memory
    118 *
    119 * @src: memory region to duplicate
    120 * @len: memory region length
    121 * @gfp: GFP mask to use
    122 *
    123 * Return: newly allocated copy of @src or %NULL in case of error
    124 */
    125void *kmemdup(const void *src, size_t len, gfp_t gfp)
    126{
    127	void *p;
    128
    129	p = kmalloc_track_caller(len, gfp);
    130	if (p)
    131		memcpy(p, src, len);
    132	return p;
    133}
    134EXPORT_SYMBOL(kmemdup);
    135
    136/**
    137 * kmemdup_nul - Create a NUL-terminated string from unterminated data
    138 * @s: The data to stringify
    139 * @len: The size of the data
    140 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
    141 *
    142 * Return: newly allocated copy of @s with NUL-termination or %NULL in
    143 * case of error
    144 */
    145char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
    146{
    147	char *buf;
    148
    149	if (!s)
    150		return NULL;
    151
    152	buf = kmalloc_track_caller(len + 1, gfp);
    153	if (buf) {
    154		memcpy(buf, s, len);
    155		buf[len] = '\0';
    156	}
    157	return buf;
    158}
    159EXPORT_SYMBOL(kmemdup_nul);
    160
    161/**
    162 * memdup_user - duplicate memory region from user space
    163 *
    164 * @src: source address in user space
    165 * @len: number of bytes to copy
    166 *
    167 * Return: an ERR_PTR() on failure.  Result is physically
    168 * contiguous, to be freed by kfree().
    169 */
    170void *memdup_user(const void __user *src, size_t len)
    171{
    172	void *p;
    173
    174	p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
    175	if (!p)
    176		return ERR_PTR(-ENOMEM);
    177
    178	if (copy_from_user(p, src, len)) {
    179		kfree(p);
    180		return ERR_PTR(-EFAULT);
    181	}
    182
    183	return p;
    184}
    185EXPORT_SYMBOL(memdup_user);
    186
    187/**
    188 * vmemdup_user - duplicate memory region from user space
    189 *
    190 * @src: source address in user space
    191 * @len: number of bytes to copy
    192 *
    193 * Return: an ERR_PTR() on failure.  Result may be not
    194 * physically contiguous.  Use kvfree() to free.
    195 */
    196void *vmemdup_user(const void __user *src, size_t len)
    197{
    198	void *p;
    199
    200	p = kvmalloc(len, GFP_USER);
    201	if (!p)
    202		return ERR_PTR(-ENOMEM);
    203
    204	if (copy_from_user(p, src, len)) {
    205		kvfree(p);
    206		return ERR_PTR(-EFAULT);
    207	}
    208
    209	return p;
    210}
    211EXPORT_SYMBOL(vmemdup_user);
    212
    213/**
    214 * strndup_user - duplicate an existing string from user space
    215 * @s: The string to duplicate
    216 * @n: Maximum number of bytes to copy, including the trailing NUL.
    217 *
    218 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
    219 */
    220char *strndup_user(const char __user *s, long n)
    221{
    222	char *p;
    223	long length;
    224
    225	length = strnlen_user(s, n);
    226
    227	if (!length)
    228		return ERR_PTR(-EFAULT);
    229
    230	if (length > n)
    231		return ERR_PTR(-EINVAL);
    232
    233	p = memdup_user(s, length);
    234
    235	if (IS_ERR(p))
    236		return p;
    237
    238	p[length - 1] = '\0';
    239
    240	return p;
    241}
    242EXPORT_SYMBOL(strndup_user);
    243
    244/**
    245 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
    246 *
    247 * @src: source address in user space
    248 * @len: number of bytes to copy
    249 *
    250 * Return: an ERR_PTR() on failure.
    251 */
    252void *memdup_user_nul(const void __user *src, size_t len)
    253{
    254	char *p;
    255
    256	/*
    257	 * Always use GFP_KERNEL, since copy_from_user() can sleep and
    258	 * cause pagefault, which makes it pointless to use GFP_NOFS
    259	 * or GFP_ATOMIC.
    260	 */
    261	p = kmalloc_track_caller(len + 1, GFP_KERNEL);
    262	if (!p)
    263		return ERR_PTR(-ENOMEM);
    264
    265	if (copy_from_user(p, src, len)) {
    266		kfree(p);
    267		return ERR_PTR(-EFAULT);
    268	}
    269	p[len] = '\0';
    270
    271	return p;
    272}
    273EXPORT_SYMBOL(memdup_user_nul);
    274
    275void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
    276		struct vm_area_struct *prev)
    277{
    278	struct vm_area_struct *next;
    279
    280	vma->vm_prev = prev;
    281	if (prev) {
    282		next = prev->vm_next;
    283		prev->vm_next = vma;
    284	} else {
    285		next = mm->mmap;
    286		mm->mmap = vma;
    287	}
    288	vma->vm_next = next;
    289	if (next)
    290		next->vm_prev = vma;
    291}
    292
    293void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
    294{
    295	struct vm_area_struct *prev, *next;
    296
    297	next = vma->vm_next;
    298	prev = vma->vm_prev;
    299	if (prev)
    300		prev->vm_next = next;
    301	else
    302		mm->mmap = next;
    303	if (next)
    304		next->vm_prev = prev;
    305}
    306
    307/* Check if the vma is being used as a stack by this task */
    308int vma_is_stack_for_current(struct vm_area_struct *vma)
    309{
    310	struct task_struct * __maybe_unused t = current;
    311
    312	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
    313}
    314
    315/*
    316 * Change backing file, only valid to use during initial VMA setup.
    317 */
    318void vma_set_file(struct vm_area_struct *vma, struct file *file)
    319{
    320	/* Changing an anonymous vma with this is illegal */
    321	get_file(file);
    322	swap(vma->vm_file, file);
    323	fput(file);
    324}
    325EXPORT_SYMBOL(vma_set_file);
    326
    327#ifndef STACK_RND_MASK
    328#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
    329#endif
    330
    331unsigned long randomize_stack_top(unsigned long stack_top)
    332{
    333	unsigned long random_variable = 0;
    334
    335	if (current->flags & PF_RANDOMIZE) {
    336		random_variable = get_random_long();
    337		random_variable &= STACK_RND_MASK;
    338		random_variable <<= PAGE_SHIFT;
    339	}
    340#ifdef CONFIG_STACK_GROWSUP
    341	return PAGE_ALIGN(stack_top) + random_variable;
    342#else
    343	return PAGE_ALIGN(stack_top) - random_variable;
    344#endif
    345}
    346
    347/**
    348 * randomize_page - Generate a random, page aligned address
    349 * @start:	The smallest acceptable address the caller will take.
    350 * @range:	The size of the area, starting at @start, within which the
    351 *		random address must fall.
    352 *
    353 * If @start + @range would overflow, @range is capped.
    354 *
    355 * NOTE: Historical use of randomize_range, which this replaces, presumed that
    356 * @start was already page aligned.  We now align it regardless.
    357 *
    358 * Return: A page aligned address within [start, start + range).  On error,
    359 * @start is returned.
    360 */
    361unsigned long randomize_page(unsigned long start, unsigned long range)
    362{
    363	if (!PAGE_ALIGNED(start)) {
    364		range -= PAGE_ALIGN(start) - start;
    365		start = PAGE_ALIGN(start);
    366	}
    367
    368	if (start > ULONG_MAX - range)
    369		range = ULONG_MAX - start;
    370
    371	range >>= PAGE_SHIFT;
    372
    373	if (range == 0)
    374		return start;
    375
    376	return start + (get_random_long() % range << PAGE_SHIFT);
    377}
    378
    379#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
    380unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
    381{
    382	/* Is the current task 32bit ? */
    383	if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
    384		return randomize_page(mm->brk, SZ_32M);
    385
    386	return randomize_page(mm->brk, SZ_1G);
    387}
    388
    389unsigned long arch_mmap_rnd(void)
    390{
    391	unsigned long rnd;
    392
    393#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
    394	if (is_compat_task())
    395		rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
    396	else
    397#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
    398		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
    399
    400	return rnd << PAGE_SHIFT;
    401}
    402
    403static int mmap_is_legacy(struct rlimit *rlim_stack)
    404{
    405	if (current->personality & ADDR_COMPAT_LAYOUT)
    406		return 1;
    407
    408	if (rlim_stack->rlim_cur == RLIM_INFINITY)
    409		return 1;
    410
    411	return sysctl_legacy_va_layout;
    412}
    413
    414/*
    415 * Leave enough space between the mmap area and the stack to honour ulimit in
    416 * the face of randomisation.
    417 */
    418#define MIN_GAP		(SZ_128M)
    419#define MAX_GAP		(STACK_TOP / 6 * 5)
    420
    421static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
    422{
    423	unsigned long gap = rlim_stack->rlim_cur;
    424	unsigned long pad = stack_guard_gap;
    425
    426	/* Account for stack randomization if necessary */
    427	if (current->flags & PF_RANDOMIZE)
    428		pad += (STACK_RND_MASK << PAGE_SHIFT);
    429
    430	/* Values close to RLIM_INFINITY can overflow. */
    431	if (gap + pad > gap)
    432		gap += pad;
    433
    434	if (gap < MIN_GAP)
    435		gap = MIN_GAP;
    436	else if (gap > MAX_GAP)
    437		gap = MAX_GAP;
    438
    439	return PAGE_ALIGN(STACK_TOP - gap - rnd);
    440}
    441
    442void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
    443{
    444	unsigned long random_factor = 0UL;
    445
    446	if (current->flags & PF_RANDOMIZE)
    447		random_factor = arch_mmap_rnd();
    448
    449	if (mmap_is_legacy(rlim_stack)) {
    450		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
    451		mm->get_unmapped_area = arch_get_unmapped_area;
    452	} else {
    453		mm->mmap_base = mmap_base(random_factor, rlim_stack);
    454		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
    455	}
    456}
    457#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
    458void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
    459{
    460	mm->mmap_base = TASK_UNMAPPED_BASE;
    461	mm->get_unmapped_area = arch_get_unmapped_area;
    462}
    463#endif
    464
    465/**
    466 * __account_locked_vm - account locked pages to an mm's locked_vm
    467 * @mm:          mm to account against
    468 * @pages:       number of pages to account
    469 * @inc:         %true if @pages should be considered positive, %false if not
    470 * @task:        task used to check RLIMIT_MEMLOCK
    471 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
    472 *
    473 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
    474 * that mmap_lock is held as writer.
    475 *
    476 * Return:
    477 * * 0       on success
    478 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
    479 */
    480int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
    481			struct task_struct *task, bool bypass_rlim)
    482{
    483	unsigned long locked_vm, limit;
    484	int ret = 0;
    485
    486	mmap_assert_write_locked(mm);
    487
    488	locked_vm = mm->locked_vm;
    489	if (inc) {
    490		if (!bypass_rlim) {
    491			limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
    492			if (locked_vm + pages > limit)
    493				ret = -ENOMEM;
    494		}
    495		if (!ret)
    496			mm->locked_vm = locked_vm + pages;
    497	} else {
    498		WARN_ON_ONCE(pages > locked_vm);
    499		mm->locked_vm = locked_vm - pages;
    500	}
    501
    502	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
    503		 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
    504		 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
    505		 ret ? " - exceeded" : "");
    506
    507	return ret;
    508}
    509EXPORT_SYMBOL_GPL(__account_locked_vm);
    510
    511/**
    512 * account_locked_vm - account locked pages to an mm's locked_vm
    513 * @mm:          mm to account against, may be NULL
    514 * @pages:       number of pages to account
    515 * @inc:         %true if @pages should be considered positive, %false if not
    516 *
    517 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
    518 *
    519 * Return:
    520 * * 0       on success, or if mm is NULL
    521 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
    522 */
    523int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
    524{
    525	int ret;
    526
    527	if (pages == 0 || !mm)
    528		return 0;
    529
    530	mmap_write_lock(mm);
    531	ret = __account_locked_vm(mm, pages, inc, current,
    532				  capable(CAP_IPC_LOCK));
    533	mmap_write_unlock(mm);
    534
    535	return ret;
    536}
    537EXPORT_SYMBOL_GPL(account_locked_vm);
    538
    539unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
    540	unsigned long len, unsigned long prot,
    541	unsigned long flag, unsigned long pgoff)
    542{
    543	unsigned long ret;
    544	struct mm_struct *mm = current->mm;
    545	unsigned long populate;
    546	LIST_HEAD(uf);
    547
    548	ret = security_mmap_file(file, prot, flag);
    549	if (!ret) {
    550		if (mmap_write_lock_killable(mm))
    551			return -EINTR;
    552		ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
    553			      &uf);
    554		mmap_write_unlock(mm);
    555		userfaultfd_unmap_complete(mm, &uf);
    556		if (populate)
    557			mm_populate(ret, populate);
    558	}
    559	return ret;
    560}
    561
    562unsigned long vm_mmap(struct file *file, unsigned long addr,
    563	unsigned long len, unsigned long prot,
    564	unsigned long flag, unsigned long offset)
    565{
    566	if (unlikely(offset + PAGE_ALIGN(len) < offset))
    567		return -EINVAL;
    568	if (unlikely(offset_in_page(offset)))
    569		return -EINVAL;
    570
    571	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
    572}
    573EXPORT_SYMBOL(vm_mmap);
    574
    575/**
    576 * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
    577 * failure, fall back to non-contiguous (vmalloc) allocation.
    578 * @size: size of the request.
    579 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
    580 * @node: numa node to allocate from
    581 *
    582 * Uses kmalloc to get the memory but if the allocation fails then falls back
    583 * to the vmalloc allocator. Use kvfree for freeing the memory.
    584 *
    585 * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
    586 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
    587 * preferable to the vmalloc fallback, due to visible performance drawbacks.
    588 *
    589 * Return: pointer to the allocated memory of %NULL in case of failure
    590 */
    591void *kvmalloc_node(size_t size, gfp_t flags, int node)
    592{
    593	gfp_t kmalloc_flags = flags;
    594	void *ret;
    595
    596	/*
    597	 * We want to attempt a large physically contiguous block first because
    598	 * it is less likely to fragment multiple larger blocks and therefore
    599	 * contribute to a long term fragmentation less than vmalloc fallback.
    600	 * However make sure that larger requests are not too disruptive - no
    601	 * OOM killer and no allocation failure warnings as we have a fallback.
    602	 */
    603	if (size > PAGE_SIZE) {
    604		kmalloc_flags |= __GFP_NOWARN;
    605
    606		if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
    607			kmalloc_flags |= __GFP_NORETRY;
    608
    609		/* nofail semantic is implemented by the vmalloc fallback */
    610		kmalloc_flags &= ~__GFP_NOFAIL;
    611	}
    612
    613	ret = kmalloc_node(size, kmalloc_flags, node);
    614
    615	/*
    616	 * It doesn't really make sense to fallback to vmalloc for sub page
    617	 * requests
    618	 */
    619	if (ret || size <= PAGE_SIZE)
    620		return ret;
    621
    622	/* Don't even allow crazy sizes */
    623	if (unlikely(size > INT_MAX)) {
    624		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
    625		return NULL;
    626	}
    627
    628	/*
    629	 * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
    630	 * since the callers already cannot assume anything
    631	 * about the resulting pointer, and cannot play
    632	 * protection games.
    633	 */
    634	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
    635			flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
    636			node, __builtin_return_address(0));
    637}
    638EXPORT_SYMBOL(kvmalloc_node);
    639
    640/**
    641 * kvfree() - Free memory.
    642 * @addr: Pointer to allocated memory.
    643 *
    644 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
    645 * It is slightly more efficient to use kfree() or vfree() if you are certain
    646 * that you know which one to use.
    647 *
    648 * Context: Either preemptible task context or not-NMI interrupt.
    649 */
    650void kvfree(const void *addr)
    651{
    652	if (is_vmalloc_addr(addr))
    653		vfree(addr);
    654	else
    655		kfree(addr);
    656}
    657EXPORT_SYMBOL(kvfree);
    658
    659/**
    660 * kvfree_sensitive - Free a data object containing sensitive information.
    661 * @addr: address of the data object to be freed.
    662 * @len: length of the data object.
    663 *
    664 * Use the special memzero_explicit() function to clear the content of a
    665 * kvmalloc'ed object containing sensitive data to make sure that the
    666 * compiler won't optimize out the data clearing.
    667 */
    668void kvfree_sensitive(const void *addr, size_t len)
    669{
    670	if (likely(!ZERO_OR_NULL_PTR(addr))) {
    671		memzero_explicit((void *)addr, len);
    672		kvfree(addr);
    673	}
    674}
    675EXPORT_SYMBOL(kvfree_sensitive);
    676
    677void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
    678{
    679	void *newp;
    680
    681	if (oldsize >= newsize)
    682		return (void *)p;
    683	newp = kvmalloc(newsize, flags);
    684	if (!newp)
    685		return NULL;
    686	memcpy(newp, p, oldsize);
    687	kvfree(p);
    688	return newp;
    689}
    690EXPORT_SYMBOL(kvrealloc);
    691
    692/**
    693 * __vmalloc_array - allocate memory for a virtually contiguous array.
    694 * @n: number of elements.
    695 * @size: element size.
    696 * @flags: the type of memory to allocate (see kmalloc).
    697 */
    698void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
    699{
    700	size_t bytes;
    701
    702	if (unlikely(check_mul_overflow(n, size, &bytes)))
    703		return NULL;
    704	return __vmalloc(bytes, flags);
    705}
    706EXPORT_SYMBOL(__vmalloc_array);
    707
    708/**
    709 * vmalloc_array - allocate memory for a virtually contiguous array.
    710 * @n: number of elements.
    711 * @size: element size.
    712 */
    713void *vmalloc_array(size_t n, size_t size)
    714{
    715	return __vmalloc_array(n, size, GFP_KERNEL);
    716}
    717EXPORT_SYMBOL(vmalloc_array);
    718
    719/**
    720 * __vcalloc - allocate and zero memory for a virtually contiguous array.
    721 * @n: number of elements.
    722 * @size: element size.
    723 * @flags: the type of memory to allocate (see kmalloc).
    724 */
    725void *__vcalloc(size_t n, size_t size, gfp_t flags)
    726{
    727	return __vmalloc_array(n, size, flags | __GFP_ZERO);
    728}
    729EXPORT_SYMBOL(__vcalloc);
    730
    731/**
    732 * vcalloc - allocate and zero memory for a virtually contiguous array.
    733 * @n: number of elements.
    734 * @size: element size.
    735 */
    736void *vcalloc(size_t n, size_t size)
    737{
    738	return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
    739}
    740EXPORT_SYMBOL(vcalloc);
    741
    742/* Neutral page->mapping pointer to address_space or anon_vma or other */
    743void *page_rmapping(struct page *page)
    744{
    745	return folio_raw_mapping(page_folio(page));
    746}
    747
    748/**
    749 * folio_mapped - Is this folio mapped into userspace?
    750 * @folio: The folio.
    751 *
    752 * Return: True if any page in this folio is referenced by user page tables.
    753 */
    754bool folio_mapped(struct folio *folio)
    755{
    756	long i, nr;
    757
    758	if (!folio_test_large(folio))
    759		return atomic_read(&folio->_mapcount) >= 0;
    760	if (atomic_read(folio_mapcount_ptr(folio)) >= 0)
    761		return true;
    762	if (folio_test_hugetlb(folio))
    763		return false;
    764
    765	nr = folio_nr_pages(folio);
    766	for (i = 0; i < nr; i++) {
    767		if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0)
    768			return true;
    769	}
    770	return false;
    771}
    772EXPORT_SYMBOL(folio_mapped);
    773
    774struct anon_vma *folio_anon_vma(struct folio *folio)
    775{
    776	unsigned long mapping = (unsigned long)folio->mapping;
    777
    778	if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
    779		return NULL;
    780	return (void *)(mapping - PAGE_MAPPING_ANON);
    781}
    782
    783/**
    784 * folio_mapping - Find the mapping where this folio is stored.
    785 * @folio: The folio.
    786 *
    787 * For folios which are in the page cache, return the mapping that this
    788 * page belongs to.  Folios in the swap cache return the swap mapping
    789 * this page is stored in (which is different from the mapping for the
    790 * swap file or swap device where the data is stored).
    791 *
    792 * You can call this for folios which aren't in the swap cache or page
    793 * cache and it will return NULL.
    794 */
    795struct address_space *folio_mapping(struct folio *folio)
    796{
    797	struct address_space *mapping;
    798
    799	/* This happens if someone calls flush_dcache_page on slab page */
    800	if (unlikely(folio_test_slab(folio)))
    801		return NULL;
    802
    803	if (unlikely(folio_test_swapcache(folio)))
    804		return swap_address_space(folio_swap_entry(folio));
    805
    806	mapping = folio->mapping;
    807	if ((unsigned long)mapping & PAGE_MAPPING_ANON)
    808		return NULL;
    809
    810	return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
    811}
    812EXPORT_SYMBOL(folio_mapping);
    813
    814/* Slow path of page_mapcount() for compound pages */
    815int __page_mapcount(struct page *page)
    816{
    817	int ret;
    818
    819	ret = atomic_read(&page->_mapcount) + 1;
    820	/*
    821	 * For file THP page->_mapcount contains total number of mapping
    822	 * of the page: no need to look into compound_mapcount.
    823	 */
    824	if (!PageAnon(page) && !PageHuge(page))
    825		return ret;
    826	page = compound_head(page);
    827	ret += atomic_read(compound_mapcount_ptr(page)) + 1;
    828	if (PageDoubleMap(page))
    829		ret--;
    830	return ret;
    831}
    832EXPORT_SYMBOL_GPL(__page_mapcount);
    833
    834/**
    835 * folio_mapcount() - Calculate the number of mappings of this folio.
    836 * @folio: The folio.
    837 *
    838 * A large folio tracks both how many times the entire folio is mapped,
    839 * and how many times each individual page in the folio is mapped.
    840 * This function calculates the total number of times the folio is
    841 * mapped.
    842 *
    843 * Return: The number of times this folio is mapped.
    844 */
    845int folio_mapcount(struct folio *folio)
    846{
    847	int i, compound, nr, ret;
    848
    849	if (likely(!folio_test_large(folio)))
    850		return atomic_read(&folio->_mapcount) + 1;
    851
    852	compound = folio_entire_mapcount(folio);
    853	nr = folio_nr_pages(folio);
    854	if (folio_test_hugetlb(folio))
    855		return compound;
    856	ret = compound;
    857	for (i = 0; i < nr; i++)
    858		ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
    859	/* File pages has compound_mapcount included in _mapcount */
    860	if (!folio_test_anon(folio))
    861		return ret - compound * nr;
    862	if (folio_test_double_map(folio))
    863		ret -= nr;
    864	return ret;
    865}
    866
    867/**
    868 * folio_copy - Copy the contents of one folio to another.
    869 * @dst: Folio to copy to.
    870 * @src: Folio to copy from.
    871 *
    872 * The bytes in the folio represented by @src are copied to @dst.
    873 * Assumes the caller has validated that @dst is at least as large as @src.
    874 * Can be called in atomic context for order-0 folios, but if the folio is
    875 * larger, it may sleep.
    876 */
    877void folio_copy(struct folio *dst, struct folio *src)
    878{
    879	long i = 0;
    880	long nr = folio_nr_pages(src);
    881
    882	for (;;) {
    883		copy_highpage(folio_page(dst, i), folio_page(src, i));
    884		if (++i == nr)
    885			break;
    886		cond_resched();
    887	}
    888}
    889
    890int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
    891int sysctl_overcommit_ratio __read_mostly = 50;
    892unsigned long sysctl_overcommit_kbytes __read_mostly;
    893int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
    894unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
    895unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
    896
    897int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
    898		size_t *lenp, loff_t *ppos)
    899{
    900	int ret;
    901
    902	ret = proc_dointvec(table, write, buffer, lenp, ppos);
    903	if (ret == 0 && write)
    904		sysctl_overcommit_kbytes = 0;
    905	return ret;
    906}
    907
    908static void sync_overcommit_as(struct work_struct *dummy)
    909{
    910	percpu_counter_sync(&vm_committed_as);
    911}
    912
    913int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
    914		size_t *lenp, loff_t *ppos)
    915{
    916	struct ctl_table t;
    917	int new_policy = -1;
    918	int ret;
    919
    920	/*
    921	 * The deviation of sync_overcommit_as could be big with loose policy
    922	 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
    923	 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
    924	 * with the strict "NEVER", and to avoid possible race condition (even
    925	 * though user usually won't too frequently do the switching to policy
    926	 * OVERCOMMIT_NEVER), the switch is done in the following order:
    927	 *	1. changing the batch
    928	 *	2. sync percpu count on each CPU
    929	 *	3. switch the policy
    930	 */
    931	if (write) {
    932		t = *table;
    933		t.data = &new_policy;
    934		ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
    935		if (ret || new_policy == -1)
    936			return ret;
    937
    938		mm_compute_batch(new_policy);
    939		if (new_policy == OVERCOMMIT_NEVER)
    940			schedule_on_each_cpu(sync_overcommit_as);
    941		sysctl_overcommit_memory = new_policy;
    942	} else {
    943		ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
    944	}
    945
    946	return ret;
    947}
    948
    949int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
    950		size_t *lenp, loff_t *ppos)
    951{
    952	int ret;
    953
    954	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
    955	if (ret == 0 && write)
    956		sysctl_overcommit_ratio = 0;
    957	return ret;
    958}
    959
    960/*
    961 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
    962 */
    963unsigned long vm_commit_limit(void)
    964{
    965	unsigned long allowed;
    966
    967	if (sysctl_overcommit_kbytes)
    968		allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
    969	else
    970		allowed = ((totalram_pages() - hugetlb_total_pages())
    971			   * sysctl_overcommit_ratio / 100);
    972	allowed += total_swap_pages;
    973
    974	return allowed;
    975}
    976
    977/*
    978 * Make sure vm_committed_as in one cacheline and not cacheline shared with
    979 * other variables. It can be updated by several CPUs frequently.
    980 */
    981struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
    982
    983/*
    984 * The global memory commitment made in the system can be a metric
    985 * that can be used to drive ballooning decisions when Linux is hosted
    986 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
    987 * balancing memory across competing virtual machines that are hosted.
    988 * Several metrics drive this policy engine including the guest reported
    989 * memory commitment.
    990 *
    991 * The time cost of this is very low for small platforms, and for big
    992 * platform like a 2S/36C/72T Skylake server, in worst case where
    993 * vm_committed_as's spinlock is under severe contention, the time cost
    994 * could be about 30~40 microseconds.
    995 */
    996unsigned long vm_memory_committed(void)
    997{
    998	return percpu_counter_sum_positive(&vm_committed_as);
    999}
   1000EXPORT_SYMBOL_GPL(vm_memory_committed);
   1001
   1002/*
   1003 * Check that a process has enough memory to allocate a new virtual
   1004 * mapping. 0 means there is enough memory for the allocation to
   1005 * succeed and -ENOMEM implies there is not.
   1006 *
   1007 * We currently support three overcommit policies, which are set via the
   1008 * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting.rst
   1009 *
   1010 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
   1011 * Additional code 2002 Jul 20 by Robert Love.
   1012 *
   1013 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
   1014 *
   1015 * Note this is a helper function intended to be used by LSMs which
   1016 * wish to use this logic.
   1017 */
   1018int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
   1019{
   1020	long allowed;
   1021
   1022	vm_acct_memory(pages);
   1023
   1024	/*
   1025	 * Sometimes we want to use more memory than we have
   1026	 */
   1027	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
   1028		return 0;
   1029
   1030	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
   1031		if (pages > totalram_pages() + total_swap_pages)
   1032			goto error;
   1033		return 0;
   1034	}
   1035
   1036	allowed = vm_commit_limit();
   1037	/*
   1038	 * Reserve some for root
   1039	 */
   1040	if (!cap_sys_admin)
   1041		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
   1042
   1043	/*
   1044	 * Don't let a single process grow so big a user can't recover
   1045	 */
   1046	if (mm) {
   1047		long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
   1048
   1049		allowed -= min_t(long, mm->total_vm / 32, reserve);
   1050	}
   1051
   1052	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
   1053		return 0;
   1054error:
   1055	vm_unacct_memory(pages);
   1056
   1057	return -ENOMEM;
   1058}
   1059
   1060/**
   1061 * get_cmdline() - copy the cmdline value to a buffer.
   1062 * @task:     the task whose cmdline value to copy.
   1063 * @buffer:   the buffer to copy to.
   1064 * @buflen:   the length of the buffer. Larger cmdline values are truncated
   1065 *            to this length.
   1066 *
   1067 * Return: the size of the cmdline field copied. Note that the copy does
   1068 * not guarantee an ending NULL byte.
   1069 */
   1070int get_cmdline(struct task_struct *task, char *buffer, int buflen)
   1071{
   1072	int res = 0;
   1073	unsigned int len;
   1074	struct mm_struct *mm = get_task_mm(task);
   1075	unsigned long arg_start, arg_end, env_start, env_end;
   1076	if (!mm)
   1077		goto out;
   1078	if (!mm->arg_end)
   1079		goto out_mm;	/* Shh! No looking before we're done */
   1080
   1081	spin_lock(&mm->arg_lock);
   1082	arg_start = mm->arg_start;
   1083	arg_end = mm->arg_end;
   1084	env_start = mm->env_start;
   1085	env_end = mm->env_end;
   1086	spin_unlock(&mm->arg_lock);
   1087
   1088	len = arg_end - arg_start;
   1089
   1090	if (len > buflen)
   1091		len = buflen;
   1092
   1093	res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
   1094
   1095	/*
   1096	 * If the nul at the end of args has been overwritten, then
   1097	 * assume application is using setproctitle(3).
   1098	 */
   1099	if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
   1100		len = strnlen(buffer, res);
   1101		if (len < res) {
   1102			res = len;
   1103		} else {
   1104			len = env_end - env_start;
   1105			if (len > buflen - res)
   1106				len = buflen - res;
   1107			res += access_process_vm(task, env_start,
   1108						 buffer+res, len,
   1109						 FOLL_FORCE);
   1110			res = strnlen(buffer, res);
   1111		}
   1112	}
   1113out_mm:
   1114	mmput(mm);
   1115out:
   1116	return res;
   1117}
   1118
   1119int __weak memcmp_pages(struct page *page1, struct page *page2)
   1120{
   1121	char *addr1, *addr2;
   1122	int ret;
   1123
   1124	addr1 = kmap_atomic(page1);
   1125	addr2 = kmap_atomic(page2);
   1126	ret = memcmp(addr1, addr2, PAGE_SIZE);
   1127	kunmap_atomic(addr2);
   1128	kunmap_atomic(addr1);
   1129	return ret;
   1130}
   1131
   1132#ifdef CONFIG_PRINTK
   1133/**
   1134 * mem_dump_obj - Print available provenance information
   1135 * @object: object for which to find provenance information.
   1136 *
   1137 * This function uses pr_cont(), so that the caller is expected to have
   1138 * printed out whatever preamble is appropriate.  The provenance information
   1139 * depends on the type of object and on how much debugging is enabled.
   1140 * For example, for a slab-cache object, the slab name is printed, and,
   1141 * if available, the return address and stack trace from the allocation
   1142 * and last free path of that object.
   1143 */
   1144void mem_dump_obj(void *object)
   1145{
   1146	const char *type;
   1147
   1148	if (kmem_valid_obj(object)) {
   1149		kmem_dump_obj(object);
   1150		return;
   1151	}
   1152
   1153	if (vmalloc_dump_obj(object))
   1154		return;
   1155
   1156	if (virt_addr_valid(object))
   1157		type = "non-slab/vmalloc memory";
   1158	else if (object == NULL)
   1159		type = "NULL pointer";
   1160	else if (object == ZERO_SIZE_PTR)
   1161		type = "zero-size pointer";
   1162	else
   1163		type = "non-paged memory";
   1164
   1165	pr_cont(" %s\n", type);
   1166}
   1167EXPORT_SYMBOL_GPL(mem_dump_obj);
   1168#endif
   1169
   1170/*
   1171 * A driver might set a page logically offline -- PageOffline() -- and
   1172 * turn the page inaccessible in the hypervisor; after that, access to page
   1173 * content can be fatal.
   1174 *
   1175 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
   1176 * pages after checking PageOffline(); however, these PFN walkers can race
   1177 * with drivers that set PageOffline().
   1178 *
   1179 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
   1180 * synchronize with such drivers, achieving that a page cannot be set
   1181 * PageOffline() while frozen.
   1182 *
   1183 * page_offline_begin()/page_offline_end() is used by drivers that care about
   1184 * such races when setting a page PageOffline().
   1185 */
   1186static DECLARE_RWSEM(page_offline_rwsem);
   1187
   1188void page_offline_freeze(void)
   1189{
   1190	down_read(&page_offline_rwsem);
   1191}
   1192
   1193void page_offline_thaw(void)
   1194{
   1195	up_read(&page_offline_rwsem);
   1196}
   1197
   1198void page_offline_begin(void)
   1199{
   1200	down_write(&page_offline_rwsem);
   1201}
   1202EXPORT_SYMBOL(page_offline_begin);
   1203
   1204void page_offline_end(void)
   1205{
   1206	up_write(&page_offline_rwsem);
   1207}
   1208EXPORT_SYMBOL(page_offline_end);
   1209
   1210#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
   1211void flush_dcache_folio(struct folio *folio)
   1212{
   1213	long i, nr = folio_nr_pages(folio);
   1214
   1215	for (i = 0; i < nr; i++)
   1216		flush_dcache_page(folio_page(folio, i));
   1217}
   1218EXPORT_SYMBOL(flush_dcache_folio);
   1219#endif