exec.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
exec.c (52167B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  linux/fs/exec.c
      4 *
      5 *  Copyright (C) 1991, 1992  Linus Torvalds
      6 */
      7
      8/*
      9 * #!-checking implemented by tytso.
     10 */
     11/*
     12 * Demand-loading implemented 01.12.91 - no need to read anything but
     13 * the header into memory. The inode of the executable is put into
     14 * "current->executable", and page faults do the actual loading. Clean.
     15 *
     16 * Once more I can proudly say that linux stood up to being changed: it
     17 * was less than 2 hours work to get demand-loading completely implemented.
     18 *
     19 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
     20 * current->executable is only used by the procfs.  This allows a dispatch
     21 * table to check for several different types  of binary formats.  We keep
     22 * trying until we recognize the file or we run out of supported binary
     23 * formats.
     24 */
     25
     26#include <linux/kernel_read_file.h>
     27#include <linux/slab.h>
     28#include <linux/file.h>
     29#include <linux/fdtable.h>
     30#include <linux/mm.h>
     31#include <linux/vmacache.h>
     32#include <linux/stat.h>
     33#include <linux/fcntl.h>
     34#include <linux/swap.h>
     35#include <linux/string.h>
     36#include <linux/init.h>
     37#include <linux/sched/mm.h>
     38#include <linux/sched/coredump.h>
     39#include <linux/sched/signal.h>
     40#include <linux/sched/numa_balancing.h>
     41#include <linux/sched/task.h>
     42#include <linux/pagemap.h>
     43#include <linux/perf_event.h>
     44#include <linux/highmem.h>
     45#include <linux/spinlock.h>
     46#include <linux/key.h>
     47#include <linux/personality.h>
     48#include <linux/binfmts.h>
     49#include <linux/utsname.h>
     50#include <linux/pid_namespace.h>
     51#include <linux/module.h>
     52#include <linux/namei.h>
     53#include <linux/mount.h>
     54#include <linux/security.h>
     55#include <linux/syscalls.h>
     56#include <linux/tsacct_kern.h>
     57#include <linux/cn_proc.h>
     58#include <linux/audit.h>
     59#include <linux/kmod.h>
     60#include <linux/fsnotify.h>
     61#include <linux/fs_struct.h>
     62#include <linux/oom.h>
     63#include <linux/compat.h>
     64#include <linux/vmalloc.h>
     65#include <linux/io_uring.h>
     66#include <linux/syscall_user_dispatch.h>
     67#include <linux/coredump.h>
     68
     69#include <linux/uaccess.h>
     70#include <asm/mmu_context.h>
     71#include <asm/tlb.h>
     72
     73#include <trace/events/task.h>
     74#include "internal.h"
     75
     76#include <trace/events/sched.h>
     77
     78static int bprm_creds_from_file(struct linux_binprm *bprm);
     79
     80int suid_dumpable = 0;
     81
     82static LIST_HEAD(formats);
     83static DEFINE_RWLOCK(binfmt_lock);
     84
     85void __register_binfmt(struct linux_binfmt * fmt, int insert)
     86{
     87	write_lock(&binfmt_lock);
     88	insert ? list_add(&fmt->lh, &formats) :
     89		 list_add_tail(&fmt->lh, &formats);
     90	write_unlock(&binfmt_lock);
     91}
     92
     93EXPORT_SYMBOL(__register_binfmt);
     94
     95void unregister_binfmt(struct linux_binfmt * fmt)
     96{
     97	write_lock(&binfmt_lock);
     98	list_del(&fmt->lh);
     99	write_unlock(&binfmt_lock);
    100}
    101
    102EXPORT_SYMBOL(unregister_binfmt);
    103
    104static inline void put_binfmt(struct linux_binfmt * fmt)
    105{
    106	module_put(fmt->module);
    107}
    108
    109bool path_noexec(const struct path *path)
    110{
    111	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
    112	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
    113}
    114
    115#ifdef CONFIG_USELIB
    116/*
    117 * Note that a shared library must be both readable and executable due to
    118 * security reasons.
    119 *
    120 * Also note that we take the address to load from the file itself.
    121 */
    122SYSCALL_DEFINE1(uselib, const char __user *, library)
    123{
    124	struct linux_binfmt *fmt;
    125	struct file *file;
    126	struct filename *tmp = getname(library);
    127	int error = PTR_ERR(tmp);
    128	static const struct open_flags uselib_flags = {
    129		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
    130		.acc_mode = MAY_READ | MAY_EXEC,
    131		.intent = LOOKUP_OPEN,
    132		.lookup_flags = LOOKUP_FOLLOW,
    133	};
    134
    135	if (IS_ERR(tmp))
    136		goto out;
    137
    138	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
    139	putname(tmp);
    140	error = PTR_ERR(file);
    141	if (IS_ERR(file))
    142		goto out;
    143
    144	/*
    145	 * may_open() has already checked for this, so it should be
    146	 * impossible to trip now. But we need to be extra cautious
    147	 * and check again at the very end too.
    148	 */
    149	error = -EACCES;
    150	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
    151			 path_noexec(&file->f_path)))
    152		goto exit;
    153
    154	fsnotify_open(file);
    155
    156	error = -ENOEXEC;
    157
    158	read_lock(&binfmt_lock);
    159	list_for_each_entry(fmt, &formats, lh) {
    160		if (!fmt->load_shlib)
    161			continue;
    162		if (!try_module_get(fmt->module))
    163			continue;
    164		read_unlock(&binfmt_lock);
    165		error = fmt->load_shlib(file);
    166		read_lock(&binfmt_lock);
    167		put_binfmt(fmt);
    168		if (error != -ENOEXEC)
    169			break;
    170	}
    171	read_unlock(&binfmt_lock);
    172exit:
    173	fput(file);
    174out:
    175  	return error;
    176}
    177#endif /* #ifdef CONFIG_USELIB */
    178
    179#ifdef CONFIG_MMU
    180/*
    181 * The nascent bprm->mm is not visible until exec_mmap() but it can
    182 * use a lot of memory, account these pages in current->mm temporary
    183 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
    184 * change the counter back via acct_arg_size(0).
    185 */
    186static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
    187{
    188	struct mm_struct *mm = current->mm;
    189	long diff = (long)(pages - bprm->vma_pages);
    190
    191	if (!mm || !diff)
    192		return;
    193
    194	bprm->vma_pages = pages;
    195	add_mm_counter(mm, MM_ANONPAGES, diff);
    196}
    197
    198static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
    199		int write)
    200{
    201	struct page *page;
    202	int ret;
    203	unsigned int gup_flags = FOLL_FORCE;
    204
    205#ifdef CONFIG_STACK_GROWSUP
    206	if (write) {
    207		ret = expand_downwards(bprm->vma, pos);
    208		if (ret < 0)
    209			return NULL;
    210	}
    211#endif
    212
    213	if (write)
    214		gup_flags |= FOLL_WRITE;
    215
    216	/*
    217	 * We are doing an exec().  'current' is the process
    218	 * doing the exec and bprm->mm is the new process's mm.
    219	 */
    220	mmap_read_lock(bprm->mm);
    221	ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
    222			&page, NULL, NULL);
    223	mmap_read_unlock(bprm->mm);
    224	if (ret <= 0)
    225		return NULL;
    226
    227	if (write)
    228		acct_arg_size(bprm, vma_pages(bprm->vma));
    229
    230	return page;
    231}
    232
    233static void put_arg_page(struct page *page)
    234{
    235	put_page(page);
    236}
    237
    238static void free_arg_pages(struct linux_binprm *bprm)
    239{
    240}
    241
    242static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
    243		struct page *page)
    244{
    245	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
    246}
    247
    248static int __bprm_mm_init(struct linux_binprm *bprm)
    249{
    250	int err;
    251	struct vm_area_struct *vma = NULL;
    252	struct mm_struct *mm = bprm->mm;
    253
    254	bprm->vma = vma = vm_area_alloc(mm);
    255	if (!vma)
    256		return -ENOMEM;
    257	vma_set_anonymous(vma);
    258
    259	if (mmap_write_lock_killable(mm)) {
    260		err = -EINTR;
    261		goto err_free;
    262	}
    263
    264	/*
    265	 * Place the stack at the largest stack address the architecture
    266	 * supports. Later, we'll move this to an appropriate place. We don't
    267	 * use STACK_TOP because that can depend on attributes which aren't
    268	 * configured yet.
    269	 */
    270	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
    271	vma->vm_end = STACK_TOP_MAX;
    272	vma->vm_start = vma->vm_end - PAGE_SIZE;
    273	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
    274	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
    275
    276	err = insert_vm_struct(mm, vma);
    277	if (err)
    278		goto err;
    279
    280	mm->stack_vm = mm->total_vm = 1;
    281	mmap_write_unlock(mm);
    282	bprm->p = vma->vm_end - sizeof(void *);
    283	return 0;
    284err:
    285	mmap_write_unlock(mm);
    286err_free:
    287	bprm->vma = NULL;
    288	vm_area_free(vma);
    289	return err;
    290}
    291
    292static bool valid_arg_len(struct linux_binprm *bprm, long len)
    293{
    294	return len <= MAX_ARG_STRLEN;
    295}
    296
    297#else
    298
    299static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
    300{
    301}
    302
    303static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
    304		int write)
    305{
    306	struct page *page;
    307
    308	page = bprm->page[pos / PAGE_SIZE];
    309	if (!page && write) {
    310		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
    311		if (!page)
    312			return NULL;
    313		bprm->page[pos / PAGE_SIZE] = page;
    314	}
    315
    316	return page;
    317}
    318
    319static void put_arg_page(struct page *page)
    320{
    321}
    322
    323static void free_arg_page(struct linux_binprm *bprm, int i)
    324{
    325	if (bprm->page[i]) {
    326		__free_page(bprm->page[i]);
    327		bprm->page[i] = NULL;
    328	}
    329}
    330
    331static void free_arg_pages(struct linux_binprm *bprm)
    332{
    333	int i;
    334
    335	for (i = 0; i < MAX_ARG_PAGES; i++)
    336		free_arg_page(bprm, i);
    337}
    338
    339static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
    340		struct page *page)
    341{
    342}
    343
    344static int __bprm_mm_init(struct linux_binprm *bprm)
    345{
    346	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
    347	return 0;
    348}
    349
    350static bool valid_arg_len(struct linux_binprm *bprm, long len)
    351{
    352	return len <= bprm->p;
    353}
    354
    355#endif /* CONFIG_MMU */
    356
    357/*
    358 * Create a new mm_struct and populate it with a temporary stack
    359 * vm_area_struct.  We don't have enough context at this point to set the stack
    360 * flags, permissions, and offset, so we use temporary values.  We'll update
    361 * them later in setup_arg_pages().
    362 */
    363static int bprm_mm_init(struct linux_binprm *bprm)
    364{
    365	int err;
    366	struct mm_struct *mm = NULL;
    367
    368	bprm->mm = mm = mm_alloc();
    369	err = -ENOMEM;
    370	if (!mm)
    371		goto err;
    372
    373	/* Save current stack limit for all calculations made during exec. */
    374	task_lock(current->group_leader);
    375	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
    376	task_unlock(current->group_leader);
    377
    378	err = __bprm_mm_init(bprm);
    379	if (err)
    380		goto err;
    381
    382	return 0;
    383
    384err:
    385	if (mm) {
    386		bprm->mm = NULL;
    387		mmdrop(mm);
    388	}
    389
    390	return err;
    391}
    392
    393struct user_arg_ptr {
    394#ifdef CONFIG_COMPAT
    395	bool is_compat;
    396#endif
    397	union {
    398		const char __user *const __user *native;
    399#ifdef CONFIG_COMPAT
    400		const compat_uptr_t __user *compat;
    401#endif
    402	} ptr;
    403};
    404
    405static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
    406{
    407	const char __user *native;
    408
    409#ifdef CONFIG_COMPAT
    410	if (unlikely(argv.is_compat)) {
    411		compat_uptr_t compat;
    412
    413		if (get_user(compat, argv.ptr.compat + nr))
    414			return ERR_PTR(-EFAULT);
    415
    416		return compat_ptr(compat);
    417	}
    418#endif
    419
    420	if (get_user(native, argv.ptr.native + nr))
    421		return ERR_PTR(-EFAULT);
    422
    423	return native;
    424}
    425
    426/*
    427 * count() counts the number of strings in array ARGV.
    428 */
    429static int count(struct user_arg_ptr argv, int max)
    430{
    431	int i = 0;
    432
    433	if (argv.ptr.native != NULL) {
    434		for (;;) {
    435			const char __user *p = get_user_arg_ptr(argv, i);
    436
    437			if (!p)
    438				break;
    439
    440			if (IS_ERR(p))
    441				return -EFAULT;
    442
    443			if (i >= max)
    444				return -E2BIG;
    445			++i;
    446
    447			if (fatal_signal_pending(current))
    448				return -ERESTARTNOHAND;
    449			cond_resched();
    450		}
    451	}
    452	return i;
    453}
    454
    455static int count_strings_kernel(const char *const *argv)
    456{
    457	int i;
    458
    459	if (!argv)
    460		return 0;
    461
    462	for (i = 0; argv[i]; ++i) {
    463		if (i >= MAX_ARG_STRINGS)
    464			return -E2BIG;
    465		if (fatal_signal_pending(current))
    466			return -ERESTARTNOHAND;
    467		cond_resched();
    468	}
    469	return i;
    470}
    471
    472static int bprm_stack_limits(struct linux_binprm *bprm)
    473{
    474	unsigned long limit, ptr_size;
    475
    476	/*
    477	 * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
    478	 * (whichever is smaller) for the argv+env strings.
    479	 * This ensures that:
    480	 *  - the remaining binfmt code will not run out of stack space,
    481	 *  - the program will have a reasonable amount of stack left
    482	 *    to work from.
    483	 */
    484	limit = _STK_LIM / 4 * 3;
    485	limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
    486	/*
    487	 * We've historically supported up to 32 pages (ARG_MAX)
    488	 * of argument strings even with small stacks
    489	 */
    490	limit = max_t(unsigned long, limit, ARG_MAX);
    491	/*
    492	 * We must account for the size of all the argv and envp pointers to
    493	 * the argv and envp strings, since they will also take up space in
    494	 * the stack. They aren't stored until much later when we can't
    495	 * signal to the parent that the child has run out of stack space.
    496	 * Instead, calculate it here so it's possible to fail gracefully.
    497	 *
    498	 * In the case of argc = 0, make sure there is space for adding a
    499	 * empty string (which will bump argc to 1), to ensure confused
    500	 * userspace programs don't start processing from argv[1], thinking
    501	 * argc can never be 0, to keep them from walking envp by accident.
    502	 * See do_execveat_common().
    503	 */
    504	ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
    505	if (limit <= ptr_size)
    506		return -E2BIG;
    507	limit -= ptr_size;
    508
    509	bprm->argmin = bprm->p - limit;
    510	return 0;
    511}
    512
    513/*
    514 * 'copy_strings()' copies argument/environment strings from the old
    515 * processes's memory to the new process's stack.  The call to get_user_pages()
    516 * ensures the destination page is created and not swapped out.
    517 */
    518static int copy_strings(int argc, struct user_arg_ptr argv,
    519			struct linux_binprm *bprm)
    520{
    521	struct page *kmapped_page = NULL;
    522	char *kaddr = NULL;
    523	unsigned long kpos = 0;
    524	int ret;
    525
    526	while (argc-- > 0) {
    527		const char __user *str;
    528		int len;
    529		unsigned long pos;
    530
    531		ret = -EFAULT;
    532		str = get_user_arg_ptr(argv, argc);
    533		if (IS_ERR(str))
    534			goto out;
    535
    536		len = strnlen_user(str, MAX_ARG_STRLEN);
    537		if (!len)
    538			goto out;
    539
    540		ret = -E2BIG;
    541		if (!valid_arg_len(bprm, len))
    542			goto out;
    543
    544		/* We're going to work our way backwards. */
    545		pos = bprm->p;
    546		str += len;
    547		bprm->p -= len;
    548#ifdef CONFIG_MMU
    549		if (bprm->p < bprm->argmin)
    550			goto out;
    551#endif
    552
    553		while (len > 0) {
    554			int offset, bytes_to_copy;
    555
    556			if (fatal_signal_pending(current)) {
    557				ret = -ERESTARTNOHAND;
    558				goto out;
    559			}
    560			cond_resched();
    561
    562			offset = pos % PAGE_SIZE;
    563			if (offset == 0)
    564				offset = PAGE_SIZE;
    565
    566			bytes_to_copy = offset;
    567			if (bytes_to_copy > len)
    568				bytes_to_copy = len;
    569
    570			offset -= bytes_to_copy;
    571			pos -= bytes_to_copy;
    572			str -= bytes_to_copy;
    573			len -= bytes_to_copy;
    574
    575			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
    576				struct page *page;
    577
    578				page = get_arg_page(bprm, pos, 1);
    579				if (!page) {
    580					ret = -E2BIG;
    581					goto out;
    582				}
    583
    584				if (kmapped_page) {
    585					flush_dcache_page(kmapped_page);
    586					kunmap(kmapped_page);
    587					put_arg_page(kmapped_page);
    588				}
    589				kmapped_page = page;
    590				kaddr = kmap(kmapped_page);
    591				kpos = pos & PAGE_MASK;
    592				flush_arg_page(bprm, kpos, kmapped_page);
    593			}
    594			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
    595				ret = -EFAULT;
    596				goto out;
    597			}
    598		}
    599	}
    600	ret = 0;
    601out:
    602	if (kmapped_page) {
    603		flush_dcache_page(kmapped_page);
    604		kunmap(kmapped_page);
    605		put_arg_page(kmapped_page);
    606	}
    607	return ret;
    608}
    609
    610/*
    611 * Copy and argument/environment string from the kernel to the processes stack.
    612 */
    613int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
    614{
    615	int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
    616	unsigned long pos = bprm->p;
    617
    618	if (len == 0)
    619		return -EFAULT;
    620	if (!valid_arg_len(bprm, len))
    621		return -E2BIG;
    622
    623	/* We're going to work our way backwards. */
    624	arg += len;
    625	bprm->p -= len;
    626	if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
    627		return -E2BIG;
    628
    629	while (len > 0) {
    630		unsigned int bytes_to_copy = min_t(unsigned int, len,
    631				min_not_zero(offset_in_page(pos), PAGE_SIZE));
    632		struct page *page;
    633		char *kaddr;
    634
    635		pos -= bytes_to_copy;
    636		arg -= bytes_to_copy;
    637		len -= bytes_to_copy;
    638
    639		page = get_arg_page(bprm, pos, 1);
    640		if (!page)
    641			return -E2BIG;
    642		kaddr = kmap_atomic(page);
    643		flush_arg_page(bprm, pos & PAGE_MASK, page);
    644		memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
    645		flush_dcache_page(page);
    646		kunmap_atomic(kaddr);
    647		put_arg_page(page);
    648	}
    649
    650	return 0;
    651}
    652EXPORT_SYMBOL(copy_string_kernel);
    653
    654static int copy_strings_kernel(int argc, const char *const *argv,
    655			       struct linux_binprm *bprm)
    656{
    657	while (argc-- > 0) {
    658		int ret = copy_string_kernel(argv[argc], bprm);
    659		if (ret < 0)
    660			return ret;
    661		if (fatal_signal_pending(current))
    662			return -ERESTARTNOHAND;
    663		cond_resched();
    664	}
    665	return 0;
    666}
    667
    668#ifdef CONFIG_MMU
    669
    670/*
    671 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
    672 * the binfmt code determines where the new stack should reside, we shift it to
    673 * its final location.  The process proceeds as follows:
    674 *
    675 * 1) Use shift to calculate the new vma endpoints.
    676 * 2) Extend vma to cover both the old and new ranges.  This ensures the
    677 *    arguments passed to subsequent functions are consistent.
    678 * 3) Move vma's page tables to the new range.
    679 * 4) Free up any cleared pgd range.
    680 * 5) Shrink the vma to cover only the new range.
    681 */
    682static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
    683{
    684	struct mm_struct *mm = vma->vm_mm;
    685	unsigned long old_start = vma->vm_start;
    686	unsigned long old_end = vma->vm_end;
    687	unsigned long length = old_end - old_start;
    688	unsigned long new_start = old_start - shift;
    689	unsigned long new_end = old_end - shift;
    690	struct mmu_gather tlb;
    691
    692	BUG_ON(new_start > new_end);
    693
    694	/*
    695	 * ensure there are no vmas between where we want to go
    696	 * and where we are
    697	 */
    698	if (vma != find_vma(mm, new_start))
    699		return -EFAULT;
    700
    701	/*
    702	 * cover the whole range: [new_start, old_end)
    703	 */
    704	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
    705		return -ENOMEM;
    706
    707	/*
    708	 * move the page tables downwards, on failure we rely on
    709	 * process cleanup to remove whatever mess we made.
    710	 */
    711	if (length != move_page_tables(vma, old_start,
    712				       vma, new_start, length, false))
    713		return -ENOMEM;
    714
    715	lru_add_drain();
    716	tlb_gather_mmu(&tlb, mm);
    717	if (new_end > old_start) {
    718		/*
    719		 * when the old and new regions overlap clear from new_end.
    720		 */
    721		free_pgd_range(&tlb, new_end, old_end, new_end,
    722			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
    723	} else {
    724		/*
    725		 * otherwise, clean from old_start; this is done to not touch
    726		 * the address space in [new_end, old_start) some architectures
    727		 * have constraints on va-space that make this illegal (IA64) -
    728		 * for the others its just a little faster.
    729		 */
    730		free_pgd_range(&tlb, old_start, old_end, new_end,
    731			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
    732	}
    733	tlb_finish_mmu(&tlb);
    734
    735	/*
    736	 * Shrink the vma to just the new range.  Always succeeds.
    737	 */
    738	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
    739
    740	return 0;
    741}
    742
    743/*
    744 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
    745 * the stack is optionally relocated, and some extra space is added.
    746 */
    747int setup_arg_pages(struct linux_binprm *bprm,
    748		    unsigned long stack_top,
    749		    int executable_stack)
    750{
    751	unsigned long ret;
    752	unsigned long stack_shift;
    753	struct mm_struct *mm = current->mm;
    754	struct vm_area_struct *vma = bprm->vma;
    755	struct vm_area_struct *prev = NULL;
    756	unsigned long vm_flags;
    757	unsigned long stack_base;
    758	unsigned long stack_size;
    759	unsigned long stack_expand;
    760	unsigned long rlim_stack;
    761	struct mmu_gather tlb;
    762
    763#ifdef CONFIG_STACK_GROWSUP
    764	/* Limit stack size */
    765	stack_base = bprm->rlim_stack.rlim_max;
    766
    767	stack_base = calc_max_stack_size(stack_base);
    768
    769	/* Add space for stack randomization. */
    770	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
    771
    772	/* Make sure we didn't let the argument array grow too large. */
    773	if (vma->vm_end - vma->vm_start > stack_base)
    774		return -ENOMEM;
    775
    776	stack_base = PAGE_ALIGN(stack_top - stack_base);
    777
    778	stack_shift = vma->vm_start - stack_base;
    779	mm->arg_start = bprm->p - stack_shift;
    780	bprm->p = vma->vm_end - stack_shift;
    781#else
    782	stack_top = arch_align_stack(stack_top);
    783	stack_top = PAGE_ALIGN(stack_top);
    784
    785	if (unlikely(stack_top < mmap_min_addr) ||
    786	    unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
    787		return -ENOMEM;
    788
    789	stack_shift = vma->vm_end - stack_top;
    790
    791	bprm->p -= stack_shift;
    792	mm->arg_start = bprm->p;
    793#endif
    794
    795	if (bprm->loader)
    796		bprm->loader -= stack_shift;
    797	bprm->exec -= stack_shift;
    798
    799	if (mmap_write_lock_killable(mm))
    800		return -EINTR;
    801
    802	vm_flags = VM_STACK_FLAGS;
    803
    804	/*
    805	 * Adjust stack execute permissions; explicitly enable for
    806	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
    807	 * (arch default) otherwise.
    808	 */
    809	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
    810		vm_flags |= VM_EXEC;
    811	else if (executable_stack == EXSTACK_DISABLE_X)
    812		vm_flags &= ~VM_EXEC;
    813	vm_flags |= mm->def_flags;
    814	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
    815
    816	tlb_gather_mmu(&tlb, mm);
    817	ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end,
    818			vm_flags);
    819	tlb_finish_mmu(&tlb);
    820
    821	if (ret)
    822		goto out_unlock;
    823	BUG_ON(prev != vma);
    824
    825	if (unlikely(vm_flags & VM_EXEC)) {
    826		pr_warn_once("process '%pD4' started with executable stack\n",
    827			     bprm->file);
    828	}
    829
    830	/* Move stack pages down in memory. */
    831	if (stack_shift) {
    832		ret = shift_arg_pages(vma, stack_shift);
    833		if (ret)
    834			goto out_unlock;
    835	}
    836
    837	/* mprotect_fixup is overkill to remove the temporary stack flags */
    838	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
    839
    840	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
    841	stack_size = vma->vm_end - vma->vm_start;
    842	/*
    843	 * Align this down to a page boundary as expand_stack
    844	 * will align it up.
    845	 */
    846	rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
    847#ifdef CONFIG_STACK_GROWSUP
    848	if (stack_size + stack_expand > rlim_stack)
    849		stack_base = vma->vm_start + rlim_stack;
    850	else
    851		stack_base = vma->vm_end + stack_expand;
    852#else
    853	if (stack_size + stack_expand > rlim_stack)
    854		stack_base = vma->vm_end - rlim_stack;
    855	else
    856		stack_base = vma->vm_start - stack_expand;
    857#endif
    858	current->mm->start_stack = bprm->p;
    859	ret = expand_stack(vma, stack_base);
    860	if (ret)
    861		ret = -EFAULT;
    862
    863out_unlock:
    864	mmap_write_unlock(mm);
    865	return ret;
    866}
    867EXPORT_SYMBOL(setup_arg_pages);
    868
    869#else
    870
    871/*
    872 * Transfer the program arguments and environment from the holding pages
    873 * onto the stack. The provided stack pointer is adjusted accordingly.
    874 */
    875int transfer_args_to_stack(struct linux_binprm *bprm,
    876			   unsigned long *sp_location)
    877{
    878	unsigned long index, stop, sp;
    879	int ret = 0;
    880
    881	stop = bprm->p >> PAGE_SHIFT;
    882	sp = *sp_location;
    883
    884	for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
    885		unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
    886		char *src = kmap(bprm->page[index]) + offset;
    887		sp -= PAGE_SIZE - offset;
    888		if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
    889			ret = -EFAULT;
    890		kunmap(bprm->page[index]);
    891		if (ret)
    892			goto out;
    893	}
    894
    895	*sp_location = sp;
    896
    897out:
    898	return ret;
    899}
    900EXPORT_SYMBOL(transfer_args_to_stack);
    901
    902#endif /* CONFIG_MMU */
    903
    904static struct file *do_open_execat(int fd, struct filename *name, int flags)
    905{
    906	struct file *file;
    907	int err;
    908	struct open_flags open_exec_flags = {
    909		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
    910		.acc_mode = MAY_EXEC,
    911		.intent = LOOKUP_OPEN,
    912		.lookup_flags = LOOKUP_FOLLOW,
    913	};
    914
    915	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
    916		return ERR_PTR(-EINVAL);
    917	if (flags & AT_SYMLINK_NOFOLLOW)
    918		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
    919	if (flags & AT_EMPTY_PATH)
    920		open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
    921
    922	file = do_filp_open(fd, name, &open_exec_flags);
    923	if (IS_ERR(file))
    924		goto out;
    925
    926	/*
    927	 * may_open() has already checked for this, so it should be
    928	 * impossible to trip now. But we need to be extra cautious
    929	 * and check again at the very end too.
    930	 */
    931	err = -EACCES;
    932	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
    933			 path_noexec(&file->f_path)))
    934		goto exit;
    935
    936	err = deny_write_access(file);
    937	if (err)
    938		goto exit;
    939
    940	if (name->name[0] != '\0')
    941		fsnotify_open(file);
    942
    943out:
    944	return file;
    945
    946exit:
    947	fput(file);
    948	return ERR_PTR(err);
    949}
    950
    951struct file *open_exec(const char *name)
    952{
    953	struct filename *filename = getname_kernel(name);
    954	struct file *f = ERR_CAST(filename);
    955
    956	if (!IS_ERR(filename)) {
    957		f = do_open_execat(AT_FDCWD, filename, 0);
    958		putname(filename);
    959	}
    960	return f;
    961}
    962EXPORT_SYMBOL(open_exec);
    963
    964#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
    965    defined(CONFIG_BINFMT_ELF_FDPIC)
    966ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
    967{
    968	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
    969	if (res > 0)
    970		flush_icache_user_range(addr, addr + len);
    971	return res;
    972}
    973EXPORT_SYMBOL(read_code);
    974#endif
    975
    976/*
    977 * Maps the mm_struct mm into the current task struct.
    978 * On success, this function returns with exec_update_lock
    979 * held for writing.
    980 */
    981static int exec_mmap(struct mm_struct *mm)
    982{
    983	struct task_struct *tsk;
    984	struct mm_struct *old_mm, *active_mm;
    985	int ret;
    986
    987	/* Notify parent that we're no longer interested in the old VM */
    988	tsk = current;
    989	old_mm = current->mm;
    990	exec_mm_release(tsk, old_mm);
    991	if (old_mm)
    992		sync_mm_rss(old_mm);
    993
    994	ret = down_write_killable(&tsk->signal->exec_update_lock);
    995	if (ret)
    996		return ret;
    997
    998	if (old_mm) {
    999		/*
   1000		 * If there is a pending fatal signal perhaps a signal
   1001		 * whose default action is to create a coredump get
   1002		 * out and die instead of going through with the exec.
   1003		 */
   1004		ret = mmap_read_lock_killable(old_mm);
   1005		if (ret) {
   1006			up_write(&tsk->signal->exec_update_lock);
   1007			return ret;
   1008		}
   1009	}
   1010
   1011	task_lock(tsk);
   1012	membarrier_exec_mmap(mm);
   1013
   1014	local_irq_disable();
   1015	active_mm = tsk->active_mm;
   1016	tsk->active_mm = mm;
   1017	tsk->mm = mm;
   1018	/*
   1019	 * This prevents preemption while active_mm is being loaded and
   1020	 * it and mm are being updated, which could cause problems for
   1021	 * lazy tlb mm refcounting when these are updated by context
   1022	 * switches. Not all architectures can handle irqs off over
   1023	 * activate_mm yet.
   1024	 */
   1025	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
   1026		local_irq_enable();
   1027	activate_mm(active_mm, mm);
   1028	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
   1029		local_irq_enable();
   1030	tsk->mm->vmacache_seqnum = 0;
   1031	vmacache_flush(tsk);
   1032	task_unlock(tsk);
   1033	if (old_mm) {
   1034		mmap_read_unlock(old_mm);
   1035		BUG_ON(active_mm != old_mm);
   1036		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
   1037		mm_update_next_owner(old_mm);
   1038		mmput(old_mm);
   1039		return 0;
   1040	}
   1041	mmdrop(active_mm);
   1042	return 0;
   1043}
   1044
   1045static int de_thread(struct task_struct *tsk)
   1046{
   1047	struct signal_struct *sig = tsk->signal;
   1048	struct sighand_struct *oldsighand = tsk->sighand;
   1049	spinlock_t *lock = &oldsighand->siglock;
   1050
   1051	if (thread_group_empty(tsk))
   1052		goto no_thread_group;
   1053
   1054	/*
   1055	 * Kill all other threads in the thread group.
   1056	 */
   1057	spin_lock_irq(lock);
   1058	if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
   1059		/*
   1060		 * Another group action in progress, just
   1061		 * return so that the signal is processed.
   1062		 */
   1063		spin_unlock_irq(lock);
   1064		return -EAGAIN;
   1065	}
   1066
   1067	sig->group_exec_task = tsk;
   1068	sig->notify_count = zap_other_threads(tsk);
   1069	if (!thread_group_leader(tsk))
   1070		sig->notify_count--;
   1071
   1072	while (sig->notify_count) {
   1073		__set_current_state(TASK_KILLABLE);
   1074		spin_unlock_irq(lock);
   1075		schedule();
   1076		if (__fatal_signal_pending(tsk))
   1077			goto killed;
   1078		spin_lock_irq(lock);
   1079	}
   1080	spin_unlock_irq(lock);
   1081
   1082	/*
   1083	 * At this point all other threads have exited, all we have to
   1084	 * do is to wait for the thread group leader to become inactive,
   1085	 * and to assume its PID:
   1086	 */
   1087	if (!thread_group_leader(tsk)) {
   1088		struct task_struct *leader = tsk->group_leader;
   1089
   1090		for (;;) {
   1091			cgroup_threadgroup_change_begin(tsk);
   1092			write_lock_irq(&tasklist_lock);
   1093			/*
   1094			 * Do this under tasklist_lock to ensure that
   1095			 * exit_notify() can't miss ->group_exec_task
   1096			 */
   1097			sig->notify_count = -1;
   1098			if (likely(leader->exit_state))
   1099				break;
   1100			__set_current_state(TASK_KILLABLE);
   1101			write_unlock_irq(&tasklist_lock);
   1102			cgroup_threadgroup_change_end(tsk);
   1103			schedule();
   1104			if (__fatal_signal_pending(tsk))
   1105				goto killed;
   1106		}
   1107
   1108		/*
   1109		 * The only record we have of the real-time age of a
   1110		 * process, regardless of execs it's done, is start_time.
   1111		 * All the past CPU time is accumulated in signal_struct
   1112		 * from sister threads now dead.  But in this non-leader
   1113		 * exec, nothing survives from the original leader thread,
   1114		 * whose birth marks the true age of this process now.
   1115		 * When we take on its identity by switching to its PID, we
   1116		 * also take its birthdate (always earlier than our own).
   1117		 */
   1118		tsk->start_time = leader->start_time;
   1119		tsk->start_boottime = leader->start_boottime;
   1120
   1121		BUG_ON(!same_thread_group(leader, tsk));
   1122		/*
   1123		 * An exec() starts a new thread group with the
   1124		 * TGID of the previous thread group. Rehash the
   1125		 * two threads with a switched PID, and release
   1126		 * the former thread group leader:
   1127		 */
   1128
   1129		/* Become a process group leader with the old leader's pid.
   1130		 * The old leader becomes a thread of the this thread group.
   1131		 */
   1132		exchange_tids(tsk, leader);
   1133		transfer_pid(leader, tsk, PIDTYPE_TGID);
   1134		transfer_pid(leader, tsk, PIDTYPE_PGID);
   1135		transfer_pid(leader, tsk, PIDTYPE_SID);
   1136
   1137		list_replace_rcu(&leader->tasks, &tsk->tasks);
   1138		list_replace_init(&leader->sibling, &tsk->sibling);
   1139
   1140		tsk->group_leader = tsk;
   1141		leader->group_leader = tsk;
   1142
   1143		tsk->exit_signal = SIGCHLD;
   1144		leader->exit_signal = -1;
   1145
   1146		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
   1147		leader->exit_state = EXIT_DEAD;
   1148
   1149		/*
   1150		 * We are going to release_task()->ptrace_unlink() silently,
   1151		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
   1152		 * the tracer wont't block again waiting for this thread.
   1153		 */
   1154		if (unlikely(leader->ptrace))
   1155			__wake_up_parent(leader, leader->parent);
   1156		write_unlock_irq(&tasklist_lock);
   1157		cgroup_threadgroup_change_end(tsk);
   1158
   1159		release_task(leader);
   1160	}
   1161
   1162	sig->group_exec_task = NULL;
   1163	sig->notify_count = 0;
   1164
   1165no_thread_group:
   1166	/* we have changed execution domain */
   1167	tsk->exit_signal = SIGCHLD;
   1168
   1169	BUG_ON(!thread_group_leader(tsk));
   1170	return 0;
   1171
   1172killed:
   1173	/* protects against exit_notify() and __exit_signal() */
   1174	read_lock(&tasklist_lock);
   1175	sig->group_exec_task = NULL;
   1176	sig->notify_count = 0;
   1177	read_unlock(&tasklist_lock);
   1178	return -EAGAIN;
   1179}
   1180
   1181
   1182/*
   1183 * This function makes sure the current process has its own signal table,
   1184 * so that flush_signal_handlers can later reset the handlers without
   1185 * disturbing other processes.  (Other processes might share the signal
   1186 * table via the CLONE_SIGHAND option to clone().)
   1187 */
   1188static int unshare_sighand(struct task_struct *me)
   1189{
   1190	struct sighand_struct *oldsighand = me->sighand;
   1191
   1192	if (refcount_read(&oldsighand->count) != 1) {
   1193		struct sighand_struct *newsighand;
   1194		/*
   1195		 * This ->sighand is shared with the CLONE_SIGHAND
   1196		 * but not CLONE_THREAD task, switch to the new one.
   1197		 */
   1198		newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
   1199		if (!newsighand)
   1200			return -ENOMEM;
   1201
   1202		refcount_set(&newsighand->count, 1);
   1203		memcpy(newsighand->action, oldsighand->action,
   1204		       sizeof(newsighand->action));
   1205
   1206		write_lock_irq(&tasklist_lock);
   1207		spin_lock(&oldsighand->siglock);
   1208		rcu_assign_pointer(me->sighand, newsighand);
   1209		spin_unlock(&oldsighand->siglock);
   1210		write_unlock_irq(&tasklist_lock);
   1211
   1212		__cleanup_sighand(oldsighand);
   1213	}
   1214	return 0;
   1215}
   1216
   1217char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
   1218{
   1219	task_lock(tsk);
   1220	/* Always NUL terminated and zero-padded */
   1221	strscpy_pad(buf, tsk->comm, buf_size);
   1222	task_unlock(tsk);
   1223	return buf;
   1224}
   1225EXPORT_SYMBOL_GPL(__get_task_comm);
   1226
   1227/*
   1228 * These functions flushes out all traces of the currently running executable
   1229 * so that a new one can be started
   1230 */
   1231
   1232void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
   1233{
   1234	task_lock(tsk);
   1235	trace_task_rename(tsk, buf);
   1236	strscpy_pad(tsk->comm, buf, sizeof(tsk->comm));
   1237	task_unlock(tsk);
   1238	perf_event_comm(tsk, exec);
   1239}
   1240
   1241/*
   1242 * Calling this is the point of no return. None of the failures will be
   1243 * seen by userspace since either the process is already taking a fatal
   1244 * signal (via de_thread() or coredump), or will have SEGV raised
   1245 * (after exec_mmap()) by search_binary_handler (see below).
   1246 */
   1247int begin_new_exec(struct linux_binprm * bprm)
   1248{
   1249	struct task_struct *me = current;
   1250	int retval;
   1251
   1252	/* Once we are committed compute the creds */
   1253	retval = bprm_creds_from_file(bprm);
   1254	if (retval)
   1255		return retval;
   1256
   1257	/*
   1258	 * Ensure all future errors are fatal.
   1259	 */
   1260	bprm->point_of_no_return = true;
   1261
   1262	/*
   1263	 * Make this the only thread in the thread group.
   1264	 */
   1265	retval = de_thread(me);
   1266	if (retval)
   1267		goto out;
   1268
   1269	/*
   1270	 * Cancel any io_uring activity across execve
   1271	 */
   1272	io_uring_task_cancel();
   1273
   1274	/* Ensure the files table is not shared. */
   1275	retval = unshare_files();
   1276	if (retval)
   1277		goto out;
   1278
   1279	/*
   1280	 * Must be called _before_ exec_mmap() as bprm->mm is
   1281	 * not visible until then. This also enables the update
   1282	 * to be lockless.
   1283	 */
   1284	retval = set_mm_exe_file(bprm->mm, bprm->file);
   1285	if (retval)
   1286		goto out;
   1287
   1288	/* If the binary is not readable then enforce mm->dumpable=0 */
   1289	would_dump(bprm, bprm->file);
   1290	if (bprm->have_execfd)
   1291		would_dump(bprm, bprm->executable);
   1292
   1293	/*
   1294	 * Release all of the old mmap stuff
   1295	 */
   1296	acct_arg_size(bprm, 0);
   1297	retval = exec_mmap(bprm->mm);
   1298	if (retval)
   1299		goto out;
   1300
   1301	bprm->mm = NULL;
   1302
   1303#ifdef CONFIG_POSIX_TIMERS
   1304	exit_itimers(me->signal);
   1305	flush_itimer_signals();
   1306#endif
   1307
   1308	/*
   1309	 * Make the signal table private.
   1310	 */
   1311	retval = unshare_sighand(me);
   1312	if (retval)
   1313		goto out_unlock;
   1314
   1315	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
   1316					PF_NOFREEZE | PF_NO_SETAFFINITY);
   1317	flush_thread();
   1318	me->personality &= ~bprm->per_clear;
   1319
   1320	clear_syscall_work_syscall_user_dispatch(me);
   1321
   1322	/*
   1323	 * We have to apply CLOEXEC before we change whether the process is
   1324	 * dumpable (in setup_new_exec) to avoid a race with a process in userspace
   1325	 * trying to access the should-be-closed file descriptors of a process
   1326	 * undergoing exec(2).
   1327	 */
   1328	do_close_on_exec(me->files);
   1329
   1330	if (bprm->secureexec) {
   1331		/* Make sure parent cannot signal privileged process. */
   1332		me->pdeath_signal = 0;
   1333
   1334		/*
   1335		 * For secureexec, reset the stack limit to sane default to
   1336		 * avoid bad behavior from the prior rlimits. This has to
   1337		 * happen before arch_pick_mmap_layout(), which examines
   1338		 * RLIMIT_STACK, but after the point of no return to avoid
   1339		 * needing to clean up the change on failure.
   1340		 */
   1341		if (bprm->rlim_stack.rlim_cur > _STK_LIM)
   1342			bprm->rlim_stack.rlim_cur = _STK_LIM;
   1343	}
   1344
   1345	me->sas_ss_sp = me->sas_ss_size = 0;
   1346
   1347	/*
   1348	 * Figure out dumpability. Note that this checking only of current
   1349	 * is wrong, but userspace depends on it. This should be testing
   1350	 * bprm->secureexec instead.
   1351	 */
   1352	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
   1353	    !(uid_eq(current_euid(), current_uid()) &&
   1354	      gid_eq(current_egid(), current_gid())))
   1355		set_dumpable(current->mm, suid_dumpable);
   1356	else
   1357		set_dumpable(current->mm, SUID_DUMP_USER);
   1358
   1359	perf_event_exec();
   1360	__set_task_comm(me, kbasename(bprm->filename), true);
   1361
   1362	/* An exec changes our domain. We are no longer part of the thread
   1363	   group */
   1364	WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
   1365	flush_signal_handlers(me, 0);
   1366
   1367	retval = set_cred_ucounts(bprm->cred);
   1368	if (retval < 0)
   1369		goto out_unlock;
   1370
   1371	/*
   1372	 * install the new credentials for this executable
   1373	 */
   1374	security_bprm_committing_creds(bprm);
   1375
   1376	commit_creds(bprm->cred);
   1377	bprm->cred = NULL;
   1378
   1379	/*
   1380	 * Disable monitoring for regular users
   1381	 * when executing setuid binaries. Must
   1382	 * wait until new credentials are committed
   1383	 * by commit_creds() above
   1384	 */
   1385	if (get_dumpable(me->mm) != SUID_DUMP_USER)
   1386		perf_event_exit_task(me);
   1387	/*
   1388	 * cred_guard_mutex must be held at least to this point to prevent
   1389	 * ptrace_attach() from altering our determination of the task's
   1390	 * credentials; any time after this it may be unlocked.
   1391	 */
   1392	security_bprm_committed_creds(bprm);
   1393
   1394	/* Pass the opened binary to the interpreter. */
   1395	if (bprm->have_execfd) {
   1396		retval = get_unused_fd_flags(0);
   1397		if (retval < 0)
   1398			goto out_unlock;
   1399		fd_install(retval, bprm->executable);
   1400		bprm->executable = NULL;
   1401		bprm->execfd = retval;
   1402	}
   1403	return 0;
   1404
   1405out_unlock:
   1406	up_write(&me->signal->exec_update_lock);
   1407out:
   1408	return retval;
   1409}
   1410EXPORT_SYMBOL(begin_new_exec);
   1411
   1412void would_dump(struct linux_binprm *bprm, struct file *file)
   1413{
   1414	struct inode *inode = file_inode(file);
   1415	struct user_namespace *mnt_userns = file_mnt_user_ns(file);
   1416	if (inode_permission(mnt_userns, inode, MAY_READ) < 0) {
   1417		struct user_namespace *old, *user_ns;
   1418		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
   1419
   1420		/* Ensure mm->user_ns contains the executable */
   1421		user_ns = old = bprm->mm->user_ns;
   1422		while ((user_ns != &init_user_ns) &&
   1423		       !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode))
   1424			user_ns = user_ns->parent;
   1425
   1426		if (old != user_ns) {
   1427			bprm->mm->user_ns = get_user_ns(user_ns);
   1428			put_user_ns(old);
   1429		}
   1430	}
   1431}
   1432EXPORT_SYMBOL(would_dump);
   1433
   1434void setup_new_exec(struct linux_binprm * bprm)
   1435{
   1436	/* Setup things that can depend upon the personality */
   1437	struct task_struct *me = current;
   1438
   1439	arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
   1440
   1441	arch_setup_new_exec();
   1442
   1443	/* Set the new mm task size. We have to do that late because it may
   1444	 * depend on TIF_32BIT which is only updated in flush_thread() on
   1445	 * some architectures like powerpc
   1446	 */
   1447	me->mm->task_size = TASK_SIZE;
   1448	up_write(&me->signal->exec_update_lock);
   1449	mutex_unlock(&me->signal->cred_guard_mutex);
   1450}
   1451EXPORT_SYMBOL(setup_new_exec);
   1452
   1453/* Runs immediately before start_thread() takes over. */
   1454void finalize_exec(struct linux_binprm *bprm)
   1455{
   1456	/* Store any stack rlimit changes before starting thread. */
   1457	task_lock(current->group_leader);
   1458	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
   1459	task_unlock(current->group_leader);
   1460}
   1461EXPORT_SYMBOL(finalize_exec);
   1462
   1463/*
   1464 * Prepare credentials and lock ->cred_guard_mutex.
   1465 * setup_new_exec() commits the new creds and drops the lock.
   1466 * Or, if exec fails before, free_bprm() should release ->cred
   1467 * and unlock.
   1468 */
   1469static int prepare_bprm_creds(struct linux_binprm *bprm)
   1470{
   1471	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
   1472		return -ERESTARTNOINTR;
   1473
   1474	bprm->cred = prepare_exec_creds();
   1475	if (likely(bprm->cred))
   1476		return 0;
   1477
   1478	mutex_unlock(&current->signal->cred_guard_mutex);
   1479	return -ENOMEM;
   1480}
   1481
   1482static void free_bprm(struct linux_binprm *bprm)
   1483{
   1484	if (bprm->mm) {
   1485		acct_arg_size(bprm, 0);
   1486		mmput(bprm->mm);
   1487	}
   1488	free_arg_pages(bprm);
   1489	if (bprm->cred) {
   1490		mutex_unlock(&current->signal->cred_guard_mutex);
   1491		abort_creds(bprm->cred);
   1492	}
   1493	if (bprm->file) {
   1494		allow_write_access(bprm->file);
   1495		fput(bprm->file);
   1496	}
   1497	if (bprm->executable)
   1498		fput(bprm->executable);
   1499	/* If a binfmt changed the interp, free it. */
   1500	if (bprm->interp != bprm->filename)
   1501		kfree(bprm->interp);
   1502	kfree(bprm->fdpath);
   1503	kfree(bprm);
   1504}
   1505
   1506static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
   1507{
   1508	struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
   1509	int retval = -ENOMEM;
   1510	if (!bprm)
   1511		goto out;
   1512
   1513	if (fd == AT_FDCWD || filename->name[0] == '/') {
   1514		bprm->filename = filename->name;
   1515	} else {
   1516		if (filename->name[0] == '\0')
   1517			bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
   1518		else
   1519			bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
   1520						  fd, filename->name);
   1521		if (!bprm->fdpath)
   1522			goto out_free;
   1523
   1524		bprm->filename = bprm->fdpath;
   1525	}
   1526	bprm->interp = bprm->filename;
   1527
   1528	retval = bprm_mm_init(bprm);
   1529	if (retval)
   1530		goto out_free;
   1531	return bprm;
   1532
   1533out_free:
   1534	free_bprm(bprm);
   1535out:
   1536	return ERR_PTR(retval);
   1537}
   1538
   1539int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
   1540{
   1541	/* If a binfmt changed the interp, free it first. */
   1542	if (bprm->interp != bprm->filename)
   1543		kfree(bprm->interp);
   1544	bprm->interp = kstrdup(interp, GFP_KERNEL);
   1545	if (!bprm->interp)
   1546		return -ENOMEM;
   1547	return 0;
   1548}
   1549EXPORT_SYMBOL(bprm_change_interp);
   1550
   1551/*
   1552 * determine how safe it is to execute the proposed program
   1553 * - the caller must hold ->cred_guard_mutex to protect against
   1554 *   PTRACE_ATTACH or seccomp thread-sync
   1555 */
   1556static void check_unsafe_exec(struct linux_binprm *bprm)
   1557{
   1558	struct task_struct *p = current, *t;
   1559	unsigned n_fs;
   1560
   1561	if (p->ptrace)
   1562		bprm->unsafe |= LSM_UNSAFE_PTRACE;
   1563
   1564	/*
   1565	 * This isn't strictly necessary, but it makes it harder for LSMs to
   1566	 * mess up.
   1567	 */
   1568	if (task_no_new_privs(current))
   1569		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
   1570
   1571	t = p;
   1572	n_fs = 1;
   1573	spin_lock(&p->fs->lock);
   1574	rcu_read_lock();
   1575	while_each_thread(p, t) {
   1576		if (t->fs == p->fs)
   1577			n_fs++;
   1578	}
   1579	rcu_read_unlock();
   1580
   1581	if (p->fs->users > n_fs)
   1582		bprm->unsafe |= LSM_UNSAFE_SHARE;
   1583	else
   1584		p->fs->in_exec = 1;
   1585	spin_unlock(&p->fs->lock);
   1586}
   1587
   1588static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
   1589{
   1590	/* Handle suid and sgid on files */
   1591	struct user_namespace *mnt_userns;
   1592	struct inode *inode;
   1593	unsigned int mode;
   1594	kuid_t uid;
   1595	kgid_t gid;
   1596
   1597	if (!mnt_may_suid(file->f_path.mnt))
   1598		return;
   1599
   1600	if (task_no_new_privs(current))
   1601		return;
   1602
   1603	inode = file->f_path.dentry->d_inode;
   1604	mode = READ_ONCE(inode->i_mode);
   1605	if (!(mode & (S_ISUID|S_ISGID)))
   1606		return;
   1607
   1608	mnt_userns = file_mnt_user_ns(file);
   1609
   1610	/* Be careful if suid/sgid is set */
   1611	inode_lock(inode);
   1612
   1613	/* reload atomically mode/uid/gid now that lock held */
   1614	mode = inode->i_mode;
   1615	uid = i_uid_into_mnt(mnt_userns, inode);
   1616	gid = i_gid_into_mnt(mnt_userns, inode);
   1617	inode_unlock(inode);
   1618
   1619	/* We ignore suid/sgid if there are no mappings for them in the ns */
   1620	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
   1621		 !kgid_has_mapping(bprm->cred->user_ns, gid))
   1622		return;
   1623
   1624	if (mode & S_ISUID) {
   1625		bprm->per_clear |= PER_CLEAR_ON_SETID;
   1626		bprm->cred->euid = uid;
   1627	}
   1628
   1629	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
   1630		bprm->per_clear |= PER_CLEAR_ON_SETID;
   1631		bprm->cred->egid = gid;
   1632	}
   1633}
   1634
   1635/*
   1636 * Compute brpm->cred based upon the final binary.
   1637 */
   1638static int bprm_creds_from_file(struct linux_binprm *bprm)
   1639{
   1640	/* Compute creds based on which file? */
   1641	struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
   1642
   1643	bprm_fill_uid(bprm, file);
   1644	return security_bprm_creds_from_file(bprm, file);
   1645}
   1646
   1647/*
   1648 * Fill the binprm structure from the inode.
   1649 * Read the first BINPRM_BUF_SIZE bytes
   1650 *
   1651 * This may be called multiple times for binary chains (scripts for example).
   1652 */
   1653static int prepare_binprm(struct linux_binprm *bprm)
   1654{
   1655	loff_t pos = 0;
   1656
   1657	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
   1658	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
   1659}
   1660
   1661/*
   1662 * Arguments are '\0' separated strings found at the location bprm->p
   1663 * points to; chop off the first by relocating brpm->p to right after
   1664 * the first '\0' encountered.
   1665 */
   1666int remove_arg_zero(struct linux_binprm *bprm)
   1667{
   1668	int ret = 0;
   1669	unsigned long offset;
   1670	char *kaddr;
   1671	struct page *page;
   1672
   1673	if (!bprm->argc)
   1674		return 0;
   1675
   1676	do {
   1677		offset = bprm->p & ~PAGE_MASK;
   1678		page = get_arg_page(bprm, bprm->p, 0);
   1679		if (!page) {
   1680			ret = -EFAULT;
   1681			goto out;
   1682		}
   1683		kaddr = kmap_atomic(page);
   1684
   1685		for (; offset < PAGE_SIZE && kaddr[offset];
   1686				offset++, bprm->p++)
   1687			;
   1688
   1689		kunmap_atomic(kaddr);
   1690		put_arg_page(page);
   1691	} while (offset == PAGE_SIZE);
   1692
   1693	bprm->p++;
   1694	bprm->argc--;
   1695	ret = 0;
   1696
   1697out:
   1698	return ret;
   1699}
   1700EXPORT_SYMBOL(remove_arg_zero);
   1701
   1702#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
   1703/*
   1704 * cycle the list of binary formats handler, until one recognizes the image
   1705 */
   1706static int search_binary_handler(struct linux_binprm *bprm)
   1707{
   1708	bool need_retry = IS_ENABLED(CONFIG_MODULES);
   1709	struct linux_binfmt *fmt;
   1710	int retval;
   1711
   1712	retval = prepare_binprm(bprm);
   1713	if (retval < 0)
   1714		return retval;
   1715
   1716	retval = security_bprm_check(bprm);
   1717	if (retval)
   1718		return retval;
   1719
   1720	retval = -ENOENT;
   1721 retry:
   1722	read_lock(&binfmt_lock);
   1723	list_for_each_entry(fmt, &formats, lh) {
   1724		if (!try_module_get(fmt->module))
   1725			continue;
   1726		read_unlock(&binfmt_lock);
   1727
   1728		retval = fmt->load_binary(bprm);
   1729
   1730		read_lock(&binfmt_lock);
   1731		put_binfmt(fmt);
   1732		if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
   1733			read_unlock(&binfmt_lock);
   1734			return retval;
   1735		}
   1736	}
   1737	read_unlock(&binfmt_lock);
   1738
   1739	if (need_retry) {
   1740		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
   1741		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
   1742			return retval;
   1743		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
   1744			return retval;
   1745		need_retry = false;
   1746		goto retry;
   1747	}
   1748
   1749	return retval;
   1750}
   1751
   1752static int exec_binprm(struct linux_binprm *bprm)
   1753{
   1754	pid_t old_pid, old_vpid;
   1755	int ret, depth;
   1756
   1757	/* Need to fetch pid before load_binary changes it */
   1758	old_pid = current->pid;
   1759	rcu_read_lock();
   1760	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
   1761	rcu_read_unlock();
   1762
   1763	/* This allows 4 levels of binfmt rewrites before failing hard. */
   1764	for (depth = 0;; depth++) {
   1765		struct file *exec;
   1766		if (depth > 5)
   1767			return -ELOOP;
   1768
   1769		ret = search_binary_handler(bprm);
   1770		if (ret < 0)
   1771			return ret;
   1772		if (!bprm->interpreter)
   1773			break;
   1774
   1775		exec = bprm->file;
   1776		bprm->file = bprm->interpreter;
   1777		bprm->interpreter = NULL;
   1778
   1779		allow_write_access(exec);
   1780		if (unlikely(bprm->have_execfd)) {
   1781			if (bprm->executable) {
   1782				fput(exec);
   1783				return -ENOEXEC;
   1784			}
   1785			bprm->executable = exec;
   1786		} else
   1787			fput(exec);
   1788	}
   1789
   1790	audit_bprm(bprm);
   1791	trace_sched_process_exec(current, old_pid, bprm);
   1792	ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
   1793	proc_exec_connector(current);
   1794	return 0;
   1795}
   1796
   1797/*
   1798 * sys_execve() executes a new program.
   1799 */
   1800static int bprm_execve(struct linux_binprm *bprm,
   1801		       int fd, struct filename *filename, int flags)
   1802{
   1803	struct file *file;
   1804	int retval;
   1805
   1806	retval = prepare_bprm_creds(bprm);
   1807	if (retval)
   1808		return retval;
   1809
   1810	check_unsafe_exec(bprm);
   1811	current->in_execve = 1;
   1812
   1813	file = do_open_execat(fd, filename, flags);
   1814	retval = PTR_ERR(file);
   1815	if (IS_ERR(file))
   1816		goto out_unmark;
   1817
   1818	sched_exec();
   1819
   1820	bprm->file = file;
   1821	/*
   1822	 * Record that a name derived from an O_CLOEXEC fd will be
   1823	 * inaccessible after exec.  This allows the code in exec to
   1824	 * choose to fail when the executable is not mmaped into the
   1825	 * interpreter and an open file descriptor is not passed to
   1826	 * the interpreter.  This makes for a better user experience
   1827	 * than having the interpreter start and then immediately fail
   1828	 * when it finds the executable is inaccessible.
   1829	 */
   1830	if (bprm->fdpath && get_close_on_exec(fd))
   1831		bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
   1832
   1833	/* Set the unchanging part of bprm->cred */
   1834	retval = security_bprm_creds_for_exec(bprm);
   1835	if (retval)
   1836		goto out;
   1837
   1838	retval = exec_binprm(bprm);
   1839	if (retval < 0)
   1840		goto out;
   1841
   1842	/* execve succeeded */
   1843	current->fs->in_exec = 0;
   1844	current->in_execve = 0;
   1845	rseq_execve(current);
   1846	acct_update_integrals(current);
   1847	task_numa_free(current, false);
   1848	return retval;
   1849
   1850out:
   1851	/*
   1852	 * If past the point of no return ensure the code never
   1853	 * returns to the userspace process.  Use an existing fatal
   1854	 * signal if present otherwise terminate the process with
   1855	 * SIGSEGV.
   1856	 */
   1857	if (bprm->point_of_no_return && !fatal_signal_pending(current))
   1858		force_fatal_sig(SIGSEGV);
   1859
   1860out_unmark:
   1861	current->fs->in_exec = 0;
   1862	current->in_execve = 0;
   1863
   1864	return retval;
   1865}
   1866
   1867static int do_execveat_common(int fd, struct filename *filename,
   1868			      struct user_arg_ptr argv,
   1869			      struct user_arg_ptr envp,
   1870			      int flags)
   1871{
   1872	struct linux_binprm *bprm;
   1873	int retval;
   1874
   1875	if (IS_ERR(filename))
   1876		return PTR_ERR(filename);
   1877
   1878	/*
   1879	 * We move the actual failure in case of RLIMIT_NPROC excess from
   1880	 * set*uid() to execve() because too many poorly written programs
   1881	 * don't check setuid() return code.  Here we additionally recheck
   1882	 * whether NPROC limit is still exceeded.
   1883	 */
   1884	if ((current->flags & PF_NPROC_EXCEEDED) &&
   1885	    is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
   1886		retval = -EAGAIN;
   1887		goto out_ret;
   1888	}
   1889
   1890	/* We're below the limit (still or again), so we don't want to make
   1891	 * further execve() calls fail. */
   1892	current->flags &= ~PF_NPROC_EXCEEDED;
   1893
   1894	bprm = alloc_bprm(fd, filename);
   1895	if (IS_ERR(bprm)) {
   1896		retval = PTR_ERR(bprm);
   1897		goto out_ret;
   1898	}
   1899
   1900	retval = count(argv, MAX_ARG_STRINGS);
   1901	if (retval == 0)
   1902		pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
   1903			     current->comm, bprm->filename);
   1904	if (retval < 0)
   1905		goto out_free;
   1906	bprm->argc = retval;
   1907
   1908	retval = count(envp, MAX_ARG_STRINGS);
   1909	if (retval < 0)
   1910		goto out_free;
   1911	bprm->envc = retval;
   1912
   1913	retval = bprm_stack_limits(bprm);
   1914	if (retval < 0)
   1915		goto out_free;
   1916
   1917	retval = copy_string_kernel(bprm->filename, bprm);
   1918	if (retval < 0)
   1919		goto out_free;
   1920	bprm->exec = bprm->p;
   1921
   1922	retval = copy_strings(bprm->envc, envp, bprm);
   1923	if (retval < 0)
   1924		goto out_free;
   1925
   1926	retval = copy_strings(bprm->argc, argv, bprm);
   1927	if (retval < 0)
   1928		goto out_free;
   1929
   1930	/*
   1931	 * When argv is empty, add an empty string ("") as argv[0] to
   1932	 * ensure confused userspace programs that start processing
   1933	 * from argv[1] won't end up walking envp. See also
   1934	 * bprm_stack_limits().
   1935	 */
   1936	if (bprm->argc == 0) {
   1937		retval = copy_string_kernel("", bprm);
   1938		if (retval < 0)
   1939			goto out_free;
   1940		bprm->argc = 1;
   1941	}
   1942
   1943	retval = bprm_execve(bprm, fd, filename, flags);
   1944out_free:
   1945	free_bprm(bprm);
   1946
   1947out_ret:
   1948	putname(filename);
   1949	return retval;
   1950}
   1951
   1952int kernel_execve(const char *kernel_filename,
   1953		  const char *const *argv, const char *const *envp)
   1954{
   1955	struct filename *filename;
   1956	struct linux_binprm *bprm;
   1957	int fd = AT_FDCWD;
   1958	int retval;
   1959
   1960	/* It is non-sense for kernel threads to call execve */
   1961	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
   1962		return -EINVAL;
   1963
   1964	filename = getname_kernel(kernel_filename);
   1965	if (IS_ERR(filename))
   1966		return PTR_ERR(filename);
   1967
   1968	bprm = alloc_bprm(fd, filename);
   1969	if (IS_ERR(bprm)) {
   1970		retval = PTR_ERR(bprm);
   1971		goto out_ret;
   1972	}
   1973
   1974	retval = count_strings_kernel(argv);
   1975	if (WARN_ON_ONCE(retval == 0))
   1976		retval = -EINVAL;
   1977	if (retval < 0)
   1978		goto out_free;
   1979	bprm->argc = retval;
   1980
   1981	retval = count_strings_kernel(envp);
   1982	if (retval < 0)
   1983		goto out_free;
   1984	bprm->envc = retval;
   1985
   1986	retval = bprm_stack_limits(bprm);
   1987	if (retval < 0)
   1988		goto out_free;
   1989
   1990	retval = copy_string_kernel(bprm->filename, bprm);
   1991	if (retval < 0)
   1992		goto out_free;
   1993	bprm->exec = bprm->p;
   1994
   1995	retval = copy_strings_kernel(bprm->envc, envp, bprm);
   1996	if (retval < 0)
   1997		goto out_free;
   1998
   1999	retval = copy_strings_kernel(bprm->argc, argv, bprm);
   2000	if (retval < 0)
   2001		goto out_free;
   2002
   2003	retval = bprm_execve(bprm, fd, filename, 0);
   2004out_free:
   2005	free_bprm(bprm);
   2006out_ret:
   2007	putname(filename);
   2008	return retval;
   2009}
   2010
   2011static int do_execve(struct filename *filename,
   2012	const char __user *const __user *__argv,
   2013	const char __user *const __user *__envp)
   2014{
   2015	struct user_arg_ptr argv = { .ptr.native = __argv };
   2016	struct user_arg_ptr envp = { .ptr.native = __envp };
   2017	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
   2018}
   2019
   2020static int do_execveat(int fd, struct filename *filename,
   2021		const char __user *const __user *__argv,
   2022		const char __user *const __user *__envp,
   2023		int flags)
   2024{
   2025	struct user_arg_ptr argv = { .ptr.native = __argv };
   2026	struct user_arg_ptr envp = { .ptr.native = __envp };
   2027
   2028	return do_execveat_common(fd, filename, argv, envp, flags);
   2029}
   2030
   2031#ifdef CONFIG_COMPAT
   2032static int compat_do_execve(struct filename *filename,
   2033	const compat_uptr_t __user *__argv,
   2034	const compat_uptr_t __user *__envp)
   2035{
   2036	struct user_arg_ptr argv = {
   2037		.is_compat = true,
   2038		.ptr.compat = __argv,
   2039	};
   2040	struct user_arg_ptr envp = {
   2041		.is_compat = true,
   2042		.ptr.compat = __envp,
   2043	};
   2044	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
   2045}
   2046
   2047static int compat_do_execveat(int fd, struct filename *filename,
   2048			      const compat_uptr_t __user *__argv,
   2049			      const compat_uptr_t __user *__envp,
   2050			      int flags)
   2051{
   2052	struct user_arg_ptr argv = {
   2053		.is_compat = true,
   2054		.ptr.compat = __argv,
   2055	};
   2056	struct user_arg_ptr envp = {
   2057		.is_compat = true,
   2058		.ptr.compat = __envp,
   2059	};
   2060	return do_execveat_common(fd, filename, argv, envp, flags);
   2061}
   2062#endif
   2063
   2064void set_binfmt(struct linux_binfmt *new)
   2065{
   2066	struct mm_struct *mm = current->mm;
   2067
   2068	if (mm->binfmt)
   2069		module_put(mm->binfmt->module);
   2070
   2071	mm->binfmt = new;
   2072	if (new)
   2073		__module_get(new->module);
   2074}
   2075EXPORT_SYMBOL(set_binfmt);
   2076
   2077/*
   2078 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
   2079 */
   2080void set_dumpable(struct mm_struct *mm, int value)
   2081{
   2082	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
   2083		return;
   2084
   2085	set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
   2086}
   2087
   2088SYSCALL_DEFINE3(execve,
   2089		const char __user *, filename,
   2090		const char __user *const __user *, argv,
   2091		const char __user *const __user *, envp)
   2092{
   2093	return do_execve(getname(filename), argv, envp);
   2094}
   2095
   2096SYSCALL_DEFINE5(execveat,
   2097		int, fd, const char __user *, filename,
   2098		const char __user *const __user *, argv,
   2099		const char __user *const __user *, envp,
   2100		int, flags)
   2101{
   2102	return do_execveat(fd,
   2103			   getname_uflags(filename, flags),
   2104			   argv, envp, flags);
   2105}
   2106
   2107#ifdef CONFIG_COMPAT
   2108COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
   2109	const compat_uptr_t __user *, argv,
   2110	const compat_uptr_t __user *, envp)
   2111{
   2112	return compat_do_execve(getname(filename), argv, envp);
   2113}
   2114
   2115COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
   2116		       const char __user *, filename,
   2117		       const compat_uptr_t __user *, argv,
   2118		       const compat_uptr_t __user *, envp,
   2119		       int,  flags)
   2120{
   2121	return compat_do_execveat(fd,
   2122				  getname_uflags(filename, flags),
   2123				  argv, envp, flags);
   2124}
   2125#endif
   2126
   2127#ifdef CONFIG_SYSCTL
   2128
   2129static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
   2130		void *buffer, size_t *lenp, loff_t *ppos)
   2131{
   2132	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
   2133
   2134	if (!error)
   2135		validate_coredump_safety();
   2136	return error;
   2137}
   2138
   2139static struct ctl_table fs_exec_sysctls[] = {
   2140	{
   2141		.procname	= "suid_dumpable",
   2142		.data		= &suid_dumpable,
   2143		.maxlen		= sizeof(int),
   2144		.mode		= 0644,
   2145		.proc_handler	= proc_dointvec_minmax_coredump,
   2146		.extra1		= SYSCTL_ZERO,
   2147		.extra2		= SYSCTL_TWO,
   2148	},
   2149	{ }
   2150};
   2151
   2152static int __init init_fs_exec_sysctls(void)
   2153{
   2154	register_sysctl_init("fs", fs_exec_sysctls);
   2155	return 0;
   2156}
   2157
   2158fs_initcall(init_fs_exec_sysctls);
   2159#endif /* CONFIG_SYSCTL */