sys.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
sys.c (66780B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/kernel/sys.c
      4 *
      5 *  Copyright (C) 1991, 1992  Linus Torvalds
      6 */
      7
      8#include <linux/export.h>
      9#include <linux/mm.h>
     10#include <linux/mm_inline.h>
     11#include <linux/utsname.h>
     12#include <linux/mman.h>
     13#include <linux/reboot.h>
     14#include <linux/prctl.h>
     15#include <linux/highuid.h>
     16#include <linux/fs.h>
     17#include <linux/kmod.h>
     18#include <linux/perf_event.h>
     19#include <linux/resource.h>
     20#include <linux/kernel.h>
     21#include <linux/workqueue.h>
     22#include <linux/capability.h>
     23#include <linux/device.h>
     24#include <linux/key.h>
     25#include <linux/times.h>
     26#include <linux/posix-timers.h>
     27#include <linux/security.h>
     28#include <linux/suspend.h>
     29#include <linux/tty.h>
     30#include <linux/signal.h>
     31#include <linux/cn_proc.h>
     32#include <linux/getcpu.h>
     33#include <linux/task_io_accounting_ops.h>
     34#include <linux/seccomp.h>
     35#include <linux/cpu.h>
     36#include <linux/personality.h>
     37#include <linux/ptrace.h>
     38#include <linux/fs_struct.h>
     39#include <linux/file.h>
     40#include <linux/mount.h>
     41#include <linux/gfp.h>
     42#include <linux/syscore_ops.h>
     43#include <linux/version.h>
     44#include <linux/ctype.h>
     45#include <linux/syscall_user_dispatch.h>
     46
     47#include <linux/compat.h>
     48#include <linux/syscalls.h>
     49#include <linux/kprobes.h>
     50#include <linux/user_namespace.h>
     51#include <linux/time_namespace.h>
     52#include <linux/binfmts.h>
     53
     54#include <linux/sched.h>
     55#include <linux/sched/autogroup.h>
     56#include <linux/sched/loadavg.h>
     57#include <linux/sched/stat.h>
     58#include <linux/sched/mm.h>
     59#include <linux/sched/coredump.h>
     60#include <linux/sched/task.h>
     61#include <linux/sched/cputime.h>
     62#include <linux/rcupdate.h>
     63#include <linux/uidgid.h>
     64#include <linux/cred.h>
     65
     66#include <linux/nospec.h>
     67
     68#include <linux/kmsg_dump.h>
     69/* Move somewhere else to avoid recompiling? */
     70#include <generated/utsrelease.h>
     71
     72#include <linux/uaccess.h>
     73#include <asm/io.h>
     74#include <asm/unistd.h>
     75
     76#include "uid16.h"
     77
     78#ifndef SET_UNALIGN_CTL
     79# define SET_UNALIGN_CTL(a, b)	(-EINVAL)
     80#endif
     81#ifndef GET_UNALIGN_CTL
     82# define GET_UNALIGN_CTL(a, b)	(-EINVAL)
     83#endif
     84#ifndef SET_FPEMU_CTL
     85# define SET_FPEMU_CTL(a, b)	(-EINVAL)
     86#endif
     87#ifndef GET_FPEMU_CTL
     88# define GET_FPEMU_CTL(a, b)	(-EINVAL)
     89#endif
     90#ifndef SET_FPEXC_CTL
     91# define SET_FPEXC_CTL(a, b)	(-EINVAL)
     92#endif
     93#ifndef GET_FPEXC_CTL
     94# define GET_FPEXC_CTL(a, b)	(-EINVAL)
     95#endif
     96#ifndef GET_ENDIAN
     97# define GET_ENDIAN(a, b)	(-EINVAL)
     98#endif
     99#ifndef SET_ENDIAN
    100# define SET_ENDIAN(a, b)	(-EINVAL)
    101#endif
    102#ifndef GET_TSC_CTL
    103# define GET_TSC_CTL(a)		(-EINVAL)
    104#endif
    105#ifndef SET_TSC_CTL
    106# define SET_TSC_CTL(a)		(-EINVAL)
    107#endif
    108#ifndef GET_FP_MODE
    109# define GET_FP_MODE(a)		(-EINVAL)
    110#endif
    111#ifndef SET_FP_MODE
    112# define SET_FP_MODE(a,b)	(-EINVAL)
    113#endif
    114#ifndef SVE_SET_VL
    115# define SVE_SET_VL(a)		(-EINVAL)
    116#endif
    117#ifndef SVE_GET_VL
    118# define SVE_GET_VL()		(-EINVAL)
    119#endif
    120#ifndef SME_SET_VL
    121# define SME_SET_VL(a)		(-EINVAL)
    122#endif
    123#ifndef SME_GET_VL
    124# define SME_GET_VL()		(-EINVAL)
    125#endif
    126#ifndef PAC_RESET_KEYS
    127# define PAC_RESET_KEYS(a, b)	(-EINVAL)
    128#endif
    129#ifndef PAC_SET_ENABLED_KEYS
    130# define PAC_SET_ENABLED_KEYS(a, b, c)	(-EINVAL)
    131#endif
    132#ifndef PAC_GET_ENABLED_KEYS
    133# define PAC_GET_ENABLED_KEYS(a)	(-EINVAL)
    134#endif
    135#ifndef SET_TAGGED_ADDR_CTRL
    136# define SET_TAGGED_ADDR_CTRL(a)	(-EINVAL)
    137#endif
    138#ifndef GET_TAGGED_ADDR_CTRL
    139# define GET_TAGGED_ADDR_CTRL()		(-EINVAL)
    140#endif
    141
    142/*
    143 * this is where the system-wide overflow UID and GID are defined, for
    144 * architectures that now have 32-bit UID/GID but didn't in the past
    145 */
    146
    147int overflowuid = DEFAULT_OVERFLOWUID;
    148int overflowgid = DEFAULT_OVERFLOWGID;
    149
    150EXPORT_SYMBOL(overflowuid);
    151EXPORT_SYMBOL(overflowgid);
    152
    153/*
    154 * the same as above, but for filesystems which can only store a 16-bit
    155 * UID and GID. as such, this is needed on all architectures
    156 */
    157
    158int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
    159int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
    160
    161EXPORT_SYMBOL(fs_overflowuid);
    162EXPORT_SYMBOL(fs_overflowgid);
    163
    164/*
    165 * Returns true if current's euid is same as p's uid or euid,
    166 * or has CAP_SYS_NICE to p's user_ns.
    167 *
    168 * Called with rcu_read_lock, creds are safe
    169 */
    170static bool set_one_prio_perm(struct task_struct *p)
    171{
    172	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
    173
    174	if (uid_eq(pcred->uid,  cred->euid) ||
    175	    uid_eq(pcred->euid, cred->euid))
    176		return true;
    177	if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
    178		return true;
    179	return false;
    180}
    181
    182/*
    183 * set the priority of a task
    184 * - the caller must hold the RCU read lock
    185 */
    186static int set_one_prio(struct task_struct *p, int niceval, int error)
    187{
    188	int no_nice;
    189
    190	if (!set_one_prio_perm(p)) {
    191		error = -EPERM;
    192		goto out;
    193	}
    194	if (niceval < task_nice(p) && !can_nice(p, niceval)) {
    195		error = -EACCES;
    196		goto out;
    197	}
    198	no_nice = security_task_setnice(p, niceval);
    199	if (no_nice) {
    200		error = no_nice;
    201		goto out;
    202	}
    203	if (error == -ESRCH)
    204		error = 0;
    205	set_user_nice(p, niceval);
    206out:
    207	return error;
    208}
    209
    210SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
    211{
    212	struct task_struct *g, *p;
    213	struct user_struct *user;
    214	const struct cred *cred = current_cred();
    215	int error = -EINVAL;
    216	struct pid *pgrp;
    217	kuid_t uid;
    218
    219	if (which > PRIO_USER || which < PRIO_PROCESS)
    220		goto out;
    221
    222	/* normalize: avoid signed division (rounding problems) */
    223	error = -ESRCH;
    224	if (niceval < MIN_NICE)
    225		niceval = MIN_NICE;
    226	if (niceval > MAX_NICE)
    227		niceval = MAX_NICE;
    228
    229	rcu_read_lock();
    230	switch (which) {
    231	case PRIO_PROCESS:
    232		if (who)
    233			p = find_task_by_vpid(who);
    234		else
    235			p = current;
    236		if (p)
    237			error = set_one_prio(p, niceval, error);
    238		break;
    239	case PRIO_PGRP:
    240		if (who)
    241			pgrp = find_vpid(who);
    242		else
    243			pgrp = task_pgrp(current);
    244		read_lock(&tasklist_lock);
    245		do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
    246			error = set_one_prio(p, niceval, error);
    247		} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
    248		read_unlock(&tasklist_lock);
    249		break;
    250	case PRIO_USER:
    251		uid = make_kuid(cred->user_ns, who);
    252		user = cred->user;
    253		if (!who)
    254			uid = cred->uid;
    255		else if (!uid_eq(uid, cred->uid)) {
    256			user = find_user(uid);
    257			if (!user)
    258				goto out_unlock;	/* No processes for this user */
    259		}
    260		for_each_process_thread(g, p) {
    261			if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
    262				error = set_one_prio(p, niceval, error);
    263		}
    264		if (!uid_eq(uid, cred->uid))
    265			free_uid(user);		/* For find_user() */
    266		break;
    267	}
    268out_unlock:
    269	rcu_read_unlock();
    270out:
    271	return error;
    272}
    273
    274/*
    275 * Ugh. To avoid negative return values, "getpriority()" will
    276 * not return the normal nice-value, but a negated value that
    277 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
    278 * to stay compatible.
    279 */
    280SYSCALL_DEFINE2(getpriority, int, which, int, who)
    281{
    282	struct task_struct *g, *p;
    283	struct user_struct *user;
    284	const struct cred *cred = current_cred();
    285	long niceval, retval = -ESRCH;
    286	struct pid *pgrp;
    287	kuid_t uid;
    288
    289	if (which > PRIO_USER || which < PRIO_PROCESS)
    290		return -EINVAL;
    291
    292	rcu_read_lock();
    293	switch (which) {
    294	case PRIO_PROCESS:
    295		if (who)
    296			p = find_task_by_vpid(who);
    297		else
    298			p = current;
    299		if (p) {
    300			niceval = nice_to_rlimit(task_nice(p));
    301			if (niceval > retval)
    302				retval = niceval;
    303		}
    304		break;
    305	case PRIO_PGRP:
    306		if (who)
    307			pgrp = find_vpid(who);
    308		else
    309			pgrp = task_pgrp(current);
    310		read_lock(&tasklist_lock);
    311		do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
    312			niceval = nice_to_rlimit(task_nice(p));
    313			if (niceval > retval)
    314				retval = niceval;
    315		} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
    316		read_unlock(&tasklist_lock);
    317		break;
    318	case PRIO_USER:
    319		uid = make_kuid(cred->user_ns, who);
    320		user = cred->user;
    321		if (!who)
    322			uid = cred->uid;
    323		else if (!uid_eq(uid, cred->uid)) {
    324			user = find_user(uid);
    325			if (!user)
    326				goto out_unlock;	/* No processes for this user */
    327		}
    328		for_each_process_thread(g, p) {
    329			if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
    330				niceval = nice_to_rlimit(task_nice(p));
    331				if (niceval > retval)
    332					retval = niceval;
    333			}
    334		}
    335		if (!uid_eq(uid, cred->uid))
    336			free_uid(user);		/* for find_user() */
    337		break;
    338	}
    339out_unlock:
    340	rcu_read_unlock();
    341
    342	return retval;
    343}
    344
    345/*
    346 * Unprivileged users may change the real gid to the effective gid
    347 * or vice versa.  (BSD-style)
    348 *
    349 * If you set the real gid at all, or set the effective gid to a value not
    350 * equal to the real gid, then the saved gid is set to the new effective gid.
    351 *
    352 * This makes it possible for a setgid program to completely drop its
    353 * privileges, which is often a useful assertion to make when you are doing
    354 * a security audit over a program.
    355 *
    356 * The general idea is that a program which uses just setregid() will be
    357 * 100% compatible with BSD.  A program which uses just setgid() will be
    358 * 100% compatible with POSIX with saved IDs.
    359 *
    360 * SMP: There are not races, the GIDs are checked only by filesystem
    361 *      operations (as far as semantic preservation is concerned).
    362 */
    363#ifdef CONFIG_MULTIUSER
    364long __sys_setregid(gid_t rgid, gid_t egid)
    365{
    366	struct user_namespace *ns = current_user_ns();
    367	const struct cred *old;
    368	struct cred *new;
    369	int retval;
    370	kgid_t krgid, kegid;
    371
    372	krgid = make_kgid(ns, rgid);
    373	kegid = make_kgid(ns, egid);
    374
    375	if ((rgid != (gid_t) -1) && !gid_valid(krgid))
    376		return -EINVAL;
    377	if ((egid != (gid_t) -1) && !gid_valid(kegid))
    378		return -EINVAL;
    379
    380	new = prepare_creds();
    381	if (!new)
    382		return -ENOMEM;
    383	old = current_cred();
    384
    385	retval = -EPERM;
    386	if (rgid != (gid_t) -1) {
    387		if (gid_eq(old->gid, krgid) ||
    388		    gid_eq(old->egid, krgid) ||
    389		    ns_capable_setid(old->user_ns, CAP_SETGID))
    390			new->gid = krgid;
    391		else
    392			goto error;
    393	}
    394	if (egid != (gid_t) -1) {
    395		if (gid_eq(old->gid, kegid) ||
    396		    gid_eq(old->egid, kegid) ||
    397		    gid_eq(old->sgid, kegid) ||
    398		    ns_capable_setid(old->user_ns, CAP_SETGID))
    399			new->egid = kegid;
    400		else
    401			goto error;
    402	}
    403
    404	if (rgid != (gid_t) -1 ||
    405	    (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
    406		new->sgid = new->egid;
    407	new->fsgid = new->egid;
    408
    409	retval = security_task_fix_setgid(new, old, LSM_SETID_RE);
    410	if (retval < 0)
    411		goto error;
    412
    413	return commit_creds(new);
    414
    415error:
    416	abort_creds(new);
    417	return retval;
    418}
    419
    420SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
    421{
    422	return __sys_setregid(rgid, egid);
    423}
    424
    425/*
    426 * setgid() is implemented like SysV w/ SAVED_IDS
    427 *
    428 * SMP: Same implicit races as above.
    429 */
    430long __sys_setgid(gid_t gid)
    431{
    432	struct user_namespace *ns = current_user_ns();
    433	const struct cred *old;
    434	struct cred *new;
    435	int retval;
    436	kgid_t kgid;
    437
    438	kgid = make_kgid(ns, gid);
    439	if (!gid_valid(kgid))
    440		return -EINVAL;
    441
    442	new = prepare_creds();
    443	if (!new)
    444		return -ENOMEM;
    445	old = current_cred();
    446
    447	retval = -EPERM;
    448	if (ns_capable_setid(old->user_ns, CAP_SETGID))
    449		new->gid = new->egid = new->sgid = new->fsgid = kgid;
    450	else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
    451		new->egid = new->fsgid = kgid;
    452	else
    453		goto error;
    454
    455	retval = security_task_fix_setgid(new, old, LSM_SETID_ID);
    456	if (retval < 0)
    457		goto error;
    458
    459	return commit_creds(new);
    460
    461error:
    462	abort_creds(new);
    463	return retval;
    464}
    465
    466SYSCALL_DEFINE1(setgid, gid_t, gid)
    467{
    468	return __sys_setgid(gid);
    469}
    470
    471/*
    472 * change the user struct in a credentials set to match the new UID
    473 */
    474static int set_user(struct cred *new)
    475{
    476	struct user_struct *new_user;
    477
    478	new_user = alloc_uid(new->uid);
    479	if (!new_user)
    480		return -EAGAIN;
    481
    482	free_uid(new->user);
    483	new->user = new_user;
    484	return 0;
    485}
    486
    487static void flag_nproc_exceeded(struct cred *new)
    488{
    489	if (new->ucounts == current_ucounts())
    490		return;
    491
    492	/*
    493	 * We don't fail in case of NPROC limit excess here because too many
    494	 * poorly written programs don't check set*uid() return code, assuming
    495	 * it never fails if called by root.  We may still enforce NPROC limit
    496	 * for programs doing set*uid()+execve() by harmlessly deferring the
    497	 * failure to the execve() stage.
    498	 */
    499	if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
    500			new->user != INIT_USER)
    501		current->flags |= PF_NPROC_EXCEEDED;
    502	else
    503		current->flags &= ~PF_NPROC_EXCEEDED;
    504}
    505
    506/*
    507 * Unprivileged users may change the real uid to the effective uid
    508 * or vice versa.  (BSD-style)
    509 *
    510 * If you set the real uid at all, or set the effective uid to a value not
    511 * equal to the real uid, then the saved uid is set to the new effective uid.
    512 *
    513 * This makes it possible for a setuid program to completely drop its
    514 * privileges, which is often a useful assertion to make when you are doing
    515 * a security audit over a program.
    516 *
    517 * The general idea is that a program which uses just setreuid() will be
    518 * 100% compatible with BSD.  A program which uses just setuid() will be
    519 * 100% compatible with POSIX with saved IDs.
    520 */
    521long __sys_setreuid(uid_t ruid, uid_t euid)
    522{
    523	struct user_namespace *ns = current_user_ns();
    524	const struct cred *old;
    525	struct cred *new;
    526	int retval;
    527	kuid_t kruid, keuid;
    528
    529	kruid = make_kuid(ns, ruid);
    530	keuid = make_kuid(ns, euid);
    531
    532	if ((ruid != (uid_t) -1) && !uid_valid(kruid))
    533		return -EINVAL;
    534	if ((euid != (uid_t) -1) && !uid_valid(keuid))
    535		return -EINVAL;
    536
    537	new = prepare_creds();
    538	if (!new)
    539		return -ENOMEM;
    540	old = current_cred();
    541
    542	retval = -EPERM;
    543	if (ruid != (uid_t) -1) {
    544		new->uid = kruid;
    545		if (!uid_eq(old->uid, kruid) &&
    546		    !uid_eq(old->euid, kruid) &&
    547		    !ns_capable_setid(old->user_ns, CAP_SETUID))
    548			goto error;
    549	}
    550
    551	if (euid != (uid_t) -1) {
    552		new->euid = keuid;
    553		if (!uid_eq(old->uid, keuid) &&
    554		    !uid_eq(old->euid, keuid) &&
    555		    !uid_eq(old->suid, keuid) &&
    556		    !ns_capable_setid(old->user_ns, CAP_SETUID))
    557			goto error;
    558	}
    559
    560	if (!uid_eq(new->uid, old->uid)) {
    561		retval = set_user(new);
    562		if (retval < 0)
    563			goto error;
    564	}
    565	if (ruid != (uid_t) -1 ||
    566	    (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
    567		new->suid = new->euid;
    568	new->fsuid = new->euid;
    569
    570	retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
    571	if (retval < 0)
    572		goto error;
    573
    574	retval = set_cred_ucounts(new);
    575	if (retval < 0)
    576		goto error;
    577
    578	flag_nproc_exceeded(new);
    579	return commit_creds(new);
    580
    581error:
    582	abort_creds(new);
    583	return retval;
    584}
    585
    586SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
    587{
    588	return __sys_setreuid(ruid, euid);
    589}
    590
    591/*
    592 * setuid() is implemented like SysV with SAVED_IDS
    593 *
    594 * Note that SAVED_ID's is deficient in that a setuid root program
    595 * like sendmail, for example, cannot set its uid to be a normal
    596 * user and then switch back, because if you're root, setuid() sets
    597 * the saved uid too.  If you don't like this, blame the bright people
    598 * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
    599 * will allow a root program to temporarily drop privileges and be able to
    600 * regain them by swapping the real and effective uid.
    601 */
    602long __sys_setuid(uid_t uid)
    603{
    604	struct user_namespace *ns = current_user_ns();
    605	const struct cred *old;
    606	struct cred *new;
    607	int retval;
    608	kuid_t kuid;
    609
    610	kuid = make_kuid(ns, uid);
    611	if (!uid_valid(kuid))
    612		return -EINVAL;
    613
    614	new = prepare_creds();
    615	if (!new)
    616		return -ENOMEM;
    617	old = current_cred();
    618
    619	retval = -EPERM;
    620	if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
    621		new->suid = new->uid = kuid;
    622		if (!uid_eq(kuid, old->uid)) {
    623			retval = set_user(new);
    624			if (retval < 0)
    625				goto error;
    626		}
    627	} else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
    628		goto error;
    629	}
    630
    631	new->fsuid = new->euid = kuid;
    632
    633	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
    634	if (retval < 0)
    635		goto error;
    636
    637	retval = set_cred_ucounts(new);
    638	if (retval < 0)
    639		goto error;
    640
    641	flag_nproc_exceeded(new);
    642	return commit_creds(new);
    643
    644error:
    645	abort_creds(new);
    646	return retval;
    647}
    648
    649SYSCALL_DEFINE1(setuid, uid_t, uid)
    650{
    651	return __sys_setuid(uid);
    652}
    653
    654
    655/*
    656 * This function implements a generic ability to update ruid, euid,
    657 * and suid.  This allows you to implement the 4.4 compatible seteuid().
    658 */
    659long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
    660{
    661	struct user_namespace *ns = current_user_ns();
    662	const struct cred *old;
    663	struct cred *new;
    664	int retval;
    665	kuid_t kruid, keuid, ksuid;
    666
    667	kruid = make_kuid(ns, ruid);
    668	keuid = make_kuid(ns, euid);
    669	ksuid = make_kuid(ns, suid);
    670
    671	if ((ruid != (uid_t) -1) && !uid_valid(kruid))
    672		return -EINVAL;
    673
    674	if ((euid != (uid_t) -1) && !uid_valid(keuid))
    675		return -EINVAL;
    676
    677	if ((suid != (uid_t) -1) && !uid_valid(ksuid))
    678		return -EINVAL;
    679
    680	new = prepare_creds();
    681	if (!new)
    682		return -ENOMEM;
    683
    684	old = current_cred();
    685
    686	retval = -EPERM;
    687	if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
    688		if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
    689		    !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
    690			goto error;
    691		if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
    692		    !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
    693			goto error;
    694		if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
    695		    !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
    696			goto error;
    697	}
    698
    699	if (ruid != (uid_t) -1) {
    700		new->uid = kruid;
    701		if (!uid_eq(kruid, old->uid)) {
    702			retval = set_user(new);
    703			if (retval < 0)
    704				goto error;
    705		}
    706	}
    707	if (euid != (uid_t) -1)
    708		new->euid = keuid;
    709	if (suid != (uid_t) -1)
    710		new->suid = ksuid;
    711	new->fsuid = new->euid;
    712
    713	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
    714	if (retval < 0)
    715		goto error;
    716
    717	retval = set_cred_ucounts(new);
    718	if (retval < 0)
    719		goto error;
    720
    721	flag_nproc_exceeded(new);
    722	return commit_creds(new);
    723
    724error:
    725	abort_creds(new);
    726	return retval;
    727}
    728
    729SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
    730{
    731	return __sys_setresuid(ruid, euid, suid);
    732}
    733
    734SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
    735{
    736	const struct cred *cred = current_cred();
    737	int retval;
    738	uid_t ruid, euid, suid;
    739
    740	ruid = from_kuid_munged(cred->user_ns, cred->uid);
    741	euid = from_kuid_munged(cred->user_ns, cred->euid);
    742	suid = from_kuid_munged(cred->user_ns, cred->suid);
    743
    744	retval = put_user(ruid, ruidp);
    745	if (!retval) {
    746		retval = put_user(euid, euidp);
    747		if (!retval)
    748			return put_user(suid, suidp);
    749	}
    750	return retval;
    751}
    752
    753/*
    754 * Same as above, but for rgid, egid, sgid.
    755 */
    756long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
    757{
    758	struct user_namespace *ns = current_user_ns();
    759	const struct cred *old;
    760	struct cred *new;
    761	int retval;
    762	kgid_t krgid, kegid, ksgid;
    763
    764	krgid = make_kgid(ns, rgid);
    765	kegid = make_kgid(ns, egid);
    766	ksgid = make_kgid(ns, sgid);
    767
    768	if ((rgid != (gid_t) -1) && !gid_valid(krgid))
    769		return -EINVAL;
    770	if ((egid != (gid_t) -1) && !gid_valid(kegid))
    771		return -EINVAL;
    772	if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
    773		return -EINVAL;
    774
    775	new = prepare_creds();
    776	if (!new)
    777		return -ENOMEM;
    778	old = current_cred();
    779
    780	retval = -EPERM;
    781	if (!ns_capable_setid(old->user_ns, CAP_SETGID)) {
    782		if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
    783		    !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
    784			goto error;
    785		if (egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
    786		    !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
    787			goto error;
    788		if (sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
    789		    !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
    790			goto error;
    791	}
    792
    793	if (rgid != (gid_t) -1)
    794		new->gid = krgid;
    795	if (egid != (gid_t) -1)
    796		new->egid = kegid;
    797	if (sgid != (gid_t) -1)
    798		new->sgid = ksgid;
    799	new->fsgid = new->egid;
    800
    801	retval = security_task_fix_setgid(new, old, LSM_SETID_RES);
    802	if (retval < 0)
    803		goto error;
    804
    805	return commit_creds(new);
    806
    807error:
    808	abort_creds(new);
    809	return retval;
    810}
    811
    812SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
    813{
    814	return __sys_setresgid(rgid, egid, sgid);
    815}
    816
    817SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
    818{
    819	const struct cred *cred = current_cred();
    820	int retval;
    821	gid_t rgid, egid, sgid;
    822
    823	rgid = from_kgid_munged(cred->user_ns, cred->gid);
    824	egid = from_kgid_munged(cred->user_ns, cred->egid);
    825	sgid = from_kgid_munged(cred->user_ns, cred->sgid);
    826
    827	retval = put_user(rgid, rgidp);
    828	if (!retval) {
    829		retval = put_user(egid, egidp);
    830		if (!retval)
    831			retval = put_user(sgid, sgidp);
    832	}
    833
    834	return retval;
    835}
    836
    837
    838/*
    839 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
    840 * is used for "access()" and for the NFS daemon (letting nfsd stay at
    841 * whatever uid it wants to). It normally shadows "euid", except when
    842 * explicitly set by setfsuid() or for access..
    843 */
    844long __sys_setfsuid(uid_t uid)
    845{
    846	const struct cred *old;
    847	struct cred *new;
    848	uid_t old_fsuid;
    849	kuid_t kuid;
    850
    851	old = current_cred();
    852	old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
    853
    854	kuid = make_kuid(old->user_ns, uid);
    855	if (!uid_valid(kuid))
    856		return old_fsuid;
    857
    858	new = prepare_creds();
    859	if (!new)
    860		return old_fsuid;
    861
    862	if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
    863	    uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
    864	    ns_capable_setid(old->user_ns, CAP_SETUID)) {
    865		if (!uid_eq(kuid, old->fsuid)) {
    866			new->fsuid = kuid;
    867			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
    868				goto change_okay;
    869		}
    870	}
    871
    872	abort_creds(new);
    873	return old_fsuid;
    874
    875change_okay:
    876	commit_creds(new);
    877	return old_fsuid;
    878}
    879
    880SYSCALL_DEFINE1(setfsuid, uid_t, uid)
    881{
    882	return __sys_setfsuid(uid);
    883}
    884
    885/*
    886 * Samma på svenska..
    887 */
    888long __sys_setfsgid(gid_t gid)
    889{
    890	const struct cred *old;
    891	struct cred *new;
    892	gid_t old_fsgid;
    893	kgid_t kgid;
    894
    895	old = current_cred();
    896	old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
    897
    898	kgid = make_kgid(old->user_ns, gid);
    899	if (!gid_valid(kgid))
    900		return old_fsgid;
    901
    902	new = prepare_creds();
    903	if (!new)
    904		return old_fsgid;
    905
    906	if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
    907	    gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
    908	    ns_capable_setid(old->user_ns, CAP_SETGID)) {
    909		if (!gid_eq(kgid, old->fsgid)) {
    910			new->fsgid = kgid;
    911			if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0)
    912				goto change_okay;
    913		}
    914	}
    915
    916	abort_creds(new);
    917	return old_fsgid;
    918
    919change_okay:
    920	commit_creds(new);
    921	return old_fsgid;
    922}
    923
    924SYSCALL_DEFINE1(setfsgid, gid_t, gid)
    925{
    926	return __sys_setfsgid(gid);
    927}
    928#endif /* CONFIG_MULTIUSER */
    929
    930/**
    931 * sys_getpid - return the thread group id of the current process
    932 *
    933 * Note, despite the name, this returns the tgid not the pid.  The tgid and
    934 * the pid are identical unless CLONE_THREAD was specified on clone() in
    935 * which case the tgid is the same in all threads of the same group.
    936 *
    937 * This is SMP safe as current->tgid does not change.
    938 */
    939SYSCALL_DEFINE0(getpid)
    940{
    941	return task_tgid_vnr(current);
    942}
    943
    944/* Thread ID - the internal kernel "pid" */
    945SYSCALL_DEFINE0(gettid)
    946{
    947	return task_pid_vnr(current);
    948}
    949
    950/*
    951 * Accessing ->real_parent is not SMP-safe, it could
    952 * change from under us. However, we can use a stale
    953 * value of ->real_parent under rcu_read_lock(), see
    954 * release_task()->call_rcu(delayed_put_task_struct).
    955 */
    956SYSCALL_DEFINE0(getppid)
    957{
    958	int pid;
    959
    960	rcu_read_lock();
    961	pid = task_tgid_vnr(rcu_dereference(current->real_parent));
    962	rcu_read_unlock();
    963
    964	return pid;
    965}
    966
    967SYSCALL_DEFINE0(getuid)
    968{
    969	/* Only we change this so SMP safe */
    970	return from_kuid_munged(current_user_ns(), current_uid());
    971}
    972
    973SYSCALL_DEFINE0(geteuid)
    974{
    975	/* Only we change this so SMP safe */
    976	return from_kuid_munged(current_user_ns(), current_euid());
    977}
    978
    979SYSCALL_DEFINE0(getgid)
    980{
    981	/* Only we change this so SMP safe */
    982	return from_kgid_munged(current_user_ns(), current_gid());
    983}
    984
    985SYSCALL_DEFINE0(getegid)
    986{
    987	/* Only we change this so SMP safe */
    988	return from_kgid_munged(current_user_ns(), current_egid());
    989}
    990
    991static void do_sys_times(struct tms *tms)
    992{
    993	u64 tgutime, tgstime, cutime, cstime;
    994
    995	thread_group_cputime_adjusted(current, &tgutime, &tgstime);
    996	cutime = current->signal->cutime;
    997	cstime = current->signal->cstime;
    998	tms->tms_utime = nsec_to_clock_t(tgutime);
    999	tms->tms_stime = nsec_to_clock_t(tgstime);
   1000	tms->tms_cutime = nsec_to_clock_t(cutime);
   1001	tms->tms_cstime = nsec_to_clock_t(cstime);
   1002}
   1003
   1004SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
   1005{
   1006	if (tbuf) {
   1007		struct tms tmp;
   1008
   1009		do_sys_times(&tmp);
   1010		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
   1011			return -EFAULT;
   1012	}
   1013	force_successful_syscall_return();
   1014	return (long) jiffies_64_to_clock_t(get_jiffies_64());
   1015}
   1016
   1017#ifdef CONFIG_COMPAT
   1018static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
   1019{
   1020	return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
   1021}
   1022
   1023COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
   1024{
   1025	if (tbuf) {
   1026		struct tms tms;
   1027		struct compat_tms tmp;
   1028
   1029		do_sys_times(&tms);
   1030		/* Convert our struct tms to the compat version. */
   1031		tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
   1032		tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
   1033		tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
   1034		tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
   1035		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
   1036			return -EFAULT;
   1037	}
   1038	force_successful_syscall_return();
   1039	return compat_jiffies_to_clock_t(jiffies);
   1040}
   1041#endif
   1042
   1043/*
   1044 * This needs some heavy checking ...
   1045 * I just haven't the stomach for it. I also don't fully
   1046 * understand sessions/pgrp etc. Let somebody who does explain it.
   1047 *
   1048 * OK, I think I have the protection semantics right.... this is really
   1049 * only important on a multi-user system anyway, to make sure one user
   1050 * can't send a signal to a process owned by another.  -TYT, 12/12/91
   1051 *
   1052 * !PF_FORKNOEXEC check to conform completely to POSIX.
   1053 */
   1054SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
   1055{
   1056	struct task_struct *p;
   1057	struct task_struct *group_leader = current->group_leader;
   1058	struct pid *pgrp;
   1059	int err;
   1060
   1061	if (!pid)
   1062		pid = task_pid_vnr(group_leader);
   1063	if (!pgid)
   1064		pgid = pid;
   1065	if (pgid < 0)
   1066		return -EINVAL;
   1067	rcu_read_lock();
   1068
   1069	/* From this point forward we keep holding onto the tasklist lock
   1070	 * so that our parent does not change from under us. -DaveM
   1071	 */
   1072	write_lock_irq(&tasklist_lock);
   1073
   1074	err = -ESRCH;
   1075	p = find_task_by_vpid(pid);
   1076	if (!p)
   1077		goto out;
   1078
   1079	err = -EINVAL;
   1080	if (!thread_group_leader(p))
   1081		goto out;
   1082
   1083	if (same_thread_group(p->real_parent, group_leader)) {
   1084		err = -EPERM;
   1085		if (task_session(p) != task_session(group_leader))
   1086			goto out;
   1087		err = -EACCES;
   1088		if (!(p->flags & PF_FORKNOEXEC))
   1089			goto out;
   1090	} else {
   1091		err = -ESRCH;
   1092		if (p != group_leader)
   1093			goto out;
   1094	}
   1095
   1096	err = -EPERM;
   1097	if (p->signal->leader)
   1098		goto out;
   1099
   1100	pgrp = task_pid(p);
   1101	if (pgid != pid) {
   1102		struct task_struct *g;
   1103
   1104		pgrp = find_vpid(pgid);
   1105		g = pid_task(pgrp, PIDTYPE_PGID);
   1106		if (!g || task_session(g) != task_session(group_leader))
   1107			goto out;
   1108	}
   1109
   1110	err = security_task_setpgid(p, pgid);
   1111	if (err)
   1112		goto out;
   1113
   1114	if (task_pgrp(p) != pgrp)
   1115		change_pid(p, PIDTYPE_PGID, pgrp);
   1116
   1117	err = 0;
   1118out:
   1119	/* All paths lead to here, thus we are safe. -DaveM */
   1120	write_unlock_irq(&tasklist_lock);
   1121	rcu_read_unlock();
   1122	return err;
   1123}
   1124
   1125static int do_getpgid(pid_t pid)
   1126{
   1127	struct task_struct *p;
   1128	struct pid *grp;
   1129	int retval;
   1130
   1131	rcu_read_lock();
   1132	if (!pid)
   1133		grp = task_pgrp(current);
   1134	else {
   1135		retval = -ESRCH;
   1136		p = find_task_by_vpid(pid);
   1137		if (!p)
   1138			goto out;
   1139		grp = task_pgrp(p);
   1140		if (!grp)
   1141			goto out;
   1142
   1143		retval = security_task_getpgid(p);
   1144		if (retval)
   1145			goto out;
   1146	}
   1147	retval = pid_vnr(grp);
   1148out:
   1149	rcu_read_unlock();
   1150	return retval;
   1151}
   1152
   1153SYSCALL_DEFINE1(getpgid, pid_t, pid)
   1154{
   1155	return do_getpgid(pid);
   1156}
   1157
   1158#ifdef __ARCH_WANT_SYS_GETPGRP
   1159
   1160SYSCALL_DEFINE0(getpgrp)
   1161{
   1162	return do_getpgid(0);
   1163}
   1164
   1165#endif
   1166
   1167SYSCALL_DEFINE1(getsid, pid_t, pid)
   1168{
   1169	struct task_struct *p;
   1170	struct pid *sid;
   1171	int retval;
   1172
   1173	rcu_read_lock();
   1174	if (!pid)
   1175		sid = task_session(current);
   1176	else {
   1177		retval = -ESRCH;
   1178		p = find_task_by_vpid(pid);
   1179		if (!p)
   1180			goto out;
   1181		sid = task_session(p);
   1182		if (!sid)
   1183			goto out;
   1184
   1185		retval = security_task_getsid(p);
   1186		if (retval)
   1187			goto out;
   1188	}
   1189	retval = pid_vnr(sid);
   1190out:
   1191	rcu_read_unlock();
   1192	return retval;
   1193}
   1194
   1195static void set_special_pids(struct pid *pid)
   1196{
   1197	struct task_struct *curr = current->group_leader;
   1198
   1199	if (task_session(curr) != pid)
   1200		change_pid(curr, PIDTYPE_SID, pid);
   1201
   1202	if (task_pgrp(curr) != pid)
   1203		change_pid(curr, PIDTYPE_PGID, pid);
   1204}
   1205
   1206int ksys_setsid(void)
   1207{
   1208	struct task_struct *group_leader = current->group_leader;
   1209	struct pid *sid = task_pid(group_leader);
   1210	pid_t session = pid_vnr(sid);
   1211	int err = -EPERM;
   1212
   1213	write_lock_irq(&tasklist_lock);
   1214	/* Fail if I am already a session leader */
   1215	if (group_leader->signal->leader)
   1216		goto out;
   1217
   1218	/* Fail if a process group id already exists that equals the
   1219	 * proposed session id.
   1220	 */
   1221	if (pid_task(sid, PIDTYPE_PGID))
   1222		goto out;
   1223
   1224	group_leader->signal->leader = 1;
   1225	set_special_pids(sid);
   1226
   1227	proc_clear_tty(group_leader);
   1228
   1229	err = session;
   1230out:
   1231	write_unlock_irq(&tasklist_lock);
   1232	if (err > 0) {
   1233		proc_sid_connector(group_leader);
   1234		sched_autogroup_create_attach(group_leader);
   1235	}
   1236	return err;
   1237}
   1238
   1239SYSCALL_DEFINE0(setsid)
   1240{
   1241	return ksys_setsid();
   1242}
   1243
   1244DECLARE_RWSEM(uts_sem);
   1245
   1246#ifdef COMPAT_UTS_MACHINE
   1247#define override_architecture(name) \
   1248	(personality(current->personality) == PER_LINUX32 && \
   1249	 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
   1250		      sizeof(COMPAT_UTS_MACHINE)))
   1251#else
   1252#define override_architecture(name)	0
   1253#endif
   1254
   1255/*
   1256 * Work around broken programs that cannot handle "Linux 3.0".
   1257 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
   1258 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be
   1259 * 2.6.60.
   1260 */
   1261static int override_release(char __user *release, size_t len)
   1262{
   1263	int ret = 0;
   1264
   1265	if (current->personality & UNAME26) {
   1266		const char *rest = UTS_RELEASE;
   1267		char buf[65] = { 0 };
   1268		int ndots = 0;
   1269		unsigned v;
   1270		size_t copy;
   1271
   1272		while (*rest) {
   1273			if (*rest == '.' && ++ndots >= 3)
   1274				break;
   1275			if (!isdigit(*rest) && *rest != '.')
   1276				break;
   1277			rest++;
   1278		}
   1279		v = LINUX_VERSION_PATCHLEVEL + 60;
   1280		copy = clamp_t(size_t, len, 1, sizeof(buf));
   1281		copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
   1282		ret = copy_to_user(release, buf, copy + 1);
   1283	}
   1284	return ret;
   1285}
   1286
   1287SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
   1288{
   1289	struct new_utsname tmp;
   1290
   1291	down_read(&uts_sem);
   1292	memcpy(&tmp, utsname(), sizeof(tmp));
   1293	up_read(&uts_sem);
   1294	if (copy_to_user(name, &tmp, sizeof(tmp)))
   1295		return -EFAULT;
   1296
   1297	if (override_release(name->release, sizeof(name->release)))
   1298		return -EFAULT;
   1299	if (override_architecture(name))
   1300		return -EFAULT;
   1301	return 0;
   1302}
   1303
   1304#ifdef __ARCH_WANT_SYS_OLD_UNAME
   1305/*
   1306 * Old cruft
   1307 */
   1308SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
   1309{
   1310	struct old_utsname tmp;
   1311
   1312	if (!name)
   1313		return -EFAULT;
   1314
   1315	down_read(&uts_sem);
   1316	memcpy(&tmp, utsname(), sizeof(tmp));
   1317	up_read(&uts_sem);
   1318	if (copy_to_user(name, &tmp, sizeof(tmp)))
   1319		return -EFAULT;
   1320
   1321	if (override_release(name->release, sizeof(name->release)))
   1322		return -EFAULT;
   1323	if (override_architecture(name))
   1324		return -EFAULT;
   1325	return 0;
   1326}
   1327
   1328SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
   1329{
   1330	struct oldold_utsname tmp;
   1331
   1332	if (!name)
   1333		return -EFAULT;
   1334
   1335	memset(&tmp, 0, sizeof(tmp));
   1336
   1337	down_read(&uts_sem);
   1338	memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN);
   1339	memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN);
   1340	memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN);
   1341	memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN);
   1342	memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN);
   1343	up_read(&uts_sem);
   1344	if (copy_to_user(name, &tmp, sizeof(tmp)))
   1345		return -EFAULT;
   1346
   1347	if (override_architecture(name))
   1348		return -EFAULT;
   1349	if (override_release(name->release, sizeof(name->release)))
   1350		return -EFAULT;
   1351	return 0;
   1352}
   1353#endif
   1354
   1355SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
   1356{
   1357	int errno;
   1358	char tmp[__NEW_UTS_LEN];
   1359
   1360	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
   1361		return -EPERM;
   1362
   1363	if (len < 0 || len > __NEW_UTS_LEN)
   1364		return -EINVAL;
   1365	errno = -EFAULT;
   1366	if (!copy_from_user(tmp, name, len)) {
   1367		struct new_utsname *u;
   1368
   1369		down_write(&uts_sem);
   1370		u = utsname();
   1371		memcpy(u->nodename, tmp, len);
   1372		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
   1373		errno = 0;
   1374		uts_proc_notify(UTS_PROC_HOSTNAME);
   1375		up_write(&uts_sem);
   1376	}
   1377	return errno;
   1378}
   1379
   1380#ifdef __ARCH_WANT_SYS_GETHOSTNAME
   1381
   1382SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
   1383{
   1384	int i;
   1385	struct new_utsname *u;
   1386	char tmp[__NEW_UTS_LEN + 1];
   1387
   1388	if (len < 0)
   1389		return -EINVAL;
   1390	down_read(&uts_sem);
   1391	u = utsname();
   1392	i = 1 + strlen(u->nodename);
   1393	if (i > len)
   1394		i = len;
   1395	memcpy(tmp, u->nodename, i);
   1396	up_read(&uts_sem);
   1397	if (copy_to_user(name, tmp, i))
   1398		return -EFAULT;
   1399	return 0;
   1400}
   1401
   1402#endif
   1403
   1404/*
   1405 * Only setdomainname; getdomainname can be implemented by calling
   1406 * uname()
   1407 */
   1408SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
   1409{
   1410	int errno;
   1411	char tmp[__NEW_UTS_LEN];
   1412
   1413	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
   1414		return -EPERM;
   1415	if (len < 0 || len > __NEW_UTS_LEN)
   1416		return -EINVAL;
   1417
   1418	errno = -EFAULT;
   1419	if (!copy_from_user(tmp, name, len)) {
   1420		struct new_utsname *u;
   1421
   1422		down_write(&uts_sem);
   1423		u = utsname();
   1424		memcpy(u->domainname, tmp, len);
   1425		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
   1426		errno = 0;
   1427		uts_proc_notify(UTS_PROC_DOMAINNAME);
   1428		up_write(&uts_sem);
   1429	}
   1430	return errno;
   1431}
   1432
   1433/* make sure you are allowed to change @tsk limits before calling this */
   1434static int do_prlimit(struct task_struct *tsk, unsigned int resource,
   1435		      struct rlimit *new_rlim, struct rlimit *old_rlim)
   1436{
   1437	struct rlimit *rlim;
   1438	int retval = 0;
   1439
   1440	if (resource >= RLIM_NLIMITS)
   1441		return -EINVAL;
   1442	if (new_rlim) {
   1443		if (new_rlim->rlim_cur > new_rlim->rlim_max)
   1444			return -EINVAL;
   1445		if (resource == RLIMIT_NOFILE &&
   1446				new_rlim->rlim_max > sysctl_nr_open)
   1447			return -EPERM;
   1448	}
   1449
   1450	/* Holding a refcount on tsk protects tsk->signal from disappearing. */
   1451	rlim = tsk->signal->rlim + resource;
   1452	task_lock(tsk->group_leader);
   1453	if (new_rlim) {
   1454		/*
   1455		 * Keep the capable check against init_user_ns until cgroups can
   1456		 * contain all limits.
   1457		 */
   1458		if (new_rlim->rlim_max > rlim->rlim_max &&
   1459				!capable(CAP_SYS_RESOURCE))
   1460			retval = -EPERM;
   1461		if (!retval)
   1462			retval = security_task_setrlimit(tsk, resource, new_rlim);
   1463	}
   1464	if (!retval) {
   1465		if (old_rlim)
   1466			*old_rlim = *rlim;
   1467		if (new_rlim)
   1468			*rlim = *new_rlim;
   1469	}
   1470	task_unlock(tsk->group_leader);
   1471
   1472	/*
   1473	 * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
   1474	 * infinite. In case of RLIM_INFINITY the posix CPU timer code
   1475	 * ignores the rlimit.
   1476	 */
   1477	if (!retval && new_rlim && resource == RLIMIT_CPU &&
   1478	    new_rlim->rlim_cur != RLIM_INFINITY &&
   1479	    IS_ENABLED(CONFIG_POSIX_TIMERS)) {
   1480		/*
   1481		 * update_rlimit_cpu can fail if the task is exiting, but there
   1482		 * may be other tasks in the thread group that are not exiting,
   1483		 * and they need their cpu timers adjusted.
   1484		 *
   1485		 * The group_leader is the last task to be released, so if we
   1486		 * cannot update_rlimit_cpu on it, then the entire process is
   1487		 * exiting and we do not need to update at all.
   1488		 */
   1489		update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
   1490	}
   1491
   1492	return retval;
   1493}
   1494
   1495SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
   1496{
   1497	struct rlimit value;
   1498	int ret;
   1499
   1500	ret = do_prlimit(current, resource, NULL, &value);
   1501	if (!ret)
   1502		ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
   1503
   1504	return ret;
   1505}
   1506
   1507#ifdef CONFIG_COMPAT
   1508
   1509COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
   1510		       struct compat_rlimit __user *, rlim)
   1511{
   1512	struct rlimit r;
   1513	struct compat_rlimit r32;
   1514
   1515	if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
   1516		return -EFAULT;
   1517
   1518	if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
   1519		r.rlim_cur = RLIM_INFINITY;
   1520	else
   1521		r.rlim_cur = r32.rlim_cur;
   1522	if (r32.rlim_max == COMPAT_RLIM_INFINITY)
   1523		r.rlim_max = RLIM_INFINITY;
   1524	else
   1525		r.rlim_max = r32.rlim_max;
   1526	return do_prlimit(current, resource, &r, NULL);
   1527}
   1528
   1529COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
   1530		       struct compat_rlimit __user *, rlim)
   1531{
   1532	struct rlimit r;
   1533	int ret;
   1534
   1535	ret = do_prlimit(current, resource, NULL, &r);
   1536	if (!ret) {
   1537		struct compat_rlimit r32;
   1538		if (r.rlim_cur > COMPAT_RLIM_INFINITY)
   1539			r32.rlim_cur = COMPAT_RLIM_INFINITY;
   1540		else
   1541			r32.rlim_cur = r.rlim_cur;
   1542		if (r.rlim_max > COMPAT_RLIM_INFINITY)
   1543			r32.rlim_max = COMPAT_RLIM_INFINITY;
   1544		else
   1545			r32.rlim_max = r.rlim_max;
   1546
   1547		if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
   1548			return -EFAULT;
   1549	}
   1550	return ret;
   1551}
   1552
   1553#endif
   1554
   1555#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
   1556
   1557/*
   1558 *	Back compatibility for getrlimit. Needed for some apps.
   1559 */
   1560SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
   1561		struct rlimit __user *, rlim)
   1562{
   1563	struct rlimit x;
   1564	if (resource >= RLIM_NLIMITS)
   1565		return -EINVAL;
   1566
   1567	resource = array_index_nospec(resource, RLIM_NLIMITS);
   1568	task_lock(current->group_leader);
   1569	x = current->signal->rlim[resource];
   1570	task_unlock(current->group_leader);
   1571	if (x.rlim_cur > 0x7FFFFFFF)
   1572		x.rlim_cur = 0x7FFFFFFF;
   1573	if (x.rlim_max > 0x7FFFFFFF)
   1574		x.rlim_max = 0x7FFFFFFF;
   1575	return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
   1576}
   1577
   1578#ifdef CONFIG_COMPAT
   1579COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
   1580		       struct compat_rlimit __user *, rlim)
   1581{
   1582	struct rlimit r;
   1583
   1584	if (resource >= RLIM_NLIMITS)
   1585		return -EINVAL;
   1586
   1587	resource = array_index_nospec(resource, RLIM_NLIMITS);
   1588	task_lock(current->group_leader);
   1589	r = current->signal->rlim[resource];
   1590	task_unlock(current->group_leader);
   1591	if (r.rlim_cur > 0x7FFFFFFF)
   1592		r.rlim_cur = 0x7FFFFFFF;
   1593	if (r.rlim_max > 0x7FFFFFFF)
   1594		r.rlim_max = 0x7FFFFFFF;
   1595
   1596	if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
   1597	    put_user(r.rlim_max, &rlim->rlim_max))
   1598		return -EFAULT;
   1599	return 0;
   1600}
   1601#endif
   1602
   1603#endif
   1604
   1605static inline bool rlim64_is_infinity(__u64 rlim64)
   1606{
   1607#if BITS_PER_LONG < 64
   1608	return rlim64 >= ULONG_MAX;
   1609#else
   1610	return rlim64 == RLIM64_INFINITY;
   1611#endif
   1612}
   1613
   1614static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
   1615{
   1616	if (rlim->rlim_cur == RLIM_INFINITY)
   1617		rlim64->rlim_cur = RLIM64_INFINITY;
   1618	else
   1619		rlim64->rlim_cur = rlim->rlim_cur;
   1620	if (rlim->rlim_max == RLIM_INFINITY)
   1621		rlim64->rlim_max = RLIM64_INFINITY;
   1622	else
   1623		rlim64->rlim_max = rlim->rlim_max;
   1624}
   1625
   1626static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
   1627{
   1628	if (rlim64_is_infinity(rlim64->rlim_cur))
   1629		rlim->rlim_cur = RLIM_INFINITY;
   1630	else
   1631		rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
   1632	if (rlim64_is_infinity(rlim64->rlim_max))
   1633		rlim->rlim_max = RLIM_INFINITY;
   1634	else
   1635		rlim->rlim_max = (unsigned long)rlim64->rlim_max;
   1636}
   1637
   1638/* rcu lock must be held */
   1639static int check_prlimit_permission(struct task_struct *task,
   1640				    unsigned int flags)
   1641{
   1642	const struct cred *cred = current_cred(), *tcred;
   1643	bool id_match;
   1644
   1645	if (current == task)
   1646		return 0;
   1647
   1648	tcred = __task_cred(task);
   1649	id_match = (uid_eq(cred->uid, tcred->euid) &&
   1650		    uid_eq(cred->uid, tcred->suid) &&
   1651		    uid_eq(cred->uid, tcred->uid)  &&
   1652		    gid_eq(cred->gid, tcred->egid) &&
   1653		    gid_eq(cred->gid, tcred->sgid) &&
   1654		    gid_eq(cred->gid, tcred->gid));
   1655	if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
   1656		return -EPERM;
   1657
   1658	return security_task_prlimit(cred, tcred, flags);
   1659}
   1660
   1661SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
   1662		const struct rlimit64 __user *, new_rlim,
   1663		struct rlimit64 __user *, old_rlim)
   1664{
   1665	struct rlimit64 old64, new64;
   1666	struct rlimit old, new;
   1667	struct task_struct *tsk;
   1668	unsigned int checkflags = 0;
   1669	int ret;
   1670
   1671	if (old_rlim)
   1672		checkflags |= LSM_PRLIMIT_READ;
   1673
   1674	if (new_rlim) {
   1675		if (copy_from_user(&new64, new_rlim, sizeof(new64)))
   1676			return -EFAULT;
   1677		rlim64_to_rlim(&new64, &new);
   1678		checkflags |= LSM_PRLIMIT_WRITE;
   1679	}
   1680
   1681	rcu_read_lock();
   1682	tsk = pid ? find_task_by_vpid(pid) : current;
   1683	if (!tsk) {
   1684		rcu_read_unlock();
   1685		return -ESRCH;
   1686	}
   1687	ret = check_prlimit_permission(tsk, checkflags);
   1688	if (ret) {
   1689		rcu_read_unlock();
   1690		return ret;
   1691	}
   1692	get_task_struct(tsk);
   1693	rcu_read_unlock();
   1694
   1695	ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
   1696			old_rlim ? &old : NULL);
   1697
   1698	if (!ret && old_rlim) {
   1699		rlim_to_rlim64(&old, &old64);
   1700		if (copy_to_user(old_rlim, &old64, sizeof(old64)))
   1701			ret = -EFAULT;
   1702	}
   1703
   1704	put_task_struct(tsk);
   1705	return ret;
   1706}
   1707
   1708SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
   1709{
   1710	struct rlimit new_rlim;
   1711
   1712	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
   1713		return -EFAULT;
   1714	return do_prlimit(current, resource, &new_rlim, NULL);
   1715}
   1716
   1717/*
   1718 * It would make sense to put struct rusage in the task_struct,
   1719 * except that would make the task_struct be *really big*.  After
   1720 * task_struct gets moved into malloc'ed memory, it would
   1721 * make sense to do this.  It will make moving the rest of the information
   1722 * a lot simpler!  (Which we're not doing right now because we're not
   1723 * measuring them yet).
   1724 *
   1725 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
   1726 * races with threads incrementing their own counters.  But since word
   1727 * reads are atomic, we either get new values or old values and we don't
   1728 * care which for the sums.  We always take the siglock to protect reading
   1729 * the c* fields from p->signal from races with exit.c updating those
   1730 * fields when reaping, so a sample either gets all the additions of a
   1731 * given child after it's reaped, or none so this sample is before reaping.
   1732 *
   1733 * Locking:
   1734 * We need to take the siglock for CHILDEREN, SELF and BOTH
   1735 * for  the cases current multithreaded, non-current single threaded
   1736 * non-current multithreaded.  Thread traversal is now safe with
   1737 * the siglock held.
   1738 * Strictly speaking, we donot need to take the siglock if we are current and
   1739 * single threaded,  as no one else can take our signal_struct away, no one
   1740 * else can  reap the  children to update signal->c* counters, and no one else
   1741 * can race with the signal-> fields. If we do not take any lock, the
   1742 * signal-> fields could be read out of order while another thread was just
   1743 * exiting. So we should  place a read memory barrier when we avoid the lock.
   1744 * On the writer side,  write memory barrier is implied in  __exit_signal
   1745 * as __exit_signal releases  the siglock spinlock after updating the signal->
   1746 * fields. But we don't do this yet to keep things simple.
   1747 *
   1748 */
   1749
   1750static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
   1751{
   1752	r->ru_nvcsw += t->nvcsw;
   1753	r->ru_nivcsw += t->nivcsw;
   1754	r->ru_minflt += t->min_flt;
   1755	r->ru_majflt += t->maj_flt;
   1756	r->ru_inblock += task_io_get_inblock(t);
   1757	r->ru_oublock += task_io_get_oublock(t);
   1758}
   1759
   1760void getrusage(struct task_struct *p, int who, struct rusage *r)
   1761{
   1762	struct task_struct *t;
   1763	unsigned long flags;
   1764	u64 tgutime, tgstime, utime, stime;
   1765	unsigned long maxrss = 0;
   1766
   1767	memset((char *)r, 0, sizeof (*r));
   1768	utime = stime = 0;
   1769
   1770	if (who == RUSAGE_THREAD) {
   1771		task_cputime_adjusted(current, &utime, &stime);
   1772		accumulate_thread_rusage(p, r);
   1773		maxrss = p->signal->maxrss;
   1774		goto out;
   1775	}
   1776
   1777	if (!lock_task_sighand(p, &flags))
   1778		return;
   1779
   1780	switch (who) {
   1781	case RUSAGE_BOTH:
   1782	case RUSAGE_CHILDREN:
   1783		utime = p->signal->cutime;
   1784		stime = p->signal->cstime;
   1785		r->ru_nvcsw = p->signal->cnvcsw;
   1786		r->ru_nivcsw = p->signal->cnivcsw;
   1787		r->ru_minflt = p->signal->cmin_flt;
   1788		r->ru_majflt = p->signal->cmaj_flt;
   1789		r->ru_inblock = p->signal->cinblock;
   1790		r->ru_oublock = p->signal->coublock;
   1791		maxrss = p->signal->cmaxrss;
   1792
   1793		if (who == RUSAGE_CHILDREN)
   1794			break;
   1795		fallthrough;
   1796
   1797	case RUSAGE_SELF:
   1798		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
   1799		utime += tgutime;
   1800		stime += tgstime;
   1801		r->ru_nvcsw += p->signal->nvcsw;
   1802		r->ru_nivcsw += p->signal->nivcsw;
   1803		r->ru_minflt += p->signal->min_flt;
   1804		r->ru_majflt += p->signal->maj_flt;
   1805		r->ru_inblock += p->signal->inblock;
   1806		r->ru_oublock += p->signal->oublock;
   1807		if (maxrss < p->signal->maxrss)
   1808			maxrss = p->signal->maxrss;
   1809		t = p;
   1810		do {
   1811			accumulate_thread_rusage(t, r);
   1812		} while_each_thread(p, t);
   1813		break;
   1814
   1815	default:
   1816		BUG();
   1817	}
   1818	unlock_task_sighand(p, &flags);
   1819
   1820out:
   1821	r->ru_utime = ns_to_kernel_old_timeval(utime);
   1822	r->ru_stime = ns_to_kernel_old_timeval(stime);
   1823
   1824	if (who != RUSAGE_CHILDREN) {
   1825		struct mm_struct *mm = get_task_mm(p);
   1826
   1827		if (mm) {
   1828			setmax_mm_hiwater_rss(&maxrss, mm);
   1829			mmput(mm);
   1830		}
   1831	}
   1832	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
   1833}
   1834
   1835SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
   1836{
   1837	struct rusage r;
   1838
   1839	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
   1840	    who != RUSAGE_THREAD)
   1841		return -EINVAL;
   1842
   1843	getrusage(current, who, &r);
   1844	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
   1845}
   1846
   1847#ifdef CONFIG_COMPAT
   1848COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
   1849{
   1850	struct rusage r;
   1851
   1852	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
   1853	    who != RUSAGE_THREAD)
   1854		return -EINVAL;
   1855
   1856	getrusage(current, who, &r);
   1857	return put_compat_rusage(&r, ru);
   1858}
   1859#endif
   1860
   1861SYSCALL_DEFINE1(umask, int, mask)
   1862{
   1863	mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
   1864	return mask;
   1865}
   1866
   1867static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
   1868{
   1869	struct fd exe;
   1870	struct inode *inode;
   1871	int err;
   1872
   1873	exe = fdget(fd);
   1874	if (!exe.file)
   1875		return -EBADF;
   1876
   1877	inode = file_inode(exe.file);
   1878
   1879	/*
   1880	 * Because the original mm->exe_file points to executable file, make
   1881	 * sure that this one is executable as well, to avoid breaking an
   1882	 * overall picture.
   1883	 */
   1884	err = -EACCES;
   1885	if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
   1886		goto exit;
   1887
   1888	err = file_permission(exe.file, MAY_EXEC);
   1889	if (err)
   1890		goto exit;
   1891
   1892	err = replace_mm_exe_file(mm, exe.file);
   1893exit:
   1894	fdput(exe);
   1895	return err;
   1896}
   1897
   1898/*
   1899 * Check arithmetic relations of passed addresses.
   1900 *
   1901 * WARNING: we don't require any capability here so be very careful
   1902 * in what is allowed for modification from userspace.
   1903 */
   1904static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
   1905{
   1906	unsigned long mmap_max_addr = TASK_SIZE;
   1907	int error = -EINVAL, i;
   1908
   1909	static const unsigned char offsets[] = {
   1910		offsetof(struct prctl_mm_map, start_code),
   1911		offsetof(struct prctl_mm_map, end_code),
   1912		offsetof(struct prctl_mm_map, start_data),
   1913		offsetof(struct prctl_mm_map, end_data),
   1914		offsetof(struct prctl_mm_map, start_brk),
   1915		offsetof(struct prctl_mm_map, brk),
   1916		offsetof(struct prctl_mm_map, start_stack),
   1917		offsetof(struct prctl_mm_map, arg_start),
   1918		offsetof(struct prctl_mm_map, arg_end),
   1919		offsetof(struct prctl_mm_map, env_start),
   1920		offsetof(struct prctl_mm_map, env_end),
   1921	};
   1922
   1923	/*
   1924	 * Make sure the members are not somewhere outside
   1925	 * of allowed address space.
   1926	 */
   1927	for (i = 0; i < ARRAY_SIZE(offsets); i++) {
   1928		u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
   1929
   1930		if ((unsigned long)val >= mmap_max_addr ||
   1931		    (unsigned long)val < mmap_min_addr)
   1932			goto out;
   1933	}
   1934
   1935	/*
   1936	 * Make sure the pairs are ordered.
   1937	 */
   1938#define __prctl_check_order(__m1, __op, __m2)				\
   1939	((unsigned long)prctl_map->__m1 __op				\
   1940	 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
   1941	error  = __prctl_check_order(start_code, <, end_code);
   1942	error |= __prctl_check_order(start_data,<=, end_data);
   1943	error |= __prctl_check_order(start_brk, <=, brk);
   1944	error |= __prctl_check_order(arg_start, <=, arg_end);
   1945	error |= __prctl_check_order(env_start, <=, env_end);
   1946	if (error)
   1947		goto out;
   1948#undef __prctl_check_order
   1949
   1950	error = -EINVAL;
   1951
   1952	/*
   1953	 * Neither we should allow to override limits if they set.
   1954	 */
   1955	if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
   1956			      prctl_map->start_brk, prctl_map->end_data,
   1957			      prctl_map->start_data))
   1958			goto out;
   1959
   1960	error = 0;
   1961out:
   1962	return error;
   1963}
   1964
   1965#ifdef CONFIG_CHECKPOINT_RESTORE
   1966static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
   1967{
   1968	struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
   1969	unsigned long user_auxv[AT_VECTOR_SIZE];
   1970	struct mm_struct *mm = current->mm;
   1971	int error;
   1972
   1973	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
   1974	BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
   1975
   1976	if (opt == PR_SET_MM_MAP_SIZE)
   1977		return put_user((unsigned int)sizeof(prctl_map),
   1978				(unsigned int __user *)addr);
   1979
   1980	if (data_size != sizeof(prctl_map))
   1981		return -EINVAL;
   1982
   1983	if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
   1984		return -EFAULT;
   1985
   1986	error = validate_prctl_map_addr(&prctl_map);
   1987	if (error)
   1988		return error;
   1989
   1990	if (prctl_map.auxv_size) {
   1991		/*
   1992		 * Someone is trying to cheat the auxv vector.
   1993		 */
   1994		if (!prctl_map.auxv ||
   1995				prctl_map.auxv_size > sizeof(mm->saved_auxv))
   1996			return -EINVAL;
   1997
   1998		memset(user_auxv, 0, sizeof(user_auxv));
   1999		if (copy_from_user(user_auxv,
   2000				   (const void __user *)prctl_map.auxv,
   2001				   prctl_map.auxv_size))
   2002			return -EFAULT;
   2003
   2004		/* Last entry must be AT_NULL as specification requires */
   2005		user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
   2006		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
   2007	}
   2008
   2009	if (prctl_map.exe_fd != (u32)-1) {
   2010		/*
   2011		 * Check if the current user is checkpoint/restore capable.
   2012		 * At the time of this writing, it checks for CAP_SYS_ADMIN
   2013		 * or CAP_CHECKPOINT_RESTORE.
   2014		 * Note that a user with access to ptrace can masquerade an
   2015		 * arbitrary program as any executable, even setuid ones.
   2016		 * This may have implications in the tomoyo subsystem.
   2017		 */
   2018		if (!checkpoint_restore_ns_capable(current_user_ns()))
   2019			return -EPERM;
   2020
   2021		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
   2022		if (error)
   2023			return error;
   2024	}
   2025
   2026	/*
   2027	 * arg_lock protects concurrent updates but we still need mmap_lock for
   2028	 * read to exclude races with sys_brk.
   2029	 */
   2030	mmap_read_lock(mm);
   2031
   2032	/*
   2033	 * We don't validate if these members are pointing to
   2034	 * real present VMAs because application may have correspond
   2035	 * VMAs already unmapped and kernel uses these members for statistics
   2036	 * output in procfs mostly, except
   2037	 *
   2038	 *  - @start_brk/@brk which are used in do_brk_flags but kernel lookups
   2039	 *    for VMAs when updating these members so anything wrong written
   2040	 *    here cause kernel to swear at userspace program but won't lead
   2041	 *    to any problem in kernel itself
   2042	 */
   2043
   2044	spin_lock(&mm->arg_lock);
   2045	mm->start_code	= prctl_map.start_code;
   2046	mm->end_code	= prctl_map.end_code;
   2047	mm->start_data	= prctl_map.start_data;
   2048	mm->end_data	= prctl_map.end_data;
   2049	mm->start_brk	= prctl_map.start_brk;
   2050	mm->brk		= prctl_map.brk;
   2051	mm->start_stack	= prctl_map.start_stack;
   2052	mm->arg_start	= prctl_map.arg_start;
   2053	mm->arg_end	= prctl_map.arg_end;
   2054	mm->env_start	= prctl_map.env_start;
   2055	mm->env_end	= prctl_map.env_end;
   2056	spin_unlock(&mm->arg_lock);
   2057
   2058	/*
   2059	 * Note this update of @saved_auxv is lockless thus
   2060	 * if someone reads this member in procfs while we're
   2061	 * updating -- it may get partly updated results. It's
   2062	 * known and acceptable trade off: we leave it as is to
   2063	 * not introduce additional locks here making the kernel
   2064	 * more complex.
   2065	 */
   2066	if (prctl_map.auxv_size)
   2067		memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
   2068
   2069	mmap_read_unlock(mm);
   2070	return 0;
   2071}
   2072#endif /* CONFIG_CHECKPOINT_RESTORE */
   2073
   2074static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
   2075			  unsigned long len)
   2076{
   2077	/*
   2078	 * This doesn't move the auxiliary vector itself since it's pinned to
   2079	 * mm_struct, but it permits filling the vector with new values.  It's
   2080	 * up to the caller to provide sane values here, otherwise userspace
   2081	 * tools which use this vector might be unhappy.
   2082	 */
   2083	unsigned long user_auxv[AT_VECTOR_SIZE] = {};
   2084
   2085	if (len > sizeof(user_auxv))
   2086		return -EINVAL;
   2087
   2088	if (copy_from_user(user_auxv, (const void __user *)addr, len))
   2089		return -EFAULT;
   2090
   2091	/* Make sure the last entry is always AT_NULL */
   2092	user_auxv[AT_VECTOR_SIZE - 2] = 0;
   2093	user_auxv[AT_VECTOR_SIZE - 1] = 0;
   2094
   2095	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
   2096
   2097	task_lock(current);
   2098	memcpy(mm->saved_auxv, user_auxv, len);
   2099	task_unlock(current);
   2100
   2101	return 0;
   2102}
   2103
   2104static int prctl_set_mm(int opt, unsigned long addr,
   2105			unsigned long arg4, unsigned long arg5)
   2106{
   2107	struct mm_struct *mm = current->mm;
   2108	struct prctl_mm_map prctl_map = {
   2109		.auxv = NULL,
   2110		.auxv_size = 0,
   2111		.exe_fd = -1,
   2112	};
   2113	struct vm_area_struct *vma;
   2114	int error;
   2115
   2116	if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
   2117			      opt != PR_SET_MM_MAP &&
   2118			      opt != PR_SET_MM_MAP_SIZE)))
   2119		return -EINVAL;
   2120
   2121#ifdef CONFIG_CHECKPOINT_RESTORE
   2122	if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
   2123		return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
   2124#endif
   2125
   2126	if (!capable(CAP_SYS_RESOURCE))
   2127		return -EPERM;
   2128
   2129	if (opt == PR_SET_MM_EXE_FILE)
   2130		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
   2131
   2132	if (opt == PR_SET_MM_AUXV)
   2133		return prctl_set_auxv(mm, addr, arg4);
   2134
   2135	if (addr >= TASK_SIZE || addr < mmap_min_addr)
   2136		return -EINVAL;
   2137
   2138	error = -EINVAL;
   2139
   2140	/*
   2141	 * arg_lock protects concurrent updates of arg boundaries, we need
   2142	 * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
   2143	 * validation.
   2144	 */
   2145	mmap_read_lock(mm);
   2146	vma = find_vma(mm, addr);
   2147
   2148	spin_lock(&mm->arg_lock);
   2149	prctl_map.start_code	= mm->start_code;
   2150	prctl_map.end_code	= mm->end_code;
   2151	prctl_map.start_data	= mm->start_data;
   2152	prctl_map.end_data	= mm->end_data;
   2153	prctl_map.start_brk	= mm->start_brk;
   2154	prctl_map.brk		= mm->brk;
   2155	prctl_map.start_stack	= mm->start_stack;
   2156	prctl_map.arg_start	= mm->arg_start;
   2157	prctl_map.arg_end	= mm->arg_end;
   2158	prctl_map.env_start	= mm->env_start;
   2159	prctl_map.env_end	= mm->env_end;
   2160
   2161	switch (opt) {
   2162	case PR_SET_MM_START_CODE:
   2163		prctl_map.start_code = addr;
   2164		break;
   2165	case PR_SET_MM_END_CODE:
   2166		prctl_map.end_code = addr;
   2167		break;
   2168	case PR_SET_MM_START_DATA:
   2169		prctl_map.start_data = addr;
   2170		break;
   2171	case PR_SET_MM_END_DATA:
   2172		prctl_map.end_data = addr;
   2173		break;
   2174	case PR_SET_MM_START_STACK:
   2175		prctl_map.start_stack = addr;
   2176		break;
   2177	case PR_SET_MM_START_BRK:
   2178		prctl_map.start_brk = addr;
   2179		break;
   2180	case PR_SET_MM_BRK:
   2181		prctl_map.brk = addr;
   2182		break;
   2183	case PR_SET_MM_ARG_START:
   2184		prctl_map.arg_start = addr;
   2185		break;
   2186	case PR_SET_MM_ARG_END:
   2187		prctl_map.arg_end = addr;
   2188		break;
   2189	case PR_SET_MM_ENV_START:
   2190		prctl_map.env_start = addr;
   2191		break;
   2192	case PR_SET_MM_ENV_END:
   2193		prctl_map.env_end = addr;
   2194		break;
   2195	default:
   2196		goto out;
   2197	}
   2198
   2199	error = validate_prctl_map_addr(&prctl_map);
   2200	if (error)
   2201		goto out;
   2202
   2203	switch (opt) {
   2204	/*
   2205	 * If command line arguments and environment
   2206	 * are placed somewhere else on stack, we can
   2207	 * set them up here, ARG_START/END to setup
   2208	 * command line arguments and ENV_START/END
   2209	 * for environment.
   2210	 */
   2211	case PR_SET_MM_START_STACK:
   2212	case PR_SET_MM_ARG_START:
   2213	case PR_SET_MM_ARG_END:
   2214	case PR_SET_MM_ENV_START:
   2215	case PR_SET_MM_ENV_END:
   2216		if (!vma) {
   2217			error = -EFAULT;
   2218			goto out;
   2219		}
   2220	}
   2221
   2222	mm->start_code	= prctl_map.start_code;
   2223	mm->end_code	= prctl_map.end_code;
   2224	mm->start_data	= prctl_map.start_data;
   2225	mm->end_data	= prctl_map.end_data;
   2226	mm->start_brk	= prctl_map.start_brk;
   2227	mm->brk		= prctl_map.brk;
   2228	mm->start_stack	= prctl_map.start_stack;
   2229	mm->arg_start	= prctl_map.arg_start;
   2230	mm->arg_end	= prctl_map.arg_end;
   2231	mm->env_start	= prctl_map.env_start;
   2232	mm->env_end	= prctl_map.env_end;
   2233
   2234	error = 0;
   2235out:
   2236	spin_unlock(&mm->arg_lock);
   2237	mmap_read_unlock(mm);
   2238	return error;
   2239}
   2240
   2241#ifdef CONFIG_CHECKPOINT_RESTORE
   2242static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
   2243{
   2244	return put_user(me->clear_child_tid, tid_addr);
   2245}
   2246#else
   2247static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
   2248{
   2249	return -EINVAL;
   2250}
   2251#endif
   2252
   2253static int propagate_has_child_subreaper(struct task_struct *p, void *data)
   2254{
   2255	/*
   2256	 * If task has has_child_subreaper - all its descendants
   2257	 * already have these flag too and new descendants will
   2258	 * inherit it on fork, skip them.
   2259	 *
   2260	 * If we've found child_reaper - skip descendants in
   2261	 * it's subtree as they will never get out pidns.
   2262	 */
   2263	if (p->signal->has_child_subreaper ||
   2264	    is_child_reaper(task_pid(p)))
   2265		return 0;
   2266
   2267	p->signal->has_child_subreaper = 1;
   2268	return 1;
   2269}
   2270
   2271int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
   2272{
   2273	return -EINVAL;
   2274}
   2275
   2276int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
   2277				    unsigned long ctrl)
   2278{
   2279	return -EINVAL;
   2280}
   2281
   2282#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
   2283
   2284#ifdef CONFIG_ANON_VMA_NAME
   2285
   2286#define ANON_VMA_NAME_MAX_LEN		80
   2287#define ANON_VMA_NAME_INVALID_CHARS	"\\`$[]"
   2288
   2289static inline bool is_valid_name_char(char ch)
   2290{
   2291	/* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
   2292	return ch > 0x1f && ch < 0x7f &&
   2293		!strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
   2294}
   2295
   2296static int prctl_set_vma(unsigned long opt, unsigned long addr,
   2297			 unsigned long size, unsigned long arg)
   2298{
   2299	struct mm_struct *mm = current->mm;
   2300	const char __user *uname;
   2301	struct anon_vma_name *anon_name = NULL;
   2302	int error;
   2303
   2304	switch (opt) {
   2305	case PR_SET_VMA_ANON_NAME:
   2306		uname = (const char __user *)arg;
   2307		if (uname) {
   2308			char *name, *pch;
   2309
   2310			name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
   2311			if (IS_ERR(name))
   2312				return PTR_ERR(name);
   2313
   2314			for (pch = name; *pch != '\0'; pch++) {
   2315				if (!is_valid_name_char(*pch)) {
   2316					kfree(name);
   2317					return -EINVAL;
   2318				}
   2319			}
   2320			/* anon_vma has its own copy */
   2321			anon_name = anon_vma_name_alloc(name);
   2322			kfree(name);
   2323			if (!anon_name)
   2324				return -ENOMEM;
   2325
   2326		}
   2327
   2328		mmap_write_lock(mm);
   2329		error = madvise_set_anon_name(mm, addr, size, anon_name);
   2330		mmap_write_unlock(mm);
   2331		anon_vma_name_put(anon_name);
   2332		break;
   2333	default:
   2334		error = -EINVAL;
   2335	}
   2336
   2337	return error;
   2338}
   2339
   2340#else /* CONFIG_ANON_VMA_NAME */
   2341static int prctl_set_vma(unsigned long opt, unsigned long start,
   2342			 unsigned long size, unsigned long arg)
   2343{
   2344	return -EINVAL;
   2345}
   2346#endif /* CONFIG_ANON_VMA_NAME */
   2347
   2348SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
   2349		unsigned long, arg4, unsigned long, arg5)
   2350{
   2351	struct task_struct *me = current;
   2352	unsigned char comm[sizeof(me->comm)];
   2353	long error;
   2354
   2355	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
   2356	if (error != -ENOSYS)
   2357		return error;
   2358
   2359	error = 0;
   2360	switch (option) {
   2361	case PR_SET_PDEATHSIG:
   2362		if (!valid_signal(arg2)) {
   2363			error = -EINVAL;
   2364			break;
   2365		}
   2366		me->pdeath_signal = arg2;
   2367		break;
   2368	case PR_GET_PDEATHSIG:
   2369		error = put_user(me->pdeath_signal, (int __user *)arg2);
   2370		break;
   2371	case PR_GET_DUMPABLE:
   2372		error = get_dumpable(me->mm);
   2373		break;
   2374	case PR_SET_DUMPABLE:
   2375		if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
   2376			error = -EINVAL;
   2377			break;
   2378		}
   2379		set_dumpable(me->mm, arg2);
   2380		break;
   2381
   2382	case PR_SET_UNALIGN:
   2383		error = SET_UNALIGN_CTL(me, arg2);
   2384		break;
   2385	case PR_GET_UNALIGN:
   2386		error = GET_UNALIGN_CTL(me, arg2);
   2387		break;
   2388	case PR_SET_FPEMU:
   2389		error = SET_FPEMU_CTL(me, arg2);
   2390		break;
   2391	case PR_GET_FPEMU:
   2392		error = GET_FPEMU_CTL(me, arg2);
   2393		break;
   2394	case PR_SET_FPEXC:
   2395		error = SET_FPEXC_CTL(me, arg2);
   2396		break;
   2397	case PR_GET_FPEXC:
   2398		error = GET_FPEXC_CTL(me, arg2);
   2399		break;
   2400	case PR_GET_TIMING:
   2401		error = PR_TIMING_STATISTICAL;
   2402		break;
   2403	case PR_SET_TIMING:
   2404		if (arg2 != PR_TIMING_STATISTICAL)
   2405			error = -EINVAL;
   2406		break;
   2407	case PR_SET_NAME:
   2408		comm[sizeof(me->comm) - 1] = 0;
   2409		if (strncpy_from_user(comm, (char __user *)arg2,
   2410				      sizeof(me->comm) - 1) < 0)
   2411			return -EFAULT;
   2412		set_task_comm(me, comm);
   2413		proc_comm_connector(me);
   2414		break;
   2415	case PR_GET_NAME:
   2416		get_task_comm(comm, me);
   2417		if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
   2418			return -EFAULT;
   2419		break;
   2420	case PR_GET_ENDIAN:
   2421		error = GET_ENDIAN(me, arg2);
   2422		break;
   2423	case PR_SET_ENDIAN:
   2424		error = SET_ENDIAN(me, arg2);
   2425		break;
   2426	case PR_GET_SECCOMP:
   2427		error = prctl_get_seccomp();
   2428		break;
   2429	case PR_SET_SECCOMP:
   2430		error = prctl_set_seccomp(arg2, (char __user *)arg3);
   2431		break;
   2432	case PR_GET_TSC:
   2433		error = GET_TSC_CTL(arg2);
   2434		break;
   2435	case PR_SET_TSC:
   2436		error = SET_TSC_CTL(arg2);
   2437		break;
   2438	case PR_TASK_PERF_EVENTS_DISABLE:
   2439		error = perf_event_task_disable();
   2440		break;
   2441	case PR_TASK_PERF_EVENTS_ENABLE:
   2442		error = perf_event_task_enable();
   2443		break;
   2444	case PR_GET_TIMERSLACK:
   2445		if (current->timer_slack_ns > ULONG_MAX)
   2446			error = ULONG_MAX;
   2447		else
   2448			error = current->timer_slack_ns;
   2449		break;
   2450	case PR_SET_TIMERSLACK:
   2451		if (arg2 <= 0)
   2452			current->timer_slack_ns =
   2453					current->default_timer_slack_ns;
   2454		else
   2455			current->timer_slack_ns = arg2;
   2456		break;
   2457	case PR_MCE_KILL:
   2458		if (arg4 | arg5)
   2459			return -EINVAL;
   2460		switch (arg2) {
   2461		case PR_MCE_KILL_CLEAR:
   2462			if (arg3 != 0)
   2463				return -EINVAL;
   2464			current->flags &= ~PF_MCE_PROCESS;
   2465			break;
   2466		case PR_MCE_KILL_SET:
   2467			current->flags |= PF_MCE_PROCESS;
   2468			if (arg3 == PR_MCE_KILL_EARLY)
   2469				current->flags |= PF_MCE_EARLY;
   2470			else if (arg3 == PR_MCE_KILL_LATE)
   2471				current->flags &= ~PF_MCE_EARLY;
   2472			else if (arg3 == PR_MCE_KILL_DEFAULT)
   2473				current->flags &=
   2474						~(PF_MCE_EARLY|PF_MCE_PROCESS);
   2475			else
   2476				return -EINVAL;
   2477			break;
   2478		default:
   2479			return -EINVAL;
   2480		}
   2481		break;
   2482	case PR_MCE_KILL_GET:
   2483		if (arg2 | arg3 | arg4 | arg5)
   2484			return -EINVAL;
   2485		if (current->flags & PF_MCE_PROCESS)
   2486			error = (current->flags & PF_MCE_EARLY) ?
   2487				PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
   2488		else
   2489			error = PR_MCE_KILL_DEFAULT;
   2490		break;
   2491	case PR_SET_MM:
   2492		error = prctl_set_mm(arg2, arg3, arg4, arg5);
   2493		break;
   2494	case PR_GET_TID_ADDRESS:
   2495		error = prctl_get_tid_address(me, (int __user * __user *)arg2);
   2496		break;
   2497	case PR_SET_CHILD_SUBREAPER:
   2498		me->signal->is_child_subreaper = !!arg2;
   2499		if (!arg2)
   2500			break;
   2501
   2502		walk_process_tree(me, propagate_has_child_subreaper, NULL);
   2503		break;
   2504	case PR_GET_CHILD_SUBREAPER:
   2505		error = put_user(me->signal->is_child_subreaper,
   2506				 (int __user *)arg2);
   2507		break;
   2508	case PR_SET_NO_NEW_PRIVS:
   2509		if (arg2 != 1 || arg3 || arg4 || arg5)
   2510			return -EINVAL;
   2511
   2512		task_set_no_new_privs(current);
   2513		break;
   2514	case PR_GET_NO_NEW_PRIVS:
   2515		if (arg2 || arg3 || arg4 || arg5)
   2516			return -EINVAL;
   2517		return task_no_new_privs(current) ? 1 : 0;
   2518	case PR_GET_THP_DISABLE:
   2519		if (arg2 || arg3 || arg4 || arg5)
   2520			return -EINVAL;
   2521		error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
   2522		break;
   2523	case PR_SET_THP_DISABLE:
   2524		if (arg3 || arg4 || arg5)
   2525			return -EINVAL;
   2526		if (mmap_write_lock_killable(me->mm))
   2527			return -EINTR;
   2528		if (arg2)
   2529			set_bit(MMF_DISABLE_THP, &me->mm->flags);
   2530		else
   2531			clear_bit(MMF_DISABLE_THP, &me->mm->flags);
   2532		mmap_write_unlock(me->mm);
   2533		break;
   2534	case PR_MPX_ENABLE_MANAGEMENT:
   2535	case PR_MPX_DISABLE_MANAGEMENT:
   2536		/* No longer implemented: */
   2537		return -EINVAL;
   2538	case PR_SET_FP_MODE:
   2539		error = SET_FP_MODE(me, arg2);
   2540		break;
   2541	case PR_GET_FP_MODE:
   2542		error = GET_FP_MODE(me);
   2543		break;
   2544	case PR_SVE_SET_VL:
   2545		error = SVE_SET_VL(arg2);
   2546		break;
   2547	case PR_SVE_GET_VL:
   2548		error = SVE_GET_VL();
   2549		break;
   2550	case PR_SME_SET_VL:
   2551		error = SME_SET_VL(arg2);
   2552		break;
   2553	case PR_SME_GET_VL:
   2554		error = SME_GET_VL();
   2555		break;
   2556	case PR_GET_SPECULATION_CTRL:
   2557		if (arg3 || arg4 || arg5)
   2558			return -EINVAL;
   2559		error = arch_prctl_spec_ctrl_get(me, arg2);
   2560		break;
   2561	case PR_SET_SPECULATION_CTRL:
   2562		if (arg4 || arg5)
   2563			return -EINVAL;
   2564		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
   2565		break;
   2566	case PR_PAC_RESET_KEYS:
   2567		if (arg3 || arg4 || arg5)
   2568			return -EINVAL;
   2569		error = PAC_RESET_KEYS(me, arg2);
   2570		break;
   2571	case PR_PAC_SET_ENABLED_KEYS:
   2572		if (arg4 || arg5)
   2573			return -EINVAL;
   2574		error = PAC_SET_ENABLED_KEYS(me, arg2, arg3);
   2575		break;
   2576	case PR_PAC_GET_ENABLED_KEYS:
   2577		if (arg2 || arg3 || arg4 || arg5)
   2578			return -EINVAL;
   2579		error = PAC_GET_ENABLED_KEYS(me);
   2580		break;
   2581	case PR_SET_TAGGED_ADDR_CTRL:
   2582		if (arg3 || arg4 || arg5)
   2583			return -EINVAL;
   2584		error = SET_TAGGED_ADDR_CTRL(arg2);
   2585		break;
   2586	case PR_GET_TAGGED_ADDR_CTRL:
   2587		if (arg2 || arg3 || arg4 || arg5)
   2588			return -EINVAL;
   2589		error = GET_TAGGED_ADDR_CTRL();
   2590		break;
   2591	case PR_SET_IO_FLUSHER:
   2592		if (!capable(CAP_SYS_RESOURCE))
   2593			return -EPERM;
   2594
   2595		if (arg3 || arg4 || arg5)
   2596			return -EINVAL;
   2597
   2598		if (arg2 == 1)
   2599			current->flags |= PR_IO_FLUSHER;
   2600		else if (!arg2)
   2601			current->flags &= ~PR_IO_FLUSHER;
   2602		else
   2603			return -EINVAL;
   2604		break;
   2605	case PR_GET_IO_FLUSHER:
   2606		if (!capable(CAP_SYS_RESOURCE))
   2607			return -EPERM;
   2608
   2609		if (arg2 || arg3 || arg4 || arg5)
   2610			return -EINVAL;
   2611
   2612		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
   2613		break;
   2614	case PR_SET_SYSCALL_USER_DISPATCH:
   2615		error = set_syscall_user_dispatch(arg2, arg3, arg4,
   2616						  (char __user *) arg5);
   2617		break;
   2618#ifdef CONFIG_SCHED_CORE
   2619	case PR_SCHED_CORE:
   2620		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
   2621		break;
   2622#endif
   2623	case PR_SET_VMA:
   2624		error = prctl_set_vma(arg2, arg3, arg4, arg5);
   2625		break;
   2626	default:
   2627		error = -EINVAL;
   2628		break;
   2629	}
   2630	return error;
   2631}
   2632
   2633SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
   2634		struct getcpu_cache __user *, unused)
   2635{
   2636	int err = 0;
   2637	int cpu = raw_smp_processor_id();
   2638
   2639	if (cpup)
   2640		err |= put_user(cpu, cpup);
   2641	if (nodep)
   2642		err |= put_user(cpu_to_node(cpu), nodep);
   2643	return err ? -EFAULT : 0;
   2644}
   2645
   2646/**
   2647 * do_sysinfo - fill in sysinfo struct
   2648 * @info: pointer to buffer to fill
   2649 */
   2650static int do_sysinfo(struct sysinfo *info)
   2651{
   2652	unsigned long mem_total, sav_total;
   2653	unsigned int mem_unit, bitcount;
   2654	struct timespec64 tp;
   2655
   2656	memset(info, 0, sizeof(struct sysinfo));
   2657
   2658	ktime_get_boottime_ts64(&tp);
   2659	timens_add_boottime(&tp);
   2660	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
   2661
   2662	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
   2663
   2664	info->procs = nr_threads;
   2665
   2666	si_meminfo(info);
   2667	si_swapinfo(info);
   2668
   2669	/*
   2670	 * If the sum of all the available memory (i.e. ram + swap)
   2671	 * is less than can be stored in a 32 bit unsigned long then
   2672	 * we can be binary compatible with 2.2.x kernels.  If not,
   2673	 * well, in that case 2.2.x was broken anyways...
   2674	 *
   2675	 *  -Erik Andersen <andersee@debian.org>
   2676	 */
   2677
   2678	mem_total = info->totalram + info->totalswap;
   2679	if (mem_total < info->totalram || mem_total < info->totalswap)
   2680		goto out;
   2681	bitcount = 0;
   2682	mem_unit = info->mem_unit;
   2683	while (mem_unit > 1) {
   2684		bitcount++;
   2685		mem_unit >>= 1;
   2686		sav_total = mem_total;
   2687		mem_total <<= 1;
   2688		if (mem_total < sav_total)
   2689			goto out;
   2690	}
   2691
   2692	/*
   2693	 * If mem_total did not overflow, multiply all memory values by
   2694	 * info->mem_unit and set it to 1.  This leaves things compatible
   2695	 * with 2.2.x, and also retains compatibility with earlier 2.4.x
   2696	 * kernels...
   2697	 */
   2698
   2699	info->mem_unit = 1;
   2700	info->totalram <<= bitcount;
   2701	info->freeram <<= bitcount;
   2702	info->sharedram <<= bitcount;
   2703	info->bufferram <<= bitcount;
   2704	info->totalswap <<= bitcount;
   2705	info->freeswap <<= bitcount;
   2706	info->totalhigh <<= bitcount;
   2707	info->freehigh <<= bitcount;
   2708
   2709out:
   2710	return 0;
   2711}
   2712
   2713SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
   2714{
   2715	struct sysinfo val;
   2716
   2717	do_sysinfo(&val);
   2718
   2719	if (copy_to_user(info, &val, sizeof(struct sysinfo)))
   2720		return -EFAULT;
   2721
   2722	return 0;
   2723}
   2724
   2725#ifdef CONFIG_COMPAT
   2726struct compat_sysinfo {
   2727	s32 uptime;
   2728	u32 loads[3];
   2729	u32 totalram;
   2730	u32 freeram;
   2731	u32 sharedram;
   2732	u32 bufferram;
   2733	u32 totalswap;
   2734	u32 freeswap;
   2735	u16 procs;
   2736	u16 pad;
   2737	u32 totalhigh;
   2738	u32 freehigh;
   2739	u32 mem_unit;
   2740	char _f[20-2*sizeof(u32)-sizeof(int)];
   2741};
   2742
   2743COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
   2744{
   2745	struct sysinfo s;
   2746	struct compat_sysinfo s_32;
   2747
   2748	do_sysinfo(&s);
   2749
   2750	/* Check to see if any memory value is too large for 32-bit and scale
   2751	 *  down if needed
   2752	 */
   2753	if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
   2754		int bitcount = 0;
   2755
   2756		while (s.mem_unit < PAGE_SIZE) {
   2757			s.mem_unit <<= 1;
   2758			bitcount++;
   2759		}
   2760
   2761		s.totalram >>= bitcount;
   2762		s.freeram >>= bitcount;
   2763		s.sharedram >>= bitcount;
   2764		s.bufferram >>= bitcount;
   2765		s.totalswap >>= bitcount;
   2766		s.freeswap >>= bitcount;
   2767		s.totalhigh >>= bitcount;
   2768		s.freehigh >>= bitcount;
   2769	}
   2770
   2771	memset(&s_32, 0, sizeof(s_32));
   2772	s_32.uptime = s.uptime;
   2773	s_32.loads[0] = s.loads[0];
   2774	s_32.loads[1] = s.loads[1];
   2775	s_32.loads[2] = s.loads[2];
   2776	s_32.totalram = s.totalram;
   2777	s_32.freeram = s.freeram;
   2778	s_32.sharedram = s.sharedram;
   2779	s_32.bufferram = s.bufferram;
   2780	s_32.totalswap = s.totalswap;
   2781	s_32.freeswap = s.freeswap;
   2782	s_32.procs = s.procs;
   2783	s_32.totalhigh = s.totalhigh;
   2784	s_32.freehigh = s.freehigh;
   2785	s_32.mem_unit = s.mem_unit;
   2786	if (copy_to_user(info, &s_32, sizeof(s_32)))
   2787		return -EFAULT;
   2788	return 0;
   2789}
   2790#endif /* CONFIG_COMPAT */