cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

commoncap.c (43990B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/* Common capabilities, needed by capability.o.
      3 */
      4
      5#include <linux/capability.h>
      6#include <linux/audit.h>
      7#include <linux/init.h>
      8#include <linux/kernel.h>
      9#include <linux/lsm_hooks.h>
     10#include <linux/file.h>
     11#include <linux/mm.h>
     12#include <linux/mman.h>
     13#include <linux/pagemap.h>
     14#include <linux/swap.h>
     15#include <linux/skbuff.h>
     16#include <linux/netlink.h>
     17#include <linux/ptrace.h>
     18#include <linux/xattr.h>
     19#include <linux/hugetlb.h>
     20#include <linux/mount.h>
     21#include <linux/sched.h>
     22#include <linux/prctl.h>
     23#include <linux/securebits.h>
     24#include <linux/user_namespace.h>
     25#include <linux/binfmts.h>
     26#include <linux/personality.h>
     27#include <linux/mnt_idmapping.h>
     28
     29/*
     30 * If a non-root user executes a setuid-root binary in
     31 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
     32 * However if fE is also set, then the intent is for only
     33 * the file capabilities to be applied, and the setuid-root
     34 * bit is left on either to change the uid (plausible) or
     35 * to get full privilege on a kernel without file capabilities
     36 * support.  So in that case we do not raise capabilities.
     37 *
     38 * Warn if that happens, once per boot.
     39 */
     40static void warn_setuid_and_fcaps_mixed(const char *fname)
     41{
     42	static int warned;
     43	if (!warned) {
     44		printk(KERN_INFO "warning: `%s' has both setuid-root and"
     45			" effective capabilities. Therefore not raising all"
     46			" capabilities.\n", fname);
     47		warned = 1;
     48	}
     49}
     50
     51/**
     52 * cap_capable - Determine whether a task has a particular effective capability
     53 * @cred: The credentials to use
     54 * @targ_ns:  The user namespace in which we need the capability
     55 * @cap: The capability to check for
     56 * @opts: Bitmask of options defined in include/linux/security.h
     57 *
     58 * Determine whether the nominated task has the specified capability amongst
     59 * its effective set, returning 0 if it does, -ve if it does not.
     60 *
     61 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
     62 * and has_capability() functions.  That is, it has the reverse semantics:
     63 * cap_has_capability() returns 0 when a task has a capability, but the
     64 * kernel's capable() and has_capability() returns 1 for this case.
     65 */
     66int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
     67		int cap, unsigned int opts)
     68{
     69	struct user_namespace *ns = targ_ns;
     70
     71	/* See if cred has the capability in the target user namespace
     72	 * by examining the target user namespace and all of the target
     73	 * user namespace's parents.
     74	 */
     75	for (;;) {
     76		/* Do we have the necessary capabilities? */
     77		if (ns == cred->user_ns)
     78			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
     79
     80		/*
     81		 * If we're already at a lower level than we're looking for,
     82		 * we're done searching.
     83		 */
     84		if (ns->level <= cred->user_ns->level)
     85			return -EPERM;
     86
     87		/* 
     88		 * The owner of the user namespace in the parent of the
     89		 * user namespace has all caps.
     90		 */
     91		if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
     92			return 0;
     93
     94		/*
     95		 * If you have a capability in a parent user ns, then you have
     96		 * it over all children user namespaces as well.
     97		 */
     98		ns = ns->parent;
     99	}
    100
    101	/* We never get here */
    102}
    103
    104/**
    105 * cap_settime - Determine whether the current process may set the system clock
    106 * @ts: The time to set
    107 * @tz: The timezone to set
    108 *
    109 * Determine whether the current process may set the system clock and timezone
    110 * information, returning 0 if permission granted, -ve if denied.
    111 */
    112int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
    113{
    114	if (!capable(CAP_SYS_TIME))
    115		return -EPERM;
    116	return 0;
    117}
    118
    119/**
    120 * cap_ptrace_access_check - Determine whether the current process may access
    121 *			   another
    122 * @child: The process to be accessed
    123 * @mode: The mode of attachment.
    124 *
    125 * If we are in the same or an ancestor user_ns and have all the target
    126 * task's capabilities, then ptrace access is allowed.
    127 * If we have the ptrace capability to the target user_ns, then ptrace
    128 * access is allowed.
    129 * Else denied.
    130 *
    131 * Determine whether a process may access another, returning 0 if permission
    132 * granted, -ve if denied.
    133 */
    134int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
    135{
    136	int ret = 0;
    137	const struct cred *cred, *child_cred;
    138	const kernel_cap_t *caller_caps;
    139
    140	rcu_read_lock();
    141	cred = current_cred();
    142	child_cred = __task_cred(child);
    143	if (mode & PTRACE_MODE_FSCREDS)
    144		caller_caps = &cred->cap_effective;
    145	else
    146		caller_caps = &cred->cap_permitted;
    147	if (cred->user_ns == child_cred->user_ns &&
    148	    cap_issubset(child_cred->cap_permitted, *caller_caps))
    149		goto out;
    150	if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
    151		goto out;
    152	ret = -EPERM;
    153out:
    154	rcu_read_unlock();
    155	return ret;
    156}
    157
    158/**
    159 * cap_ptrace_traceme - Determine whether another process may trace the current
    160 * @parent: The task proposed to be the tracer
    161 *
    162 * If parent is in the same or an ancestor user_ns and has all current's
    163 * capabilities, then ptrace access is allowed.
    164 * If parent has the ptrace capability to current's user_ns, then ptrace
    165 * access is allowed.
    166 * Else denied.
    167 *
    168 * Determine whether the nominated task is permitted to trace the current
    169 * process, returning 0 if permission is granted, -ve if denied.
    170 */
    171int cap_ptrace_traceme(struct task_struct *parent)
    172{
    173	int ret = 0;
    174	const struct cred *cred, *child_cred;
    175
    176	rcu_read_lock();
    177	cred = __task_cred(parent);
    178	child_cred = current_cred();
    179	if (cred->user_ns == child_cred->user_ns &&
    180	    cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
    181		goto out;
    182	if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
    183		goto out;
    184	ret = -EPERM;
    185out:
    186	rcu_read_unlock();
    187	return ret;
    188}
    189
    190/**
    191 * cap_capget - Retrieve a task's capability sets
    192 * @target: The task from which to retrieve the capability sets
    193 * @effective: The place to record the effective set
    194 * @inheritable: The place to record the inheritable set
    195 * @permitted: The place to record the permitted set
    196 *
    197 * This function retrieves the capabilities of the nominated task and returns
    198 * them to the caller.
    199 */
    200int cap_capget(struct task_struct *target, kernel_cap_t *effective,
    201	       kernel_cap_t *inheritable, kernel_cap_t *permitted)
    202{
    203	const struct cred *cred;
    204
    205	/* Derived from kernel/capability.c:sys_capget. */
    206	rcu_read_lock();
    207	cred = __task_cred(target);
    208	*effective   = cred->cap_effective;
    209	*inheritable = cred->cap_inheritable;
    210	*permitted   = cred->cap_permitted;
    211	rcu_read_unlock();
    212	return 0;
    213}
    214
    215/*
    216 * Determine whether the inheritable capabilities are limited to the old
    217 * permitted set.  Returns 1 if they are limited, 0 if they are not.
    218 */
    219static inline int cap_inh_is_capped(void)
    220{
    221	/* they are so limited unless the current task has the CAP_SETPCAP
    222	 * capability
    223	 */
    224	if (cap_capable(current_cred(), current_cred()->user_ns,
    225			CAP_SETPCAP, CAP_OPT_NONE) == 0)
    226		return 0;
    227	return 1;
    228}
    229
    230/**
    231 * cap_capset - Validate and apply proposed changes to current's capabilities
    232 * @new: The proposed new credentials; alterations should be made here
    233 * @old: The current task's current credentials
    234 * @effective: A pointer to the proposed new effective capabilities set
    235 * @inheritable: A pointer to the proposed new inheritable capabilities set
    236 * @permitted: A pointer to the proposed new permitted capabilities set
    237 *
    238 * This function validates and applies a proposed mass change to the current
    239 * process's capability sets.  The changes are made to the proposed new
    240 * credentials, and assuming no error, will be committed by the caller of LSM.
    241 */
    242int cap_capset(struct cred *new,
    243	       const struct cred *old,
    244	       const kernel_cap_t *effective,
    245	       const kernel_cap_t *inheritable,
    246	       const kernel_cap_t *permitted)
    247{
    248	if (cap_inh_is_capped() &&
    249	    !cap_issubset(*inheritable,
    250			  cap_combine(old->cap_inheritable,
    251				      old->cap_permitted)))
    252		/* incapable of using this inheritable set */
    253		return -EPERM;
    254
    255	if (!cap_issubset(*inheritable,
    256			  cap_combine(old->cap_inheritable,
    257				      old->cap_bset)))
    258		/* no new pI capabilities outside bounding set */
    259		return -EPERM;
    260
    261	/* verify restrictions on target's new Permitted set */
    262	if (!cap_issubset(*permitted, old->cap_permitted))
    263		return -EPERM;
    264
    265	/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
    266	if (!cap_issubset(*effective, *permitted))
    267		return -EPERM;
    268
    269	new->cap_effective   = *effective;
    270	new->cap_inheritable = *inheritable;
    271	new->cap_permitted   = *permitted;
    272
    273	/*
    274	 * Mask off ambient bits that are no longer both permitted and
    275	 * inheritable.
    276	 */
    277	new->cap_ambient = cap_intersect(new->cap_ambient,
    278					 cap_intersect(*permitted,
    279						       *inheritable));
    280	if (WARN_ON(!cap_ambient_invariant_ok(new)))
    281		return -EINVAL;
    282	return 0;
    283}
    284
    285/**
    286 * cap_inode_need_killpriv - Determine if inode change affects privileges
    287 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
    288 *
    289 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
    290 * affects the security markings on that inode, and if it is, should
    291 * inode_killpriv() be invoked or the change rejected.
    292 *
    293 * Return: 1 if security.capability has a value, meaning inode_killpriv()
    294 * is required, 0 otherwise, meaning inode_killpriv() is not required.
    295 */
    296int cap_inode_need_killpriv(struct dentry *dentry)
    297{
    298	struct inode *inode = d_backing_inode(dentry);
    299	int error;
    300
    301	error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
    302	return error > 0;
    303}
    304
    305/**
    306 * cap_inode_killpriv - Erase the security markings on an inode
    307 *
    308 * @mnt_userns:	user namespace of the mount the inode was found from
    309 * @dentry:	The inode/dentry to alter
    310 *
    311 * Erase the privilege-enhancing security markings on an inode.
    312 *
    313 * If the inode has been found through an idmapped mount the user namespace of
    314 * the vfsmount must be passed through @mnt_userns. This function will then
    315 * take care to map the inode according to @mnt_userns before checking
    316 * permissions. On non-idmapped mounts or if permission checking is to be
    317 * performed on the raw inode simply passs init_user_ns.
    318 *
    319 * Return: 0 if successful, -ve on error.
    320 */
    321int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry)
    322{
    323	int error;
    324
    325	error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS);
    326	if (error == -EOPNOTSUPP)
    327		error = 0;
    328	return error;
    329}
    330
    331static bool rootid_owns_currentns(kuid_t kroot)
    332{
    333	struct user_namespace *ns;
    334
    335	if (!uid_valid(kroot))
    336		return false;
    337
    338	for (ns = current_user_ns(); ; ns = ns->parent) {
    339		if (from_kuid(ns, kroot) == 0)
    340			return true;
    341		if (ns == &init_user_ns)
    342			break;
    343	}
    344
    345	return false;
    346}
    347
    348static __u32 sansflags(__u32 m)
    349{
    350	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
    351}
    352
    353static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
    354{
    355	if (size != XATTR_CAPS_SZ_2)
    356		return false;
    357	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
    358}
    359
    360static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
    361{
    362	if (size != XATTR_CAPS_SZ_3)
    363		return false;
    364	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
    365}
    366
    367/*
    368 * getsecurity: We are called for security.* before any attempt to read the
    369 * xattr from the inode itself.
    370 *
    371 * This gives us a chance to read the on-disk value and convert it.  If we
    372 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
    373 *
    374 * Note we are not called by vfs_getxattr_alloc(), but that is only called
    375 * by the integrity subsystem, which really wants the unconverted values -
    376 * so that's good.
    377 */
    378int cap_inode_getsecurity(struct user_namespace *mnt_userns,
    379			  struct inode *inode, const char *name, void **buffer,
    380			  bool alloc)
    381{
    382	int size, ret;
    383	kuid_t kroot;
    384	u32 nsmagic, magic;
    385	uid_t root, mappedroot;
    386	char *tmpbuf = NULL;
    387	struct vfs_cap_data *cap;
    388	struct vfs_ns_cap_data *nscap = NULL;
    389	struct dentry *dentry;
    390	struct user_namespace *fs_ns;
    391
    392	if (strcmp(name, "capability") != 0)
    393		return -EOPNOTSUPP;
    394
    395	dentry = d_find_any_alias(inode);
    396	if (!dentry)
    397		return -EINVAL;
    398
    399	size = sizeof(struct vfs_ns_cap_data);
    400	ret = (int)vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS,
    401				      &tmpbuf, size, GFP_NOFS);
    402	dput(dentry);
    403
    404	if (ret < 0 || !tmpbuf)
    405		return ret;
    406
    407	fs_ns = inode->i_sb->s_user_ns;
    408	cap = (struct vfs_cap_data *) tmpbuf;
    409	if (is_v2header((size_t) ret, cap)) {
    410		root = 0;
    411	} else if (is_v3header((size_t) ret, cap)) {
    412		nscap = (struct vfs_ns_cap_data *) tmpbuf;
    413		root = le32_to_cpu(nscap->rootid);
    414	} else {
    415		size = -EINVAL;
    416		goto out_free;
    417	}
    418
    419	kroot = make_kuid(fs_ns, root);
    420
    421	/* If this is an idmapped mount shift the kuid. */
    422	kroot = mapped_kuid_fs(mnt_userns, fs_ns, kroot);
    423
    424	/* If the root kuid maps to a valid uid in current ns, then return
    425	 * this as a nscap. */
    426	mappedroot = from_kuid(current_user_ns(), kroot);
    427	if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
    428		size = sizeof(struct vfs_ns_cap_data);
    429		if (alloc) {
    430			if (!nscap) {
    431				/* v2 -> v3 conversion */
    432				nscap = kzalloc(size, GFP_ATOMIC);
    433				if (!nscap) {
    434					size = -ENOMEM;
    435					goto out_free;
    436				}
    437				nsmagic = VFS_CAP_REVISION_3;
    438				magic = le32_to_cpu(cap->magic_etc);
    439				if (magic & VFS_CAP_FLAGS_EFFECTIVE)
    440					nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
    441				memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
    442				nscap->magic_etc = cpu_to_le32(nsmagic);
    443			} else {
    444				/* use allocated v3 buffer */
    445				tmpbuf = NULL;
    446			}
    447			nscap->rootid = cpu_to_le32(mappedroot);
    448			*buffer = nscap;
    449		}
    450		goto out_free;
    451	}
    452
    453	if (!rootid_owns_currentns(kroot)) {
    454		size = -EOVERFLOW;
    455		goto out_free;
    456	}
    457
    458	/* This comes from a parent namespace.  Return as a v2 capability */
    459	size = sizeof(struct vfs_cap_data);
    460	if (alloc) {
    461		if (nscap) {
    462			/* v3 -> v2 conversion */
    463			cap = kzalloc(size, GFP_ATOMIC);
    464			if (!cap) {
    465				size = -ENOMEM;
    466				goto out_free;
    467			}
    468			magic = VFS_CAP_REVISION_2;
    469			nsmagic = le32_to_cpu(nscap->magic_etc);
    470			if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
    471				magic |= VFS_CAP_FLAGS_EFFECTIVE;
    472			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
    473			cap->magic_etc = cpu_to_le32(magic);
    474		} else {
    475			/* use unconverted v2 */
    476			tmpbuf = NULL;
    477		}
    478		*buffer = cap;
    479	}
    480out_free:
    481	kfree(tmpbuf);
    482	return size;
    483}
    484
    485/**
    486 * rootid_from_xattr - translate root uid of vfs caps
    487 *
    488 * @value:	vfs caps value which may be modified by this function
    489 * @size:	size of @ivalue
    490 * @task_ns:	user namespace of the caller
    491 * @mnt_userns:	user namespace of the mount the inode was found from
    492 * @fs_userns:	user namespace of the filesystem
    493 *
    494 * If the inode has been found through an idmapped mount the user namespace of
    495 * the vfsmount must be passed through @mnt_userns. This function will then
    496 * take care to map the inode according to @mnt_userns before checking
    497 * permissions. On non-idmapped mounts or if permission checking is to be
    498 * performed on the raw inode simply passs init_user_ns.
    499 */
    500static kuid_t rootid_from_xattr(const void *value, size_t size,
    501				struct user_namespace *task_ns,
    502				struct user_namespace *mnt_userns,
    503				struct user_namespace *fs_userns)
    504{
    505	const struct vfs_ns_cap_data *nscap = value;
    506	kuid_t rootkid;
    507	uid_t rootid = 0;
    508
    509	if (size == XATTR_CAPS_SZ_3)
    510		rootid = le32_to_cpu(nscap->rootid);
    511
    512	rootkid = make_kuid(task_ns, rootid);
    513	return mapped_kuid_user(mnt_userns, fs_userns, rootkid);
    514}
    515
    516static bool validheader(size_t size, const struct vfs_cap_data *cap)
    517{
    518	return is_v2header(size, cap) || is_v3header(size, cap);
    519}
    520
    521/**
    522 * cap_convert_nscap - check vfs caps
    523 *
    524 * @mnt_userns:	user namespace of the mount the inode was found from
    525 * @dentry:	used to retrieve inode to check permissions on
    526 * @ivalue:	vfs caps value which may be modified by this function
    527 * @size:	size of @ivalue
    528 *
    529 * User requested a write of security.capability.  If needed, update the
    530 * xattr to change from v2 to v3, or to fixup the v3 rootid.
    531 *
    532 * If the inode has been found through an idmapped mount the user namespace of
    533 * the vfsmount must be passed through @mnt_userns. This function will then
    534 * take care to map the inode according to @mnt_userns before checking
    535 * permissions. On non-idmapped mounts or if permission checking is to be
    536 * performed on the raw inode simply passs init_user_ns.
    537 *
    538 * Return: On success, return the new size; on error, return < 0.
    539 */
    540int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
    541		      const void **ivalue, size_t size)
    542{
    543	struct vfs_ns_cap_data *nscap;
    544	uid_t nsrootid;
    545	const struct vfs_cap_data *cap = *ivalue;
    546	__u32 magic, nsmagic;
    547	struct inode *inode = d_backing_inode(dentry);
    548	struct user_namespace *task_ns = current_user_ns(),
    549		*fs_ns = inode->i_sb->s_user_ns;
    550	kuid_t rootid;
    551	size_t newsize;
    552
    553	if (!*ivalue)
    554		return -EINVAL;
    555	if (!validheader(size, cap))
    556		return -EINVAL;
    557	if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
    558		return -EPERM;
    559	if (size == XATTR_CAPS_SZ_2 && (mnt_userns == fs_ns))
    560		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
    561			/* user is privileged, just write the v2 */
    562			return size;
    563
    564	rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns, fs_ns);
    565	if (!uid_valid(rootid))
    566		return -EINVAL;
    567
    568	nsrootid = from_kuid(fs_ns, rootid);
    569	if (nsrootid == -1)
    570		return -EINVAL;
    571
    572	newsize = sizeof(struct vfs_ns_cap_data);
    573	nscap = kmalloc(newsize, GFP_ATOMIC);
    574	if (!nscap)
    575		return -ENOMEM;
    576	nscap->rootid = cpu_to_le32(nsrootid);
    577	nsmagic = VFS_CAP_REVISION_3;
    578	magic = le32_to_cpu(cap->magic_etc);
    579	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
    580		nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
    581	nscap->magic_etc = cpu_to_le32(nsmagic);
    582	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
    583
    584	*ivalue = nscap;
    585	return newsize;
    586}
    587
    588/*
    589 * Calculate the new process capability sets from the capability sets attached
    590 * to a file.
    591 */
    592static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
    593					  struct linux_binprm *bprm,
    594					  bool *effective,
    595					  bool *has_fcap)
    596{
    597	struct cred *new = bprm->cred;
    598	unsigned i;
    599	int ret = 0;
    600
    601	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
    602		*effective = true;
    603
    604	if (caps->magic_etc & VFS_CAP_REVISION_MASK)
    605		*has_fcap = true;
    606
    607	CAP_FOR_EACH_U32(i) {
    608		__u32 permitted = caps->permitted.cap[i];
    609		__u32 inheritable = caps->inheritable.cap[i];
    610
    611		/*
    612		 * pP' = (X & fP) | (pI & fI)
    613		 * The addition of pA' is handled later.
    614		 */
    615		new->cap_permitted.cap[i] =
    616			(new->cap_bset.cap[i] & permitted) |
    617			(new->cap_inheritable.cap[i] & inheritable);
    618
    619		if (permitted & ~new->cap_permitted.cap[i])
    620			/* insufficient to execute correctly */
    621			ret = -EPERM;
    622	}
    623
    624	/*
    625	 * For legacy apps, with no internal support for recognizing they
    626	 * do not have enough capabilities, we return an error if they are
    627	 * missing some "forced" (aka file-permitted) capabilities.
    628	 */
    629	return *effective ? ret : 0;
    630}
    631
    632/**
    633 * get_vfs_caps_from_disk - retrieve vfs caps from disk
    634 *
    635 * @mnt_userns:	user namespace of the mount the inode was found from
    636 * @dentry:	dentry from which @inode is retrieved
    637 * @cpu_caps:	vfs capabilities
    638 *
    639 * Extract the on-exec-apply capability sets for an executable file.
    640 *
    641 * If the inode has been found through an idmapped mount the user namespace of
    642 * the vfsmount must be passed through @mnt_userns. This function will then
    643 * take care to map the inode according to @mnt_userns before checking
    644 * permissions. On non-idmapped mounts or if permission checking is to be
    645 * performed on the raw inode simply passs init_user_ns.
    646 */
    647int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
    648			   const struct dentry *dentry,
    649			   struct cpu_vfs_cap_data *cpu_caps)
    650{
    651	struct inode *inode = d_backing_inode(dentry);
    652	__u32 magic_etc;
    653	unsigned tocopy, i;
    654	int size;
    655	struct vfs_ns_cap_data data, *nscaps = &data;
    656	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
    657	kuid_t rootkuid;
    658	struct user_namespace *fs_ns;
    659
    660	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
    661
    662	if (!inode)
    663		return -ENODATA;
    664
    665	fs_ns = inode->i_sb->s_user_ns;
    666	size = __vfs_getxattr((struct dentry *)dentry, inode,
    667			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
    668	if (size == -ENODATA || size == -EOPNOTSUPP)
    669		/* no data, that's ok */
    670		return -ENODATA;
    671
    672	if (size < 0)
    673		return size;
    674
    675	if (size < sizeof(magic_etc))
    676		return -EINVAL;
    677
    678	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
    679
    680	rootkuid = make_kuid(fs_ns, 0);
    681	switch (magic_etc & VFS_CAP_REVISION_MASK) {
    682	case VFS_CAP_REVISION_1:
    683		if (size != XATTR_CAPS_SZ_1)
    684			return -EINVAL;
    685		tocopy = VFS_CAP_U32_1;
    686		break;
    687	case VFS_CAP_REVISION_2:
    688		if (size != XATTR_CAPS_SZ_2)
    689			return -EINVAL;
    690		tocopy = VFS_CAP_U32_2;
    691		break;
    692	case VFS_CAP_REVISION_3:
    693		if (size != XATTR_CAPS_SZ_3)
    694			return -EINVAL;
    695		tocopy = VFS_CAP_U32_3;
    696		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
    697		break;
    698
    699	default:
    700		return -EINVAL;
    701	}
    702	/* Limit the caps to the mounter of the filesystem
    703	 * or the more limited uid specified in the xattr.
    704	 */
    705	rootkuid = mapped_kuid_fs(mnt_userns, fs_ns, rootkuid);
    706	if (!rootid_owns_currentns(rootkuid))
    707		return -ENODATA;
    708
    709	CAP_FOR_EACH_U32(i) {
    710		if (i >= tocopy)
    711			break;
    712		cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
    713		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
    714	}
    715
    716	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
    717	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
    718
    719	cpu_caps->rootid = rootkuid;
    720
    721	return 0;
    722}
    723
    724/*
    725 * Attempt to get the on-exec apply capability sets for an executable file from
    726 * its xattrs and, if present, apply them to the proposed credentials being
    727 * constructed by execve().
    728 */
    729static int get_file_caps(struct linux_binprm *bprm, struct file *file,
    730			 bool *effective, bool *has_fcap)
    731{
    732	int rc = 0;
    733	struct cpu_vfs_cap_data vcaps;
    734
    735	cap_clear(bprm->cred->cap_permitted);
    736
    737	if (!file_caps_enabled)
    738		return 0;
    739
    740	if (!mnt_may_suid(file->f_path.mnt))
    741		return 0;
    742
    743	/*
    744	 * This check is redundant with mnt_may_suid() but is kept to make
    745	 * explicit that capability bits are limited to s_user_ns and its
    746	 * descendants.
    747	 */
    748	if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
    749		return 0;
    750
    751	rc = get_vfs_caps_from_disk(file_mnt_user_ns(file),
    752				    file->f_path.dentry, &vcaps);
    753	if (rc < 0) {
    754		if (rc == -EINVAL)
    755			printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
    756					bprm->filename);
    757		else if (rc == -ENODATA)
    758			rc = 0;
    759		goto out;
    760	}
    761
    762	rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);
    763
    764out:
    765	if (rc)
    766		cap_clear(bprm->cred->cap_permitted);
    767
    768	return rc;
    769}
    770
    771static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }
    772
    773static inline bool __is_real(kuid_t uid, struct cred *cred)
    774{ return uid_eq(cred->uid, uid); }
    775
    776static inline bool __is_eff(kuid_t uid, struct cred *cred)
    777{ return uid_eq(cred->euid, uid); }
    778
    779static inline bool __is_suid(kuid_t uid, struct cred *cred)
    780{ return !__is_real(uid, cred) && __is_eff(uid, cred); }
    781
    782/*
    783 * handle_privileged_root - Handle case of privileged root
    784 * @bprm: The execution parameters, including the proposed creds
    785 * @has_fcap: Are any file capabilities set?
    786 * @effective: Do we have effective root privilege?
    787 * @root_uid: This namespace' root UID WRT initial USER namespace
    788 *
    789 * Handle the case where root is privileged and hasn't been neutered by
    790 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
    791 * set UID root and nothing is changed.  If we are root, cap_permitted is
    792 * updated.  If we have become set UID root, the effective bit is set.
    793 */
    794static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
    795				   bool *effective, kuid_t root_uid)
    796{
    797	const struct cred *old = current_cred();
    798	struct cred *new = bprm->cred;
    799
    800	if (!root_privileged())
    801		return;
    802	/*
    803	 * If the legacy file capability is set, then don't set privs
    804	 * for a setuid root binary run by a non-root user.  Do set it
    805	 * for a root user just to cause least surprise to an admin.
    806	 */
    807	if (has_fcap && __is_suid(root_uid, new)) {
    808		warn_setuid_and_fcaps_mixed(bprm->filename);
    809		return;
    810	}
    811	/*
    812	 * To support inheritance of root-permissions and suid-root
    813	 * executables under compatibility mode, we override the
    814	 * capability sets for the file.
    815	 */
    816	if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
    817		/* pP' = (cap_bset & ~0) | (pI & ~0) */
    818		new->cap_permitted = cap_combine(old->cap_bset,
    819						 old->cap_inheritable);
    820	}
    821	/*
    822	 * If only the real uid is 0, we do not set the effective bit.
    823	 */
    824	if (__is_eff(root_uid, new))
    825		*effective = true;
    826}
    827
    828#define __cap_gained(field, target, source) \
    829	!cap_issubset(target->cap_##field, source->cap_##field)
    830#define __cap_grew(target, source, cred) \
    831	!cap_issubset(cred->cap_##target, cred->cap_##source)
    832#define __cap_full(field, cred) \
    833	cap_issubset(CAP_FULL_SET, cred->cap_##field)
    834
    835static inline bool __is_setuid(struct cred *new, const struct cred *old)
    836{ return !uid_eq(new->euid, old->uid); }
    837
    838static inline bool __is_setgid(struct cred *new, const struct cred *old)
    839{ return !gid_eq(new->egid, old->gid); }
    840
    841/*
    842 * 1) Audit candidate if current->cap_effective is set
    843 *
    844 * We do not bother to audit if 3 things are true:
    845 *   1) cap_effective has all caps
    846 *   2) we became root *OR* are were already root
    847 *   3) root is supposed to have all caps (SECURE_NOROOT)
    848 * Since this is just a normal root execing a process.
    849 *
    850 * Number 1 above might fail if you don't have a full bset, but I think
    851 * that is interesting information to audit.
    852 *
    853 * A number of other conditions require logging:
    854 * 2) something prevented setuid root getting all caps
    855 * 3) non-setuid root gets fcaps
    856 * 4) non-setuid root gets ambient
    857 */
    858static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
    859				     kuid_t root, bool has_fcap)
    860{
    861	bool ret = false;
    862
    863	if ((__cap_grew(effective, ambient, new) &&
    864	     !(__cap_full(effective, new) &&
    865	       (__is_eff(root, new) || __is_real(root, new)) &&
    866	       root_privileged())) ||
    867	    (root_privileged() &&
    868	     __is_suid(root, new) &&
    869	     !__cap_full(effective, new)) ||
    870	    (!__is_setuid(new, old) &&
    871	     ((has_fcap &&
    872	       __cap_gained(permitted, new, old)) ||
    873	      __cap_gained(ambient, new, old))))
    874
    875		ret = true;
    876
    877	return ret;
    878}
    879
    880/**
    881 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
    882 * @bprm: The execution parameters, including the proposed creds
    883 * @file: The file to pull the credentials from
    884 *
    885 * Set up the proposed credentials for a new execution context being
    886 * constructed by execve().  The proposed creds in @bprm->cred is altered,
    887 * which won't take effect immediately.
    888 *
    889 * Return: 0 if successful, -ve on error.
    890 */
    891int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
    892{
    893	/* Process setpcap binaries and capabilities for uid 0 */
    894	const struct cred *old = current_cred();
    895	struct cred *new = bprm->cred;
    896	bool effective = false, has_fcap = false, is_setid;
    897	int ret;
    898	kuid_t root_uid;
    899
    900	if (WARN_ON(!cap_ambient_invariant_ok(old)))
    901		return -EPERM;
    902
    903	ret = get_file_caps(bprm, file, &effective, &has_fcap);
    904	if (ret < 0)
    905		return ret;
    906
    907	root_uid = make_kuid(new->user_ns, 0);
    908
    909	handle_privileged_root(bprm, has_fcap, &effective, root_uid);
    910
    911	/* if we have fs caps, clear dangerous personality flags */
    912	if (__cap_gained(permitted, new, old))
    913		bprm->per_clear |= PER_CLEAR_ON_SETID;
    914
    915	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
    916	 * credentials unless they have the appropriate permit.
    917	 *
    918	 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
    919	 */
    920	is_setid = __is_setuid(new, old) || __is_setgid(new, old);
    921
    922	if ((is_setid || __cap_gained(permitted, new, old)) &&
    923	    ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
    924	     !ptracer_capable(current, new->user_ns))) {
    925		/* downgrade; they get no more than they had, and maybe less */
    926		if (!ns_capable(new->user_ns, CAP_SETUID) ||
    927		    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
    928			new->euid = new->uid;
    929			new->egid = new->gid;
    930		}
    931		new->cap_permitted = cap_intersect(new->cap_permitted,
    932						   old->cap_permitted);
    933	}
    934
    935	new->suid = new->fsuid = new->euid;
    936	new->sgid = new->fsgid = new->egid;
    937
    938	/* File caps or setid cancels ambient. */
    939	if (has_fcap || is_setid)
    940		cap_clear(new->cap_ambient);
    941
    942	/*
    943	 * Now that we've computed pA', update pP' to give:
    944	 *   pP' = (X & fP) | (pI & fI) | pA'
    945	 */
    946	new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
    947
    948	/*
    949	 * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
    950	 * this is the same as pE' = (fE ? pP' : 0) | pA'.
    951	 */
    952	if (effective)
    953		new->cap_effective = new->cap_permitted;
    954	else
    955		new->cap_effective = new->cap_ambient;
    956
    957	if (WARN_ON(!cap_ambient_invariant_ok(new)))
    958		return -EPERM;
    959
    960	if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
    961		ret = audit_log_bprm_fcaps(bprm, new, old);
    962		if (ret < 0)
    963			return ret;
    964	}
    965
    966	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
    967
    968	if (WARN_ON(!cap_ambient_invariant_ok(new)))
    969		return -EPERM;
    970
    971	/* Check for privilege-elevated exec. */
    972	if (is_setid ||
    973	    (!__is_real(root_uid, new) &&
    974	     (effective ||
    975	      __cap_grew(permitted, ambient, new))))
    976		bprm->secureexec = 1;
    977
    978	return 0;
    979}
    980
    981/**
    982 * cap_inode_setxattr - Determine whether an xattr may be altered
    983 * @dentry: The inode/dentry being altered
    984 * @name: The name of the xattr to be changed
    985 * @value: The value that the xattr will be changed to
    986 * @size: The size of value
    987 * @flags: The replacement flag
    988 *
    989 * Determine whether an xattr may be altered or set on an inode, returning 0 if
    990 * permission is granted, -ve if denied.
    991 *
    992 * This is used to make sure security xattrs don't get updated or set by those
    993 * who aren't privileged to do so.
    994 */
    995int cap_inode_setxattr(struct dentry *dentry, const char *name,
    996		       const void *value, size_t size, int flags)
    997{
    998	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
    999
   1000	/* Ignore non-security xattrs */
   1001	if (strncmp(name, XATTR_SECURITY_PREFIX,
   1002			XATTR_SECURITY_PREFIX_LEN) != 0)
   1003		return 0;
   1004
   1005	/*
   1006	 * For XATTR_NAME_CAPS the check will be done in
   1007	 * cap_convert_nscap(), called by setxattr()
   1008	 */
   1009	if (strcmp(name, XATTR_NAME_CAPS) == 0)
   1010		return 0;
   1011
   1012	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
   1013		return -EPERM;
   1014	return 0;
   1015}
   1016
   1017/**
   1018 * cap_inode_removexattr - Determine whether an xattr may be removed
   1019 *
   1020 * @mnt_userns:	User namespace of the mount the inode was found from
   1021 * @dentry:	The inode/dentry being altered
   1022 * @name:	The name of the xattr to be changed
   1023 *
   1024 * Determine whether an xattr may be removed from an inode, returning 0 if
   1025 * permission is granted, -ve if denied.
   1026 *
   1027 * If the inode has been found through an idmapped mount the user namespace of
   1028 * the vfsmount must be passed through @mnt_userns. This function will then
   1029 * take care to map the inode according to @mnt_userns before checking
   1030 * permissions. On non-idmapped mounts or if permission checking is to be
   1031 * performed on the raw inode simply passs init_user_ns.
   1032 *
   1033 * This is used to make sure security xattrs don't get removed by those who
   1034 * aren't privileged to remove them.
   1035 */
   1036int cap_inode_removexattr(struct user_namespace *mnt_userns,
   1037			  struct dentry *dentry, const char *name)
   1038{
   1039	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
   1040
   1041	/* Ignore non-security xattrs */
   1042	if (strncmp(name, XATTR_SECURITY_PREFIX,
   1043			XATTR_SECURITY_PREFIX_LEN) != 0)
   1044		return 0;
   1045
   1046	if (strcmp(name, XATTR_NAME_CAPS) == 0) {
   1047		/* security.capability gets namespaced */
   1048		struct inode *inode = d_backing_inode(dentry);
   1049		if (!inode)
   1050			return -EINVAL;
   1051		if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
   1052			return -EPERM;
   1053		return 0;
   1054	}
   1055
   1056	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
   1057		return -EPERM;
   1058	return 0;
   1059}
   1060
   1061/*
   1062 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
   1063 * a process after a call to setuid, setreuid, or setresuid.
   1064 *
   1065 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
   1066 *  {r,e,s}uid != 0, the permitted and effective capabilities are
   1067 *  cleared.
   1068 *
   1069 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
   1070 *  capabilities of the process are cleared.
   1071 *
   1072 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
   1073 *  capabilities are set to the permitted capabilities.
   1074 *
   1075 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
   1076 *  never happen.
   1077 *
   1078 *  -astor
   1079 *
   1080 * cevans - New behaviour, Oct '99
   1081 * A process may, via prctl(), elect to keep its capabilities when it
   1082 * calls setuid() and switches away from uid==0. Both permitted and
   1083 * effective sets will be retained.
   1084 * Without this change, it was impossible for a daemon to drop only some
   1085 * of its privilege. The call to setuid(!=0) would drop all privileges!
   1086 * Keeping uid 0 is not an option because uid 0 owns too many vital
   1087 * files..
   1088 * Thanks to Olaf Kirch and Peter Benie for spotting this.
   1089 */
   1090static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
   1091{
   1092	kuid_t root_uid = make_kuid(old->user_ns, 0);
   1093
   1094	if ((uid_eq(old->uid, root_uid) ||
   1095	     uid_eq(old->euid, root_uid) ||
   1096	     uid_eq(old->suid, root_uid)) &&
   1097	    (!uid_eq(new->uid, root_uid) &&
   1098	     !uid_eq(new->euid, root_uid) &&
   1099	     !uid_eq(new->suid, root_uid))) {
   1100		if (!issecure(SECURE_KEEP_CAPS)) {
   1101			cap_clear(new->cap_permitted);
   1102			cap_clear(new->cap_effective);
   1103		}
   1104
   1105		/*
   1106		 * Pre-ambient programs expect setresuid to nonroot followed
   1107		 * by exec to drop capabilities.  We should make sure that
   1108		 * this remains the case.
   1109		 */
   1110		cap_clear(new->cap_ambient);
   1111	}
   1112	if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
   1113		cap_clear(new->cap_effective);
   1114	if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
   1115		new->cap_effective = new->cap_permitted;
   1116}
   1117
   1118/**
   1119 * cap_task_fix_setuid - Fix up the results of setuid() call
   1120 * @new: The proposed credentials
   1121 * @old: The current task's current credentials
   1122 * @flags: Indications of what has changed
   1123 *
   1124 * Fix up the results of setuid() call before the credential changes are
   1125 * actually applied.
   1126 *
   1127 * Return: 0 to grant the changes, -ve to deny them.
   1128 */
   1129int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
   1130{
   1131	switch (flags) {
   1132	case LSM_SETID_RE:
   1133	case LSM_SETID_ID:
   1134	case LSM_SETID_RES:
   1135		/* juggle the capabilities to follow [RES]UID changes unless
   1136		 * otherwise suppressed */
   1137		if (!issecure(SECURE_NO_SETUID_FIXUP))
   1138			cap_emulate_setxuid(new, old);
   1139		break;
   1140
   1141	case LSM_SETID_FS:
   1142		/* juggle the capabilties to follow FSUID changes, unless
   1143		 * otherwise suppressed
   1144		 *
   1145		 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
   1146		 *          if not, we might be a bit too harsh here.
   1147		 */
   1148		if (!issecure(SECURE_NO_SETUID_FIXUP)) {
   1149			kuid_t root_uid = make_kuid(old->user_ns, 0);
   1150			if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
   1151				new->cap_effective =
   1152					cap_drop_fs_set(new->cap_effective);
   1153
   1154			if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
   1155				new->cap_effective =
   1156					cap_raise_fs_set(new->cap_effective,
   1157							 new->cap_permitted);
   1158		}
   1159		break;
   1160
   1161	default:
   1162		return -EINVAL;
   1163	}
   1164
   1165	return 0;
   1166}
   1167
   1168/*
   1169 * Rationale: code calling task_setscheduler, task_setioprio, and
   1170 * task_setnice, assumes that
   1171 *   . if capable(cap_sys_nice), then those actions should be allowed
   1172 *   . if not capable(cap_sys_nice), but acting on your own processes,
   1173 *   	then those actions should be allowed
   1174 * This is insufficient now since you can call code without suid, but
   1175 * yet with increased caps.
   1176 * So we check for increased caps on the target process.
   1177 */
   1178static int cap_safe_nice(struct task_struct *p)
   1179{
   1180	int is_subset, ret = 0;
   1181
   1182	rcu_read_lock();
   1183	is_subset = cap_issubset(__task_cred(p)->cap_permitted,
   1184				 current_cred()->cap_permitted);
   1185	if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
   1186		ret = -EPERM;
   1187	rcu_read_unlock();
   1188
   1189	return ret;
   1190}
   1191
   1192/**
   1193 * cap_task_setscheduler - Detemine if scheduler policy change is permitted
   1194 * @p: The task to affect
   1195 *
   1196 * Detemine if the requested scheduler policy change is permitted for the
   1197 * specified task.
   1198 *
   1199 * Return: 0 if permission is granted, -ve if denied.
   1200 */
   1201int cap_task_setscheduler(struct task_struct *p)
   1202{
   1203	return cap_safe_nice(p);
   1204}
   1205
   1206/**
   1207 * cap_task_setioprio - Detemine if I/O priority change is permitted
   1208 * @p: The task to affect
   1209 * @ioprio: The I/O priority to set
   1210 *
   1211 * Detemine if the requested I/O priority change is permitted for the specified
   1212 * task.
   1213 *
   1214 * Return: 0 if permission is granted, -ve if denied.
   1215 */
   1216int cap_task_setioprio(struct task_struct *p, int ioprio)
   1217{
   1218	return cap_safe_nice(p);
   1219}
   1220
   1221/**
   1222 * cap_task_setnice - Detemine if task priority change is permitted
   1223 * @p: The task to affect
   1224 * @nice: The nice value to set
   1225 *
   1226 * Detemine if the requested task priority change is permitted for the
   1227 * specified task.
   1228 *
   1229 * Return: 0 if permission is granted, -ve if denied.
   1230 */
   1231int cap_task_setnice(struct task_struct *p, int nice)
   1232{
   1233	return cap_safe_nice(p);
   1234}
   1235
   1236/*
   1237 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
   1238 * the current task's bounding set.  Returns 0 on success, -ve on error.
   1239 */
   1240static int cap_prctl_drop(unsigned long cap)
   1241{
   1242	struct cred *new;
   1243
   1244	if (!ns_capable(current_user_ns(), CAP_SETPCAP))
   1245		return -EPERM;
   1246	if (!cap_valid(cap))
   1247		return -EINVAL;
   1248
   1249	new = prepare_creds();
   1250	if (!new)
   1251		return -ENOMEM;
   1252	cap_lower(new->cap_bset, cap);
   1253	return commit_creds(new);
   1254}
   1255
   1256/**
   1257 * cap_task_prctl - Implement process control functions for this security module
   1258 * @option: The process control function requested
   1259 * @arg2: The argument data for this function
   1260 * @arg3: The argument data for this function
   1261 * @arg4: The argument data for this function
   1262 * @arg5: The argument data for this function
   1263 *
   1264 * Allow process control functions (sys_prctl()) to alter capabilities; may
   1265 * also deny access to other functions not otherwise implemented here.
   1266 *
   1267 * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
   1268 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
   1269 * modules will consider performing the function.
   1270 */
   1271int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
   1272		   unsigned long arg4, unsigned long arg5)
   1273{
   1274	const struct cred *old = current_cred();
   1275	struct cred *new;
   1276
   1277	switch (option) {
   1278	case PR_CAPBSET_READ:
   1279		if (!cap_valid(arg2))
   1280			return -EINVAL;
   1281		return !!cap_raised(old->cap_bset, arg2);
   1282
   1283	case PR_CAPBSET_DROP:
   1284		return cap_prctl_drop(arg2);
   1285
   1286	/*
   1287	 * The next four prctl's remain to assist with transitioning a
   1288	 * system from legacy UID=0 based privilege (when filesystem
   1289	 * capabilities are not in use) to a system using filesystem
   1290	 * capabilities only - as the POSIX.1e draft intended.
   1291	 *
   1292	 * Note:
   1293	 *
   1294	 *  PR_SET_SECUREBITS =
   1295	 *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
   1296	 *    | issecure_mask(SECURE_NOROOT)
   1297	 *    | issecure_mask(SECURE_NOROOT_LOCKED)
   1298	 *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
   1299	 *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
   1300	 *
   1301	 * will ensure that the current process and all of its
   1302	 * children will be locked into a pure
   1303	 * capability-based-privilege environment.
   1304	 */
   1305	case PR_SET_SECUREBITS:
   1306		if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
   1307		     & (old->securebits ^ arg2))			/*[1]*/
   1308		    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
   1309		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
   1310		    || (cap_capable(current_cred(),
   1311				    current_cred()->user_ns,
   1312				    CAP_SETPCAP,
   1313				    CAP_OPT_NONE) != 0)			/*[4]*/
   1314			/*
   1315			 * [1] no changing of bits that are locked
   1316			 * [2] no unlocking of locks
   1317			 * [3] no setting of unsupported bits
   1318			 * [4] doing anything requires privilege (go read about
   1319			 *     the "sendmail capabilities bug")
   1320			 */
   1321		    )
   1322			/* cannot change a locked bit */
   1323			return -EPERM;
   1324
   1325		new = prepare_creds();
   1326		if (!new)
   1327			return -ENOMEM;
   1328		new->securebits = arg2;
   1329		return commit_creds(new);
   1330
   1331	case PR_GET_SECUREBITS:
   1332		return old->securebits;
   1333
   1334	case PR_GET_KEEPCAPS:
   1335		return !!issecure(SECURE_KEEP_CAPS);
   1336
   1337	case PR_SET_KEEPCAPS:
   1338		if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
   1339			return -EINVAL;
   1340		if (issecure(SECURE_KEEP_CAPS_LOCKED))
   1341			return -EPERM;
   1342
   1343		new = prepare_creds();
   1344		if (!new)
   1345			return -ENOMEM;
   1346		if (arg2)
   1347			new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
   1348		else
   1349			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
   1350		return commit_creds(new);
   1351
   1352	case PR_CAP_AMBIENT:
   1353		if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
   1354			if (arg3 | arg4 | arg5)
   1355				return -EINVAL;
   1356
   1357			new = prepare_creds();
   1358			if (!new)
   1359				return -ENOMEM;
   1360			cap_clear(new->cap_ambient);
   1361			return commit_creds(new);
   1362		}
   1363
   1364		if (((!cap_valid(arg3)) | arg4 | arg5))
   1365			return -EINVAL;
   1366
   1367		if (arg2 == PR_CAP_AMBIENT_IS_SET) {
   1368			return !!cap_raised(current_cred()->cap_ambient, arg3);
   1369		} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
   1370			   arg2 != PR_CAP_AMBIENT_LOWER) {
   1371			return -EINVAL;
   1372		} else {
   1373			if (arg2 == PR_CAP_AMBIENT_RAISE &&
   1374			    (!cap_raised(current_cred()->cap_permitted, arg3) ||
   1375			     !cap_raised(current_cred()->cap_inheritable,
   1376					 arg3) ||
   1377			     issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
   1378				return -EPERM;
   1379
   1380			new = prepare_creds();
   1381			if (!new)
   1382				return -ENOMEM;
   1383			if (arg2 == PR_CAP_AMBIENT_RAISE)
   1384				cap_raise(new->cap_ambient, arg3);
   1385			else
   1386				cap_lower(new->cap_ambient, arg3);
   1387			return commit_creds(new);
   1388		}
   1389
   1390	default:
   1391		/* No functionality available - continue with default */
   1392		return -ENOSYS;
   1393	}
   1394}
   1395
   1396/**
   1397 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
   1398 * @mm: The VM space in which the new mapping is to be made
   1399 * @pages: The size of the mapping
   1400 *
   1401 * Determine whether the allocation of a new virtual mapping by the current
   1402 * task is permitted.
   1403 *
   1404 * Return: 1 if permission is granted, 0 if not.
   1405 */
   1406int cap_vm_enough_memory(struct mm_struct *mm, long pages)
   1407{
   1408	int cap_sys_admin = 0;
   1409
   1410	if (cap_capable(current_cred(), &init_user_ns,
   1411				CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
   1412		cap_sys_admin = 1;
   1413
   1414	return cap_sys_admin;
   1415}
   1416
   1417/**
   1418 * cap_mmap_addr - check if able to map given addr
   1419 * @addr: address attempting to be mapped
   1420 *
   1421 * If the process is attempting to map memory below dac_mmap_min_addr they need
   1422 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
   1423 * capability security module.
   1424 *
   1425 * Return: 0 if this mapping should be allowed or -EPERM if not.
   1426 */
   1427int cap_mmap_addr(unsigned long addr)
   1428{
   1429	int ret = 0;
   1430
   1431	if (addr < dac_mmap_min_addr) {
   1432		ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
   1433				  CAP_OPT_NONE);
   1434		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
   1435		if (ret == 0)
   1436			current->flags |= PF_SUPERPRIV;
   1437	}
   1438	return ret;
   1439}
   1440
   1441int cap_mmap_file(struct file *file, unsigned long reqprot,
   1442		  unsigned long prot, unsigned long flags)
   1443{
   1444	return 0;
   1445}
   1446
   1447#ifdef CONFIG_SECURITY
   1448
   1449static struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
   1450	LSM_HOOK_INIT(capable, cap_capable),
   1451	LSM_HOOK_INIT(settime, cap_settime),
   1452	LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
   1453	LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
   1454	LSM_HOOK_INIT(capget, cap_capget),
   1455	LSM_HOOK_INIT(capset, cap_capset),
   1456	LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
   1457	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
   1458	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
   1459	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
   1460	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
   1461	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
   1462	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
   1463	LSM_HOOK_INIT(task_prctl, cap_task_prctl),
   1464	LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
   1465	LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
   1466	LSM_HOOK_INIT(task_setnice, cap_task_setnice),
   1467	LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
   1468};
   1469
   1470static int __init capability_init(void)
   1471{
   1472	security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
   1473				"capability");
   1474	return 0;
   1475}
   1476
   1477DEFINE_LSM(capability) = {
   1478	.name = "capability",
   1479	.order = LSM_ORDER_FIRST,
   1480	.init = capability_init,
   1481};
   1482
   1483#endif /* CONFIG_SECURITY */