cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pid_namespace.c (11543B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Pid namespaces
      4 *
      5 * Authors:
      6 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
      7 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
      8 *     Many thanks to Oleg Nesterov for comments and help
      9 *
     10 */
     11
     12#include <linux/pid.h>
     13#include <linux/pid_namespace.h>
     14#include <linux/user_namespace.h>
     15#include <linux/syscalls.h>
     16#include <linux/cred.h>
     17#include <linux/err.h>
     18#include <linux/acct.h>
     19#include <linux/slab.h>
     20#include <linux/proc_ns.h>
     21#include <linux/reboot.h>
     22#include <linux/export.h>
     23#include <linux/sched/task.h>
     24#include <linux/sched/signal.h>
     25#include <linux/idr.h>
     26
     27static DEFINE_MUTEX(pid_caches_mutex);
     28static struct kmem_cache *pid_ns_cachep;
     29/* Write once array, filled from the beginning. */
     30static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
     31
     32/*
     33 * creates the kmem cache to allocate pids from.
     34 * @level: pid namespace level
     35 */
     36
     37static struct kmem_cache *create_pid_cachep(unsigned int level)
     38{
     39	/* Level 0 is init_pid_ns.pid_cachep */
     40	struct kmem_cache **pkc = &pid_cache[level - 1];
     41	struct kmem_cache *kc;
     42	char name[4 + 10 + 1];
     43	unsigned int len;
     44
     45	kc = READ_ONCE(*pkc);
     46	if (kc)
     47		return kc;
     48
     49	snprintf(name, sizeof(name), "pid_%u", level + 1);
     50	len = sizeof(struct pid) + level * sizeof(struct upid);
     51	mutex_lock(&pid_caches_mutex);
     52	/* Name collision forces to do allocation under mutex. */
     53	if (!*pkc)
     54		*pkc = kmem_cache_create(name, len, 0,
     55					 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
     56	mutex_unlock(&pid_caches_mutex);
     57	/* current can fail, but someone else can succeed. */
     58	return READ_ONCE(*pkc);
     59}
     60
     61static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
     62{
     63	return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
     64}
     65
     66static void dec_pid_namespaces(struct ucounts *ucounts)
     67{
     68	dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
     69}
     70
     71static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
     72	struct pid_namespace *parent_pid_ns)
     73{
     74	struct pid_namespace *ns;
     75	unsigned int level = parent_pid_ns->level + 1;
     76	struct ucounts *ucounts;
     77	int err;
     78
     79	err = -EINVAL;
     80	if (!in_userns(parent_pid_ns->user_ns, user_ns))
     81		goto out;
     82
     83	err = -ENOSPC;
     84	if (level > MAX_PID_NS_LEVEL)
     85		goto out;
     86	ucounts = inc_pid_namespaces(user_ns);
     87	if (!ucounts)
     88		goto out;
     89
     90	err = -ENOMEM;
     91	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
     92	if (ns == NULL)
     93		goto out_dec;
     94
     95	idr_init(&ns->idr);
     96
     97	ns->pid_cachep = create_pid_cachep(level);
     98	if (ns->pid_cachep == NULL)
     99		goto out_free_idr;
    100
    101	err = ns_alloc_inum(&ns->ns);
    102	if (err)
    103		goto out_free_idr;
    104	ns->ns.ops = &pidns_operations;
    105
    106	refcount_set(&ns->ns.count, 1);
    107	ns->level = level;
    108	ns->parent = get_pid_ns(parent_pid_ns);
    109	ns->user_ns = get_user_ns(user_ns);
    110	ns->ucounts = ucounts;
    111	ns->pid_allocated = PIDNS_ADDING;
    112
    113	return ns;
    114
    115out_free_idr:
    116	idr_destroy(&ns->idr);
    117	kmem_cache_free(pid_ns_cachep, ns);
    118out_dec:
    119	dec_pid_namespaces(ucounts);
    120out:
    121	return ERR_PTR(err);
    122}
    123
    124static void delayed_free_pidns(struct rcu_head *p)
    125{
    126	struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
    127
    128	dec_pid_namespaces(ns->ucounts);
    129	put_user_ns(ns->user_ns);
    130
    131	kmem_cache_free(pid_ns_cachep, ns);
    132}
    133
    134static void destroy_pid_namespace(struct pid_namespace *ns)
    135{
    136	ns_free_inum(&ns->ns);
    137
    138	idr_destroy(&ns->idr);
    139	call_rcu(&ns->rcu, delayed_free_pidns);
    140}
    141
    142struct pid_namespace *copy_pid_ns(unsigned long flags,
    143	struct user_namespace *user_ns, struct pid_namespace *old_ns)
    144{
    145	if (!(flags & CLONE_NEWPID))
    146		return get_pid_ns(old_ns);
    147	if (task_active_pid_ns(current) != old_ns)
    148		return ERR_PTR(-EINVAL);
    149	return create_pid_namespace(user_ns, old_ns);
    150}
    151
    152void put_pid_ns(struct pid_namespace *ns)
    153{
    154	struct pid_namespace *parent;
    155
    156	while (ns != &init_pid_ns) {
    157		parent = ns->parent;
    158		if (!refcount_dec_and_test(&ns->ns.count))
    159			break;
    160		destroy_pid_namespace(ns);
    161		ns = parent;
    162	}
    163}
    164EXPORT_SYMBOL_GPL(put_pid_ns);
    165
    166void zap_pid_ns_processes(struct pid_namespace *pid_ns)
    167{
    168	int nr;
    169	int rc;
    170	struct task_struct *task, *me = current;
    171	int init_pids = thread_group_leader(me) ? 1 : 2;
    172	struct pid *pid;
    173
    174	/* Don't allow any more processes into the pid namespace */
    175	disable_pid_allocation(pid_ns);
    176
    177	/*
    178	 * Ignore SIGCHLD causing any terminated children to autoreap.
    179	 * This speeds up the namespace shutdown, plus see the comment
    180	 * below.
    181	 */
    182	spin_lock_irq(&me->sighand->siglock);
    183	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
    184	spin_unlock_irq(&me->sighand->siglock);
    185
    186	/*
    187	 * The last thread in the cgroup-init thread group is terminating.
    188	 * Find remaining pid_ts in the namespace, signal and wait for them
    189	 * to exit.
    190	 *
    191	 * Note:  This signals each threads in the namespace - even those that
    192	 * 	  belong to the same thread group, To avoid this, we would have
    193	 * 	  to walk the entire tasklist looking a processes in this
    194	 * 	  namespace, but that could be unnecessarily expensive if the
    195	 * 	  pid namespace has just a few processes. Or we need to
    196	 * 	  maintain a tasklist for each pid namespace.
    197	 *
    198	 */
    199	rcu_read_lock();
    200	read_lock(&tasklist_lock);
    201	nr = 2;
    202	idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
    203		task = pid_task(pid, PIDTYPE_PID);
    204		if (task && !__fatal_signal_pending(task))
    205			group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX);
    206	}
    207	read_unlock(&tasklist_lock);
    208	rcu_read_unlock();
    209
    210	/*
    211	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
    212	 * kernel_wait4() will also block until our children traced from the
    213	 * parent namespace are detached and become EXIT_DEAD.
    214	 */
    215	do {
    216		clear_thread_flag(TIF_SIGPENDING);
    217		rc = kernel_wait4(-1, NULL, __WALL, NULL);
    218	} while (rc != -ECHILD);
    219
    220	/*
    221	 * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
    222	 * process whose parents processes are outside of the pid
    223	 * namespace.  Such processes are created with setns()+fork().
    224	 *
    225	 * If those EXIT_ZOMBIE processes are not reaped by their
    226	 * parents before their parents exit, they will be reparented
    227	 * to pid_ns->child_reaper.  Thus pidns->child_reaper needs to
    228	 * stay valid until they all go away.
    229	 *
    230	 * The code relies on the pid_ns->child_reaper ignoring
    231	 * SIGCHILD to cause those EXIT_ZOMBIE processes to be
    232	 * autoreaped if reparented.
    233	 *
    234	 * Semantically it is also desirable to wait for EXIT_ZOMBIE
    235	 * processes before allowing the child_reaper to be reaped, as
    236	 * that gives the invariant that when the init process of a
    237	 * pid namespace is reaped all of the processes in the pid
    238	 * namespace are gone.
    239	 *
    240	 * Once all of the other tasks are gone from the pid_namespace
    241	 * free_pid() will awaken this task.
    242	 */
    243	for (;;) {
    244		set_current_state(TASK_INTERRUPTIBLE);
    245		if (pid_ns->pid_allocated == init_pids)
    246			break;
    247		schedule();
    248	}
    249	__set_current_state(TASK_RUNNING);
    250
    251	if (pid_ns->reboot)
    252		current->signal->group_exit_code = pid_ns->reboot;
    253
    254	acct_exit_ns(pid_ns);
    255	return;
    256}
    257
    258#ifdef CONFIG_CHECKPOINT_RESTORE
    259static int pid_ns_ctl_handler(struct ctl_table *table, int write,
    260		void *buffer, size_t *lenp, loff_t *ppos)
    261{
    262	struct pid_namespace *pid_ns = task_active_pid_ns(current);
    263	struct ctl_table tmp = *table;
    264	int ret, next;
    265
    266	if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
    267		return -EPERM;
    268
    269	/*
    270	 * Writing directly to ns' last_pid field is OK, since this field
    271	 * is volatile in a living namespace anyway and a code writing to
    272	 * it should synchronize its usage with external means.
    273	 */
    274
    275	next = idr_get_cursor(&pid_ns->idr) - 1;
    276
    277	tmp.data = &next;
    278	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
    279	if (!ret && write)
    280		idr_set_cursor(&pid_ns->idr, next + 1);
    281
    282	return ret;
    283}
    284
    285extern int pid_max;
    286static struct ctl_table pid_ns_ctl_table[] = {
    287	{
    288		.procname = "ns_last_pid",
    289		.maxlen = sizeof(int),
    290		.mode = 0666, /* permissions are checked in the handler */
    291		.proc_handler = pid_ns_ctl_handler,
    292		.extra1 = SYSCTL_ZERO,
    293		.extra2 = &pid_max,
    294	},
    295	{ }
    296};
    297static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
    298#endif	/* CONFIG_CHECKPOINT_RESTORE */
    299
    300int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
    301{
    302	if (pid_ns == &init_pid_ns)
    303		return 0;
    304
    305	switch (cmd) {
    306	case LINUX_REBOOT_CMD_RESTART2:
    307	case LINUX_REBOOT_CMD_RESTART:
    308		pid_ns->reboot = SIGHUP;
    309		break;
    310
    311	case LINUX_REBOOT_CMD_POWER_OFF:
    312	case LINUX_REBOOT_CMD_HALT:
    313		pid_ns->reboot = SIGINT;
    314		break;
    315	default:
    316		return -EINVAL;
    317	}
    318
    319	read_lock(&tasklist_lock);
    320	send_sig(SIGKILL, pid_ns->child_reaper, 1);
    321	read_unlock(&tasklist_lock);
    322
    323	do_exit(0);
    324
    325	/* Not reached */
    326	return 0;
    327}
    328
    329static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
    330{
    331	return container_of(ns, struct pid_namespace, ns);
    332}
    333
    334static struct ns_common *pidns_get(struct task_struct *task)
    335{
    336	struct pid_namespace *ns;
    337
    338	rcu_read_lock();
    339	ns = task_active_pid_ns(task);
    340	if (ns)
    341		get_pid_ns(ns);
    342	rcu_read_unlock();
    343
    344	return ns ? &ns->ns : NULL;
    345}
    346
    347static struct ns_common *pidns_for_children_get(struct task_struct *task)
    348{
    349	struct pid_namespace *ns = NULL;
    350
    351	task_lock(task);
    352	if (task->nsproxy) {
    353		ns = task->nsproxy->pid_ns_for_children;
    354		get_pid_ns(ns);
    355	}
    356	task_unlock(task);
    357
    358	if (ns) {
    359		read_lock(&tasklist_lock);
    360		if (!ns->child_reaper) {
    361			put_pid_ns(ns);
    362			ns = NULL;
    363		}
    364		read_unlock(&tasklist_lock);
    365	}
    366
    367	return ns ? &ns->ns : NULL;
    368}
    369
    370static void pidns_put(struct ns_common *ns)
    371{
    372	put_pid_ns(to_pid_ns(ns));
    373}
    374
    375static int pidns_install(struct nsset *nsset, struct ns_common *ns)
    376{
    377	struct nsproxy *nsproxy = nsset->nsproxy;
    378	struct pid_namespace *active = task_active_pid_ns(current);
    379	struct pid_namespace *ancestor, *new = to_pid_ns(ns);
    380
    381	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
    382	    !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
    383		return -EPERM;
    384
    385	/*
    386	 * Only allow entering the current active pid namespace
    387	 * or a child of the current active pid namespace.
    388	 *
    389	 * This is required for fork to return a usable pid value and
    390	 * this maintains the property that processes and their
    391	 * children can not escape their current pid namespace.
    392	 */
    393	if (new->level < active->level)
    394		return -EINVAL;
    395
    396	ancestor = new;
    397	while (ancestor->level > active->level)
    398		ancestor = ancestor->parent;
    399	if (ancestor != active)
    400		return -EINVAL;
    401
    402	put_pid_ns(nsproxy->pid_ns_for_children);
    403	nsproxy->pid_ns_for_children = get_pid_ns(new);
    404	return 0;
    405}
    406
    407static struct ns_common *pidns_get_parent(struct ns_common *ns)
    408{
    409	struct pid_namespace *active = task_active_pid_ns(current);
    410	struct pid_namespace *pid_ns, *p;
    411
    412	/* See if the parent is in the current namespace */
    413	pid_ns = p = to_pid_ns(ns)->parent;
    414	for (;;) {
    415		if (!p)
    416			return ERR_PTR(-EPERM);
    417		if (p == active)
    418			break;
    419		p = p->parent;
    420	}
    421
    422	return &get_pid_ns(pid_ns)->ns;
    423}
    424
    425static struct user_namespace *pidns_owner(struct ns_common *ns)
    426{
    427	return to_pid_ns(ns)->user_ns;
    428}
    429
    430const struct proc_ns_operations pidns_operations = {
    431	.name		= "pid",
    432	.type		= CLONE_NEWPID,
    433	.get		= pidns_get,
    434	.put		= pidns_put,
    435	.install	= pidns_install,
    436	.owner		= pidns_owner,
    437	.get_parent	= pidns_get_parent,
    438};
    439
    440const struct proc_ns_operations pidns_for_children_operations = {
    441	.name		= "pid_for_children",
    442	.real_ns_name	= "pid",
    443	.type		= CLONE_NEWPID,
    444	.get		= pidns_for_children_get,
    445	.put		= pidns_put,
    446	.install	= pidns_install,
    447	.owner		= pidns_owner,
    448	.get_parent	= pidns_get_parent,
    449};
    450
    451static __init int pid_namespaces_init(void)
    452{
    453	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
    454
    455#ifdef CONFIG_CHECKPOINT_RESTORE
    456	register_sysctl_paths(kern_path, pid_ns_ctl_table);
    457#endif
    458	return 0;
    459}
    460
    461__initcall(pid_namespaces_init);