summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c100
-rw-r--r--kernel/pid.c86
-rw-r--r--kernel/pid_namespace.c2
4 files changed, 156 insertions, 34 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index a46a50d67002..f2d20ab74422 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1457,7 +1457,7 @@ repeat:
*/
wo->notask_error = -ECHILD;
if ((wo->wo_type < PIDTYPE_MAX) &&
- (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+ (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/fork.c b/kernel/fork.c
index 13b38794efb5..35f91ee91057 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
spin_unlock_irq(&current->sighand->siglock);
+
+ /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
+ if (clone_flags & CLONE_CLEAR_SIGHAND)
+ flush_signal_handlers(tsk, 0);
+
return 0;
}
@@ -1695,12 +1700,68 @@ static int pidfd_release(struct inode *inode, struct file *file)
}
#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestoral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ * namespace (also take care to create new mount namespaces in the
+ * new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ * have exactly one entry, which is 0
+ */
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
{
- struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
struct pid *pid = f->private_data;
+ struct pid_namespace *ns;
+ pid_t nr = -1;
+
+ if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+ ns = proc_pid_ns(file_inode(m->file));
+ nr = pid_nr_ns(pid, ns);
+ }
+
+ seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+ seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+ if (nr > 0) {
+ int i;
- seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+ /* If nr is non-zero it means that 'pid' is valid and that
+ * ns, i.e. the pid namespace associated with the procfs
+ * instance, is in the pid namespace hierarchy of pid.
+ * Start at one below the already printed level.
+ */
+ for (i = ns->level + 1; i <= pid->level; i++)
+ seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+ }
+#endif
seq_putc(m, '\n');
}
#endif
@@ -2026,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process(
stackleak_task_init(p);
if (pid != &init_struct_pid) {
- pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
+ args->set_tid_size);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
@@ -2529,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
{
int err;
struct clone_args args;
+ pid_t *kset_tid = kargs->set_tid;
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
@@ -2539,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
if (err)
return err;
+ if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
+ return -EINVAL;
+
+ if (unlikely(!args.set_tid && args.set_tid_size > 0))
+ return -EINVAL;
+
+ if (unlikely(args.set_tid && args.set_tid_size == 0))
+ return -EINVAL;
+
/*
* Verify that higher 32bits of exit_signal are unset and that
* it is a valid signal
@@ -2556,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
+ .set_tid_size = args.set_tid_size,
};
+ if (args.set_tid &&
+ copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
+ (kargs->set_tid_size * sizeof(pid_t))))
+ return -EFAULT;
+
+ kargs->set_tid = kset_tid;
+
return 0;
}
@@ -2591,11 +2671,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
- /*
- * All lower bits of the flag word are taken.
- * Verify that no other unknown flags are passed along.
- */
- if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+ /* Verify that no unknown flags are passed along. */
+ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
return false;
/*
@@ -2605,6 +2682,10 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
return false;
+ if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
+ (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
+ return false;
+
if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
kargs->exit_signal)
return false;
@@ -2631,6 +2712,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
int err;
struct kernel_clone_args kargs;
+ pid_t set_tid[MAX_PID_NS_LEVEL];
+
+ kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size);
if (err)
diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..2278e249141d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -157,7 +157,8 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
+ size_t set_tid_size)
{
struct pid *pid;
enum pid_type type;
@@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct upid *upid;
int retval = -ENOMEM;
+ /*
+ * set_tid_size contains the size of the set_tid array. Starting at
+ * the most nested currently active PID namespace it tells alloc_pid()
+ * which PID to set for a process in that most nested PID namespace
+ * up to set_tid_size PID namespaces. It does not have to set the PID
+ * for a process in all nested PID namespaces but set_tid_size must
+ * never be greater than the current ns->level + 1.
+ */
+ if (set_tid_size > ns->level + 1)
+ return ERR_PTR(-EINVAL);
+
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
@@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns)
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
- int pid_min = 1;
+ int tid = 0;
+
+ if (set_tid_size) {
+ tid = set_tid[ns->level - i];
+
+ retval = -EINVAL;
+ if (tid < 1 || tid >= pid_max)
+ goto out_free;
+ /*
+ * Also fail if a PID != 1 is requested and
+ * no PID 1 exists.
+ */
+ if (tid != 1 && !tmp->child_reaper)
+ goto out_free;
+ retval = -EPERM;
+ if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
+ goto out_free;
+ set_tid_size--;
+ }
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
- /*
- * init really needs pid 1, but after reaching the maximum
- * wrap back to RESERVED_PIDS
- */
- if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
- pid_min = RESERVED_PIDS;
-
- /*
- * Store a null pointer so find_pid_ns does not find
- * a partially initialized PID (see below).
- */
- nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ if (tid) {
+ nr = idr_alloc(&tmp->idr, NULL, tid,
+ tid + 1, GFP_ATOMIC);
+ /*
+ * If ENOSPC is returned it means that the PID is
+ * alreay in use. Return EEXIST in that case.
+ */
+ if (nr == -ENOSPC)
+ nr = -EEXIST;
+ } else {
+ int pid_min = 1;
+ /*
+ * init really needs pid 1, but after reaching the
+ * maximum wrap back to RESERVED_PIDS
+ */
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ }
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -299,7 +341,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
*pid_ptr = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
- if (!hlist_empty(&pid->tasks[tmp]))
+ if (pid_has_task(pid, tmp))
return;
free_pid(pid);
@@ -497,7 +539,7 @@ static int pidfd_create(struct pid *pid)
*/
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
- int fd, ret;
+ int fd;
struct pid *p;
if (flags)
@@ -510,13 +552,11 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
if (!p)
return -ESRCH;
- ret = 0;
- rcu_read_lock();
- if (!pid_task(p, PIDTYPE_TGID))
- ret = -EINVAL;
- rcu_read_unlock();
+ if (pid_has_task(p, PIDTYPE_TGID))
+ fd = pidfd_create(p);
+ else
+ fd = -EINVAL;
- fd = ret ?: pidfd_create(p);
put_pid(p);
return fd;
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a6a79f85c81a..d40017e79ebe 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -26,8 +26,6 @@
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
-/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
-#define MAX_PID_NS_LEVEL 32
/* Write once array, filled from the beginning. */
static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];