cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

exit.c (45982B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  linux/kernel/exit.c
      4 *
      5 *  Copyright (C) 1991, 1992  Linus Torvalds
      6 */
      7
      8#include <linux/mm.h>
      9#include <linux/slab.h>
     10#include <linux/sched/autogroup.h>
     11#include <linux/sched/mm.h>
     12#include <linux/sched/stat.h>
     13#include <linux/sched/task.h>
     14#include <linux/sched/task_stack.h>
     15#include <linux/sched/cputime.h>
     16#include <linux/interrupt.h>
     17#include <linux/module.h>
     18#include <linux/capability.h>
     19#include <linux/completion.h>
     20#include <linux/personality.h>
     21#include <linux/tty.h>
     22#include <linux/iocontext.h>
     23#include <linux/key.h>
     24#include <linux/cpu.h>
     25#include <linux/acct.h>
     26#include <linux/tsacct_kern.h>
     27#include <linux/file.h>
     28#include <linux/fdtable.h>
     29#include <linux/freezer.h>
     30#include <linux/binfmts.h>
     31#include <linux/nsproxy.h>
     32#include <linux/pid_namespace.h>
     33#include <linux/ptrace.h>
     34#include <linux/profile.h>
     35#include <linux/mount.h>
     36#include <linux/proc_fs.h>
     37#include <linux/kthread.h>
     38#include <linux/mempolicy.h>
     39#include <linux/taskstats_kern.h>
     40#include <linux/delayacct.h>
     41#include <linux/cgroup.h>
     42#include <linux/syscalls.h>
     43#include <linux/signal.h>
     44#include <linux/posix-timers.h>
     45#include <linux/cn_proc.h>
     46#include <linux/mutex.h>
     47#include <linux/futex.h>
     48#include <linux/pipe_fs_i.h>
     49#include <linux/audit.h> /* for audit_free() */
     50#include <linux/resource.h>
     51#include <linux/task_io_accounting_ops.h>
     52#include <linux/blkdev.h>
     53#include <linux/task_work.h>
     54#include <linux/fs_struct.h>
     55#include <linux/init_task.h>
     56#include <linux/perf_event.h>
     57#include <trace/events/sched.h>
     58#include <linux/hw_breakpoint.h>
     59#include <linux/oom.h>
     60#include <linux/writeback.h>
     61#include <linux/shm.h>
     62#include <linux/kcov.h>
     63#include <linux/random.h>
     64#include <linux/rcuwait.h>
     65#include <linux/compat.h>
     66#include <linux/io_uring.h>
     67#include <linux/kprobes.h>
     68#include <linux/rethook.h>
     69
     70#include <linux/uaccess.h>
     71#include <asm/unistd.h>
     72#include <asm/mmu_context.h>
     73
     74static void __unhash_process(struct task_struct *p, bool group_dead)
     75{
     76	nr_threads--;
     77	detach_pid(p, PIDTYPE_PID);
     78	if (group_dead) {
     79		detach_pid(p, PIDTYPE_TGID);
     80		detach_pid(p, PIDTYPE_PGID);
     81		detach_pid(p, PIDTYPE_SID);
     82
     83		list_del_rcu(&p->tasks);
     84		list_del_init(&p->sibling);
     85		__this_cpu_dec(process_counts);
     86	}
     87	list_del_rcu(&p->thread_group);
     88	list_del_rcu(&p->thread_node);
     89}
     90
     91/*
     92 * This function expects the tasklist_lock write-locked.
     93 */
     94static void __exit_signal(struct task_struct *tsk)
     95{
     96	struct signal_struct *sig = tsk->signal;
     97	bool group_dead = thread_group_leader(tsk);
     98	struct sighand_struct *sighand;
     99	struct tty_struct *tty;
    100	u64 utime, stime;
    101
    102	sighand = rcu_dereference_check(tsk->sighand,
    103					lockdep_tasklist_lock_is_held());
    104	spin_lock(&sighand->siglock);
    105
    106#ifdef CONFIG_POSIX_TIMERS
    107	posix_cpu_timers_exit(tsk);
    108	if (group_dead)
    109		posix_cpu_timers_exit_group(tsk);
    110#endif
    111
    112	if (group_dead) {
    113		tty = sig->tty;
    114		sig->tty = NULL;
    115	} else {
    116		/*
    117		 * If there is any task waiting for the group exit
    118		 * then notify it:
    119		 */
    120		if (sig->notify_count > 0 && !--sig->notify_count)
    121			wake_up_process(sig->group_exec_task);
    122
    123		if (tsk == sig->curr_target)
    124			sig->curr_target = next_thread(tsk);
    125	}
    126
    127	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
    128			      sizeof(unsigned long long));
    129
    130	/*
    131	 * Accumulate here the counters for all threads as they die. We could
    132	 * skip the group leader because it is the last user of signal_struct,
    133	 * but we want to avoid the race with thread_group_cputime() which can
    134	 * see the empty ->thread_head list.
    135	 */
    136	task_cputime(tsk, &utime, &stime);
    137	write_seqlock(&sig->stats_lock);
    138	sig->utime += utime;
    139	sig->stime += stime;
    140	sig->gtime += task_gtime(tsk);
    141	sig->min_flt += tsk->min_flt;
    142	sig->maj_flt += tsk->maj_flt;
    143	sig->nvcsw += tsk->nvcsw;
    144	sig->nivcsw += tsk->nivcsw;
    145	sig->inblock += task_io_get_inblock(tsk);
    146	sig->oublock += task_io_get_oublock(tsk);
    147	task_io_accounting_add(&sig->ioac, &tsk->ioac);
    148	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
    149	sig->nr_threads--;
    150	__unhash_process(tsk, group_dead);
    151	write_sequnlock(&sig->stats_lock);
    152
    153	/*
    154	 * Do this under ->siglock, we can race with another thread
    155	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
    156	 */
    157	flush_sigqueue(&tsk->pending);
    158	tsk->sighand = NULL;
    159	spin_unlock(&sighand->siglock);
    160
    161	__cleanup_sighand(sighand);
    162	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
    163	if (group_dead) {
    164		flush_sigqueue(&sig->shared_pending);
    165		tty_kref_put(tty);
    166	}
    167}
    168
    169static void delayed_put_task_struct(struct rcu_head *rhp)
    170{
    171	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
    172
    173	kprobe_flush_task(tsk);
    174	rethook_flush_task(tsk);
    175	perf_event_delayed_put(tsk);
    176	trace_sched_process_free(tsk);
    177	put_task_struct(tsk);
    178}
    179
    180void put_task_struct_rcu_user(struct task_struct *task)
    181{
    182	if (refcount_dec_and_test(&task->rcu_users))
    183		call_rcu(&task->rcu, delayed_put_task_struct);
    184}
    185
    186void release_task(struct task_struct *p)
    187{
    188	struct task_struct *leader;
    189	struct pid *thread_pid;
    190	int zap_leader;
    191repeat:
    192	/* don't need to get the RCU readlock here - the process is dead and
    193	 * can't be modifying its own credentials. But shut RCU-lockdep up */
    194	rcu_read_lock();
    195	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
    196	rcu_read_unlock();
    197
    198	cgroup_release(p);
    199
    200	write_lock_irq(&tasklist_lock);
    201	ptrace_release_task(p);
    202	thread_pid = get_pid(p->thread_pid);
    203	__exit_signal(p);
    204
    205	/*
    206	 * If we are the last non-leader member of the thread
    207	 * group, and the leader is zombie, then notify the
    208	 * group leader's parent process. (if it wants notification.)
    209	 */
    210	zap_leader = 0;
    211	leader = p->group_leader;
    212	if (leader != p && thread_group_empty(leader)
    213			&& leader->exit_state == EXIT_ZOMBIE) {
    214		/*
    215		 * If we were the last child thread and the leader has
    216		 * exited already, and the leader's parent ignores SIGCHLD,
    217		 * then we are the one who should release the leader.
    218		 */
    219		zap_leader = do_notify_parent(leader, leader->exit_signal);
    220		if (zap_leader)
    221			leader->exit_state = EXIT_DEAD;
    222	}
    223
    224	write_unlock_irq(&tasklist_lock);
    225	seccomp_filter_release(p);
    226	proc_flush_pid(thread_pid);
    227	put_pid(thread_pid);
    228	release_thread(p);
    229	put_task_struct_rcu_user(p);
    230
    231	p = leader;
    232	if (unlikely(zap_leader))
    233		goto repeat;
    234}
    235
    236int rcuwait_wake_up(struct rcuwait *w)
    237{
    238	int ret = 0;
    239	struct task_struct *task;
    240
    241	rcu_read_lock();
    242
    243	/*
    244	 * Order condition vs @task, such that everything prior to the load
    245	 * of @task is visible. This is the condition as to why the user called
    246	 * rcuwait_wake() in the first place. Pairs with set_current_state()
    247	 * barrier (A) in rcuwait_wait_event().
    248	 *
    249	 *    WAIT                WAKE
    250	 *    [S] tsk = current	  [S] cond = true
    251	 *        MB (A)	      MB (B)
    252	 *    [L] cond		  [L] tsk
    253	 */
    254	smp_mb(); /* (B) */
    255
    256	task = rcu_dereference(w->task);
    257	if (task)
    258		ret = wake_up_process(task);
    259	rcu_read_unlock();
    260
    261	return ret;
    262}
    263EXPORT_SYMBOL_GPL(rcuwait_wake_up);
    264
    265/*
    266 * Determine if a process group is "orphaned", according to the POSIX
    267 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
    268 * by terminal-generated stop signals.  Newly orphaned process groups are
    269 * to receive a SIGHUP and a SIGCONT.
    270 *
    271 * "I ask you, have you ever known what it is to be an orphan?"
    272 */
    273static int will_become_orphaned_pgrp(struct pid *pgrp,
    274					struct task_struct *ignored_task)
    275{
    276	struct task_struct *p;
    277
    278	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
    279		if ((p == ignored_task) ||
    280		    (p->exit_state && thread_group_empty(p)) ||
    281		    is_global_init(p->real_parent))
    282			continue;
    283
    284		if (task_pgrp(p->real_parent) != pgrp &&
    285		    task_session(p->real_parent) == task_session(p))
    286			return 0;
    287	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
    288
    289	return 1;
    290}
    291
    292int is_current_pgrp_orphaned(void)
    293{
    294	int retval;
    295
    296	read_lock(&tasklist_lock);
    297	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
    298	read_unlock(&tasklist_lock);
    299
    300	return retval;
    301}
    302
    303static bool has_stopped_jobs(struct pid *pgrp)
    304{
    305	struct task_struct *p;
    306
    307	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
    308		if (p->signal->flags & SIGNAL_STOP_STOPPED)
    309			return true;
    310	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
    311
    312	return false;
    313}
    314
    315/*
    316 * Check to see if any process groups have become orphaned as
    317 * a result of our exiting, and if they have any stopped jobs,
    318 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
    319 */
    320static void
    321kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
    322{
    323	struct pid *pgrp = task_pgrp(tsk);
    324	struct task_struct *ignored_task = tsk;
    325
    326	if (!parent)
    327		/* exit: our father is in a different pgrp than
    328		 * we are and we were the only connection outside.
    329		 */
    330		parent = tsk->real_parent;
    331	else
    332		/* reparent: our child is in a different pgrp than
    333		 * we are, and it was the only connection outside.
    334		 */
    335		ignored_task = NULL;
    336
    337	if (task_pgrp(parent) != pgrp &&
    338	    task_session(parent) == task_session(tsk) &&
    339	    will_become_orphaned_pgrp(pgrp, ignored_task) &&
    340	    has_stopped_jobs(pgrp)) {
    341		__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
    342		__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
    343	}
    344}
    345
    346static void coredump_task_exit(struct task_struct *tsk)
    347{
    348	struct core_state *core_state;
    349
    350	/*
    351	 * Serialize with any possible pending coredump.
    352	 * We must hold siglock around checking core_state
    353	 * and setting PF_POSTCOREDUMP.  The core-inducing thread
    354	 * will increment ->nr_threads for each thread in the
    355	 * group without PF_POSTCOREDUMP set.
    356	 */
    357	spin_lock_irq(&tsk->sighand->siglock);
    358	tsk->flags |= PF_POSTCOREDUMP;
    359	core_state = tsk->signal->core_state;
    360	spin_unlock_irq(&tsk->sighand->siglock);
    361	if (core_state) {
    362		struct core_thread self;
    363
    364		self.task = current;
    365		if (self.task->flags & PF_SIGNALED)
    366			self.next = xchg(&core_state->dumper.next, &self);
    367		else
    368			self.task = NULL;
    369		/*
    370		 * Implies mb(), the result of xchg() must be visible
    371		 * to core_state->dumper.
    372		 */
    373		if (atomic_dec_and_test(&core_state->nr_threads))
    374			complete(&core_state->startup);
    375
    376		for (;;) {
    377			set_current_state(TASK_UNINTERRUPTIBLE);
    378			if (!self.task) /* see coredump_finish() */
    379				break;
    380			freezable_schedule();
    381		}
    382		__set_current_state(TASK_RUNNING);
    383	}
    384}
    385
    386#ifdef CONFIG_MEMCG
    387/*
    388 * A task is exiting.   If it owned this mm, find a new owner for the mm.
    389 */
    390void mm_update_next_owner(struct mm_struct *mm)
    391{
    392	struct task_struct *c, *g, *p = current;
    393
    394retry:
    395	/*
    396	 * If the exiting or execing task is not the owner, it's
    397	 * someone else's problem.
    398	 */
    399	if (mm->owner != p)
    400		return;
    401	/*
    402	 * The current owner is exiting/execing and there are no other
    403	 * candidates.  Do not leave the mm pointing to a possibly
    404	 * freed task structure.
    405	 */
    406	if (atomic_read(&mm->mm_users) <= 1) {
    407		WRITE_ONCE(mm->owner, NULL);
    408		return;
    409	}
    410
    411	read_lock(&tasklist_lock);
    412	/*
    413	 * Search in the children
    414	 */
    415	list_for_each_entry(c, &p->children, sibling) {
    416		if (c->mm == mm)
    417			goto assign_new_owner;
    418	}
    419
    420	/*
    421	 * Search in the siblings
    422	 */
    423	list_for_each_entry(c, &p->real_parent->children, sibling) {
    424		if (c->mm == mm)
    425			goto assign_new_owner;
    426	}
    427
    428	/*
    429	 * Search through everything else, we should not get here often.
    430	 */
    431	for_each_process(g) {
    432		if (g->flags & PF_KTHREAD)
    433			continue;
    434		for_each_thread(g, c) {
    435			if (c->mm == mm)
    436				goto assign_new_owner;
    437			if (c->mm)
    438				break;
    439		}
    440	}
    441	read_unlock(&tasklist_lock);
    442	/*
    443	 * We found no owner yet mm_users > 1: this implies that we are
    444	 * most likely racing with swapoff (try_to_unuse()) or /proc or
    445	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
    446	 */
    447	WRITE_ONCE(mm->owner, NULL);
    448	return;
    449
    450assign_new_owner:
    451	BUG_ON(c == p);
    452	get_task_struct(c);
    453	/*
    454	 * The task_lock protects c->mm from changing.
    455	 * We always want mm->owner->mm == mm
    456	 */
    457	task_lock(c);
    458	/*
    459	 * Delay read_unlock() till we have the task_lock()
    460	 * to ensure that c does not slip away underneath us
    461	 */
    462	read_unlock(&tasklist_lock);
    463	if (c->mm != mm) {
    464		task_unlock(c);
    465		put_task_struct(c);
    466		goto retry;
    467	}
    468	WRITE_ONCE(mm->owner, c);
    469	task_unlock(c);
    470	put_task_struct(c);
    471}
    472#endif /* CONFIG_MEMCG */
    473
    474/*
    475 * Turn us into a lazy TLB process if we
    476 * aren't already..
    477 */
    478static void exit_mm(void)
    479{
    480	struct mm_struct *mm = current->mm;
    481
    482	exit_mm_release(current, mm);
    483	if (!mm)
    484		return;
    485	sync_mm_rss(mm);
    486	mmap_read_lock(mm);
    487	mmgrab(mm);
    488	BUG_ON(mm != current->active_mm);
    489	/* more a memory barrier than a real lock */
    490	task_lock(current);
    491	/*
    492	 * When a thread stops operating on an address space, the loop
    493	 * in membarrier_private_expedited() may not observe that
    494	 * tsk->mm, and the loop in membarrier_global_expedited() may
    495	 * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
    496	 * rq->membarrier_state, so those would not issue an IPI.
    497	 * Membarrier requires a memory barrier after accessing
    498	 * user-space memory, before clearing tsk->mm or the
    499	 * rq->membarrier_state.
    500	 */
    501	smp_mb__after_spinlock();
    502	local_irq_disable();
    503	current->mm = NULL;
    504	membarrier_update_current_mm(NULL);
    505	enter_lazy_tlb(mm, current);
    506	local_irq_enable();
    507	task_unlock(current);
    508	mmap_read_unlock(mm);
    509	mm_update_next_owner(mm);
    510	mmput(mm);
    511	if (test_thread_flag(TIF_MEMDIE))
    512		exit_oom_victim();
    513}
    514
    515static struct task_struct *find_alive_thread(struct task_struct *p)
    516{
    517	struct task_struct *t;
    518
    519	for_each_thread(p, t) {
    520		if (!(t->flags & PF_EXITING))
    521			return t;
    522	}
    523	return NULL;
    524}
    525
    526static struct task_struct *find_child_reaper(struct task_struct *father,
    527						struct list_head *dead)
    528	__releases(&tasklist_lock)
    529	__acquires(&tasklist_lock)
    530{
    531	struct pid_namespace *pid_ns = task_active_pid_ns(father);
    532	struct task_struct *reaper = pid_ns->child_reaper;
    533	struct task_struct *p, *n;
    534
    535	if (likely(reaper != father))
    536		return reaper;
    537
    538	reaper = find_alive_thread(father);
    539	if (reaper) {
    540		pid_ns->child_reaper = reaper;
    541		return reaper;
    542	}
    543
    544	write_unlock_irq(&tasklist_lock);
    545
    546	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
    547		list_del_init(&p->ptrace_entry);
    548		release_task(p);
    549	}
    550
    551	zap_pid_ns_processes(pid_ns);
    552	write_lock_irq(&tasklist_lock);
    553
    554	return father;
    555}
    556
    557/*
    558 * When we die, we re-parent all our children, and try to:
    559 * 1. give them to another thread in our thread group, if such a member exists
    560 * 2. give it to the first ancestor process which prctl'd itself as a
    561 *    child_subreaper for its children (like a service manager)
    562 * 3. give it to the init process (PID 1) in our pid namespace
    563 */
    564static struct task_struct *find_new_reaper(struct task_struct *father,
    565					   struct task_struct *child_reaper)
    566{
    567	struct task_struct *thread, *reaper;
    568
    569	thread = find_alive_thread(father);
    570	if (thread)
    571		return thread;
    572
    573	if (father->signal->has_child_subreaper) {
    574		unsigned int ns_level = task_pid(father)->level;
    575		/*
    576		 * Find the first ->is_child_subreaper ancestor in our pid_ns.
    577		 * We can't check reaper != child_reaper to ensure we do not
    578		 * cross the namespaces, the exiting parent could be injected
    579		 * by setns() + fork().
    580		 * We check pid->level, this is slightly more efficient than
    581		 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
    582		 */
    583		for (reaper = father->real_parent;
    584		     task_pid(reaper)->level == ns_level;
    585		     reaper = reaper->real_parent) {
    586			if (reaper == &init_task)
    587				break;
    588			if (!reaper->signal->is_child_subreaper)
    589				continue;
    590			thread = find_alive_thread(reaper);
    591			if (thread)
    592				return thread;
    593		}
    594	}
    595
    596	return child_reaper;
    597}
    598
    599/*
    600* Any that need to be release_task'd are put on the @dead list.
    601 */
    602static void reparent_leader(struct task_struct *father, struct task_struct *p,
    603				struct list_head *dead)
    604{
    605	if (unlikely(p->exit_state == EXIT_DEAD))
    606		return;
    607
    608	/* We don't want people slaying init. */
    609	p->exit_signal = SIGCHLD;
    610
    611	/* If it has exited notify the new parent about this child's death. */
    612	if (!p->ptrace &&
    613	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
    614		if (do_notify_parent(p, p->exit_signal)) {
    615			p->exit_state = EXIT_DEAD;
    616			list_add(&p->ptrace_entry, dead);
    617		}
    618	}
    619
    620	kill_orphaned_pgrp(p, father);
    621}
    622
    623/*
    624 * This does two things:
    625 *
    626 * A.  Make init inherit all the child processes
    627 * B.  Check to see if any process groups have become orphaned
    628 *	as a result of our exiting, and if they have any stopped
    629 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
    630 */
    631static void forget_original_parent(struct task_struct *father,
    632					struct list_head *dead)
    633{
    634	struct task_struct *p, *t, *reaper;
    635
    636	if (unlikely(!list_empty(&father->ptraced)))
    637		exit_ptrace(father, dead);
    638
    639	/* Can drop and reacquire tasklist_lock */
    640	reaper = find_child_reaper(father, dead);
    641	if (list_empty(&father->children))
    642		return;
    643
    644	reaper = find_new_reaper(father, reaper);
    645	list_for_each_entry(p, &father->children, sibling) {
    646		for_each_thread(p, t) {
    647			RCU_INIT_POINTER(t->real_parent, reaper);
    648			BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
    649			if (likely(!t->ptrace))
    650				t->parent = t->real_parent;
    651			if (t->pdeath_signal)
    652				group_send_sig_info(t->pdeath_signal,
    653						    SEND_SIG_NOINFO, t,
    654						    PIDTYPE_TGID);
    655		}
    656		/*
    657		 * If this is a threaded reparent there is no need to
    658		 * notify anyone anything has happened.
    659		 */
    660		if (!same_thread_group(reaper, father))
    661			reparent_leader(father, p, dead);
    662	}
    663	list_splice_tail_init(&father->children, &reaper->children);
    664}
    665
    666/*
    667 * Send signals to all our closest relatives so that they know
    668 * to properly mourn us..
    669 */
    670static void exit_notify(struct task_struct *tsk, int group_dead)
    671{
    672	bool autoreap;
    673	struct task_struct *p, *n;
    674	LIST_HEAD(dead);
    675
    676	write_lock_irq(&tasklist_lock);
    677	forget_original_parent(tsk, &dead);
    678
    679	if (group_dead)
    680		kill_orphaned_pgrp(tsk->group_leader, NULL);
    681
    682	tsk->exit_state = EXIT_ZOMBIE;
    683	if (unlikely(tsk->ptrace)) {
    684		int sig = thread_group_leader(tsk) &&
    685				thread_group_empty(tsk) &&
    686				!ptrace_reparented(tsk) ?
    687			tsk->exit_signal : SIGCHLD;
    688		autoreap = do_notify_parent(tsk, sig);
    689	} else if (thread_group_leader(tsk)) {
    690		autoreap = thread_group_empty(tsk) &&
    691			do_notify_parent(tsk, tsk->exit_signal);
    692	} else {
    693		autoreap = true;
    694	}
    695
    696	if (autoreap) {
    697		tsk->exit_state = EXIT_DEAD;
    698		list_add(&tsk->ptrace_entry, &dead);
    699	}
    700
    701	/* mt-exec, de_thread() is waiting for group leader */
    702	if (unlikely(tsk->signal->notify_count < 0))
    703		wake_up_process(tsk->signal->group_exec_task);
    704	write_unlock_irq(&tasklist_lock);
    705
    706	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
    707		list_del_init(&p->ptrace_entry);
    708		release_task(p);
    709	}
    710}
    711
    712#ifdef CONFIG_DEBUG_STACK_USAGE
    713static void check_stack_usage(void)
    714{
    715	static DEFINE_SPINLOCK(low_water_lock);
    716	static int lowest_to_date = THREAD_SIZE;
    717	unsigned long free;
    718
    719	free = stack_not_used(current);
    720
    721	if (free >= lowest_to_date)
    722		return;
    723
    724	spin_lock(&low_water_lock);
    725	if (free < lowest_to_date) {
    726		pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
    727			current->comm, task_pid_nr(current), free);
    728		lowest_to_date = free;
    729	}
    730	spin_unlock(&low_water_lock);
    731}
    732#else
    733static inline void check_stack_usage(void) {}
    734#endif
    735
    736void __noreturn do_exit(long code)
    737{
    738	struct task_struct *tsk = current;
    739	int group_dead;
    740
    741	WARN_ON(tsk->plug);
    742
    743	kcov_task_exit(tsk);
    744
    745	coredump_task_exit(tsk);
    746	ptrace_event(PTRACE_EVENT_EXIT, code);
    747
    748	validate_creds_for_do_exit(tsk);
    749
    750	io_uring_files_cancel();
    751	exit_signals(tsk);  /* sets PF_EXITING */
    752
    753	/* sync mm's RSS info before statistics gathering */
    754	if (tsk->mm)
    755		sync_mm_rss(tsk->mm);
    756	acct_update_integrals(tsk);
    757	group_dead = atomic_dec_and_test(&tsk->signal->live);
    758	if (group_dead) {
    759		/*
    760		 * If the last thread of global init has exited, panic
    761		 * immediately to get a useable coredump.
    762		 */
    763		if (unlikely(is_global_init(tsk)))
    764			panic("Attempted to kill init! exitcode=0x%08x\n",
    765				tsk->signal->group_exit_code ?: (int)code);
    766
    767#ifdef CONFIG_POSIX_TIMERS
    768		hrtimer_cancel(&tsk->signal->real_timer);
    769		exit_itimers(tsk->signal);
    770#endif
    771		if (tsk->mm)
    772			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
    773	}
    774	acct_collect(code, group_dead);
    775	if (group_dead)
    776		tty_audit_exit();
    777	audit_free(tsk);
    778
    779	tsk->exit_code = code;
    780	taskstats_exit(tsk, group_dead);
    781
    782	exit_mm();
    783
    784	if (group_dead)
    785		acct_process();
    786	trace_sched_process_exit(tsk);
    787
    788	exit_sem(tsk);
    789	exit_shm(tsk);
    790	exit_files(tsk);
    791	exit_fs(tsk);
    792	if (group_dead)
    793		disassociate_ctty(1);
    794	exit_task_namespaces(tsk);
    795	exit_task_work(tsk);
    796	exit_thread(tsk);
    797
    798	/*
    799	 * Flush inherited counters to the parent - before the parent
    800	 * gets woken up by child-exit notifications.
    801	 *
    802	 * because of cgroup mode, must be called before cgroup_exit()
    803	 */
    804	perf_event_exit_task(tsk);
    805
    806	sched_autogroup_exit_task(tsk);
    807	cgroup_exit(tsk);
    808
    809	/*
    810	 * FIXME: do that only when needed, using sched_exit tracepoint
    811	 */
    812	flush_ptrace_hw_breakpoint(tsk);
    813
    814	exit_tasks_rcu_start();
    815	exit_notify(tsk, group_dead);
    816	proc_exit_connector(tsk);
    817	mpol_put_task_policy(tsk);
    818#ifdef CONFIG_FUTEX
    819	if (unlikely(current->pi_state_cache))
    820		kfree(current->pi_state_cache);
    821#endif
    822	/*
    823	 * Make sure we are holding no locks:
    824	 */
    825	debug_check_no_locks_held();
    826
    827	if (tsk->io_context)
    828		exit_io_context(tsk);
    829
    830	if (tsk->splice_pipe)
    831		free_pipe_info(tsk->splice_pipe);
    832
    833	if (tsk->task_frag.page)
    834		put_page(tsk->task_frag.page);
    835
    836	validate_creds_for_do_exit(tsk);
    837	exit_task_stack_account(tsk);
    838
    839	check_stack_usage();
    840	preempt_disable();
    841	if (tsk->nr_dirtied)
    842		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
    843	exit_rcu();
    844	exit_tasks_rcu_finish();
    845
    846	lockdep_free_task(tsk);
    847	do_task_dead();
    848}
    849
    850void __noreturn make_task_dead(int signr)
    851{
    852	/*
    853	 * Take the task off the cpu after something catastrophic has
    854	 * happened.
    855	 *
    856	 * We can get here from a kernel oops, sometimes with preemption off.
    857	 * Start by checking for critical errors.
    858	 * Then fix up important state like USER_DS and preemption.
    859	 * Then do everything else.
    860	 */
    861	struct task_struct *tsk = current;
    862
    863	if (unlikely(in_interrupt()))
    864		panic("Aiee, killing interrupt handler!");
    865	if (unlikely(!tsk->pid))
    866		panic("Attempted to kill the idle task!");
    867
    868	if (unlikely(in_atomic())) {
    869		pr_info("note: %s[%d] exited with preempt_count %d\n",
    870			current->comm, task_pid_nr(current),
    871			preempt_count());
    872		preempt_count_set(PREEMPT_ENABLED);
    873	}
    874
    875	/*
    876	 * We're taking recursive faults here in make_task_dead. Safest is to just
    877	 * leave this task alone and wait for reboot.
    878	 */
    879	if (unlikely(tsk->flags & PF_EXITING)) {
    880		pr_alert("Fixing recursive fault but reboot is needed!\n");
    881		futex_exit_recursive(tsk);
    882		tsk->exit_state = EXIT_DEAD;
    883		refcount_inc(&tsk->rcu_users);
    884		do_task_dead();
    885	}
    886
    887	do_exit(signr);
    888}
    889
    890SYSCALL_DEFINE1(exit, int, error_code)
    891{
    892	do_exit((error_code&0xff)<<8);
    893}
    894
    895/*
    896 * Take down every thread in the group.  This is called by fatal signals
    897 * as well as by sys_exit_group (below).
    898 */
    899void __noreturn
    900do_group_exit(int exit_code)
    901{
    902	struct signal_struct *sig = current->signal;
    903
    904	if (sig->flags & SIGNAL_GROUP_EXIT)
    905		exit_code = sig->group_exit_code;
    906	else if (sig->group_exec_task)
    907		exit_code = 0;
    908	else if (!thread_group_empty(current)) {
    909		struct sighand_struct *const sighand = current->sighand;
    910
    911		spin_lock_irq(&sighand->siglock);
    912		if (sig->flags & SIGNAL_GROUP_EXIT)
    913			/* Another thread got here before we took the lock.  */
    914			exit_code = sig->group_exit_code;
    915		else if (sig->group_exec_task)
    916			exit_code = 0;
    917		else {
    918			sig->group_exit_code = exit_code;
    919			sig->flags = SIGNAL_GROUP_EXIT;
    920			zap_other_threads(current);
    921		}
    922		spin_unlock_irq(&sighand->siglock);
    923	}
    924
    925	do_exit(exit_code);
    926	/* NOTREACHED */
    927}
    928
    929/*
    930 * this kills every thread in the thread group. Note that any externally
    931 * wait4()-ing process will get the correct exit code - even if this
    932 * thread is not the thread group leader.
    933 */
    934SYSCALL_DEFINE1(exit_group, int, error_code)
    935{
    936	do_group_exit((error_code & 0xff) << 8);
    937	/* NOTREACHED */
    938	return 0;
    939}
    940
    941struct waitid_info {
    942	pid_t pid;
    943	uid_t uid;
    944	int status;
    945	int cause;
    946};
    947
    948struct wait_opts {
    949	enum pid_type		wo_type;
    950	int			wo_flags;
    951	struct pid		*wo_pid;
    952
    953	struct waitid_info	*wo_info;
    954	int			wo_stat;
    955	struct rusage		*wo_rusage;
    956
    957	wait_queue_entry_t		child_wait;
    958	int			notask_error;
    959};
    960
    961static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
    962{
    963	return	wo->wo_type == PIDTYPE_MAX ||
    964		task_pid_type(p, wo->wo_type) == wo->wo_pid;
    965}
    966
    967static int
    968eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
    969{
    970	if (!eligible_pid(wo, p))
    971		return 0;
    972
    973	/*
    974	 * Wait for all children (clone and not) if __WALL is set or
    975	 * if it is traced by us.
    976	 */
    977	if (ptrace || (wo->wo_flags & __WALL))
    978		return 1;
    979
    980	/*
    981	 * Otherwise, wait for clone children *only* if __WCLONE is set;
    982	 * otherwise, wait for non-clone children *only*.
    983	 *
    984	 * Note: a "clone" child here is one that reports to its parent
    985	 * using a signal other than SIGCHLD, or a non-leader thread which
    986	 * we can only see if it is traced by us.
    987	 */
    988	if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
    989		return 0;
    990
    991	return 1;
    992}
    993
    994/*
    995 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
    996 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
    997 * the lock and this task is uninteresting.  If we return nonzero, we have
    998 * released the lock and the system call should return.
    999 */
   1000static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
   1001{
   1002	int state, status;
   1003	pid_t pid = task_pid_vnr(p);
   1004	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
   1005	struct waitid_info *infop;
   1006
   1007	if (!likely(wo->wo_flags & WEXITED))
   1008		return 0;
   1009
   1010	if (unlikely(wo->wo_flags & WNOWAIT)) {
   1011		status = (p->signal->flags & SIGNAL_GROUP_EXIT)
   1012			? p->signal->group_exit_code : p->exit_code;
   1013		get_task_struct(p);
   1014		read_unlock(&tasklist_lock);
   1015		sched_annotate_sleep();
   1016		if (wo->wo_rusage)
   1017			getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
   1018		put_task_struct(p);
   1019		goto out_info;
   1020	}
   1021	/*
   1022	 * Move the task's state to DEAD/TRACE, only one thread can do this.
   1023	 */
   1024	state = (ptrace_reparented(p) && thread_group_leader(p)) ?
   1025		EXIT_TRACE : EXIT_DEAD;
   1026	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
   1027		return 0;
   1028	/*
   1029	 * We own this thread, nobody else can reap it.
   1030	 */
   1031	read_unlock(&tasklist_lock);
   1032	sched_annotate_sleep();
   1033
   1034	/*
   1035	 * Check thread_group_leader() to exclude the traced sub-threads.
   1036	 */
   1037	if (state == EXIT_DEAD && thread_group_leader(p)) {
   1038		struct signal_struct *sig = p->signal;
   1039		struct signal_struct *psig = current->signal;
   1040		unsigned long maxrss;
   1041		u64 tgutime, tgstime;
   1042
   1043		/*
   1044		 * The resource counters for the group leader are in its
   1045		 * own task_struct.  Those for dead threads in the group
   1046		 * are in its signal_struct, as are those for the child
   1047		 * processes it has previously reaped.  All these
   1048		 * accumulate in the parent's signal_struct c* fields.
   1049		 *
   1050		 * We don't bother to take a lock here to protect these
   1051		 * p->signal fields because the whole thread group is dead
   1052		 * and nobody can change them.
   1053		 *
   1054		 * psig->stats_lock also protects us from our sub-theads
   1055		 * which can reap other children at the same time. Until
   1056		 * we change k_getrusage()-like users to rely on this lock
   1057		 * we have to take ->siglock as well.
   1058		 *
   1059		 * We use thread_group_cputime_adjusted() to get times for
   1060		 * the thread group, which consolidates times for all threads
   1061		 * in the group including the group leader.
   1062		 */
   1063		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
   1064		spin_lock_irq(&current->sighand->siglock);
   1065		write_seqlock(&psig->stats_lock);
   1066		psig->cutime += tgutime + sig->cutime;
   1067		psig->cstime += tgstime + sig->cstime;
   1068		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
   1069		psig->cmin_flt +=
   1070			p->min_flt + sig->min_flt + sig->cmin_flt;
   1071		psig->cmaj_flt +=
   1072			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
   1073		psig->cnvcsw +=
   1074			p->nvcsw + sig->nvcsw + sig->cnvcsw;
   1075		psig->cnivcsw +=
   1076			p->nivcsw + sig->nivcsw + sig->cnivcsw;
   1077		psig->cinblock +=
   1078			task_io_get_inblock(p) +
   1079			sig->inblock + sig->cinblock;
   1080		psig->coublock +=
   1081			task_io_get_oublock(p) +
   1082			sig->oublock + sig->coublock;
   1083		maxrss = max(sig->maxrss, sig->cmaxrss);
   1084		if (psig->cmaxrss < maxrss)
   1085			psig->cmaxrss = maxrss;
   1086		task_io_accounting_add(&psig->ioac, &p->ioac);
   1087		task_io_accounting_add(&psig->ioac, &sig->ioac);
   1088		write_sequnlock(&psig->stats_lock);
   1089		spin_unlock_irq(&current->sighand->siglock);
   1090	}
   1091
   1092	if (wo->wo_rusage)
   1093		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
   1094	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
   1095		? p->signal->group_exit_code : p->exit_code;
   1096	wo->wo_stat = status;
   1097
   1098	if (state == EXIT_TRACE) {
   1099		write_lock_irq(&tasklist_lock);
   1100		/* We dropped tasklist, ptracer could die and untrace */
   1101		ptrace_unlink(p);
   1102
   1103		/* If parent wants a zombie, don't release it now */
   1104		state = EXIT_ZOMBIE;
   1105		if (do_notify_parent(p, p->exit_signal))
   1106			state = EXIT_DEAD;
   1107		p->exit_state = state;
   1108		write_unlock_irq(&tasklist_lock);
   1109	}
   1110	if (state == EXIT_DEAD)
   1111		release_task(p);
   1112
   1113out_info:
   1114	infop = wo->wo_info;
   1115	if (infop) {
   1116		if ((status & 0x7f) == 0) {
   1117			infop->cause = CLD_EXITED;
   1118			infop->status = status >> 8;
   1119		} else {
   1120			infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
   1121			infop->status = status & 0x7f;
   1122		}
   1123		infop->pid = pid;
   1124		infop->uid = uid;
   1125	}
   1126
   1127	return pid;
   1128}
   1129
   1130static int *task_stopped_code(struct task_struct *p, bool ptrace)
   1131{
   1132	if (ptrace) {
   1133		if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
   1134			return &p->exit_code;
   1135	} else {
   1136		if (p->signal->flags & SIGNAL_STOP_STOPPED)
   1137			return &p->signal->group_exit_code;
   1138	}
   1139	return NULL;
   1140}
   1141
   1142/**
   1143 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
   1144 * @wo: wait options
   1145 * @ptrace: is the wait for ptrace
   1146 * @p: task to wait for
   1147 *
   1148 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
   1149 *
   1150 * CONTEXT:
   1151 * read_lock(&tasklist_lock), which is released if return value is
   1152 * non-zero.  Also, grabs and releases @p->sighand->siglock.
   1153 *
   1154 * RETURNS:
   1155 * 0 if wait condition didn't exist and search for other wait conditions
   1156 * should continue.  Non-zero return, -errno on failure and @p's pid on
   1157 * success, implies that tasklist_lock is released and wait condition
   1158 * search should terminate.
   1159 */
   1160static int wait_task_stopped(struct wait_opts *wo,
   1161				int ptrace, struct task_struct *p)
   1162{
   1163	struct waitid_info *infop;
   1164	int exit_code, *p_code, why;
   1165	uid_t uid = 0; /* unneeded, required by compiler */
   1166	pid_t pid;
   1167
   1168	/*
   1169	 * Traditionally we see ptrace'd stopped tasks regardless of options.
   1170	 */
   1171	if (!ptrace && !(wo->wo_flags & WUNTRACED))
   1172		return 0;
   1173
   1174	if (!task_stopped_code(p, ptrace))
   1175		return 0;
   1176
   1177	exit_code = 0;
   1178	spin_lock_irq(&p->sighand->siglock);
   1179
   1180	p_code = task_stopped_code(p, ptrace);
   1181	if (unlikely(!p_code))
   1182		goto unlock_sig;
   1183
   1184	exit_code = *p_code;
   1185	if (!exit_code)
   1186		goto unlock_sig;
   1187
   1188	if (!unlikely(wo->wo_flags & WNOWAIT))
   1189		*p_code = 0;
   1190
   1191	uid = from_kuid_munged(current_user_ns(), task_uid(p));
   1192unlock_sig:
   1193	spin_unlock_irq(&p->sighand->siglock);
   1194	if (!exit_code)
   1195		return 0;
   1196
   1197	/*
   1198	 * Now we are pretty sure this task is interesting.
   1199	 * Make sure it doesn't get reaped out from under us while we
   1200	 * give up the lock and then examine it below.  We don't want to
   1201	 * keep holding onto the tasklist_lock while we call getrusage and
   1202	 * possibly take page faults for user memory.
   1203	 */
   1204	get_task_struct(p);
   1205	pid = task_pid_vnr(p);
   1206	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
   1207	read_unlock(&tasklist_lock);
   1208	sched_annotate_sleep();
   1209	if (wo->wo_rusage)
   1210		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
   1211	put_task_struct(p);
   1212
   1213	if (likely(!(wo->wo_flags & WNOWAIT)))
   1214		wo->wo_stat = (exit_code << 8) | 0x7f;
   1215
   1216	infop = wo->wo_info;
   1217	if (infop) {
   1218		infop->cause = why;
   1219		infop->status = exit_code;
   1220		infop->pid = pid;
   1221		infop->uid = uid;
   1222	}
   1223	return pid;
   1224}
   1225
   1226/*
   1227 * Handle do_wait work for one task in a live, non-stopped state.
   1228 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
   1229 * the lock and this task is uninteresting.  If we return nonzero, we have
   1230 * released the lock and the system call should return.
   1231 */
   1232static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
   1233{
   1234	struct waitid_info *infop;
   1235	pid_t pid;
   1236	uid_t uid;
   1237
   1238	if (!unlikely(wo->wo_flags & WCONTINUED))
   1239		return 0;
   1240
   1241	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
   1242		return 0;
   1243
   1244	spin_lock_irq(&p->sighand->siglock);
   1245	/* Re-check with the lock held.  */
   1246	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
   1247		spin_unlock_irq(&p->sighand->siglock);
   1248		return 0;
   1249	}
   1250	if (!unlikely(wo->wo_flags & WNOWAIT))
   1251		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
   1252	uid = from_kuid_munged(current_user_ns(), task_uid(p));
   1253	spin_unlock_irq(&p->sighand->siglock);
   1254
   1255	pid = task_pid_vnr(p);
   1256	get_task_struct(p);
   1257	read_unlock(&tasklist_lock);
   1258	sched_annotate_sleep();
   1259	if (wo->wo_rusage)
   1260		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
   1261	put_task_struct(p);
   1262
   1263	infop = wo->wo_info;
   1264	if (!infop) {
   1265		wo->wo_stat = 0xffff;
   1266	} else {
   1267		infop->cause = CLD_CONTINUED;
   1268		infop->pid = pid;
   1269		infop->uid = uid;
   1270		infop->status = SIGCONT;
   1271	}
   1272	return pid;
   1273}
   1274
   1275/*
   1276 * Consider @p for a wait by @parent.
   1277 *
   1278 * -ECHILD should be in ->notask_error before the first call.
   1279 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
   1280 * Returns zero if the search for a child should continue;
   1281 * then ->notask_error is 0 if @p is an eligible child,
   1282 * or still -ECHILD.
   1283 */
   1284static int wait_consider_task(struct wait_opts *wo, int ptrace,
   1285				struct task_struct *p)
   1286{
   1287	/*
   1288	 * We can race with wait_task_zombie() from another thread.
   1289	 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
   1290	 * can't confuse the checks below.
   1291	 */
   1292	int exit_state = READ_ONCE(p->exit_state);
   1293	int ret;
   1294
   1295	if (unlikely(exit_state == EXIT_DEAD))
   1296		return 0;
   1297
   1298	ret = eligible_child(wo, ptrace, p);
   1299	if (!ret)
   1300		return ret;
   1301
   1302	if (unlikely(exit_state == EXIT_TRACE)) {
   1303		/*
   1304		 * ptrace == 0 means we are the natural parent. In this case
   1305		 * we should clear notask_error, debugger will notify us.
   1306		 */
   1307		if (likely(!ptrace))
   1308			wo->notask_error = 0;
   1309		return 0;
   1310	}
   1311
   1312	if (likely(!ptrace) && unlikely(p->ptrace)) {
   1313		/*
   1314		 * If it is traced by its real parent's group, just pretend
   1315		 * the caller is ptrace_do_wait() and reap this child if it
   1316		 * is zombie.
   1317		 *
   1318		 * This also hides group stop state from real parent; otherwise
   1319		 * a single stop can be reported twice as group and ptrace stop.
   1320		 * If a ptracer wants to distinguish these two events for its
   1321		 * own children it should create a separate process which takes
   1322		 * the role of real parent.
   1323		 */
   1324		if (!ptrace_reparented(p))
   1325			ptrace = 1;
   1326	}
   1327
   1328	/* slay zombie? */
   1329	if (exit_state == EXIT_ZOMBIE) {
   1330		/* we don't reap group leaders with subthreads */
   1331		if (!delay_group_leader(p)) {
   1332			/*
   1333			 * A zombie ptracee is only visible to its ptracer.
   1334			 * Notification and reaping will be cascaded to the
   1335			 * real parent when the ptracer detaches.
   1336			 */
   1337			if (unlikely(ptrace) || likely(!p->ptrace))
   1338				return wait_task_zombie(wo, p);
   1339		}
   1340
   1341		/*
   1342		 * Allow access to stopped/continued state via zombie by
   1343		 * falling through.  Clearing of notask_error is complex.
   1344		 *
   1345		 * When !@ptrace:
   1346		 *
   1347		 * If WEXITED is set, notask_error should naturally be
   1348		 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
   1349		 * so, if there are live subthreads, there are events to
   1350		 * wait for.  If all subthreads are dead, it's still safe
   1351		 * to clear - this function will be called again in finite
   1352		 * amount time once all the subthreads are released and
   1353		 * will then return without clearing.
   1354		 *
   1355		 * When @ptrace:
   1356		 *
   1357		 * Stopped state is per-task and thus can't change once the
   1358		 * target task dies.  Only continued and exited can happen.
   1359		 * Clear notask_error if WCONTINUED | WEXITED.
   1360		 */
   1361		if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
   1362			wo->notask_error = 0;
   1363	} else {
   1364		/*
   1365		 * @p is alive and it's gonna stop, continue or exit, so
   1366		 * there always is something to wait for.
   1367		 */
   1368		wo->notask_error = 0;
   1369	}
   1370
   1371	/*
   1372	 * Wait for stopped.  Depending on @ptrace, different stopped state
   1373	 * is used and the two don't interact with each other.
   1374	 */
   1375	ret = wait_task_stopped(wo, ptrace, p);
   1376	if (ret)
   1377		return ret;
   1378
   1379	/*
   1380	 * Wait for continued.  There's only one continued state and the
   1381	 * ptracer can consume it which can confuse the real parent.  Don't
   1382	 * use WCONTINUED from ptracer.  You don't need or want it.
   1383	 */
   1384	return wait_task_continued(wo, p);
   1385}
   1386
   1387/*
   1388 * Do the work of do_wait() for one thread in the group, @tsk.
   1389 *
   1390 * -ECHILD should be in ->notask_error before the first call.
   1391 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
   1392 * Returns zero if the search for a child should continue; then
   1393 * ->notask_error is 0 if there were any eligible children,
   1394 * or still -ECHILD.
   1395 */
   1396static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
   1397{
   1398	struct task_struct *p;
   1399
   1400	list_for_each_entry(p, &tsk->children, sibling) {
   1401		int ret = wait_consider_task(wo, 0, p);
   1402
   1403		if (ret)
   1404			return ret;
   1405	}
   1406
   1407	return 0;
   1408}
   1409
   1410static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
   1411{
   1412	struct task_struct *p;
   1413
   1414	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
   1415		int ret = wait_consider_task(wo, 1, p);
   1416
   1417		if (ret)
   1418			return ret;
   1419	}
   1420
   1421	return 0;
   1422}
   1423
   1424static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
   1425				int sync, void *key)
   1426{
   1427	struct wait_opts *wo = container_of(wait, struct wait_opts,
   1428						child_wait);
   1429	struct task_struct *p = key;
   1430
   1431	if (!eligible_pid(wo, p))
   1432		return 0;
   1433
   1434	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
   1435		return 0;
   1436
   1437	return default_wake_function(wait, mode, sync, key);
   1438}
   1439
   1440void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
   1441{
   1442	__wake_up_sync_key(&parent->signal->wait_chldexit,
   1443			   TASK_INTERRUPTIBLE, p);
   1444}
   1445
   1446static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
   1447				 struct task_struct *target)
   1448{
   1449	struct task_struct *parent =
   1450		!ptrace ? target->real_parent : target->parent;
   1451
   1452	return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
   1453				     same_thread_group(current, parent));
   1454}
   1455
   1456/*
   1457 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
   1458 * and tracee lists to find the target task.
   1459 */
   1460static int do_wait_pid(struct wait_opts *wo)
   1461{
   1462	bool ptrace;
   1463	struct task_struct *target;
   1464	int retval;
   1465
   1466	ptrace = false;
   1467	target = pid_task(wo->wo_pid, PIDTYPE_TGID);
   1468	if (target && is_effectively_child(wo, ptrace, target)) {
   1469		retval = wait_consider_task(wo, ptrace, target);
   1470		if (retval)
   1471			return retval;
   1472	}
   1473
   1474	ptrace = true;
   1475	target = pid_task(wo->wo_pid, PIDTYPE_PID);
   1476	if (target && target->ptrace &&
   1477	    is_effectively_child(wo, ptrace, target)) {
   1478		retval = wait_consider_task(wo, ptrace, target);
   1479		if (retval)
   1480			return retval;
   1481	}
   1482
   1483	return 0;
   1484}
   1485
   1486static long do_wait(struct wait_opts *wo)
   1487{
   1488	int retval;
   1489
   1490	trace_sched_process_wait(wo->wo_pid);
   1491
   1492	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
   1493	wo->child_wait.private = current;
   1494	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
   1495repeat:
   1496	/*
   1497	 * If there is nothing that can match our criteria, just get out.
   1498	 * We will clear ->notask_error to zero if we see any child that
   1499	 * might later match our criteria, even if we are not able to reap
   1500	 * it yet.
   1501	 */
   1502	wo->notask_error = -ECHILD;
   1503	if ((wo->wo_type < PIDTYPE_MAX) &&
   1504	   (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
   1505		goto notask;
   1506
   1507	set_current_state(TASK_INTERRUPTIBLE);
   1508	read_lock(&tasklist_lock);
   1509
   1510	if (wo->wo_type == PIDTYPE_PID) {
   1511		retval = do_wait_pid(wo);
   1512		if (retval)
   1513			goto end;
   1514	} else {
   1515		struct task_struct *tsk = current;
   1516
   1517		do {
   1518			retval = do_wait_thread(wo, tsk);
   1519			if (retval)
   1520				goto end;
   1521
   1522			retval = ptrace_do_wait(wo, tsk);
   1523			if (retval)
   1524				goto end;
   1525
   1526			if (wo->wo_flags & __WNOTHREAD)
   1527				break;
   1528		} while_each_thread(current, tsk);
   1529	}
   1530	read_unlock(&tasklist_lock);
   1531
   1532notask:
   1533	retval = wo->notask_error;
   1534	if (!retval && !(wo->wo_flags & WNOHANG)) {
   1535		retval = -ERESTARTSYS;
   1536		if (!signal_pending(current)) {
   1537			schedule();
   1538			goto repeat;
   1539		}
   1540	}
   1541end:
   1542	__set_current_state(TASK_RUNNING);
   1543	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
   1544	return retval;
   1545}
   1546
   1547static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
   1548			  int options, struct rusage *ru)
   1549{
   1550	struct wait_opts wo;
   1551	struct pid *pid = NULL;
   1552	enum pid_type type;
   1553	long ret;
   1554	unsigned int f_flags = 0;
   1555
   1556	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
   1557			__WNOTHREAD|__WCLONE|__WALL))
   1558		return -EINVAL;
   1559	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
   1560		return -EINVAL;
   1561
   1562	switch (which) {
   1563	case P_ALL:
   1564		type = PIDTYPE_MAX;
   1565		break;
   1566	case P_PID:
   1567		type = PIDTYPE_PID;
   1568		if (upid <= 0)
   1569			return -EINVAL;
   1570
   1571		pid = find_get_pid(upid);
   1572		break;
   1573	case P_PGID:
   1574		type = PIDTYPE_PGID;
   1575		if (upid < 0)
   1576			return -EINVAL;
   1577
   1578		if (upid)
   1579			pid = find_get_pid(upid);
   1580		else
   1581			pid = get_task_pid(current, PIDTYPE_PGID);
   1582		break;
   1583	case P_PIDFD:
   1584		type = PIDTYPE_PID;
   1585		if (upid < 0)
   1586			return -EINVAL;
   1587
   1588		pid = pidfd_get_pid(upid, &f_flags);
   1589		if (IS_ERR(pid))
   1590			return PTR_ERR(pid);
   1591
   1592		break;
   1593	default:
   1594		return -EINVAL;
   1595	}
   1596
   1597	wo.wo_type	= type;
   1598	wo.wo_pid	= pid;
   1599	wo.wo_flags	= options;
   1600	wo.wo_info	= infop;
   1601	wo.wo_rusage	= ru;
   1602	if (f_flags & O_NONBLOCK)
   1603		wo.wo_flags |= WNOHANG;
   1604
   1605	ret = do_wait(&wo);
   1606	if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
   1607		ret = -EAGAIN;
   1608
   1609	put_pid(pid);
   1610	return ret;
   1611}
   1612
   1613SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
   1614		infop, int, options, struct rusage __user *, ru)
   1615{
   1616	struct rusage r;
   1617	struct waitid_info info = {.status = 0};
   1618	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
   1619	int signo = 0;
   1620
   1621	if (err > 0) {
   1622		signo = SIGCHLD;
   1623		err = 0;
   1624		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
   1625			return -EFAULT;
   1626	}
   1627	if (!infop)
   1628		return err;
   1629
   1630	if (!user_write_access_begin(infop, sizeof(*infop)))
   1631		return -EFAULT;
   1632
   1633	unsafe_put_user(signo, &infop->si_signo, Efault);
   1634	unsafe_put_user(0, &infop->si_errno, Efault);
   1635	unsafe_put_user(info.cause, &infop->si_code, Efault);
   1636	unsafe_put_user(info.pid, &infop->si_pid, Efault);
   1637	unsafe_put_user(info.uid, &infop->si_uid, Efault);
   1638	unsafe_put_user(info.status, &infop->si_status, Efault);
   1639	user_write_access_end();
   1640	return err;
   1641Efault:
   1642	user_write_access_end();
   1643	return -EFAULT;
   1644}
   1645
   1646long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
   1647		  struct rusage *ru)
   1648{
   1649	struct wait_opts wo;
   1650	struct pid *pid = NULL;
   1651	enum pid_type type;
   1652	long ret;
   1653
   1654	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
   1655			__WNOTHREAD|__WCLONE|__WALL))
   1656		return -EINVAL;
   1657
   1658	/* -INT_MIN is not defined */
   1659	if (upid == INT_MIN)
   1660		return -ESRCH;
   1661
   1662	if (upid == -1)
   1663		type = PIDTYPE_MAX;
   1664	else if (upid < 0) {
   1665		type = PIDTYPE_PGID;
   1666		pid = find_get_pid(-upid);
   1667	} else if (upid == 0) {
   1668		type = PIDTYPE_PGID;
   1669		pid = get_task_pid(current, PIDTYPE_PGID);
   1670	} else /* upid > 0 */ {
   1671		type = PIDTYPE_PID;
   1672		pid = find_get_pid(upid);
   1673	}
   1674
   1675	wo.wo_type	= type;
   1676	wo.wo_pid	= pid;
   1677	wo.wo_flags	= options | WEXITED;
   1678	wo.wo_info	= NULL;
   1679	wo.wo_stat	= 0;
   1680	wo.wo_rusage	= ru;
   1681	ret = do_wait(&wo);
   1682	put_pid(pid);
   1683	if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
   1684		ret = -EFAULT;
   1685
   1686	return ret;
   1687}
   1688
   1689int kernel_wait(pid_t pid, int *stat)
   1690{
   1691	struct wait_opts wo = {
   1692		.wo_type	= PIDTYPE_PID,
   1693		.wo_pid		= find_get_pid(pid),
   1694		.wo_flags	= WEXITED,
   1695	};
   1696	int ret;
   1697
   1698	ret = do_wait(&wo);
   1699	if (ret > 0 && wo.wo_stat)
   1700		*stat = wo.wo_stat;
   1701	put_pid(wo.wo_pid);
   1702	return ret;
   1703}
   1704
   1705SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
   1706		int, options, struct rusage __user *, ru)
   1707{
   1708	struct rusage r;
   1709	long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
   1710
   1711	if (err > 0) {
   1712		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
   1713			return -EFAULT;
   1714	}
   1715	return err;
   1716}
   1717
   1718#ifdef __ARCH_WANT_SYS_WAITPID
   1719
   1720/*
   1721 * sys_waitpid() remains for compatibility. waitpid() should be
   1722 * implemented by calling sys_wait4() from libc.a.
   1723 */
   1724SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
   1725{
   1726	return kernel_wait4(pid, stat_addr, options, NULL);
   1727}
   1728
   1729#endif
   1730
   1731#ifdef CONFIG_COMPAT
   1732COMPAT_SYSCALL_DEFINE4(wait4,
   1733	compat_pid_t, pid,
   1734	compat_uint_t __user *, stat_addr,
   1735	int, options,
   1736	struct compat_rusage __user *, ru)
   1737{
   1738	struct rusage r;
   1739	long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
   1740	if (err > 0) {
   1741		if (ru && put_compat_rusage(&r, ru))
   1742			return -EFAULT;
   1743	}
   1744	return err;
   1745}
   1746
   1747COMPAT_SYSCALL_DEFINE5(waitid,
   1748		int, which, compat_pid_t, pid,
   1749		struct compat_siginfo __user *, infop, int, options,
   1750		struct compat_rusage __user *, uru)
   1751{
   1752	struct rusage ru;
   1753	struct waitid_info info = {.status = 0};
   1754	long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
   1755	int signo = 0;
   1756	if (err > 0) {
   1757		signo = SIGCHLD;
   1758		err = 0;
   1759		if (uru) {
   1760			/* kernel_waitid() overwrites everything in ru */
   1761			if (COMPAT_USE_64BIT_TIME)
   1762				err = copy_to_user(uru, &ru, sizeof(ru));
   1763			else
   1764				err = put_compat_rusage(&ru, uru);
   1765			if (err)
   1766				return -EFAULT;
   1767		}
   1768	}
   1769
   1770	if (!infop)
   1771		return err;
   1772
   1773	if (!user_write_access_begin(infop, sizeof(*infop)))
   1774		return -EFAULT;
   1775
   1776	unsafe_put_user(signo, &infop->si_signo, Efault);
   1777	unsafe_put_user(0, &infop->si_errno, Efault);
   1778	unsafe_put_user(info.cause, &infop->si_code, Efault);
   1779	unsafe_put_user(info.pid, &infop->si_pid, Efault);
   1780	unsafe_put_user(info.uid, &infop->si_uid, Efault);
   1781	unsafe_put_user(info.status, &infop->si_status, Efault);
   1782	user_write_access_end();
   1783	return err;
   1784Efault:
   1785	user_write_access_end();
   1786	return -EFAULT;
   1787}
   1788#endif
   1789
   1790/**
   1791 * thread_group_exited - check that a thread group has exited
   1792 * @pid: tgid of thread group to be checked.
   1793 *
   1794 * Test if the thread group represented by tgid has exited (all
   1795 * threads are zombies, dead or completely gone).
   1796 *
   1797 * Return: true if the thread group has exited. false otherwise.
   1798 */
   1799bool thread_group_exited(struct pid *pid)
   1800{
   1801	struct task_struct *task;
   1802	bool exited;
   1803
   1804	rcu_read_lock();
   1805	task = pid_task(pid, PIDTYPE_PID);
   1806	exited = !task ||
   1807		(READ_ONCE(task->exit_state) && thread_group_empty(task));
   1808	rcu_read_unlock();
   1809
   1810	return exited;
   1811}
   1812EXPORT_SYMBOL(thread_group_exited);
   1813
   1814__weak void abort(void)
   1815{
   1816	BUG();
   1817
   1818	/* if that doesn't kill us, halt */
   1819	panic("Oops failed to kill thread");
   1820}
   1821EXPORT_SYMBOL(abort);