cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sem.c (64696B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * linux/ipc/sem.c
      4 * Copyright (C) 1992 Krishna Balasubramanian
      5 * Copyright (C) 1995 Eric Schenk, Bruno Haible
      6 *
      7 * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
      8 *
      9 * SMP-threaded, sysctl's added
     10 * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
     11 * Enforced range limit on SEM_UNDO
     12 * (c) 2001 Red Hat Inc
     13 * Lockless wakeup
     14 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
     15 * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
     16 * Further wakeup optimizations, documentation
     17 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
     18 *
     19 * support for audit of ipc object properties and permission changes
     20 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
     21 *
     22 * namespaces support
     23 * OpenVZ, SWsoft Inc.
     24 * Pavel Emelianov <xemul@openvz.org>
     25 *
     26 * Implementation notes: (May 2010)
     27 * This file implements System V semaphores.
     28 *
     29 * User space visible behavior:
     30 * - FIFO ordering for semop() operations (just FIFO, not starvation
     31 *   protection)
     32 * - multiple semaphore operations that alter the same semaphore in
     33 *   one semop() are handled.
     34 * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
     35 *   SETALL calls.
     36 * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
     37 * - undo adjustments at process exit are limited to 0..SEMVMX.
     38 * - namespace are supported.
     39 * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing
     40 *   to /proc/sys/kernel/sem.
     41 * - statistics about the usage are reported in /proc/sysvipc/sem.
     42 *
     43 * Internals:
     44 * - scalability:
     45 *   - all global variables are read-mostly.
     46 *   - semop() calls and semctl(RMID) are synchronized by RCU.
     47 *   - most operations do write operations (actually: spin_lock calls) to
     48 *     the per-semaphore array structure.
     49 *   Thus: Perfect SMP scaling between independent semaphore arrays.
     50 *         If multiple semaphores in one array are used, then cache line
     51 *         trashing on the semaphore array spinlock will limit the scaling.
     52 * - semncnt and semzcnt are calculated on demand in count_semcnt()
     53 * - the task that performs a successful semop() scans the list of all
     54 *   sleeping tasks and completes any pending operations that can be fulfilled.
     55 *   Semaphores are actively given to waiting tasks (necessary for FIFO).
     56 *   (see update_queue())
     57 * - To improve the scalability, the actual wake-up calls are performed after
     58 *   dropping all locks. (see wake_up_sem_queue_prepare())
     59 * - All work is done by the waker, the woken up task does not have to do
     60 *   anything - not even acquiring a lock or dropping a refcount.
     61 * - A woken up task may not even touch the semaphore array anymore, it may
     62 *   have been destroyed already by a semctl(RMID).
     63 * - UNDO values are stored in an array (one per process and per
     64 *   semaphore array, lazily allocated). For backwards compatibility, multiple
     65 *   modes for the UNDO variables are supported (per process, per thread)
     66 *   (see copy_semundo, CLONE_SYSVSEM)
     67 * - There are two lists of the pending operations: a per-array list
     68 *   and per-semaphore list (stored in the array). This allows to achieve FIFO
     69 *   ordering without always scanning all pending operations.
     70 *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
     71 */
     72
     73#include <linux/compat.h>
     74#include <linux/slab.h>
     75#include <linux/spinlock.h>
     76#include <linux/init.h>
     77#include <linux/proc_fs.h>
     78#include <linux/time.h>
     79#include <linux/security.h>
     80#include <linux/syscalls.h>
     81#include <linux/audit.h>
     82#include <linux/capability.h>
     83#include <linux/seq_file.h>
     84#include <linux/rwsem.h>
     85#include <linux/nsproxy.h>
     86#include <linux/ipc_namespace.h>
     87#include <linux/sched/wake_q.h>
     88#include <linux/nospec.h>
     89#include <linux/rhashtable.h>
     90
     91#include <linux/uaccess.h>
     92#include "util.h"
     93
     94/* One semaphore structure for each semaphore in the system. */
     95struct sem {
     96	int	semval;		/* current value */
     97	/*
     98	 * PID of the process that last modified the semaphore. For
     99	 * Linux, specifically these are:
    100	 *  - semop
    101	 *  - semctl, via SETVAL and SETALL.
    102	 *  - at task exit when performing undo adjustments (see exit_sem).
    103	 */
    104	struct pid *sempid;
    105	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
    106	struct list_head pending_alter; /* pending single-sop operations */
    107					/* that alter the semaphore */
    108	struct list_head pending_const; /* pending single-sop operations */
    109					/* that do not alter the semaphore*/
    110	time64_t	 sem_otime;	/* candidate for sem_otime */
    111} ____cacheline_aligned_in_smp;
    112
    113/* One sem_array data structure for each set of semaphores in the system. */
    114struct sem_array {
    115	struct kern_ipc_perm	sem_perm;	/* permissions .. see ipc.h */
    116	time64_t		sem_ctime;	/* create/last semctl() time */
    117	struct list_head	pending_alter;	/* pending operations */
    118						/* that alter the array */
    119	struct list_head	pending_const;	/* pending complex operations */
    120						/* that do not alter semvals */
    121	struct list_head	list_id;	/* undo requests on this array */
    122	int			sem_nsems;	/* no. of semaphores in array */
    123	int			complex_count;	/* pending complex operations */
    124	unsigned int		use_global_lock;/* >0: global lock required */
    125
    126	struct sem		sems[];
    127} __randomize_layout;
    128
    129/* One queue for each sleeping process in the system. */
    130struct sem_queue {
    131	struct list_head	list;	 /* queue of pending operations */
    132	struct task_struct	*sleeper; /* this process */
    133	struct sem_undo		*undo;	 /* undo structure */
    134	struct pid		*pid;	 /* process id of requesting process */
    135	int			status;	 /* completion status of operation */
    136	struct sembuf		*sops;	 /* array of pending operations */
    137	struct sembuf		*blocking; /* the operation that blocked */
    138	int			nsops;	 /* number of operations */
    139	bool			alter;	 /* does *sops alter the array? */
    140	bool                    dupsop;	 /* sops on more than one sem_num */
    141};
    142
    143/* Each task has a list of undo requests. They are executed automatically
    144 * when the process exits.
    145 */
    146struct sem_undo {
    147	struct list_head	list_proc;	/* per-process list: *
    148						 * all undos from one process
    149						 * rcu protected */
    150	struct rcu_head		rcu;		/* rcu struct for sem_undo */
    151	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
    152	struct list_head	list_id;	/* per semaphore array list:
    153						 * all undos for one array */
    154	int			semid;		/* semaphore set identifier */
    155	short			*semadj;	/* array of adjustments */
    156						/* one per semaphore */
    157};
    158
    159/* sem_undo_list controls shared access to the list of sem_undo structures
    160 * that may be shared among all a CLONE_SYSVSEM task group.
    161 */
    162struct sem_undo_list {
    163	refcount_t		refcnt;
    164	spinlock_t		lock;
    165	struct list_head	list_proc;
    166};
    167
    168
    169#define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
    170
    171static int newary(struct ipc_namespace *, struct ipc_params *);
    172static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
    173#ifdef CONFIG_PROC_FS
    174static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
    175#endif
    176
    177#define SEMMSL_FAST	256 /* 512 bytes on stack */
    178#define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
    179
    180/*
    181 * Switching from the mode suitable for simple ops
    182 * to the mode for complex ops is costly. Therefore:
    183 * use some hysteresis
    184 */
    185#define USE_GLOBAL_LOCK_HYSTERESIS	10
    186
    187/*
    188 * Locking:
    189 * a) global sem_lock() for read/write
    190 *	sem_undo.id_next,
    191 *	sem_array.complex_count,
    192 *	sem_array.pending{_alter,_const},
    193 *	sem_array.sem_undo
    194 *
    195 * b) global or semaphore sem_lock() for read/write:
    196 *	sem_array.sems[i].pending_{const,alter}:
    197 *
    198 * c) special:
    199 *	sem_undo_list.list_proc:
    200 *	* undo_list->lock for write
    201 *	* rcu for read
    202 *	use_global_lock:
    203 *	* global sem_lock() for write
    204 *	* either local or global sem_lock() for read.
    205 *
    206 * Memory ordering:
    207 * Most ordering is enforced by using spin_lock() and spin_unlock().
    208 *
    209 * Exceptions:
    210 * 1) use_global_lock: (SEM_BARRIER_1)
    211 * Setting it from non-zero to 0 is a RELEASE, this is ensured by
    212 * using smp_store_release(): Immediately after setting it to 0,
    213 * a simple op can start.
    214 * Testing if it is non-zero is an ACQUIRE, this is ensured by using
    215 * smp_load_acquire().
    216 * Setting it from 0 to non-zero must be ordered with regards to
    217 * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
    218 * is inside a spin_lock() and after a write from 0 to non-zero a
    219 * spin_lock()+spin_unlock() is done.
    220 * To prevent the compiler/cpu temporarily writing 0 to use_global_lock,
    221 * READ_ONCE()/WRITE_ONCE() is used.
    222 *
    223 * 2) queue.status: (SEM_BARRIER_2)
    224 * Initialization is done while holding sem_lock(), so no further barrier is
    225 * required.
    226 * Setting it to a result code is a RELEASE, this is ensured by both a
    227 * smp_store_release() (for case a) and while holding sem_lock()
    228 * (for case b).
    229 * The ACQUIRE when reading the result code without holding sem_lock() is
    230 * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
    231 * (case a above).
    232 * Reading the result code while holding sem_lock() needs no further barriers,
    233 * the locks inside sem_lock() enforce ordering (case b above)
    234 *
    235 * 3) current->state:
    236 * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock().
    237 * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may
    238 * happen immediately after calling wake_q_add. As wake_q_add_safe() is called
    239 * when holding sem_lock(), no further barriers are required.
    240 *
    241 * See also ipc/mqueue.c for more details on the covered races.
    242 */
    243
    244#define sc_semmsl	sem_ctls[0]
    245#define sc_semmns	sem_ctls[1]
    246#define sc_semopm	sem_ctls[2]
    247#define sc_semmni	sem_ctls[3]
    248
    249void sem_init_ns(struct ipc_namespace *ns)
    250{
    251	ns->sc_semmsl = SEMMSL;
    252	ns->sc_semmns = SEMMNS;
    253	ns->sc_semopm = SEMOPM;
    254	ns->sc_semmni = SEMMNI;
    255	ns->used_sems = 0;
    256	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
    257}
    258
    259#ifdef CONFIG_IPC_NS
    260void sem_exit_ns(struct ipc_namespace *ns)
    261{
    262	free_ipcs(ns, &sem_ids(ns), freeary);
    263	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
    264	rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
    265}
    266#endif
    267
    268void __init sem_init(void)
    269{
    270	sem_init_ns(&init_ipc_ns);
    271	ipc_init_proc_interface("sysvipc/sem",
    272				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
    273				IPC_SEM_IDS, sysvipc_sem_proc_show);
    274}
    275
    276/**
    277 * unmerge_queues - unmerge queues, if possible.
    278 * @sma: semaphore array
    279 *
    280 * The function unmerges the wait queues if complex_count is 0.
    281 * It must be called prior to dropping the global semaphore array lock.
    282 */
    283static void unmerge_queues(struct sem_array *sma)
    284{
    285	struct sem_queue *q, *tq;
    286
    287	/* complex operations still around? */
    288	if (sma->complex_count)
    289		return;
    290	/*
    291	 * We will switch back to simple mode.
    292	 * Move all pending operation back into the per-semaphore
    293	 * queues.
    294	 */
    295	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
    296		struct sem *curr;
    297		curr = &sma->sems[q->sops[0].sem_num];
    298
    299		list_add_tail(&q->list, &curr->pending_alter);
    300	}
    301	INIT_LIST_HEAD(&sma->pending_alter);
    302}
    303
    304/**
    305 * merge_queues - merge single semop queues into global queue
    306 * @sma: semaphore array
    307 *
    308 * This function merges all per-semaphore queues into the global queue.
    309 * It is necessary to achieve FIFO ordering for the pending single-sop
    310 * operations when a multi-semop operation must sleep.
    311 * Only the alter operations must be moved, the const operations can stay.
    312 */
    313static void merge_queues(struct sem_array *sma)
    314{
    315	int i;
    316	for (i = 0; i < sma->sem_nsems; i++) {
    317		struct sem *sem = &sma->sems[i];
    318
    319		list_splice_init(&sem->pending_alter, &sma->pending_alter);
    320	}
    321}
    322
    323static void sem_rcu_free(struct rcu_head *head)
    324{
    325	struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu);
    326	struct sem_array *sma = container_of(p, struct sem_array, sem_perm);
    327
    328	security_sem_free(&sma->sem_perm);
    329	kvfree(sma);
    330}
    331
    332/*
    333 * Enter the mode suitable for non-simple operations:
    334 * Caller must own sem_perm.lock.
    335 */
    336static void complexmode_enter(struct sem_array *sma)
    337{
    338	int i;
    339	struct sem *sem;
    340
    341	if (sma->use_global_lock > 0)  {
    342		/*
    343		 * We are already in global lock mode.
    344		 * Nothing to do, just reset the
    345		 * counter until we return to simple mode.
    346		 */
    347		WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS);
    348		return;
    349	}
    350	WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS);
    351
    352	for (i = 0; i < sma->sem_nsems; i++) {
    353		sem = &sma->sems[i];
    354		spin_lock(&sem->lock);
    355		spin_unlock(&sem->lock);
    356	}
    357}
    358
    359/*
    360 * Try to leave the mode that disallows simple operations:
    361 * Caller must own sem_perm.lock.
    362 */
    363static void complexmode_tryleave(struct sem_array *sma)
    364{
    365	if (sma->complex_count)  {
    366		/* Complex ops are sleeping.
    367		 * We must stay in complex mode
    368		 */
    369		return;
    370	}
    371	if (sma->use_global_lock == 1) {
    372
    373		/* See SEM_BARRIER_1 for purpose/pairing */
    374		smp_store_release(&sma->use_global_lock, 0);
    375	} else {
    376		WRITE_ONCE(sma->use_global_lock,
    377				sma->use_global_lock-1);
    378	}
    379}
    380
    381#define SEM_GLOBAL_LOCK	(-1)
    382/*
    383 * If the request contains only one semaphore operation, and there are
    384 * no complex transactions pending, lock only the semaphore involved.
    385 * Otherwise, lock the entire semaphore array, since we either have
    386 * multiple semaphores in our own semops, or we need to look at
    387 * semaphores from other pending complex operations.
    388 */
    389static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
    390			      int nsops)
    391{
    392	struct sem *sem;
    393	int idx;
    394
    395	if (nsops != 1) {
    396		/* Complex operation - acquire a full lock */
    397		ipc_lock_object(&sma->sem_perm);
    398
    399		/* Prevent parallel simple ops */
    400		complexmode_enter(sma);
    401		return SEM_GLOBAL_LOCK;
    402	}
    403
    404	/*
    405	 * Only one semaphore affected - try to optimize locking.
    406	 * Optimized locking is possible if no complex operation
    407	 * is either enqueued or processed right now.
    408	 *
    409	 * Both facts are tracked by use_global_mode.
    410	 */
    411	idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
    412	sem = &sma->sems[idx];
    413
    414	/*
    415	 * Initial check for use_global_lock. Just an optimization,
    416	 * no locking, no memory barrier.
    417	 */
    418	if (!READ_ONCE(sma->use_global_lock)) {
    419		/*
    420		 * It appears that no complex operation is around.
    421		 * Acquire the per-semaphore lock.
    422		 */
    423		spin_lock(&sem->lock);
    424
    425		/* see SEM_BARRIER_1 for purpose/pairing */
    426		if (!smp_load_acquire(&sma->use_global_lock)) {
    427			/* fast path successful! */
    428			return sops->sem_num;
    429		}
    430		spin_unlock(&sem->lock);
    431	}
    432
    433	/* slow path: acquire the full lock */
    434	ipc_lock_object(&sma->sem_perm);
    435
    436	if (sma->use_global_lock == 0) {
    437		/*
    438		 * The use_global_lock mode ended while we waited for
    439		 * sma->sem_perm.lock. Thus we must switch to locking
    440		 * with sem->lock.
    441		 * Unlike in the fast path, there is no need to recheck
    442		 * sma->use_global_lock after we have acquired sem->lock:
    443		 * We own sma->sem_perm.lock, thus use_global_lock cannot
    444		 * change.
    445		 */
    446		spin_lock(&sem->lock);
    447
    448		ipc_unlock_object(&sma->sem_perm);
    449		return sops->sem_num;
    450	} else {
    451		/*
    452		 * Not a false alarm, thus continue to use the global lock
    453		 * mode. No need for complexmode_enter(), this was done by
    454		 * the caller that has set use_global_mode to non-zero.
    455		 */
    456		return SEM_GLOBAL_LOCK;
    457	}
    458}
    459
    460static inline void sem_unlock(struct sem_array *sma, int locknum)
    461{
    462	if (locknum == SEM_GLOBAL_LOCK) {
    463		unmerge_queues(sma);
    464		complexmode_tryleave(sma);
    465		ipc_unlock_object(&sma->sem_perm);
    466	} else {
    467		struct sem *sem = &sma->sems[locknum];
    468		spin_unlock(&sem->lock);
    469	}
    470}
    471
    472/*
    473 * sem_lock_(check_) routines are called in the paths where the rwsem
    474 * is not held.
    475 *
    476 * The caller holds the RCU read lock.
    477 */
    478static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
    479{
    480	struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
    481
    482	if (IS_ERR(ipcp))
    483		return ERR_CAST(ipcp);
    484
    485	return container_of(ipcp, struct sem_array, sem_perm);
    486}
    487
    488static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
    489							int id)
    490{
    491	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
    492
    493	if (IS_ERR(ipcp))
    494		return ERR_CAST(ipcp);
    495
    496	return container_of(ipcp, struct sem_array, sem_perm);
    497}
    498
    499static inline void sem_lock_and_putref(struct sem_array *sma)
    500{
    501	sem_lock(sma, NULL, -1);
    502	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
    503}
    504
    505static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
    506{
    507	ipc_rmid(&sem_ids(ns), &s->sem_perm);
    508}
    509
    510static struct sem_array *sem_alloc(size_t nsems)
    511{
    512	struct sem_array *sma;
    513
    514	if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
    515		return NULL;
    516
    517	sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT);
    518	if (unlikely(!sma))
    519		return NULL;
    520
    521	return sma;
    522}
    523
    524/**
    525 * newary - Create a new semaphore set
    526 * @ns: namespace
    527 * @params: ptr to the structure that contains key, semflg and nsems
    528 *
    529 * Called with sem_ids.rwsem held (as a writer)
    530 */
    531static int newary(struct ipc_namespace *ns, struct ipc_params *params)
    532{
    533	int retval;
    534	struct sem_array *sma;
    535	key_t key = params->key;
    536	int nsems = params->u.nsems;
    537	int semflg = params->flg;
    538	int i;
    539
    540	if (!nsems)
    541		return -EINVAL;
    542	if (ns->used_sems + nsems > ns->sc_semmns)
    543		return -ENOSPC;
    544
    545	sma = sem_alloc(nsems);
    546	if (!sma)
    547		return -ENOMEM;
    548
    549	sma->sem_perm.mode = (semflg & S_IRWXUGO);
    550	sma->sem_perm.key = key;
    551
    552	sma->sem_perm.security = NULL;
    553	retval = security_sem_alloc(&sma->sem_perm);
    554	if (retval) {
    555		kvfree(sma);
    556		return retval;
    557	}
    558
    559	for (i = 0; i < nsems; i++) {
    560		INIT_LIST_HEAD(&sma->sems[i].pending_alter);
    561		INIT_LIST_HEAD(&sma->sems[i].pending_const);
    562		spin_lock_init(&sma->sems[i].lock);
    563	}
    564
    565	sma->complex_count = 0;
    566	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
    567	INIT_LIST_HEAD(&sma->pending_alter);
    568	INIT_LIST_HEAD(&sma->pending_const);
    569	INIT_LIST_HEAD(&sma->list_id);
    570	sma->sem_nsems = nsems;
    571	sma->sem_ctime = ktime_get_real_seconds();
    572
    573	/* ipc_addid() locks sma upon success. */
    574	retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
    575	if (retval < 0) {
    576		ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
    577		return retval;
    578	}
    579	ns->used_sems += nsems;
    580
    581	sem_unlock(sma, -1);
    582	rcu_read_unlock();
    583
    584	return sma->sem_perm.id;
    585}
    586
    587
    588/*
    589 * Called with sem_ids.rwsem and ipcp locked.
    590 */
    591static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
    592{
    593	struct sem_array *sma;
    594
    595	sma = container_of(ipcp, struct sem_array, sem_perm);
    596	if (params->u.nsems > sma->sem_nsems)
    597		return -EINVAL;
    598
    599	return 0;
    600}
    601
    602long ksys_semget(key_t key, int nsems, int semflg)
    603{
    604	struct ipc_namespace *ns;
    605	static const struct ipc_ops sem_ops = {
    606		.getnew = newary,
    607		.associate = security_sem_associate,
    608		.more_checks = sem_more_checks,
    609	};
    610	struct ipc_params sem_params;
    611
    612	ns = current->nsproxy->ipc_ns;
    613
    614	if (nsems < 0 || nsems > ns->sc_semmsl)
    615		return -EINVAL;
    616
    617	sem_params.key = key;
    618	sem_params.flg = semflg;
    619	sem_params.u.nsems = nsems;
    620
    621	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
    622}
    623
    624SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
    625{
    626	return ksys_semget(key, nsems, semflg);
    627}
    628
    629/**
    630 * perform_atomic_semop[_slow] - Attempt to perform semaphore
    631 *                               operations on a given array.
    632 * @sma: semaphore array
    633 * @q: struct sem_queue that describes the operation
    634 *
    635 * Caller blocking are as follows, based the value
    636 * indicated by the semaphore operation (sem_op):
    637 *
    638 *  (1) >0 never blocks.
    639 *  (2)  0 (wait-for-zero operation): semval is non-zero.
    640 *  (3) <0 attempting to decrement semval to a value smaller than zero.
    641 *
    642 * Returns 0 if the operation was possible.
    643 * Returns 1 if the operation is impossible, the caller must sleep.
    644 * Returns <0 for error codes.
    645 */
    646static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
    647{
    648	int result, sem_op, nsops;
    649	struct pid *pid;
    650	struct sembuf *sop;
    651	struct sem *curr;
    652	struct sembuf *sops;
    653	struct sem_undo *un;
    654
    655	sops = q->sops;
    656	nsops = q->nsops;
    657	un = q->undo;
    658
    659	for (sop = sops; sop < sops + nsops; sop++) {
    660		int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);
    661		curr = &sma->sems[idx];
    662		sem_op = sop->sem_op;
    663		result = curr->semval;
    664
    665		if (!sem_op && result)
    666			goto would_block;
    667
    668		result += sem_op;
    669		if (result < 0)
    670			goto would_block;
    671		if (result > SEMVMX)
    672			goto out_of_range;
    673
    674		if (sop->sem_flg & SEM_UNDO) {
    675			int undo = un->semadj[sop->sem_num] - sem_op;
    676			/* Exceeding the undo range is an error. */
    677			if (undo < (-SEMAEM - 1) || undo > SEMAEM)
    678				goto out_of_range;
    679			un->semadj[sop->sem_num] = undo;
    680		}
    681
    682		curr->semval = result;
    683	}
    684
    685	sop--;
    686	pid = q->pid;
    687	while (sop >= sops) {
    688		ipc_update_pid(&sma->sems[sop->sem_num].sempid, pid);
    689		sop--;
    690	}
    691
    692	return 0;
    693
    694out_of_range:
    695	result = -ERANGE;
    696	goto undo;
    697
    698would_block:
    699	q->blocking = sop;
    700
    701	if (sop->sem_flg & IPC_NOWAIT)
    702		result = -EAGAIN;
    703	else
    704		result = 1;
    705
    706undo:
    707	sop--;
    708	while (sop >= sops) {
    709		sem_op = sop->sem_op;
    710		sma->sems[sop->sem_num].semval -= sem_op;
    711		if (sop->sem_flg & SEM_UNDO)
    712			un->semadj[sop->sem_num] += sem_op;
    713		sop--;
    714	}
    715
    716	return result;
    717}
    718
    719static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
    720{
    721	int result, sem_op, nsops;
    722	struct sembuf *sop;
    723	struct sem *curr;
    724	struct sembuf *sops;
    725	struct sem_undo *un;
    726
    727	sops = q->sops;
    728	nsops = q->nsops;
    729	un = q->undo;
    730
    731	if (unlikely(q->dupsop))
    732		return perform_atomic_semop_slow(sma, q);
    733
    734	/*
    735	 * We scan the semaphore set twice, first to ensure that the entire
    736	 * operation can succeed, therefore avoiding any pointless writes
    737	 * to shared memory and having to undo such changes in order to block
    738	 * until the operations can go through.
    739	 */
    740	for (sop = sops; sop < sops + nsops; sop++) {
    741		int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);
    742
    743		curr = &sma->sems[idx];
    744		sem_op = sop->sem_op;
    745		result = curr->semval;
    746
    747		if (!sem_op && result)
    748			goto would_block; /* wait-for-zero */
    749
    750		result += sem_op;
    751		if (result < 0)
    752			goto would_block;
    753
    754		if (result > SEMVMX)
    755			return -ERANGE;
    756
    757		if (sop->sem_flg & SEM_UNDO) {
    758			int undo = un->semadj[sop->sem_num] - sem_op;
    759
    760			/* Exceeding the undo range is an error. */
    761			if (undo < (-SEMAEM - 1) || undo > SEMAEM)
    762				return -ERANGE;
    763		}
    764	}
    765
    766	for (sop = sops; sop < sops + nsops; sop++) {
    767		curr = &sma->sems[sop->sem_num];
    768		sem_op = sop->sem_op;
    769
    770		if (sop->sem_flg & SEM_UNDO) {
    771			int undo = un->semadj[sop->sem_num] - sem_op;
    772
    773			un->semadj[sop->sem_num] = undo;
    774		}
    775		curr->semval += sem_op;
    776		ipc_update_pid(&curr->sempid, q->pid);
    777	}
    778
    779	return 0;
    780
    781would_block:
    782	q->blocking = sop;
    783	return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
    784}
    785
    786static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
    787					     struct wake_q_head *wake_q)
    788{
    789	struct task_struct *sleeper;
    790
    791	sleeper = get_task_struct(q->sleeper);
    792
    793	/* see SEM_BARRIER_2 for purpose/pairing */
    794	smp_store_release(&q->status, error);
    795
    796	wake_q_add_safe(wake_q, sleeper);
    797}
    798
    799static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
    800{
    801	list_del(&q->list);
    802	if (q->nsops > 1)
    803		sma->complex_count--;
    804}
    805
    806/** check_restart(sma, q)
    807 * @sma: semaphore array
    808 * @q: the operation that just completed
    809 *
    810 * update_queue is O(N^2) when it restarts scanning the whole queue of
    811 * waiting operations. Therefore this function checks if the restart is
    812 * really necessary. It is called after a previously waiting operation
    813 * modified the array.
    814 * Note that wait-for-zero operations are handled without restart.
    815 */
    816static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
    817{
    818	/* pending complex alter operations are too difficult to analyse */
    819	if (!list_empty(&sma->pending_alter))
    820		return 1;
    821
    822	/* we were a sleeping complex operation. Too difficult */
    823	if (q->nsops > 1)
    824		return 1;
    825
    826	/* It is impossible that someone waits for the new value:
    827	 * - complex operations always restart.
    828	 * - wait-for-zero are handled separately.
    829	 * - q is a previously sleeping simple operation that
    830	 *   altered the array. It must be a decrement, because
    831	 *   simple increments never sleep.
    832	 * - If there are older (higher priority) decrements
    833	 *   in the queue, then they have observed the original
    834	 *   semval value and couldn't proceed. The operation
    835	 *   decremented to value - thus they won't proceed either.
    836	 */
    837	return 0;
    838}
    839
    840/**
    841 * wake_const_ops - wake up non-alter tasks
    842 * @sma: semaphore array.
    843 * @semnum: semaphore that was modified.
    844 * @wake_q: lockless wake-queue head.
    845 *
    846 * wake_const_ops must be called after a semaphore in a semaphore array
    847 * was set to 0. If complex const operations are pending, wake_const_ops must
    848 * be called with semnum = -1, as well as with the number of each modified
    849 * semaphore.
    850 * The tasks that must be woken up are added to @wake_q. The return code
    851 * is stored in q->pid.
    852 * The function returns 1 if at least one operation was completed successfully.
    853 */
    854static int wake_const_ops(struct sem_array *sma, int semnum,
    855			  struct wake_q_head *wake_q)
    856{
    857	struct sem_queue *q, *tmp;
    858	struct list_head *pending_list;
    859	int semop_completed = 0;
    860
    861	if (semnum == -1)
    862		pending_list = &sma->pending_const;
    863	else
    864		pending_list = &sma->sems[semnum].pending_const;
    865
    866	list_for_each_entry_safe(q, tmp, pending_list, list) {
    867		int error = perform_atomic_semop(sma, q);
    868
    869		if (error > 0)
    870			continue;
    871		/* operation completed, remove from queue & wakeup */
    872		unlink_queue(sma, q);
    873
    874		wake_up_sem_queue_prepare(q, error, wake_q);
    875		if (error == 0)
    876			semop_completed = 1;
    877	}
    878
    879	return semop_completed;
    880}
    881
    882/**
    883 * do_smart_wakeup_zero - wakeup all wait for zero tasks
    884 * @sma: semaphore array
    885 * @sops: operations that were performed
    886 * @nsops: number of operations
    887 * @wake_q: lockless wake-queue head
    888 *
    889 * Checks all required queue for wait-for-zero operations, based
    890 * on the actual changes that were performed on the semaphore array.
    891 * The function returns 1 if at least one operation was completed successfully.
    892 */
    893static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
    894				int nsops, struct wake_q_head *wake_q)
    895{
    896	int i;
    897	int semop_completed = 0;
    898	int got_zero = 0;
    899
    900	/* first: the per-semaphore queues, if known */
    901	if (sops) {
    902		for (i = 0; i < nsops; i++) {
    903			int num = sops[i].sem_num;
    904
    905			if (sma->sems[num].semval == 0) {
    906				got_zero = 1;
    907				semop_completed |= wake_const_ops(sma, num, wake_q);
    908			}
    909		}
    910	} else {
    911		/*
    912		 * No sops means modified semaphores not known.
    913		 * Assume all were changed.
    914		 */
    915		for (i = 0; i < sma->sem_nsems; i++) {
    916			if (sma->sems[i].semval == 0) {
    917				got_zero = 1;
    918				semop_completed |= wake_const_ops(sma, i, wake_q);
    919			}
    920		}
    921	}
    922	/*
    923	 * If one of the modified semaphores got 0,
    924	 * then check the global queue, too.
    925	 */
    926	if (got_zero)
    927		semop_completed |= wake_const_ops(sma, -1, wake_q);
    928
    929	return semop_completed;
    930}
    931
    932
    933/**
    934 * update_queue - look for tasks that can be completed.
    935 * @sma: semaphore array.
    936 * @semnum: semaphore that was modified.
    937 * @wake_q: lockless wake-queue head.
    938 *
    939 * update_queue must be called after a semaphore in a semaphore array
    940 * was modified. If multiple semaphores were modified, update_queue must
    941 * be called with semnum = -1, as well as with the number of each modified
    942 * semaphore.
    943 * The tasks that must be woken up are added to @wake_q. The return code
    944 * is stored in q->pid.
    945 * The function internally checks if const operations can now succeed.
    946 *
    947 * The function return 1 if at least one semop was completed successfully.
    948 */
    949static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
    950{
    951	struct sem_queue *q, *tmp;
    952	struct list_head *pending_list;
    953	int semop_completed = 0;
    954
    955	if (semnum == -1)
    956		pending_list = &sma->pending_alter;
    957	else
    958		pending_list = &sma->sems[semnum].pending_alter;
    959
    960again:
    961	list_for_each_entry_safe(q, tmp, pending_list, list) {
    962		int error, restart;
    963
    964		/* If we are scanning the single sop, per-semaphore list of
    965		 * one semaphore and that semaphore is 0, then it is not
    966		 * necessary to scan further: simple increments
    967		 * that affect only one entry succeed immediately and cannot
    968		 * be in the  per semaphore pending queue, and decrements
    969		 * cannot be successful if the value is already 0.
    970		 */
    971		if (semnum != -1 && sma->sems[semnum].semval == 0)
    972			break;
    973
    974		error = perform_atomic_semop(sma, q);
    975
    976		/* Does q->sleeper still need to sleep? */
    977		if (error > 0)
    978			continue;
    979
    980		unlink_queue(sma, q);
    981
    982		if (error) {
    983			restart = 0;
    984		} else {
    985			semop_completed = 1;
    986			do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
    987			restart = check_restart(sma, q);
    988		}
    989
    990		wake_up_sem_queue_prepare(q, error, wake_q);
    991		if (restart)
    992			goto again;
    993	}
    994	return semop_completed;
    995}
    996
    997/**
    998 * set_semotime - set sem_otime
    999 * @sma: semaphore array
   1000 * @sops: operations that modified the array, may be NULL
   1001 *
   1002 * sem_otime is replicated to avoid cache line trashing.
   1003 * This function sets one instance to the current time.
   1004 */
   1005static void set_semotime(struct sem_array *sma, struct sembuf *sops)
   1006{
   1007	if (sops == NULL) {
   1008		sma->sems[0].sem_otime = ktime_get_real_seconds();
   1009	} else {
   1010		sma->sems[sops[0].sem_num].sem_otime =
   1011						ktime_get_real_seconds();
   1012	}
   1013}
   1014
   1015/**
   1016 * do_smart_update - optimized update_queue
   1017 * @sma: semaphore array
   1018 * @sops: operations that were performed
   1019 * @nsops: number of operations
   1020 * @otime: force setting otime
   1021 * @wake_q: lockless wake-queue head
   1022 *
   1023 * do_smart_update() does the required calls to update_queue and wakeup_zero,
   1024 * based on the actual changes that were performed on the semaphore array.
   1025 * Note that the function does not do the actual wake-up: the caller is
   1026 * responsible for calling wake_up_q().
   1027 * It is safe to perform this call after dropping all locks.
   1028 */
   1029static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
   1030			    int otime, struct wake_q_head *wake_q)
   1031{
   1032	int i;
   1033
   1034	otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
   1035
   1036	if (!list_empty(&sma->pending_alter)) {
   1037		/* semaphore array uses the global queue - just process it. */
   1038		otime |= update_queue(sma, -1, wake_q);
   1039	} else {
   1040		if (!sops) {
   1041			/*
   1042			 * No sops, thus the modified semaphores are not
   1043			 * known. Check all.
   1044			 */
   1045			for (i = 0; i < sma->sem_nsems; i++)
   1046				otime |= update_queue(sma, i, wake_q);
   1047		} else {
   1048			/*
   1049			 * Check the semaphores that were increased:
   1050			 * - No complex ops, thus all sleeping ops are
   1051			 *   decrease.
   1052			 * - if we decreased the value, then any sleeping
   1053			 *   semaphore ops won't be able to run: If the
   1054			 *   previous value was too small, then the new
   1055			 *   value will be too small, too.
   1056			 */
   1057			for (i = 0; i < nsops; i++) {
   1058				if (sops[i].sem_op > 0) {
   1059					otime |= update_queue(sma,
   1060							      sops[i].sem_num, wake_q);
   1061				}
   1062			}
   1063		}
   1064	}
   1065	if (otime)
   1066		set_semotime(sma, sops);
   1067}
   1068
   1069/*
   1070 * check_qop: Test if a queued operation sleeps on the semaphore semnum
   1071 */
   1072static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q,
   1073			bool count_zero)
   1074{
   1075	struct sembuf *sop = q->blocking;
   1076
   1077	/*
   1078	 * Linux always (since 0.99.10) reported a task as sleeping on all
   1079	 * semaphores. This violates SUS, therefore it was changed to the
   1080	 * standard compliant behavior.
   1081	 * Give the administrators a chance to notice that an application
   1082	 * might misbehave because it relies on the Linux behavior.
   1083	 */
   1084	pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
   1085			"The task %s (%d) triggered the difference, watch for misbehavior.\n",
   1086			current->comm, task_pid_nr(current));
   1087
   1088	if (sop->sem_num != semnum)
   1089		return 0;
   1090
   1091	if (count_zero && sop->sem_op == 0)
   1092		return 1;
   1093	if (!count_zero && sop->sem_op < 0)
   1094		return 1;
   1095
   1096	return 0;
   1097}
   1098
   1099/* The following counts are associated to each semaphore:
   1100 *   semncnt        number of tasks waiting on semval being nonzero
   1101 *   semzcnt        number of tasks waiting on semval being zero
   1102 *
   1103 * Per definition, a task waits only on the semaphore of the first semop
   1104 * that cannot proceed, even if additional operation would block, too.
   1105 */
   1106static int count_semcnt(struct sem_array *sma, ushort semnum,
   1107			bool count_zero)
   1108{
   1109	struct list_head *l;
   1110	struct sem_queue *q;
   1111	int semcnt;
   1112
   1113	semcnt = 0;
   1114	/* First: check the simple operations. They are easy to evaluate */
   1115	if (count_zero)
   1116		l = &sma->sems[semnum].pending_const;
   1117	else
   1118		l = &sma->sems[semnum].pending_alter;
   1119
   1120	list_for_each_entry(q, l, list) {
   1121		/* all task on a per-semaphore list sleep on exactly
   1122		 * that semaphore
   1123		 */
   1124		semcnt++;
   1125	}
   1126
   1127	/* Then: check the complex operations. */
   1128	list_for_each_entry(q, &sma->pending_alter, list) {
   1129		semcnt += check_qop(sma, semnum, q, count_zero);
   1130	}
   1131	if (count_zero) {
   1132		list_for_each_entry(q, &sma->pending_const, list) {
   1133			semcnt += check_qop(sma, semnum, q, count_zero);
   1134		}
   1135	}
   1136	return semcnt;
   1137}
   1138
   1139/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
   1140 * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
   1141 * remains locked on exit.
   1142 */
   1143static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
   1144{
   1145	struct sem_undo *un, *tu;
   1146	struct sem_queue *q, *tq;
   1147	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
   1148	int i;
   1149	DEFINE_WAKE_Q(wake_q);
   1150
   1151	/* Free the existing undo structures for this semaphore set.  */
   1152	ipc_assert_locked_object(&sma->sem_perm);
   1153	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
   1154		list_del(&un->list_id);
   1155		spin_lock(&un->ulp->lock);
   1156		un->semid = -1;
   1157		list_del_rcu(&un->list_proc);
   1158		spin_unlock(&un->ulp->lock);
   1159		kvfree_rcu(un, rcu);
   1160	}
   1161
   1162	/* Wake up all pending processes and let them fail with EIDRM. */
   1163	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
   1164		unlink_queue(sma, q);
   1165		wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
   1166	}
   1167
   1168	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
   1169		unlink_queue(sma, q);
   1170		wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
   1171	}
   1172	for (i = 0; i < sma->sem_nsems; i++) {
   1173		struct sem *sem = &sma->sems[i];
   1174		list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
   1175			unlink_queue(sma, q);
   1176			wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
   1177		}
   1178		list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
   1179			unlink_queue(sma, q);
   1180			wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
   1181		}
   1182		ipc_update_pid(&sem->sempid, NULL);
   1183	}
   1184
   1185	/* Remove the semaphore set from the IDR */
   1186	sem_rmid(ns, sma);
   1187	sem_unlock(sma, -1);
   1188	rcu_read_unlock();
   1189
   1190	wake_up_q(&wake_q);
   1191	ns->used_sems -= sma->sem_nsems;
   1192	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
   1193}
   1194
   1195static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
   1196{
   1197	switch (version) {
   1198	case IPC_64:
   1199		return copy_to_user(buf, in, sizeof(*in));
   1200	case IPC_OLD:
   1201	    {
   1202		struct semid_ds out;
   1203
   1204		memset(&out, 0, sizeof(out));
   1205
   1206		ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
   1207
   1208		out.sem_otime	= in->sem_otime;
   1209		out.sem_ctime	= in->sem_ctime;
   1210		out.sem_nsems	= in->sem_nsems;
   1211
   1212		return copy_to_user(buf, &out, sizeof(out));
   1213	    }
   1214	default:
   1215		return -EINVAL;
   1216	}
   1217}
   1218
   1219static time64_t get_semotime(struct sem_array *sma)
   1220{
   1221	int i;
   1222	time64_t res;
   1223
   1224	res = sma->sems[0].sem_otime;
   1225	for (i = 1; i < sma->sem_nsems; i++) {
   1226		time64_t to = sma->sems[i].sem_otime;
   1227
   1228		if (to > res)
   1229			res = to;
   1230	}
   1231	return res;
   1232}
   1233
   1234static int semctl_stat(struct ipc_namespace *ns, int semid,
   1235			 int cmd, struct semid64_ds *semid64)
   1236{
   1237	struct sem_array *sma;
   1238	time64_t semotime;
   1239	int err;
   1240
   1241	memset(semid64, 0, sizeof(*semid64));
   1242
   1243	rcu_read_lock();
   1244	if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) {
   1245		sma = sem_obtain_object(ns, semid);
   1246		if (IS_ERR(sma)) {
   1247			err = PTR_ERR(sma);
   1248			goto out_unlock;
   1249		}
   1250	} else { /* IPC_STAT */
   1251		sma = sem_obtain_object_check(ns, semid);
   1252		if (IS_ERR(sma)) {
   1253			err = PTR_ERR(sma);
   1254			goto out_unlock;
   1255		}
   1256	}
   1257
   1258	/* see comment for SHM_STAT_ANY */
   1259	if (cmd == SEM_STAT_ANY)
   1260		audit_ipc_obj(&sma->sem_perm);
   1261	else {
   1262		err = -EACCES;
   1263		if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
   1264			goto out_unlock;
   1265	}
   1266
   1267	err = security_sem_semctl(&sma->sem_perm, cmd);
   1268	if (err)
   1269		goto out_unlock;
   1270
   1271	ipc_lock_object(&sma->sem_perm);
   1272
   1273	if (!ipc_valid_object(&sma->sem_perm)) {
   1274		ipc_unlock_object(&sma->sem_perm);
   1275		err = -EIDRM;
   1276		goto out_unlock;
   1277	}
   1278
   1279	kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm);
   1280	semotime = get_semotime(sma);
   1281	semid64->sem_otime = semotime;
   1282	semid64->sem_ctime = sma->sem_ctime;
   1283#ifndef CONFIG_64BIT
   1284	semid64->sem_otime_high = semotime >> 32;
   1285	semid64->sem_ctime_high = sma->sem_ctime >> 32;
   1286#endif
   1287	semid64->sem_nsems = sma->sem_nsems;
   1288
   1289	if (cmd == IPC_STAT) {
   1290		/*
   1291		 * As defined in SUS:
   1292		 * Return 0 on success
   1293		 */
   1294		err = 0;
   1295	} else {
   1296		/*
   1297		 * SEM_STAT and SEM_STAT_ANY (both Linux specific)
   1298		 * Return the full id, including the sequence number
   1299		 */
   1300		err = sma->sem_perm.id;
   1301	}
   1302	ipc_unlock_object(&sma->sem_perm);
   1303out_unlock:
   1304	rcu_read_unlock();
   1305	return err;
   1306}
   1307
   1308static int semctl_info(struct ipc_namespace *ns, int semid,
   1309			 int cmd, void __user *p)
   1310{
   1311	struct seminfo seminfo;
   1312	int max_idx;
   1313	int err;
   1314
   1315	err = security_sem_semctl(NULL, cmd);
   1316	if (err)
   1317		return err;
   1318
   1319	memset(&seminfo, 0, sizeof(seminfo));
   1320	seminfo.semmni = ns->sc_semmni;
   1321	seminfo.semmns = ns->sc_semmns;
   1322	seminfo.semmsl = ns->sc_semmsl;
   1323	seminfo.semopm = ns->sc_semopm;
   1324	seminfo.semvmx = SEMVMX;
   1325	seminfo.semmnu = SEMMNU;
   1326	seminfo.semmap = SEMMAP;
   1327	seminfo.semume = SEMUME;
   1328	down_read(&sem_ids(ns).rwsem);
   1329	if (cmd == SEM_INFO) {
   1330		seminfo.semusz = sem_ids(ns).in_use;
   1331		seminfo.semaem = ns->used_sems;
   1332	} else {
   1333		seminfo.semusz = SEMUSZ;
   1334		seminfo.semaem = SEMAEM;
   1335	}
   1336	max_idx = ipc_get_maxidx(&sem_ids(ns));
   1337	up_read(&sem_ids(ns).rwsem);
   1338	if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
   1339		return -EFAULT;
   1340	return (max_idx < 0) ? 0 : max_idx;
   1341}
   1342
   1343static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
   1344		int val)
   1345{
   1346	struct sem_undo *un;
   1347	struct sem_array *sma;
   1348	struct sem *curr;
   1349	int err;
   1350	DEFINE_WAKE_Q(wake_q);
   1351
   1352	if (val > SEMVMX || val < 0)
   1353		return -ERANGE;
   1354
   1355	rcu_read_lock();
   1356	sma = sem_obtain_object_check(ns, semid);
   1357	if (IS_ERR(sma)) {
   1358		rcu_read_unlock();
   1359		return PTR_ERR(sma);
   1360	}
   1361
   1362	if (semnum < 0 || semnum >= sma->sem_nsems) {
   1363		rcu_read_unlock();
   1364		return -EINVAL;
   1365	}
   1366
   1367
   1368	if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
   1369		rcu_read_unlock();
   1370		return -EACCES;
   1371	}
   1372
   1373	err = security_sem_semctl(&sma->sem_perm, SETVAL);
   1374	if (err) {
   1375		rcu_read_unlock();
   1376		return -EACCES;
   1377	}
   1378
   1379	sem_lock(sma, NULL, -1);
   1380
   1381	if (!ipc_valid_object(&sma->sem_perm)) {
   1382		sem_unlock(sma, -1);
   1383		rcu_read_unlock();
   1384		return -EIDRM;
   1385	}
   1386
   1387	semnum = array_index_nospec(semnum, sma->sem_nsems);
   1388	curr = &sma->sems[semnum];
   1389
   1390	ipc_assert_locked_object(&sma->sem_perm);
   1391	list_for_each_entry(un, &sma->list_id, list_id)
   1392		un->semadj[semnum] = 0;
   1393
   1394	curr->semval = val;
   1395	ipc_update_pid(&curr->sempid, task_tgid(current));
   1396	sma->sem_ctime = ktime_get_real_seconds();
   1397	/* maybe some queued-up processes were waiting for this */
   1398	do_smart_update(sma, NULL, 0, 0, &wake_q);
   1399	sem_unlock(sma, -1);
   1400	rcu_read_unlock();
   1401	wake_up_q(&wake_q);
   1402	return 0;
   1403}
   1404
   1405static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
   1406		int cmd, void __user *p)
   1407{
   1408	struct sem_array *sma;
   1409	struct sem *curr;
   1410	int err, nsems;
   1411	ushort fast_sem_io[SEMMSL_FAST];
   1412	ushort *sem_io = fast_sem_io;
   1413	DEFINE_WAKE_Q(wake_q);
   1414
   1415	rcu_read_lock();
   1416	sma = sem_obtain_object_check(ns, semid);
   1417	if (IS_ERR(sma)) {
   1418		rcu_read_unlock();
   1419		return PTR_ERR(sma);
   1420	}
   1421
   1422	nsems = sma->sem_nsems;
   1423
   1424	err = -EACCES;
   1425	if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
   1426		goto out_rcu_wakeup;
   1427
   1428	err = security_sem_semctl(&sma->sem_perm, cmd);
   1429	if (err)
   1430		goto out_rcu_wakeup;
   1431
   1432	switch (cmd) {
   1433	case GETALL:
   1434	{
   1435		ushort __user *array = p;
   1436		int i;
   1437
   1438		sem_lock(sma, NULL, -1);
   1439		if (!ipc_valid_object(&sma->sem_perm)) {
   1440			err = -EIDRM;
   1441			goto out_unlock;
   1442		}
   1443		if (nsems > SEMMSL_FAST) {
   1444			if (!ipc_rcu_getref(&sma->sem_perm)) {
   1445				err = -EIDRM;
   1446				goto out_unlock;
   1447			}
   1448			sem_unlock(sma, -1);
   1449			rcu_read_unlock();
   1450			sem_io = kvmalloc_array(nsems, sizeof(ushort),
   1451						GFP_KERNEL);
   1452			if (sem_io == NULL) {
   1453				ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
   1454				return -ENOMEM;
   1455			}
   1456
   1457			rcu_read_lock();
   1458			sem_lock_and_putref(sma);
   1459			if (!ipc_valid_object(&sma->sem_perm)) {
   1460				err = -EIDRM;
   1461				goto out_unlock;
   1462			}
   1463		}
   1464		for (i = 0; i < sma->sem_nsems; i++)
   1465			sem_io[i] = sma->sems[i].semval;
   1466		sem_unlock(sma, -1);
   1467		rcu_read_unlock();
   1468		err = 0;
   1469		if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
   1470			err = -EFAULT;
   1471		goto out_free;
   1472	}
   1473	case SETALL:
   1474	{
   1475		int i;
   1476		struct sem_undo *un;
   1477
   1478		if (!ipc_rcu_getref(&sma->sem_perm)) {
   1479			err = -EIDRM;
   1480			goto out_rcu_wakeup;
   1481		}
   1482		rcu_read_unlock();
   1483
   1484		if (nsems > SEMMSL_FAST) {
   1485			sem_io = kvmalloc_array(nsems, sizeof(ushort),
   1486						GFP_KERNEL);
   1487			if (sem_io == NULL) {
   1488				ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
   1489				return -ENOMEM;
   1490			}
   1491		}
   1492
   1493		if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
   1494			ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
   1495			err = -EFAULT;
   1496			goto out_free;
   1497		}
   1498
   1499		for (i = 0; i < nsems; i++) {
   1500			if (sem_io[i] > SEMVMX) {
   1501				ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
   1502				err = -ERANGE;
   1503				goto out_free;
   1504			}
   1505		}
   1506		rcu_read_lock();
   1507		sem_lock_and_putref(sma);
   1508		if (!ipc_valid_object(&sma->sem_perm)) {
   1509			err = -EIDRM;
   1510			goto out_unlock;
   1511		}
   1512
   1513		for (i = 0; i < nsems; i++) {
   1514			sma->sems[i].semval = sem_io[i];
   1515			ipc_update_pid(&sma->sems[i].sempid, task_tgid(current));
   1516		}
   1517
   1518		ipc_assert_locked_object(&sma->sem_perm);
   1519		list_for_each_entry(un, &sma->list_id, list_id) {
   1520			for (i = 0; i < nsems; i++)
   1521				un->semadj[i] = 0;
   1522		}
   1523		sma->sem_ctime = ktime_get_real_seconds();
   1524		/* maybe some queued-up processes were waiting for this */
   1525		do_smart_update(sma, NULL, 0, 0, &wake_q);
   1526		err = 0;
   1527		goto out_unlock;
   1528	}
   1529	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
   1530	}
   1531	err = -EINVAL;
   1532	if (semnum < 0 || semnum >= nsems)
   1533		goto out_rcu_wakeup;
   1534
   1535	sem_lock(sma, NULL, -1);
   1536	if (!ipc_valid_object(&sma->sem_perm)) {
   1537		err = -EIDRM;
   1538		goto out_unlock;
   1539	}
   1540
   1541	semnum = array_index_nospec(semnum, nsems);
   1542	curr = &sma->sems[semnum];
   1543
   1544	switch (cmd) {
   1545	case GETVAL:
   1546		err = curr->semval;
   1547		goto out_unlock;
   1548	case GETPID:
   1549		err = pid_vnr(curr->sempid);
   1550		goto out_unlock;
   1551	case GETNCNT:
   1552		err = count_semcnt(sma, semnum, 0);
   1553		goto out_unlock;
   1554	case GETZCNT:
   1555		err = count_semcnt(sma, semnum, 1);
   1556		goto out_unlock;
   1557	}
   1558
   1559out_unlock:
   1560	sem_unlock(sma, -1);
   1561out_rcu_wakeup:
   1562	rcu_read_unlock();
   1563	wake_up_q(&wake_q);
   1564out_free:
   1565	if (sem_io != fast_sem_io)
   1566		kvfree(sem_io);
   1567	return err;
   1568}
   1569
   1570static inline unsigned long
   1571copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
   1572{
   1573	switch (version) {
   1574	case IPC_64:
   1575		if (copy_from_user(out, buf, sizeof(*out)))
   1576			return -EFAULT;
   1577		return 0;
   1578	case IPC_OLD:
   1579	    {
   1580		struct semid_ds tbuf_old;
   1581
   1582		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
   1583			return -EFAULT;
   1584
   1585		out->sem_perm.uid	= tbuf_old.sem_perm.uid;
   1586		out->sem_perm.gid	= tbuf_old.sem_perm.gid;
   1587		out->sem_perm.mode	= tbuf_old.sem_perm.mode;
   1588
   1589		return 0;
   1590	    }
   1591	default:
   1592		return -EINVAL;
   1593	}
   1594}
   1595
   1596/*
   1597 * This function handles some semctl commands which require the rwsem
   1598 * to be held in write mode.
   1599 * NOTE: no locks must be held, the rwsem is taken inside this function.
   1600 */
   1601static int semctl_down(struct ipc_namespace *ns, int semid,
   1602		       int cmd, struct semid64_ds *semid64)
   1603{
   1604	struct sem_array *sma;
   1605	int err;
   1606	struct kern_ipc_perm *ipcp;
   1607
   1608	down_write(&sem_ids(ns).rwsem);
   1609	rcu_read_lock();
   1610
   1611	ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd,
   1612				      &semid64->sem_perm, 0);
   1613	if (IS_ERR(ipcp)) {
   1614		err = PTR_ERR(ipcp);
   1615		goto out_unlock1;
   1616	}
   1617
   1618	sma = container_of(ipcp, struct sem_array, sem_perm);
   1619
   1620	err = security_sem_semctl(&sma->sem_perm, cmd);
   1621	if (err)
   1622		goto out_unlock1;
   1623
   1624	switch (cmd) {
   1625	case IPC_RMID:
   1626		sem_lock(sma, NULL, -1);
   1627		/* freeary unlocks the ipc object and rcu */
   1628		freeary(ns, ipcp);
   1629		goto out_up;
   1630	case IPC_SET:
   1631		sem_lock(sma, NULL, -1);
   1632		err = ipc_update_perm(&semid64->sem_perm, ipcp);
   1633		if (err)
   1634			goto out_unlock0;
   1635		sma->sem_ctime = ktime_get_real_seconds();
   1636		break;
   1637	default:
   1638		err = -EINVAL;
   1639		goto out_unlock1;
   1640	}
   1641
   1642out_unlock0:
   1643	sem_unlock(sma, -1);
   1644out_unlock1:
   1645	rcu_read_unlock();
   1646out_up:
   1647	up_write(&sem_ids(ns).rwsem);
   1648	return err;
   1649}
   1650
   1651static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version)
   1652{
   1653	struct ipc_namespace *ns;
   1654	void __user *p = (void __user *)arg;
   1655	struct semid64_ds semid64;
   1656	int err;
   1657
   1658	if (semid < 0)
   1659		return -EINVAL;
   1660
   1661	ns = current->nsproxy->ipc_ns;
   1662
   1663	switch (cmd) {
   1664	case IPC_INFO:
   1665	case SEM_INFO:
   1666		return semctl_info(ns, semid, cmd, p);
   1667	case IPC_STAT:
   1668	case SEM_STAT:
   1669	case SEM_STAT_ANY:
   1670		err = semctl_stat(ns, semid, cmd, &semid64);
   1671		if (err < 0)
   1672			return err;
   1673		if (copy_semid_to_user(p, &semid64, version))
   1674			err = -EFAULT;
   1675		return err;
   1676	case GETALL:
   1677	case GETVAL:
   1678	case GETPID:
   1679	case GETNCNT:
   1680	case GETZCNT:
   1681	case SETALL:
   1682		return semctl_main(ns, semid, semnum, cmd, p);
   1683	case SETVAL: {
   1684		int val;
   1685#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
   1686		/* big-endian 64bit */
   1687		val = arg >> 32;
   1688#else
   1689		/* 32bit or little-endian 64bit */
   1690		val = arg;
   1691#endif
   1692		return semctl_setval(ns, semid, semnum, val);
   1693	}
   1694	case IPC_SET:
   1695		if (copy_semid_from_user(&semid64, p, version))
   1696			return -EFAULT;
   1697		fallthrough;
   1698	case IPC_RMID:
   1699		return semctl_down(ns, semid, cmd, &semid64);
   1700	default:
   1701		return -EINVAL;
   1702	}
   1703}
   1704
   1705SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
   1706{
   1707	return ksys_semctl(semid, semnum, cmd, arg, IPC_64);
   1708}
   1709
   1710#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
   1711long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg)
   1712{
   1713	int version = ipc_parse_version(&cmd);
   1714
   1715	return ksys_semctl(semid, semnum, cmd, arg, version);
   1716}
   1717
   1718SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
   1719{
   1720	return ksys_old_semctl(semid, semnum, cmd, arg);
   1721}
   1722#endif
   1723
   1724#ifdef CONFIG_COMPAT
   1725
   1726struct compat_semid_ds {
   1727	struct compat_ipc_perm sem_perm;
   1728	old_time32_t sem_otime;
   1729	old_time32_t sem_ctime;
   1730	compat_uptr_t sem_base;
   1731	compat_uptr_t sem_pending;
   1732	compat_uptr_t sem_pending_last;
   1733	compat_uptr_t undo;
   1734	unsigned short sem_nsems;
   1735};
   1736
   1737static int copy_compat_semid_from_user(struct semid64_ds *out, void __user *buf,
   1738					int version)
   1739{
   1740	memset(out, 0, sizeof(*out));
   1741	if (version == IPC_64) {
   1742		struct compat_semid64_ds __user *p = buf;
   1743		return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm);
   1744	} else {
   1745		struct compat_semid_ds __user *p = buf;
   1746		return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm);
   1747	}
   1748}
   1749
   1750static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in,
   1751					int version)
   1752{
   1753	if (version == IPC_64) {
   1754		struct compat_semid64_ds v;
   1755		memset(&v, 0, sizeof(v));
   1756		to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm);
   1757		v.sem_otime	 = lower_32_bits(in->sem_otime);
   1758		v.sem_otime_high = upper_32_bits(in->sem_otime);
   1759		v.sem_ctime	 = lower_32_bits(in->sem_ctime);
   1760		v.sem_ctime_high = upper_32_bits(in->sem_ctime);
   1761		v.sem_nsems = in->sem_nsems;
   1762		return copy_to_user(buf, &v, sizeof(v));
   1763	} else {
   1764		struct compat_semid_ds v;
   1765		memset(&v, 0, sizeof(v));
   1766		to_compat_ipc_perm(&v.sem_perm, &in->sem_perm);
   1767		v.sem_otime = in->sem_otime;
   1768		v.sem_ctime = in->sem_ctime;
   1769		v.sem_nsems = in->sem_nsems;
   1770		return copy_to_user(buf, &v, sizeof(v));
   1771	}
   1772}
   1773
   1774static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version)
   1775{
   1776	void __user *p = compat_ptr(arg);
   1777	struct ipc_namespace *ns;
   1778	struct semid64_ds semid64;
   1779	int err;
   1780
   1781	ns = current->nsproxy->ipc_ns;
   1782
   1783	if (semid < 0)
   1784		return -EINVAL;
   1785
   1786	switch (cmd & (~IPC_64)) {
   1787	case IPC_INFO:
   1788	case SEM_INFO:
   1789		return semctl_info(ns, semid, cmd, p);
   1790	case IPC_STAT:
   1791	case SEM_STAT:
   1792	case SEM_STAT_ANY:
   1793		err = semctl_stat(ns, semid, cmd, &semid64);
   1794		if (err < 0)
   1795			return err;
   1796		if (copy_compat_semid_to_user(p, &semid64, version))
   1797			err = -EFAULT;
   1798		return err;
   1799	case GETVAL:
   1800	case GETPID:
   1801	case GETNCNT:
   1802	case GETZCNT:
   1803	case GETALL:
   1804	case SETALL:
   1805		return semctl_main(ns, semid, semnum, cmd, p);
   1806	case SETVAL:
   1807		return semctl_setval(ns, semid, semnum, arg);
   1808	case IPC_SET:
   1809		if (copy_compat_semid_from_user(&semid64, p, version))
   1810			return -EFAULT;
   1811		fallthrough;
   1812	case IPC_RMID:
   1813		return semctl_down(ns, semid, cmd, &semid64);
   1814	default:
   1815		return -EINVAL;
   1816	}
   1817}
   1818
   1819COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
   1820{
   1821	return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64);
   1822}
   1823
   1824#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
   1825long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg)
   1826{
   1827	int version = compat_ipc_parse_version(&cmd);
   1828
   1829	return compat_ksys_semctl(semid, semnum, cmd, arg, version);
   1830}
   1831
   1832COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg)
   1833{
   1834	return compat_ksys_old_semctl(semid, semnum, cmd, arg);
   1835}
   1836#endif
   1837#endif
   1838
   1839/* If the task doesn't already have a undo_list, then allocate one
   1840 * here.  We guarantee there is only one thread using this undo list,
   1841 * and current is THE ONE
   1842 *
   1843 * If this allocation and assignment succeeds, but later
   1844 * portions of this code fail, there is no need to free the sem_undo_list.
   1845 * Just let it stay associated with the task, and it'll be freed later
   1846 * at exit time.
   1847 *
   1848 * This can block, so callers must hold no locks.
   1849 */
   1850static inline int get_undo_list(struct sem_undo_list **undo_listp)
   1851{
   1852	struct sem_undo_list *undo_list;
   1853
   1854	undo_list = current->sysvsem.undo_list;
   1855	if (!undo_list) {
   1856		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT);
   1857		if (undo_list == NULL)
   1858			return -ENOMEM;
   1859		spin_lock_init(&undo_list->lock);
   1860		refcount_set(&undo_list->refcnt, 1);
   1861		INIT_LIST_HEAD(&undo_list->list_proc);
   1862
   1863		current->sysvsem.undo_list = undo_list;
   1864	}
   1865	*undo_listp = undo_list;
   1866	return 0;
   1867}
   1868
   1869static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
   1870{
   1871	struct sem_undo *un;
   1872
   1873	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc,
   1874				spin_is_locked(&ulp->lock)) {
   1875		if (un->semid == semid)
   1876			return un;
   1877	}
   1878	return NULL;
   1879}
   1880
   1881static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
   1882{
   1883	struct sem_undo *un;
   1884
   1885	assert_spin_locked(&ulp->lock);
   1886
   1887	un = __lookup_undo(ulp, semid);
   1888	if (un) {
   1889		list_del_rcu(&un->list_proc);
   1890		list_add_rcu(&un->list_proc, &ulp->list_proc);
   1891	}
   1892	return un;
   1893}
   1894
   1895/**
   1896 * find_alloc_undo - lookup (and if not present create) undo array
   1897 * @ns: namespace
   1898 * @semid: semaphore array id
   1899 *
   1900 * The function looks up (and if not present creates) the undo structure.
   1901 * The size of the undo structure depends on the size of the semaphore
   1902 * array, thus the alloc path is not that straightforward.
   1903 * Lifetime-rules: sem_undo is rcu-protected, on success, the function
   1904 * performs a rcu_read_lock().
   1905 */
   1906static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
   1907{
   1908	struct sem_array *sma;
   1909	struct sem_undo_list *ulp;
   1910	struct sem_undo *un, *new;
   1911	int nsems, error;
   1912
   1913	error = get_undo_list(&ulp);
   1914	if (error)
   1915		return ERR_PTR(error);
   1916
   1917	rcu_read_lock();
   1918	spin_lock(&ulp->lock);
   1919	un = lookup_undo(ulp, semid);
   1920	spin_unlock(&ulp->lock);
   1921	if (likely(un != NULL))
   1922		goto out;
   1923
   1924	/* no undo structure around - allocate one. */
   1925	/* step 1: figure out the size of the semaphore array */
   1926	sma = sem_obtain_object_check(ns, semid);
   1927	if (IS_ERR(sma)) {
   1928		rcu_read_unlock();
   1929		return ERR_CAST(sma);
   1930	}
   1931
   1932	nsems = sma->sem_nsems;
   1933	if (!ipc_rcu_getref(&sma->sem_perm)) {
   1934		rcu_read_unlock();
   1935		un = ERR_PTR(-EIDRM);
   1936		goto out;
   1937	}
   1938	rcu_read_unlock();
   1939
   1940	/* step 2: allocate new undo structure */
   1941	new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems,
   1942		       GFP_KERNEL_ACCOUNT);
   1943	if (!new) {
   1944		ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
   1945		return ERR_PTR(-ENOMEM);
   1946	}
   1947
   1948	/* step 3: Acquire the lock on semaphore array */
   1949	rcu_read_lock();
   1950	sem_lock_and_putref(sma);
   1951	if (!ipc_valid_object(&sma->sem_perm)) {
   1952		sem_unlock(sma, -1);
   1953		rcu_read_unlock();
   1954		kvfree(new);
   1955		un = ERR_PTR(-EIDRM);
   1956		goto out;
   1957	}
   1958	spin_lock(&ulp->lock);
   1959
   1960	/*
   1961	 * step 4: check for races: did someone else allocate the undo struct?
   1962	 */
   1963	un = lookup_undo(ulp, semid);
   1964	if (un) {
   1965		spin_unlock(&ulp->lock);
   1966		kvfree(new);
   1967		goto success;
   1968	}
   1969	/* step 5: initialize & link new undo structure */
   1970	new->semadj = (short *) &new[1];
   1971	new->ulp = ulp;
   1972	new->semid = semid;
   1973	assert_spin_locked(&ulp->lock);
   1974	list_add_rcu(&new->list_proc, &ulp->list_proc);
   1975	ipc_assert_locked_object(&sma->sem_perm);
   1976	list_add(&new->list_id, &sma->list_id);
   1977	un = new;
   1978	spin_unlock(&ulp->lock);
   1979success:
   1980	sem_unlock(sma, -1);
   1981out:
   1982	return un;
   1983}
   1984
   1985long __do_semtimedop(int semid, struct sembuf *sops,
   1986		unsigned nsops, const struct timespec64 *timeout,
   1987		struct ipc_namespace *ns)
   1988{
   1989	int error = -EINVAL;
   1990	struct sem_array *sma;
   1991	struct sembuf *sop;
   1992	struct sem_undo *un;
   1993	int max, locknum;
   1994	bool undos = false, alter = false, dupsop = false;
   1995	struct sem_queue queue;
   1996	unsigned long dup = 0;
   1997	ktime_t expires, *exp = NULL;
   1998	bool timed_out = false;
   1999
   2000	if (nsops < 1 || semid < 0)
   2001		return -EINVAL;
   2002	if (nsops > ns->sc_semopm)
   2003		return -E2BIG;
   2004
   2005	if (timeout) {
   2006		if (!timespec64_valid(timeout))
   2007			return -EINVAL;
   2008		expires = ktime_add_safe(ktime_get(),
   2009				timespec64_to_ktime(*timeout));
   2010		exp = &expires;
   2011	}
   2012
   2013
   2014	max = 0;
   2015	for (sop = sops; sop < sops + nsops; sop++) {
   2016		unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
   2017
   2018		if (sop->sem_num >= max)
   2019			max = sop->sem_num;
   2020		if (sop->sem_flg & SEM_UNDO)
   2021			undos = true;
   2022		if (dup & mask) {
   2023			/*
   2024			 * There was a previous alter access that appears
   2025			 * to have accessed the same semaphore, thus use
   2026			 * the dupsop logic. "appears", because the detection
   2027			 * can only check % BITS_PER_LONG.
   2028			 */
   2029			dupsop = true;
   2030		}
   2031		if (sop->sem_op != 0) {
   2032			alter = true;
   2033			dup |= mask;
   2034		}
   2035	}
   2036
   2037	if (undos) {
   2038		/* On success, find_alloc_undo takes the rcu_read_lock */
   2039		un = find_alloc_undo(ns, semid);
   2040		if (IS_ERR(un)) {
   2041			error = PTR_ERR(un);
   2042			goto out;
   2043		}
   2044	} else {
   2045		un = NULL;
   2046		rcu_read_lock();
   2047	}
   2048
   2049	sma = sem_obtain_object_check(ns, semid);
   2050	if (IS_ERR(sma)) {
   2051		rcu_read_unlock();
   2052		error = PTR_ERR(sma);
   2053		goto out;
   2054	}
   2055
   2056	error = -EFBIG;
   2057	if (max >= sma->sem_nsems) {
   2058		rcu_read_unlock();
   2059		goto out;
   2060	}
   2061
   2062	error = -EACCES;
   2063	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
   2064		rcu_read_unlock();
   2065		goto out;
   2066	}
   2067
   2068	error = security_sem_semop(&sma->sem_perm, sops, nsops, alter);
   2069	if (error) {
   2070		rcu_read_unlock();
   2071		goto out;
   2072	}
   2073
   2074	error = -EIDRM;
   2075	locknum = sem_lock(sma, sops, nsops);
   2076	/*
   2077	 * We eventually might perform the following check in a lockless
   2078	 * fashion, considering ipc_valid_object() locking constraints.
   2079	 * If nsops == 1 and there is no contention for sem_perm.lock, then
   2080	 * only a per-semaphore lock is held and it's OK to proceed with the
   2081	 * check below. More details on the fine grained locking scheme
   2082	 * entangled here and why it's RMID race safe on comments at sem_lock()
   2083	 */
   2084	if (!ipc_valid_object(&sma->sem_perm))
   2085		goto out_unlock;
   2086	/*
   2087	 * semid identifiers are not unique - find_alloc_undo may have
   2088	 * allocated an undo structure, it was invalidated by an RMID
   2089	 * and now a new array with received the same id. Check and fail.
   2090	 * This case can be detected checking un->semid. The existence of
   2091	 * "un" itself is guaranteed by rcu.
   2092	 */
   2093	if (un && un->semid == -1)
   2094		goto out_unlock;
   2095
   2096	queue.sops = sops;
   2097	queue.nsops = nsops;
   2098	queue.undo = un;
   2099	queue.pid = task_tgid(current);
   2100	queue.alter = alter;
   2101	queue.dupsop = dupsop;
   2102
   2103	error = perform_atomic_semop(sma, &queue);
   2104	if (error == 0) { /* non-blocking successful path */
   2105		DEFINE_WAKE_Q(wake_q);
   2106
   2107		/*
   2108		 * If the operation was successful, then do
   2109		 * the required updates.
   2110		 */
   2111		if (alter)
   2112			do_smart_update(sma, sops, nsops, 1, &wake_q);
   2113		else
   2114			set_semotime(sma, sops);
   2115
   2116		sem_unlock(sma, locknum);
   2117		rcu_read_unlock();
   2118		wake_up_q(&wake_q);
   2119
   2120		goto out;
   2121	}
   2122	if (error < 0) /* non-blocking error path */
   2123		goto out_unlock;
   2124
   2125	/*
   2126	 * We need to sleep on this operation, so we put the current
   2127	 * task into the pending queue and go to sleep.
   2128	 */
   2129	if (nsops == 1) {
   2130		struct sem *curr;
   2131		int idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
   2132		curr = &sma->sems[idx];
   2133
   2134		if (alter) {
   2135			if (sma->complex_count) {
   2136				list_add_tail(&queue.list,
   2137						&sma->pending_alter);
   2138			} else {
   2139
   2140				list_add_tail(&queue.list,
   2141						&curr->pending_alter);
   2142			}
   2143		} else {
   2144			list_add_tail(&queue.list, &curr->pending_const);
   2145		}
   2146	} else {
   2147		if (!sma->complex_count)
   2148			merge_queues(sma);
   2149
   2150		if (alter)
   2151			list_add_tail(&queue.list, &sma->pending_alter);
   2152		else
   2153			list_add_tail(&queue.list, &sma->pending_const);
   2154
   2155		sma->complex_count++;
   2156	}
   2157
   2158	do {
   2159		/* memory ordering ensured by the lock in sem_lock() */
   2160		WRITE_ONCE(queue.status, -EINTR);
   2161		queue.sleeper = current;
   2162
   2163		/* memory ordering is ensured by the lock in sem_lock() */
   2164		__set_current_state(TASK_INTERRUPTIBLE);
   2165		sem_unlock(sma, locknum);
   2166		rcu_read_unlock();
   2167
   2168		timed_out = !schedule_hrtimeout_range(exp,
   2169				current->timer_slack_ns, HRTIMER_MODE_ABS);
   2170
   2171		/*
   2172		 * fastpath: the semop has completed, either successfully or
   2173		 * not, from the syscall pov, is quite irrelevant to us at this
   2174		 * point; we're done.
   2175		 *
   2176		 * We _do_ care, nonetheless, about being awoken by a signal or
   2177		 * spuriously.  The queue.status is checked again in the
   2178		 * slowpath (aka after taking sem_lock), such that we can detect
   2179		 * scenarios where we were awakened externally, during the
   2180		 * window between wake_q_add() and wake_up_q().
   2181		 */
   2182		error = READ_ONCE(queue.status);
   2183		if (error != -EINTR) {
   2184			/* see SEM_BARRIER_2 for purpose/pairing */
   2185			smp_acquire__after_ctrl_dep();
   2186			goto out;
   2187		}
   2188
   2189		rcu_read_lock();
   2190		locknum = sem_lock(sma, sops, nsops);
   2191
   2192		if (!ipc_valid_object(&sma->sem_perm))
   2193			goto out_unlock;
   2194
   2195		/*
   2196		 * No necessity for any barrier: We are protect by sem_lock()
   2197		 */
   2198		error = READ_ONCE(queue.status);
   2199
   2200		/*
   2201		 * If queue.status != -EINTR we are woken up by another process.
   2202		 * Leave without unlink_queue(), but with sem_unlock().
   2203		 */
   2204		if (error != -EINTR)
   2205			goto out_unlock;
   2206
   2207		/*
   2208		 * If an interrupt occurred we have to clean up the queue.
   2209		 */
   2210		if (timed_out)
   2211			error = -EAGAIN;
   2212	} while (error == -EINTR && !signal_pending(current)); /* spurious */
   2213
   2214	unlink_queue(sma, &queue);
   2215
   2216out_unlock:
   2217	sem_unlock(sma, locknum);
   2218	rcu_read_unlock();
   2219out:
   2220	return error;
   2221}
   2222
   2223static long do_semtimedop(int semid, struct sembuf __user *tsops,
   2224		unsigned nsops, const struct timespec64 *timeout)
   2225{
   2226	struct sembuf fast_sops[SEMOPM_FAST];
   2227	struct sembuf *sops = fast_sops;
   2228	struct ipc_namespace *ns;
   2229	int ret;
   2230
   2231	ns = current->nsproxy->ipc_ns;
   2232	if (nsops > ns->sc_semopm)
   2233		return -E2BIG;
   2234	if (nsops < 1)
   2235		return -EINVAL;
   2236
   2237	if (nsops > SEMOPM_FAST) {
   2238		sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
   2239		if (sops == NULL)
   2240			return -ENOMEM;
   2241	}
   2242
   2243	if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
   2244		ret =  -EFAULT;
   2245		goto out_free;
   2246	}
   2247
   2248	ret = __do_semtimedop(semid, sops, nsops, timeout, ns);
   2249
   2250out_free:
   2251	if (sops != fast_sops)
   2252		kvfree(sops);
   2253
   2254	return ret;
   2255}
   2256
   2257long ksys_semtimedop(int semid, struct sembuf __user *tsops,
   2258		     unsigned int nsops, const struct __kernel_timespec __user *timeout)
   2259{
   2260	if (timeout) {
   2261		struct timespec64 ts;
   2262		if (get_timespec64(&ts, timeout))
   2263			return -EFAULT;
   2264		return do_semtimedop(semid, tsops, nsops, &ts);
   2265	}
   2266	return do_semtimedop(semid, tsops, nsops, NULL);
   2267}
   2268
   2269SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
   2270		unsigned int, nsops, const struct __kernel_timespec __user *, timeout)
   2271{
   2272	return ksys_semtimedop(semid, tsops, nsops, timeout);
   2273}
   2274
   2275#ifdef CONFIG_COMPAT_32BIT_TIME
   2276long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
   2277			    unsigned int nsops,
   2278			    const struct old_timespec32 __user *timeout)
   2279{
   2280	if (timeout) {
   2281		struct timespec64 ts;
   2282		if (get_old_timespec32(&ts, timeout))
   2283			return -EFAULT;
   2284		return do_semtimedop(semid, tsems, nsops, &ts);
   2285	}
   2286	return do_semtimedop(semid, tsems, nsops, NULL);
   2287}
   2288
   2289SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems,
   2290		       unsigned int, nsops,
   2291		       const struct old_timespec32 __user *, timeout)
   2292{
   2293	return compat_ksys_semtimedop(semid, tsems, nsops, timeout);
   2294}
   2295#endif
   2296
   2297SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
   2298		unsigned, nsops)
   2299{
   2300	return do_semtimedop(semid, tsops, nsops, NULL);
   2301}
   2302
   2303/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
   2304 * parent and child tasks.
   2305 */
   2306
   2307int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
   2308{
   2309	struct sem_undo_list *undo_list;
   2310	int error;
   2311
   2312	if (clone_flags & CLONE_SYSVSEM) {
   2313		error = get_undo_list(&undo_list);
   2314		if (error)
   2315			return error;
   2316		refcount_inc(&undo_list->refcnt);
   2317		tsk->sysvsem.undo_list = undo_list;
   2318	} else
   2319		tsk->sysvsem.undo_list = NULL;
   2320
   2321	return 0;
   2322}
   2323
   2324/*
   2325 * add semadj values to semaphores, free undo structures.
   2326 * undo structures are not freed when semaphore arrays are destroyed
   2327 * so some of them may be out of date.
   2328 * IMPLEMENTATION NOTE: There is some confusion over whether the
   2329 * set of adjustments that needs to be done should be done in an atomic
   2330 * manner or not. That is, if we are attempting to decrement the semval
   2331 * should we queue up and wait until we can do so legally?
   2332 * The original implementation attempted to do this (queue and wait).
   2333 * The current implementation does not do so. The POSIX standard
   2334 * and SVID should be consulted to determine what behavior is mandated.
   2335 */
   2336void exit_sem(struct task_struct *tsk)
   2337{
   2338	struct sem_undo_list *ulp;
   2339
   2340	ulp = tsk->sysvsem.undo_list;
   2341	if (!ulp)
   2342		return;
   2343	tsk->sysvsem.undo_list = NULL;
   2344
   2345	if (!refcount_dec_and_test(&ulp->refcnt))
   2346		return;
   2347
   2348	for (;;) {
   2349		struct sem_array *sma;
   2350		struct sem_undo *un;
   2351		int semid, i;
   2352		DEFINE_WAKE_Q(wake_q);
   2353
   2354		cond_resched();
   2355
   2356		rcu_read_lock();
   2357		un = list_entry_rcu(ulp->list_proc.next,
   2358				    struct sem_undo, list_proc);
   2359		if (&un->list_proc == &ulp->list_proc) {
   2360			/*
   2361			 * We must wait for freeary() before freeing this ulp,
   2362			 * in case we raced with last sem_undo. There is a small
   2363			 * possibility where we exit while freeary() didn't
   2364			 * finish unlocking sem_undo_list.
   2365			 */
   2366			spin_lock(&ulp->lock);
   2367			spin_unlock(&ulp->lock);
   2368			rcu_read_unlock();
   2369			break;
   2370		}
   2371		spin_lock(&ulp->lock);
   2372		semid = un->semid;
   2373		spin_unlock(&ulp->lock);
   2374
   2375		/* exit_sem raced with IPC_RMID, nothing to do */
   2376		if (semid == -1) {
   2377			rcu_read_unlock();
   2378			continue;
   2379		}
   2380
   2381		sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid);
   2382		/* exit_sem raced with IPC_RMID, nothing to do */
   2383		if (IS_ERR(sma)) {
   2384			rcu_read_unlock();
   2385			continue;
   2386		}
   2387
   2388		sem_lock(sma, NULL, -1);
   2389		/* exit_sem raced with IPC_RMID, nothing to do */
   2390		if (!ipc_valid_object(&sma->sem_perm)) {
   2391			sem_unlock(sma, -1);
   2392			rcu_read_unlock();
   2393			continue;
   2394		}
   2395		un = __lookup_undo(ulp, semid);
   2396		if (un == NULL) {
   2397			/* exit_sem raced with IPC_RMID+semget() that created
   2398			 * exactly the same semid. Nothing to do.
   2399			 */
   2400			sem_unlock(sma, -1);
   2401			rcu_read_unlock();
   2402			continue;
   2403		}
   2404
   2405		/* remove un from the linked lists */
   2406		ipc_assert_locked_object(&sma->sem_perm);
   2407		list_del(&un->list_id);
   2408
   2409		spin_lock(&ulp->lock);
   2410		list_del_rcu(&un->list_proc);
   2411		spin_unlock(&ulp->lock);
   2412
   2413		/* perform adjustments registered in un */
   2414		for (i = 0; i < sma->sem_nsems; i++) {
   2415			struct sem *semaphore = &sma->sems[i];
   2416			if (un->semadj[i]) {
   2417				semaphore->semval += un->semadj[i];
   2418				/*
   2419				 * Range checks of the new semaphore value,
   2420				 * not defined by sus:
   2421				 * - Some unices ignore the undo entirely
   2422				 *   (e.g. HP UX 11i 11.22, Tru64 V5.1)
   2423				 * - some cap the value (e.g. FreeBSD caps
   2424				 *   at 0, but doesn't enforce SEMVMX)
   2425				 *
   2426				 * Linux caps the semaphore value, both at 0
   2427				 * and at SEMVMX.
   2428				 *
   2429				 *	Manfred <manfred@colorfullife.com>
   2430				 */
   2431				if (semaphore->semval < 0)
   2432					semaphore->semval = 0;
   2433				if (semaphore->semval > SEMVMX)
   2434					semaphore->semval = SEMVMX;
   2435				ipc_update_pid(&semaphore->sempid, task_tgid(current));
   2436			}
   2437		}
   2438		/* maybe some queued-up processes were waiting for this */
   2439		do_smart_update(sma, NULL, 0, 1, &wake_q);
   2440		sem_unlock(sma, -1);
   2441		rcu_read_unlock();
   2442		wake_up_q(&wake_q);
   2443
   2444		kvfree_rcu(un, rcu);
   2445	}
   2446	kfree(ulp);
   2447}
   2448
   2449#ifdef CONFIG_PROC_FS
   2450static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
   2451{
   2452	struct user_namespace *user_ns = seq_user_ns(s);
   2453	struct kern_ipc_perm *ipcp = it;
   2454	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
   2455	time64_t sem_otime;
   2456
   2457	/*
   2458	 * The proc interface isn't aware of sem_lock(), it calls
   2459	 * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock).
   2460	 * (in sysvipc_find_ipc)
   2461	 * In order to stay compatible with sem_lock(), we must
   2462	 * enter / leave complex_mode.
   2463	 */
   2464	complexmode_enter(sma);
   2465
   2466	sem_otime = get_semotime(sma);
   2467
   2468	seq_printf(s,
   2469		   "%10d %10d  %4o %10u %5u %5u %5u %5u %10llu %10llu\n",
   2470		   sma->sem_perm.key,
   2471		   sma->sem_perm.id,
   2472		   sma->sem_perm.mode,
   2473		   sma->sem_nsems,
   2474		   from_kuid_munged(user_ns, sma->sem_perm.uid),
   2475		   from_kgid_munged(user_ns, sma->sem_perm.gid),
   2476		   from_kuid_munged(user_ns, sma->sem_perm.cuid),
   2477		   from_kgid_munged(user_ns, sma->sem_perm.cgid),
   2478		   sem_otime,
   2479		   sma->sem_ctime);
   2480
   2481	complexmode_tryleave(sma);
   2482
   2483	return 0;
   2484}
   2485#endif