cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pi.c (33140B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2
      3#include <linux/slab.h>
      4#include <linux/sched/task.h>
      5
      6#include "futex.h"
      7#include "../locking/rtmutex_common.h"
      8
      9/*
     10 * PI code:
     11 */
     12int refill_pi_state_cache(void)
     13{
     14	struct futex_pi_state *pi_state;
     15
     16	if (likely(current->pi_state_cache))
     17		return 0;
     18
     19	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
     20
     21	if (!pi_state)
     22		return -ENOMEM;
     23
     24	INIT_LIST_HEAD(&pi_state->list);
     25	/* pi_mutex gets initialized later */
     26	pi_state->owner = NULL;
     27	refcount_set(&pi_state->refcount, 1);
     28	pi_state->key = FUTEX_KEY_INIT;
     29
     30	current->pi_state_cache = pi_state;
     31
     32	return 0;
     33}
     34
     35static struct futex_pi_state *alloc_pi_state(void)
     36{
     37	struct futex_pi_state *pi_state = current->pi_state_cache;
     38
     39	WARN_ON(!pi_state);
     40	current->pi_state_cache = NULL;
     41
     42	return pi_state;
     43}
     44
     45static void pi_state_update_owner(struct futex_pi_state *pi_state,
     46				  struct task_struct *new_owner)
     47{
     48	struct task_struct *old_owner = pi_state->owner;
     49
     50	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
     51
     52	if (old_owner) {
     53		raw_spin_lock(&old_owner->pi_lock);
     54		WARN_ON(list_empty(&pi_state->list));
     55		list_del_init(&pi_state->list);
     56		raw_spin_unlock(&old_owner->pi_lock);
     57	}
     58
     59	if (new_owner) {
     60		raw_spin_lock(&new_owner->pi_lock);
     61		WARN_ON(!list_empty(&pi_state->list));
     62		list_add(&pi_state->list, &new_owner->pi_state_list);
     63		pi_state->owner = new_owner;
     64		raw_spin_unlock(&new_owner->pi_lock);
     65	}
     66}
     67
     68void get_pi_state(struct futex_pi_state *pi_state)
     69{
     70	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
     71}
     72
     73/*
     74 * Drops a reference to the pi_state object and frees or caches it
     75 * when the last reference is gone.
     76 */
     77void put_pi_state(struct futex_pi_state *pi_state)
     78{
     79	if (!pi_state)
     80		return;
     81
     82	if (!refcount_dec_and_test(&pi_state->refcount))
     83		return;
     84
     85	/*
     86	 * If pi_state->owner is NULL, the owner is most probably dying
     87	 * and has cleaned up the pi_state already
     88	 */
     89	if (pi_state->owner) {
     90		unsigned long flags;
     91
     92		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
     93		pi_state_update_owner(pi_state, NULL);
     94		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
     95		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
     96	}
     97
     98	if (current->pi_state_cache) {
     99		kfree(pi_state);
    100	} else {
    101		/*
    102		 * pi_state->list is already empty.
    103		 * clear pi_state->owner.
    104		 * refcount is at 0 - put it back to 1.
    105		 */
    106		pi_state->owner = NULL;
    107		refcount_set(&pi_state->refcount, 1);
    108		current->pi_state_cache = pi_state;
    109	}
    110}
    111
    112/*
    113 * We need to check the following states:
    114 *
    115 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
    116 *
    117 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
    118 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
    119 *
    120 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
    121 *
    122 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
    123 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
    124 *
    125 * [6]  Found  | Found    | task      | 0         | 1      | Valid
    126 *
    127 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
    128 *
    129 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
    130 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
    131 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
    132 *
    133 * [1]	Indicates that the kernel can acquire the futex atomically. We
    134 *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
    135 *
    136 * [2]	Valid, if TID does not belong to a kernel thread. If no matching
    137 *      thread is found then it indicates that the owner TID has died.
    138 *
    139 * [3]	Invalid. The waiter is queued on a non PI futex
    140 *
    141 * [4]	Valid state after exit_robust_list(), which sets the user space
    142 *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
    143 *
    144 * [5]	The user space value got manipulated between exit_robust_list()
    145 *	and exit_pi_state_list()
    146 *
    147 * [6]	Valid state after exit_pi_state_list() which sets the new owner in
    148 *	the pi_state but cannot access the user space value.
    149 *
    150 * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
    151 *
    152 * [8]	Owner and user space value match
    153 *
    154 * [9]	There is no transient state which sets the user space TID to 0
    155 *	except exit_robust_list(), but this is indicated by the
    156 *	FUTEX_OWNER_DIED bit. See [4]
    157 *
    158 * [10] There is no transient state which leaves owner and user space
    159 *	TID out of sync. Except one error case where the kernel is denied
    160 *	write access to the user address, see fixup_pi_state_owner().
    161 *
    162 *
    163 * Serialization and lifetime rules:
    164 *
    165 * hb->lock:
    166 *
    167 *	hb -> futex_q, relation
    168 *	futex_q -> pi_state, relation
    169 *
    170 *	(cannot be raw because hb can contain arbitrary amount
    171 *	 of futex_q's)
    172 *
    173 * pi_mutex->wait_lock:
    174 *
    175 *	{uval, pi_state}
    176 *
    177 *	(and pi_mutex 'obviously')
    178 *
    179 * p->pi_lock:
    180 *
    181 *	p->pi_state_list -> pi_state->list, relation
    182 *	pi_mutex->owner -> pi_state->owner, relation
    183 *
    184 * pi_state->refcount:
    185 *
    186 *	pi_state lifetime
    187 *
    188 *
    189 * Lock order:
    190 *
    191 *   hb->lock
    192 *     pi_mutex->wait_lock
    193 *       p->pi_lock
    194 *
    195 */
    196
    197/*
    198 * Validate that the existing waiter has a pi_state and sanity check
    199 * the pi_state against the user space value. If correct, attach to
    200 * it.
    201 */
    202static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
    203			      struct futex_pi_state *pi_state,
    204			      struct futex_pi_state **ps)
    205{
    206	pid_t pid = uval & FUTEX_TID_MASK;
    207	u32 uval2;
    208	int ret;
    209
    210	/*
    211	 * Userspace might have messed up non-PI and PI futexes [3]
    212	 */
    213	if (unlikely(!pi_state))
    214		return -EINVAL;
    215
    216	/*
    217	 * We get here with hb->lock held, and having found a
    218	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
    219	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
    220	 * which in turn means that futex_lock_pi() still has a reference on
    221	 * our pi_state.
    222	 *
    223	 * The waiter holding a reference on @pi_state also protects against
    224	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
    225	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
    226	 * free pi_state before we can take a reference ourselves.
    227	 */
    228	WARN_ON(!refcount_read(&pi_state->refcount));
    229
    230	/*
    231	 * Now that we have a pi_state, we can acquire wait_lock
    232	 * and do the state validation.
    233	 */
    234	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    235
    236	/*
    237	 * Since {uval, pi_state} is serialized by wait_lock, and our current
    238	 * uval was read without holding it, it can have changed. Verify it
    239	 * still is what we expect it to be, otherwise retry the entire
    240	 * operation.
    241	 */
    242	if (futex_get_value_locked(&uval2, uaddr))
    243		goto out_efault;
    244
    245	if (uval != uval2)
    246		goto out_eagain;
    247
    248	/*
    249	 * Handle the owner died case:
    250	 */
    251	if (uval & FUTEX_OWNER_DIED) {
    252		/*
    253		 * exit_pi_state_list sets owner to NULL and wakes the
    254		 * topmost waiter. The task which acquires the
    255		 * pi_state->rt_mutex will fixup owner.
    256		 */
    257		if (!pi_state->owner) {
    258			/*
    259			 * No pi state owner, but the user space TID
    260			 * is not 0. Inconsistent state. [5]
    261			 */
    262			if (pid)
    263				goto out_einval;
    264			/*
    265			 * Take a ref on the state and return success. [4]
    266			 */
    267			goto out_attach;
    268		}
    269
    270		/*
    271		 * If TID is 0, then either the dying owner has not
    272		 * yet executed exit_pi_state_list() or some waiter
    273		 * acquired the rtmutex in the pi state, but did not
    274		 * yet fixup the TID in user space.
    275		 *
    276		 * Take a ref on the state and return success. [6]
    277		 */
    278		if (!pid)
    279			goto out_attach;
    280	} else {
    281		/*
    282		 * If the owner died bit is not set, then the pi_state
    283		 * must have an owner. [7]
    284		 */
    285		if (!pi_state->owner)
    286			goto out_einval;
    287	}
    288
    289	/*
    290	 * Bail out if user space manipulated the futex value. If pi
    291	 * state exists then the owner TID must be the same as the
    292	 * user space TID. [9/10]
    293	 */
    294	if (pid != task_pid_vnr(pi_state->owner))
    295		goto out_einval;
    296
    297out_attach:
    298	get_pi_state(pi_state);
    299	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    300	*ps = pi_state;
    301	return 0;
    302
    303out_einval:
    304	ret = -EINVAL;
    305	goto out_error;
    306
    307out_eagain:
    308	ret = -EAGAIN;
    309	goto out_error;
    310
    311out_efault:
    312	ret = -EFAULT;
    313	goto out_error;
    314
    315out_error:
    316	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    317	return ret;
    318}
    319
    320static int handle_exit_race(u32 __user *uaddr, u32 uval,
    321			    struct task_struct *tsk)
    322{
    323	u32 uval2;
    324
    325	/*
    326	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
    327	 * caller that the alleged owner is busy.
    328	 */
    329	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
    330		return -EBUSY;
    331
    332	/*
    333	 * Reread the user space value to handle the following situation:
    334	 *
    335	 * CPU0				CPU1
    336	 *
    337	 * sys_exit()			sys_futex()
    338	 *  do_exit()			 futex_lock_pi()
    339	 *                                futex_lock_pi_atomic()
    340	 *   exit_signals(tsk)		    No waiters:
    341	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
    342	 *  mm_release(tsk)		    Set waiter bit
    343	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
    344	 *      Set owner died		    attach_to_pi_owner() {
    345	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
    346	 *   }				     if (!tsk->flags & PF_EXITING) {
    347	 *  ...				       attach();
    348	 *  tsk->futex_state =               } else {
    349	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
    350	 *					  FUTEX_STATE_DEAD)
    351	 *				         return -EAGAIN;
    352	 *				       return -ESRCH; <--- FAIL
    353	 *				     }
    354	 *
    355	 * Returning ESRCH unconditionally is wrong here because the
    356	 * user space value has been changed by the exiting task.
    357	 *
    358	 * The same logic applies to the case where the exiting task is
    359	 * already gone.
    360	 */
    361	if (futex_get_value_locked(&uval2, uaddr))
    362		return -EFAULT;
    363
    364	/* If the user space value has changed, try again. */
    365	if (uval2 != uval)
    366		return -EAGAIN;
    367
    368	/*
    369	 * The exiting task did not have a robust list, the robust list was
    370	 * corrupted or the user space value in *uaddr is simply bogus.
    371	 * Give up and tell user space.
    372	 */
    373	return -ESRCH;
    374}
    375
    376static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
    377				 struct futex_pi_state **ps)
    378{
    379	/*
    380	 * No existing pi state. First waiter. [2]
    381	 *
    382	 * This creates pi_state, we have hb->lock held, this means nothing can
    383	 * observe this state, wait_lock is irrelevant.
    384	 */
    385	struct futex_pi_state *pi_state = alloc_pi_state();
    386
    387	/*
    388	 * Initialize the pi_mutex in locked state and make @p
    389	 * the owner of it:
    390	 */
    391	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
    392
    393	/* Store the key for possible exit cleanups: */
    394	pi_state->key = *key;
    395
    396	WARN_ON(!list_empty(&pi_state->list));
    397	list_add(&pi_state->list, &p->pi_state_list);
    398	/*
    399	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
    400	 * because there is no concurrency as the object is not published yet.
    401	 */
    402	pi_state->owner = p;
    403
    404	*ps = pi_state;
    405}
    406/*
    407 * Lookup the task for the TID provided from user space and attach to
    408 * it after doing proper sanity checks.
    409 */
    410static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
    411			      struct futex_pi_state **ps,
    412			      struct task_struct **exiting)
    413{
    414	pid_t pid = uval & FUTEX_TID_MASK;
    415	struct task_struct *p;
    416
    417	/*
    418	 * We are the first waiter - try to look up the real owner and attach
    419	 * the new pi_state to it, but bail out when TID = 0 [1]
    420	 *
    421	 * The !pid check is paranoid. None of the call sites should end up
    422	 * with pid == 0, but better safe than sorry. Let the caller retry
    423	 */
    424	if (!pid)
    425		return -EAGAIN;
    426	p = find_get_task_by_vpid(pid);
    427	if (!p)
    428		return handle_exit_race(uaddr, uval, NULL);
    429
    430	if (unlikely(p->flags & PF_KTHREAD)) {
    431		put_task_struct(p);
    432		return -EPERM;
    433	}
    434
    435	/*
    436	 * We need to look at the task state to figure out, whether the
    437	 * task is exiting. To protect against the change of the task state
    438	 * in futex_exit_release(), we do this protected by p->pi_lock:
    439	 */
    440	raw_spin_lock_irq(&p->pi_lock);
    441	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
    442		/*
    443		 * The task is on the way out. When the futex state is
    444		 * FUTEX_STATE_DEAD, we know that the task has finished
    445		 * the cleanup:
    446		 */
    447		int ret = handle_exit_race(uaddr, uval, p);
    448
    449		raw_spin_unlock_irq(&p->pi_lock);
    450		/*
    451		 * If the owner task is between FUTEX_STATE_EXITING and
    452		 * FUTEX_STATE_DEAD then store the task pointer and keep
    453		 * the reference on the task struct. The calling code will
    454		 * drop all locks, wait for the task to reach
    455		 * FUTEX_STATE_DEAD and then drop the refcount. This is
    456		 * required to prevent a live lock when the current task
    457		 * preempted the exiting task between the two states.
    458		 */
    459		if (ret == -EBUSY)
    460			*exiting = p;
    461		else
    462			put_task_struct(p);
    463		return ret;
    464	}
    465
    466	__attach_to_pi_owner(p, key, ps);
    467	raw_spin_unlock_irq(&p->pi_lock);
    468
    469	put_task_struct(p);
    470
    471	return 0;
    472}
    473
    474static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
    475{
    476	int err;
    477	u32 curval;
    478
    479	if (unlikely(should_fail_futex(true)))
    480		return -EFAULT;
    481
    482	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
    483	if (unlikely(err))
    484		return err;
    485
    486	/* If user space value changed, let the caller retry */
    487	return curval != uval ? -EAGAIN : 0;
    488}
    489
    490/**
    491 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
    492 * @uaddr:		the pi futex user address
    493 * @hb:			the pi futex hash bucket
    494 * @key:		the futex key associated with uaddr and hb
    495 * @ps:			the pi_state pointer where we store the result of the
    496 *			lookup
    497 * @task:		the task to perform the atomic lock work for.  This will
    498 *			be "current" except in the case of requeue pi.
    499 * @exiting:		Pointer to store the task pointer of the owner task
    500 *			which is in the middle of exiting
    501 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
    502 *
    503 * Return:
    504 *  -  0 - ready to wait;
    505 *  -  1 - acquired the lock;
    506 *  - <0 - error
    507 *
    508 * The hb->lock must be held by the caller.
    509 *
    510 * @exiting is only set when the return value is -EBUSY. If so, this holds
    511 * a refcount on the exiting task on return and the caller needs to drop it
    512 * after waiting for the exit to complete.
    513 */
    514int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
    515			 union futex_key *key,
    516			 struct futex_pi_state **ps,
    517			 struct task_struct *task,
    518			 struct task_struct **exiting,
    519			 int set_waiters)
    520{
    521	u32 uval, newval, vpid = task_pid_vnr(task);
    522	struct futex_q *top_waiter;
    523	int ret;
    524
    525	/*
    526	 * Read the user space value first so we can validate a few
    527	 * things before proceeding further.
    528	 */
    529	if (futex_get_value_locked(&uval, uaddr))
    530		return -EFAULT;
    531
    532	if (unlikely(should_fail_futex(true)))
    533		return -EFAULT;
    534
    535	/*
    536	 * Detect deadlocks.
    537	 */
    538	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
    539		return -EDEADLK;
    540
    541	if ((unlikely(should_fail_futex(true))))
    542		return -EDEADLK;
    543
    544	/*
    545	 * Lookup existing state first. If it exists, try to attach to
    546	 * its pi_state.
    547	 */
    548	top_waiter = futex_top_waiter(hb, key);
    549	if (top_waiter)
    550		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
    551
    552	/*
    553	 * No waiter and user TID is 0. We are here because the
    554	 * waiters or the owner died bit is set or called from
    555	 * requeue_cmp_pi or for whatever reason something took the
    556	 * syscall.
    557	 */
    558	if (!(uval & FUTEX_TID_MASK)) {
    559		/*
    560		 * We take over the futex. No other waiters and the user space
    561		 * TID is 0. We preserve the owner died bit.
    562		 */
    563		newval = uval & FUTEX_OWNER_DIED;
    564		newval |= vpid;
    565
    566		/* The futex requeue_pi code can enforce the waiters bit */
    567		if (set_waiters)
    568			newval |= FUTEX_WAITERS;
    569
    570		ret = lock_pi_update_atomic(uaddr, uval, newval);
    571		if (ret)
    572			return ret;
    573
    574		/*
    575		 * If the waiter bit was requested the caller also needs PI
    576		 * state attached to the new owner of the user space futex.
    577		 *
    578		 * @task is guaranteed to be alive and it cannot be exiting
    579		 * because it is either sleeping or waiting in
    580		 * futex_requeue_pi_wakeup_sync().
    581		 *
    582		 * No need to do the full attach_to_pi_owner() exercise
    583		 * because @task is known and valid.
    584		 */
    585		if (set_waiters) {
    586			raw_spin_lock_irq(&task->pi_lock);
    587			__attach_to_pi_owner(task, key, ps);
    588			raw_spin_unlock_irq(&task->pi_lock);
    589		}
    590		return 1;
    591	}
    592
    593	/*
    594	 * First waiter. Set the waiters bit before attaching ourself to
    595	 * the owner. If owner tries to unlock, it will be forced into
    596	 * the kernel and blocked on hb->lock.
    597	 */
    598	newval = uval | FUTEX_WAITERS;
    599	ret = lock_pi_update_atomic(uaddr, uval, newval);
    600	if (ret)
    601		return ret;
    602	/*
    603	 * If the update of the user space value succeeded, we try to
    604	 * attach to the owner. If that fails, no harm done, we only
    605	 * set the FUTEX_WAITERS bit in the user space variable.
    606	 */
    607	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
    608}
    609
    610/*
    611 * Caller must hold a reference on @pi_state.
    612 */
    613static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
    614{
    615	struct rt_mutex_waiter *top_waiter;
    616	struct task_struct *new_owner;
    617	bool postunlock = false;
    618	DEFINE_RT_WAKE_Q(wqh);
    619	u32 curval, newval;
    620	int ret = 0;
    621
    622	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
    623	if (WARN_ON_ONCE(!top_waiter)) {
    624		/*
    625		 * As per the comment in futex_unlock_pi() this should not happen.
    626		 *
    627		 * When this happens, give up our locks and try again, giving
    628		 * the futex_lock_pi() instance time to complete, either by
    629		 * waiting on the rtmutex or removing itself from the futex
    630		 * queue.
    631		 */
    632		ret = -EAGAIN;
    633		goto out_unlock;
    634	}
    635
    636	new_owner = top_waiter->task;
    637
    638	/*
    639	 * We pass it to the next owner. The WAITERS bit is always kept
    640	 * enabled while there is PI state around. We cleanup the owner
    641	 * died bit, because we are the owner.
    642	 */
    643	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
    644
    645	if (unlikely(should_fail_futex(true))) {
    646		ret = -EFAULT;
    647		goto out_unlock;
    648	}
    649
    650	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
    651	if (!ret && (curval != uval)) {
    652		/*
    653		 * If a unconditional UNLOCK_PI operation (user space did not
    654		 * try the TID->0 transition) raced with a waiter setting the
    655		 * FUTEX_WAITERS flag between get_user() and locking the hash
    656		 * bucket lock, retry the operation.
    657		 */
    658		if ((FUTEX_TID_MASK & curval) == uval)
    659			ret = -EAGAIN;
    660		else
    661			ret = -EINVAL;
    662	}
    663
    664	if (!ret) {
    665		/*
    666		 * This is a point of no return; once we modified the uval
    667		 * there is no going back and subsequent operations must
    668		 * not fail.
    669		 */
    670		pi_state_update_owner(pi_state, new_owner);
    671		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
    672	}
    673
    674out_unlock:
    675	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    676
    677	if (postunlock)
    678		rt_mutex_postunlock(&wqh);
    679
    680	return ret;
    681}
    682
    683static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
    684				  struct task_struct *argowner)
    685{
    686	struct futex_pi_state *pi_state = q->pi_state;
    687	struct task_struct *oldowner, *newowner;
    688	u32 uval, curval, newval, newtid;
    689	int err = 0;
    690
    691	oldowner = pi_state->owner;
    692
    693	/*
    694	 * We are here because either:
    695	 *
    696	 *  - we stole the lock and pi_state->owner needs updating to reflect
    697	 *    that (@argowner == current),
    698	 *
    699	 * or:
    700	 *
    701	 *  - someone stole our lock and we need to fix things to point to the
    702	 *    new owner (@argowner == NULL).
    703	 *
    704	 * Either way, we have to replace the TID in the user space variable.
    705	 * This must be atomic as we have to preserve the owner died bit here.
    706	 *
    707	 * Note: We write the user space value _before_ changing the pi_state
    708	 * because we can fault here. Imagine swapped out pages or a fork
    709	 * that marked all the anonymous memory readonly for cow.
    710	 *
    711	 * Modifying pi_state _before_ the user space value would leave the
    712	 * pi_state in an inconsistent state when we fault here, because we
    713	 * need to drop the locks to handle the fault. This might be observed
    714	 * in the PID checks when attaching to PI state .
    715	 */
    716retry:
    717	if (!argowner) {
    718		if (oldowner != current) {
    719			/*
    720			 * We raced against a concurrent self; things are
    721			 * already fixed up. Nothing to do.
    722			 */
    723			return 0;
    724		}
    725
    726		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
    727			/* We got the lock. pi_state is correct. Tell caller. */
    728			return 1;
    729		}
    730
    731		/*
    732		 * The trylock just failed, so either there is an owner or
    733		 * there is a higher priority waiter than this one.
    734		 */
    735		newowner = rt_mutex_owner(&pi_state->pi_mutex);
    736		/*
    737		 * If the higher priority waiter has not yet taken over the
    738		 * rtmutex then newowner is NULL. We can't return here with
    739		 * that state because it's inconsistent vs. the user space
    740		 * state. So drop the locks and try again. It's a valid
    741		 * situation and not any different from the other retry
    742		 * conditions.
    743		 */
    744		if (unlikely(!newowner)) {
    745			err = -EAGAIN;
    746			goto handle_err;
    747		}
    748	} else {
    749		WARN_ON_ONCE(argowner != current);
    750		if (oldowner == current) {
    751			/*
    752			 * We raced against a concurrent self; things are
    753			 * already fixed up. Nothing to do.
    754			 */
    755			return 1;
    756		}
    757		newowner = argowner;
    758	}
    759
    760	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
    761	/* Owner died? */
    762	if (!pi_state->owner)
    763		newtid |= FUTEX_OWNER_DIED;
    764
    765	err = futex_get_value_locked(&uval, uaddr);
    766	if (err)
    767		goto handle_err;
    768
    769	for (;;) {
    770		newval = (uval & FUTEX_OWNER_DIED) | newtid;
    771
    772		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
    773		if (err)
    774			goto handle_err;
    775
    776		if (curval == uval)
    777			break;
    778		uval = curval;
    779	}
    780
    781	/*
    782	 * We fixed up user space. Now we need to fix the pi_state
    783	 * itself.
    784	 */
    785	pi_state_update_owner(pi_state, newowner);
    786
    787	return argowner == current;
    788
    789	/*
    790	 * In order to reschedule or handle a page fault, we need to drop the
    791	 * locks here. In the case of a fault, this gives the other task
    792	 * (either the highest priority waiter itself or the task which stole
    793	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
    794	 * are back from handling the fault we need to check the pi_state after
    795	 * reacquiring the locks and before trying to do another fixup. When
    796	 * the fixup has been done already we simply return.
    797	 *
    798	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
    799	 * drop hb->lock since the caller owns the hb -> futex_q relation.
    800	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
    801	 */
    802handle_err:
    803	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    804	spin_unlock(q->lock_ptr);
    805
    806	switch (err) {
    807	case -EFAULT:
    808		err = fault_in_user_writeable(uaddr);
    809		break;
    810
    811	case -EAGAIN:
    812		cond_resched();
    813		err = 0;
    814		break;
    815
    816	default:
    817		WARN_ON_ONCE(1);
    818		break;
    819	}
    820
    821	spin_lock(q->lock_ptr);
    822	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    823
    824	/*
    825	 * Check if someone else fixed it for us:
    826	 */
    827	if (pi_state->owner != oldowner)
    828		return argowner == current;
    829
    830	/* Retry if err was -EAGAIN or the fault in succeeded */
    831	if (!err)
    832		goto retry;
    833
    834	/*
    835	 * fault_in_user_writeable() failed so user state is immutable. At
    836	 * best we can make the kernel state consistent but user state will
    837	 * be most likely hosed and any subsequent unlock operation will be
    838	 * rejected due to PI futex rule [10].
    839	 *
    840	 * Ensure that the rtmutex owner is also the pi_state owner despite
    841	 * the user space value claiming something different. There is no
    842	 * point in unlocking the rtmutex if current is the owner as it
    843	 * would need to wait until the next waiter has taken the rtmutex
    844	 * to guarantee consistent state. Keep it simple. Userspace asked
    845	 * for this wreckaged state.
    846	 *
    847	 * The rtmutex has an owner - either current or some other
    848	 * task. See the EAGAIN loop above.
    849	 */
    850	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
    851
    852	return err;
    853}
    854
    855static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
    856				struct task_struct *argowner)
    857{
    858	struct futex_pi_state *pi_state = q->pi_state;
    859	int ret;
    860
    861	lockdep_assert_held(q->lock_ptr);
    862
    863	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    864	ret = __fixup_pi_state_owner(uaddr, q, argowner);
    865	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    866	return ret;
    867}
    868
    869/**
    870 * fixup_pi_owner() - Post lock pi_state and corner case management
    871 * @uaddr:	user address of the futex
    872 * @q:		futex_q (contains pi_state and access to the rt_mutex)
    873 * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
    874 *
    875 * After attempting to lock an rt_mutex, this function is called to cleanup
    876 * the pi_state owner as well as handle race conditions that may allow us to
    877 * acquire the lock. Must be called with the hb lock held.
    878 *
    879 * Return:
    880 *  -  1 - success, lock taken;
    881 *  -  0 - success, lock not taken;
    882 *  - <0 - on error (-EFAULT)
    883 */
    884int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
    885{
    886	if (locked) {
    887		/*
    888		 * Got the lock. We might not be the anticipated owner if we
    889		 * did a lock-steal - fix up the PI-state in that case:
    890		 *
    891		 * Speculative pi_state->owner read (we don't hold wait_lock);
    892		 * since we own the lock pi_state->owner == current is the
    893		 * stable state, anything else needs more attention.
    894		 */
    895		if (q->pi_state->owner != current)
    896			return fixup_pi_state_owner(uaddr, q, current);
    897		return 1;
    898	}
    899
    900	/*
    901	 * If we didn't get the lock; check if anybody stole it from us. In
    902	 * that case, we need to fix up the uval to point to them instead of
    903	 * us, otherwise bad things happen. [10]
    904	 *
    905	 * Another speculative read; pi_state->owner == current is unstable
    906	 * but needs our attention.
    907	 */
    908	if (q->pi_state->owner == current)
    909		return fixup_pi_state_owner(uaddr, q, NULL);
    910
    911	/*
    912	 * Paranoia check. If we did not take the lock, then we should not be
    913	 * the owner of the rt_mutex. Warn and establish consistent state.
    914	 */
    915	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
    916		return fixup_pi_state_owner(uaddr, q, current);
    917
    918	return 0;
    919}
    920
    921/*
    922 * Userspace tried a 0 -> TID atomic transition of the futex value
    923 * and failed. The kernel side here does the whole locking operation:
    924 * if there are waiters then it will block as a consequence of relying
    925 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
    926 * a 0 value of the futex too.).
    927 *
    928 * Also serves as futex trylock_pi()'ing, and due semantics.
    929 */
    930int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
    931{
    932	struct hrtimer_sleeper timeout, *to;
    933	struct task_struct *exiting = NULL;
    934	struct rt_mutex_waiter rt_waiter;
    935	struct futex_hash_bucket *hb;
    936	struct futex_q q = futex_q_init;
    937	int res, ret;
    938
    939	if (!IS_ENABLED(CONFIG_FUTEX_PI))
    940		return -ENOSYS;
    941
    942	if (refill_pi_state_cache())
    943		return -ENOMEM;
    944
    945	to = futex_setup_timer(time, &timeout, flags, 0);
    946
    947retry:
    948	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
    949	if (unlikely(ret != 0))
    950		goto out;
    951
    952retry_private:
    953	hb = futex_q_lock(&q);
    954
    955	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
    956				   &exiting, 0);
    957	if (unlikely(ret)) {
    958		/*
    959		 * Atomic work succeeded and we got the lock,
    960		 * or failed. Either way, we do _not_ block.
    961		 */
    962		switch (ret) {
    963		case 1:
    964			/* We got the lock. */
    965			ret = 0;
    966			goto out_unlock_put_key;
    967		case -EFAULT:
    968			goto uaddr_faulted;
    969		case -EBUSY:
    970		case -EAGAIN:
    971			/*
    972			 * Two reasons for this:
    973			 * - EBUSY: Task is exiting and we just wait for the
    974			 *   exit to complete.
    975			 * - EAGAIN: The user space value changed.
    976			 */
    977			futex_q_unlock(hb);
    978			/*
    979			 * Handle the case where the owner is in the middle of
    980			 * exiting. Wait for the exit to complete otherwise
    981			 * this task might loop forever, aka. live lock.
    982			 */
    983			wait_for_owner_exiting(ret, exiting);
    984			cond_resched();
    985			goto retry;
    986		default:
    987			goto out_unlock_put_key;
    988		}
    989	}
    990
    991	WARN_ON(!q.pi_state);
    992
    993	/*
    994	 * Only actually queue now that the atomic ops are done:
    995	 */
    996	__futex_queue(&q, hb);
    997
    998	if (trylock) {
    999		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
   1000		/* Fixup the trylock return value: */
   1001		ret = ret ? 0 : -EWOULDBLOCK;
   1002		goto no_block;
   1003	}
   1004
   1005	rt_mutex_init_waiter(&rt_waiter);
   1006
   1007	/*
   1008	 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
   1009	 * hold it while doing rt_mutex_start_proxy(), because then it will
   1010	 * include hb->lock in the blocking chain, even through we'll not in
   1011	 * fact hold it while blocking. This will lead it to report -EDEADLK
   1012	 * and BUG when futex_unlock_pi() interleaves with this.
   1013	 *
   1014	 * Therefore acquire wait_lock while holding hb->lock, but drop the
   1015	 * latter before calling __rt_mutex_start_proxy_lock(). This
   1016	 * interleaves with futex_unlock_pi() -- which does a similar lock
   1017	 * handoff -- such that the latter can observe the futex_q::pi_state
   1018	 * before __rt_mutex_start_proxy_lock() is done.
   1019	 */
   1020	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
   1021	spin_unlock(q.lock_ptr);
   1022	/*
   1023	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
   1024	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
   1025	 * it sees the futex_q::pi_state.
   1026	 */
   1027	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
   1028	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
   1029
   1030	if (ret) {
   1031		if (ret == 1)
   1032			ret = 0;
   1033		goto cleanup;
   1034	}
   1035
   1036	if (unlikely(to))
   1037		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
   1038
   1039	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
   1040
   1041cleanup:
   1042	spin_lock(q.lock_ptr);
   1043	/*
   1044	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
   1045	 * first acquire the hb->lock before removing the lock from the
   1046	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
   1047	 * lists consistent.
   1048	 *
   1049	 * In particular; it is important that futex_unlock_pi() can not
   1050	 * observe this inconsistency.
   1051	 */
   1052	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
   1053		ret = 0;
   1054
   1055no_block:
   1056	/*
   1057	 * Fixup the pi_state owner and possibly acquire the lock if we
   1058	 * haven't already.
   1059	 */
   1060	res = fixup_pi_owner(uaddr, &q, !ret);
   1061	/*
   1062	 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
   1063	 * the lock, clear our -ETIMEDOUT or -EINTR.
   1064	 */
   1065	if (res)
   1066		ret = (res < 0) ? res : 0;
   1067
   1068	futex_unqueue_pi(&q);
   1069	spin_unlock(q.lock_ptr);
   1070	goto out;
   1071
   1072out_unlock_put_key:
   1073	futex_q_unlock(hb);
   1074
   1075out:
   1076	if (to) {
   1077		hrtimer_cancel(&to->timer);
   1078		destroy_hrtimer_on_stack(&to->timer);
   1079	}
   1080	return ret != -EINTR ? ret : -ERESTARTNOINTR;
   1081
   1082uaddr_faulted:
   1083	futex_q_unlock(hb);
   1084
   1085	ret = fault_in_user_writeable(uaddr);
   1086	if (ret)
   1087		goto out;
   1088
   1089	if (!(flags & FLAGS_SHARED))
   1090		goto retry_private;
   1091
   1092	goto retry;
   1093}
   1094
   1095/*
   1096 * Userspace attempted a TID -> 0 atomic transition, and failed.
   1097 * This is the in-kernel slowpath: we look up the PI state (if any),
   1098 * and do the rt-mutex unlock.
   1099 */
   1100int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
   1101{
   1102	u32 curval, uval, vpid = task_pid_vnr(current);
   1103	union futex_key key = FUTEX_KEY_INIT;
   1104	struct futex_hash_bucket *hb;
   1105	struct futex_q *top_waiter;
   1106	int ret;
   1107
   1108	if (!IS_ENABLED(CONFIG_FUTEX_PI))
   1109		return -ENOSYS;
   1110
   1111retry:
   1112	if (get_user(uval, uaddr))
   1113		return -EFAULT;
   1114	/*
   1115	 * We release only a lock we actually own:
   1116	 */
   1117	if ((uval & FUTEX_TID_MASK) != vpid)
   1118		return -EPERM;
   1119
   1120	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
   1121	if (ret)
   1122		return ret;
   1123
   1124	hb = futex_hash(&key);
   1125	spin_lock(&hb->lock);
   1126
   1127	/*
   1128	 * Check waiters first. We do not trust user space values at
   1129	 * all and we at least want to know if user space fiddled
   1130	 * with the futex value instead of blindly unlocking.
   1131	 */
   1132	top_waiter = futex_top_waiter(hb, &key);
   1133	if (top_waiter) {
   1134		struct futex_pi_state *pi_state = top_waiter->pi_state;
   1135
   1136		ret = -EINVAL;
   1137		if (!pi_state)
   1138			goto out_unlock;
   1139
   1140		/*
   1141		 * If current does not own the pi_state then the futex is
   1142		 * inconsistent and user space fiddled with the futex value.
   1143		 */
   1144		if (pi_state->owner != current)
   1145			goto out_unlock;
   1146
   1147		get_pi_state(pi_state);
   1148		/*
   1149		 * By taking wait_lock while still holding hb->lock, we ensure
   1150		 * there is no point where we hold neither; and therefore
   1151		 * wake_futex_p() must observe a state consistent with what we
   1152		 * observed.
   1153		 *
   1154		 * In particular; this forces __rt_mutex_start_proxy() to
   1155		 * complete such that we're guaranteed to observe the
   1156		 * rt_waiter. Also see the WARN in wake_futex_pi().
   1157		 */
   1158		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
   1159		spin_unlock(&hb->lock);
   1160
   1161		/* drops pi_state->pi_mutex.wait_lock */
   1162		ret = wake_futex_pi(uaddr, uval, pi_state);
   1163
   1164		put_pi_state(pi_state);
   1165
   1166		/*
   1167		 * Success, we're done! No tricky corner cases.
   1168		 */
   1169		if (!ret)
   1170			return ret;
   1171		/*
   1172		 * The atomic access to the futex value generated a
   1173		 * pagefault, so retry the user-access and the wakeup:
   1174		 */
   1175		if (ret == -EFAULT)
   1176			goto pi_faulted;
   1177		/*
   1178		 * A unconditional UNLOCK_PI op raced against a waiter
   1179		 * setting the FUTEX_WAITERS bit. Try again.
   1180		 */
   1181		if (ret == -EAGAIN)
   1182			goto pi_retry;
   1183		/*
   1184		 * wake_futex_pi has detected invalid state. Tell user
   1185		 * space.
   1186		 */
   1187		return ret;
   1188	}
   1189
   1190	/*
   1191	 * We have no kernel internal state, i.e. no waiters in the
   1192	 * kernel. Waiters which are about to queue themselves are stuck
   1193	 * on hb->lock. So we can safely ignore them. We do neither
   1194	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
   1195	 * owner.
   1196	 */
   1197	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
   1198		spin_unlock(&hb->lock);
   1199		switch (ret) {
   1200		case -EFAULT:
   1201			goto pi_faulted;
   1202
   1203		case -EAGAIN:
   1204			goto pi_retry;
   1205
   1206		default:
   1207			WARN_ON_ONCE(1);
   1208			return ret;
   1209		}
   1210	}
   1211
   1212	/*
   1213	 * If uval has changed, let user space handle it.
   1214	 */
   1215	ret = (curval == uval) ? 0 : -EAGAIN;
   1216
   1217out_unlock:
   1218	spin_unlock(&hb->lock);
   1219	return ret;
   1220
   1221pi_retry:
   1222	cond_resched();
   1223	goto retry;
   1224
   1225pi_faulted:
   1226
   1227	ret = fault_in_user_writeable(uaddr);
   1228	if (!ret)
   1229		goto retry;
   1230
   1231	return ret;
   1232}
   1233