cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

core.c (32131B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 *  Fast Userspace Mutexes (which I call "Futexes!").
      4 *  (C) Rusty Russell, IBM 2002
      5 *
      6 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
      7 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
      8 *
      9 *  Removed page pinning, fix privately mapped COW pages and other cleanups
     10 *  (C) Copyright 2003, 2004 Jamie Lokier
     11 *
     12 *  Robust futex support started by Ingo Molnar
     13 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
     14 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
     15 *
     16 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
     17 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
     18 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
     19 *
     20 *  PRIVATE futexes by Eric Dumazet
     21 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
     22 *
     23 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
     24 *  Copyright (C) IBM Corporation, 2009
     25 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
     26 *
     27 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
     28 *  enough at me, Linus for the original (flawed) idea, Matthew
     29 *  Kirkwood for proof-of-concept implementation.
     30 *
     31 *  "The futexes are also cursed."
     32 *  "But they come in a choice of three flavours!"
     33 */
     34#include <linux/compat.h>
     35#include <linux/jhash.h>
     36#include <linux/pagemap.h>
     37#include <linux/memblock.h>
     38#include <linux/fault-inject.h>
     39#include <linux/slab.h>
     40
     41#include "futex.h"
     42#include "../locking/rtmutex_common.h"
     43
     44/*
     45 * The base of the bucket array and its size are always used together
     46 * (after initialization only in futex_hash()), so ensure that they
     47 * reside in the same cacheline.
     48 */
     49static struct {
     50	struct futex_hash_bucket *queues;
     51	unsigned long            hashsize;
     52} __futex_data __read_mostly __aligned(2*sizeof(long));
     53#define futex_queues   (__futex_data.queues)
     54#define futex_hashsize (__futex_data.hashsize)
     55
     56
     57/*
     58 * Fault injections for futexes.
     59 */
     60#ifdef CONFIG_FAIL_FUTEX
     61
     62static struct {
     63	struct fault_attr attr;
     64
     65	bool ignore_private;
     66} fail_futex = {
     67	.attr = FAULT_ATTR_INITIALIZER,
     68	.ignore_private = false,
     69};
     70
     71static int __init setup_fail_futex(char *str)
     72{
     73	return setup_fault_attr(&fail_futex.attr, str);
     74}
     75__setup("fail_futex=", setup_fail_futex);
     76
     77bool should_fail_futex(bool fshared)
     78{
     79	if (fail_futex.ignore_private && !fshared)
     80		return false;
     81
     82	return should_fail(&fail_futex.attr, 1);
     83}
     84
     85#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
     86
     87static int __init fail_futex_debugfs(void)
     88{
     89	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
     90	struct dentry *dir;
     91
     92	dir = fault_create_debugfs_attr("fail_futex", NULL,
     93					&fail_futex.attr);
     94	if (IS_ERR(dir))
     95		return PTR_ERR(dir);
     96
     97	debugfs_create_bool("ignore-private", mode, dir,
     98			    &fail_futex.ignore_private);
     99	return 0;
    100}
    101
    102late_initcall(fail_futex_debugfs);
    103
    104#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
    105
    106#endif /* CONFIG_FAIL_FUTEX */
    107
    108/**
    109 * futex_hash - Return the hash bucket in the global hash
    110 * @key:	Pointer to the futex key for which the hash is calculated
    111 *
    112 * We hash on the keys returned from get_futex_key (see below) and return the
    113 * corresponding hash bucket in the global hash.
    114 */
    115struct futex_hash_bucket *futex_hash(union futex_key *key)
    116{
    117	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
    118			  key->both.offset);
    119
    120	return &futex_queues[hash & (futex_hashsize - 1)];
    121}
    122
    123
    124/**
    125 * futex_setup_timer - set up the sleeping hrtimer.
    126 * @time:	ptr to the given timeout value
    127 * @timeout:	the hrtimer_sleeper structure to be set up
    128 * @flags:	futex flags
    129 * @range_ns:	optional range in ns
    130 *
    131 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
    132 *	   value given
    133 */
    134struct hrtimer_sleeper *
    135futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
    136		  int flags, u64 range_ns)
    137{
    138	if (!time)
    139		return NULL;
    140
    141	hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
    142				      CLOCK_REALTIME : CLOCK_MONOTONIC,
    143				      HRTIMER_MODE_ABS);
    144	/*
    145	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
    146	 * effectively the same as calling hrtimer_set_expires().
    147	 */
    148	hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
    149
    150	return timeout;
    151}
    152
    153/*
    154 * Generate a machine wide unique identifier for this inode.
    155 *
    156 * This relies on u64 not wrapping in the life-time of the machine; which with
    157 * 1ns resolution means almost 585 years.
    158 *
    159 * This further relies on the fact that a well formed program will not unmap
    160 * the file while it has a (shared) futex waiting on it. This mapping will have
    161 * a file reference which pins the mount and inode.
    162 *
    163 * If for some reason an inode gets evicted and read back in again, it will get
    164 * a new sequence number and will _NOT_ match, even though it is the exact same
    165 * file.
    166 *
    167 * It is important that futex_match() will never have a false-positive, esp.
    168 * for PI futexes that can mess up the state. The above argues that false-negatives
    169 * are only possible for malformed programs.
    170 */
    171static u64 get_inode_sequence_number(struct inode *inode)
    172{
    173	static atomic64_t i_seq;
    174	u64 old;
    175
    176	/* Does the inode already have a sequence number? */
    177	old = atomic64_read(&inode->i_sequence);
    178	if (likely(old))
    179		return old;
    180
    181	for (;;) {
    182		u64 new = atomic64_add_return(1, &i_seq);
    183		if (WARN_ON_ONCE(!new))
    184			continue;
    185
    186		old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
    187		if (old)
    188			return old;
    189		return new;
    190	}
    191}
    192
    193/**
    194 * get_futex_key() - Get parameters which are the keys for a futex
    195 * @uaddr:	virtual address of the futex
    196 * @fshared:	false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
    197 * @key:	address where result is stored.
    198 * @rw:		mapping needs to be read/write (values: FUTEX_READ,
    199 *              FUTEX_WRITE)
    200 *
    201 * Return: a negative error code or 0
    202 *
    203 * The key words are stored in @key on success.
    204 *
    205 * For shared mappings (when @fshared), the key is:
    206 *
    207 *   ( inode->i_sequence, page->index, offset_within_page )
    208 *
    209 * [ also see get_inode_sequence_number() ]
    210 *
    211 * For private mappings (or when !@fshared), the key is:
    212 *
    213 *   ( current->mm, address, 0 )
    214 *
    215 * This allows (cross process, where applicable) identification of the futex
    216 * without keeping the page pinned for the duration of the FUTEX_WAIT.
    217 *
    218 * lock_page() might sleep, the caller should not hold a spinlock.
    219 */
    220int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
    221		  enum futex_access rw)
    222{
    223	unsigned long address = (unsigned long)uaddr;
    224	struct mm_struct *mm = current->mm;
    225	struct page *page, *tail;
    226	struct address_space *mapping;
    227	int err, ro = 0;
    228
    229	/*
    230	 * The futex address must be "naturally" aligned.
    231	 */
    232	key->both.offset = address % PAGE_SIZE;
    233	if (unlikely((address % sizeof(u32)) != 0))
    234		return -EINVAL;
    235	address -= key->both.offset;
    236
    237	if (unlikely(!access_ok(uaddr, sizeof(u32))))
    238		return -EFAULT;
    239
    240	if (unlikely(should_fail_futex(fshared)))
    241		return -EFAULT;
    242
    243	/*
    244	 * PROCESS_PRIVATE futexes are fast.
    245	 * As the mm cannot disappear under us and the 'key' only needs
    246	 * virtual address, we dont even have to find the underlying vma.
    247	 * Note : We do have to check 'uaddr' is a valid user address,
    248	 *        but access_ok() should be faster than find_vma()
    249	 */
    250	if (!fshared) {
    251		key->private.mm = mm;
    252		key->private.address = address;
    253		return 0;
    254	}
    255
    256again:
    257	/* Ignore any VERIFY_READ mapping (futex common case) */
    258	if (unlikely(should_fail_futex(true)))
    259		return -EFAULT;
    260
    261	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
    262	/*
    263	 * If write access is not required (eg. FUTEX_WAIT), try
    264	 * and get read-only access.
    265	 */
    266	if (err == -EFAULT && rw == FUTEX_READ) {
    267		err = get_user_pages_fast(address, 1, 0, &page);
    268		ro = 1;
    269	}
    270	if (err < 0)
    271		return err;
    272	else
    273		err = 0;
    274
    275	/*
    276	 * The treatment of mapping from this point on is critical. The page
    277	 * lock protects many things but in this context the page lock
    278	 * stabilizes mapping, prevents inode freeing in the shared
    279	 * file-backed region case and guards against movement to swap cache.
    280	 *
    281	 * Strictly speaking the page lock is not needed in all cases being
    282	 * considered here and page lock forces unnecessarily serialization
    283	 * From this point on, mapping will be re-verified if necessary and
    284	 * page lock will be acquired only if it is unavoidable
    285	 *
    286	 * Mapping checks require the head page for any compound page so the
    287	 * head page and mapping is looked up now. For anonymous pages, it
    288	 * does not matter if the page splits in the future as the key is
    289	 * based on the address. For filesystem-backed pages, the tail is
    290	 * required as the index of the page determines the key. For
    291	 * base pages, there is no tail page and tail == page.
    292	 */
    293	tail = page;
    294	page = compound_head(page);
    295	mapping = READ_ONCE(page->mapping);
    296
    297	/*
    298	 * If page->mapping is NULL, then it cannot be a PageAnon
    299	 * page; but it might be the ZERO_PAGE or in the gate area or
    300	 * in a special mapping (all cases which we are happy to fail);
    301	 * or it may have been a good file page when get_user_pages_fast
    302	 * found it, but truncated or holepunched or subjected to
    303	 * invalidate_complete_page2 before we got the page lock (also
    304	 * cases which we are happy to fail).  And we hold a reference,
    305	 * so refcount care in invalidate_inode_page's remove_mapping
    306	 * prevents drop_caches from setting mapping to NULL beneath us.
    307	 *
    308	 * The case we do have to guard against is when memory pressure made
    309	 * shmem_writepage move it from filecache to swapcache beneath us:
    310	 * an unlikely race, but we do need to retry for page->mapping.
    311	 */
    312	if (unlikely(!mapping)) {
    313		int shmem_swizzled;
    314
    315		/*
    316		 * Page lock is required to identify which special case above
    317		 * applies. If this is really a shmem page then the page lock
    318		 * will prevent unexpected transitions.
    319		 */
    320		lock_page(page);
    321		shmem_swizzled = PageSwapCache(page) || page->mapping;
    322		unlock_page(page);
    323		put_page(page);
    324
    325		if (shmem_swizzled)
    326			goto again;
    327
    328		return -EFAULT;
    329	}
    330
    331	/*
    332	 * Private mappings are handled in a simple way.
    333	 *
    334	 * If the futex key is stored on an anonymous page, then the associated
    335	 * object is the mm which is implicitly pinned by the calling process.
    336	 *
    337	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
    338	 * it's a read-only handle, it's expected that futexes attach to
    339	 * the object not the particular process.
    340	 */
    341	if (PageAnon(page)) {
    342		/*
    343		 * A RO anonymous page will never change and thus doesn't make
    344		 * sense for futex operations.
    345		 */
    346		if (unlikely(should_fail_futex(true)) || ro) {
    347			err = -EFAULT;
    348			goto out;
    349		}
    350
    351		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
    352		key->private.mm = mm;
    353		key->private.address = address;
    354
    355	} else {
    356		struct inode *inode;
    357
    358		/*
    359		 * The associated futex object in this case is the inode and
    360		 * the page->mapping must be traversed. Ordinarily this should
    361		 * be stabilised under page lock but it's not strictly
    362		 * necessary in this case as we just want to pin the inode, not
    363		 * update the radix tree or anything like that.
    364		 *
    365		 * The RCU read lock is taken as the inode is finally freed
    366		 * under RCU. If the mapping still matches expectations then the
    367		 * mapping->host can be safely accessed as being a valid inode.
    368		 */
    369		rcu_read_lock();
    370
    371		if (READ_ONCE(page->mapping) != mapping) {
    372			rcu_read_unlock();
    373			put_page(page);
    374
    375			goto again;
    376		}
    377
    378		inode = READ_ONCE(mapping->host);
    379		if (!inode) {
    380			rcu_read_unlock();
    381			put_page(page);
    382
    383			goto again;
    384		}
    385
    386		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
    387		key->shared.i_seq = get_inode_sequence_number(inode);
    388		key->shared.pgoff = page_to_pgoff(tail);
    389		rcu_read_unlock();
    390	}
    391
    392out:
    393	put_page(page);
    394	return err;
    395}
    396
    397/**
    398 * fault_in_user_writeable() - Fault in user address and verify RW access
    399 * @uaddr:	pointer to faulting user space address
    400 *
    401 * Slow path to fixup the fault we just took in the atomic write
    402 * access to @uaddr.
    403 *
    404 * We have no generic implementation of a non-destructive write to the
    405 * user address. We know that we faulted in the atomic pagefault
    406 * disabled section so we can as well avoid the #PF overhead by
    407 * calling get_user_pages() right away.
    408 */
    409int fault_in_user_writeable(u32 __user *uaddr)
    410{
    411	struct mm_struct *mm = current->mm;
    412	int ret;
    413
    414	mmap_read_lock(mm);
    415	ret = fixup_user_fault(mm, (unsigned long)uaddr,
    416			       FAULT_FLAG_WRITE, NULL);
    417	mmap_read_unlock(mm);
    418
    419	return ret < 0 ? ret : 0;
    420}
    421
    422/**
    423 * futex_top_waiter() - Return the highest priority waiter on a futex
    424 * @hb:		the hash bucket the futex_q's reside in
    425 * @key:	the futex key (to distinguish it from other futex futex_q's)
    426 *
    427 * Must be called with the hb lock held.
    428 */
    429struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
    430{
    431	struct futex_q *this;
    432
    433	plist_for_each_entry(this, &hb->chain, list) {
    434		if (futex_match(&this->key, key))
    435			return this;
    436	}
    437	return NULL;
    438}
    439
    440int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
    441{
    442	int ret;
    443
    444	pagefault_disable();
    445	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
    446	pagefault_enable();
    447
    448	return ret;
    449}
    450
    451int futex_get_value_locked(u32 *dest, u32 __user *from)
    452{
    453	int ret;
    454
    455	pagefault_disable();
    456	ret = __get_user(*dest, from);
    457	pagefault_enable();
    458
    459	return ret ? -EFAULT : 0;
    460}
    461
    462/**
    463 * wait_for_owner_exiting - Block until the owner has exited
    464 * @ret: owner's current futex lock status
    465 * @exiting:	Pointer to the exiting task
    466 *
    467 * Caller must hold a refcount on @exiting.
    468 */
    469void wait_for_owner_exiting(int ret, struct task_struct *exiting)
    470{
    471	if (ret != -EBUSY) {
    472		WARN_ON_ONCE(exiting);
    473		return;
    474	}
    475
    476	if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
    477		return;
    478
    479	mutex_lock(&exiting->futex_exit_mutex);
    480	/*
    481	 * No point in doing state checking here. If the waiter got here
    482	 * while the task was in exec()->exec_futex_release() then it can
    483	 * have any FUTEX_STATE_* value when the waiter has acquired the
    484	 * mutex. OK, if running, EXITING or DEAD if it reached exit()
    485	 * already. Highly unlikely and not a problem. Just one more round
    486	 * through the futex maze.
    487	 */
    488	mutex_unlock(&exiting->futex_exit_mutex);
    489
    490	put_task_struct(exiting);
    491}
    492
    493/**
    494 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
    495 * @q:	The futex_q to unqueue
    496 *
    497 * The q->lock_ptr must not be NULL and must be held by the caller.
    498 */
    499void __futex_unqueue(struct futex_q *q)
    500{
    501	struct futex_hash_bucket *hb;
    502
    503	if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
    504		return;
    505	lockdep_assert_held(q->lock_ptr);
    506
    507	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
    508	plist_del(&q->list, &hb->chain);
    509	futex_hb_waiters_dec(hb);
    510}
    511
    512/* The key must be already stored in q->key. */
    513struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
    514	__acquires(&hb->lock)
    515{
    516	struct futex_hash_bucket *hb;
    517
    518	hb = futex_hash(&q->key);
    519
    520	/*
    521	 * Increment the counter before taking the lock so that
    522	 * a potential waker won't miss a to-be-slept task that is
    523	 * waiting for the spinlock. This is safe as all futex_q_lock()
    524	 * users end up calling futex_queue(). Similarly, for housekeeping,
    525	 * decrement the counter at futex_q_unlock() when some error has
    526	 * occurred and we don't end up adding the task to the list.
    527	 */
    528	futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
    529
    530	q->lock_ptr = &hb->lock;
    531
    532	spin_lock(&hb->lock);
    533	return hb;
    534}
    535
    536void futex_q_unlock(struct futex_hash_bucket *hb)
    537	__releases(&hb->lock)
    538{
    539	spin_unlock(&hb->lock);
    540	futex_hb_waiters_dec(hb);
    541}
    542
    543void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
    544{
    545	int prio;
    546
    547	/*
    548	 * The priority used to register this element is
    549	 * - either the real thread-priority for the real-time threads
    550	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
    551	 * - or MAX_RT_PRIO for non-RT threads.
    552	 * Thus, all RT-threads are woken first in priority order, and
    553	 * the others are woken last, in FIFO order.
    554	 */
    555	prio = min(current->normal_prio, MAX_RT_PRIO);
    556
    557	plist_node_init(&q->list, prio);
    558	plist_add(&q->list, &hb->chain);
    559	q->task = current;
    560}
    561
    562/**
    563 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
    564 * @q:	The futex_q to unqueue
    565 *
    566 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
    567 * be paired with exactly one earlier call to futex_queue().
    568 *
    569 * Return:
    570 *  - 1 - if the futex_q was still queued (and we removed unqueued it);
    571 *  - 0 - if the futex_q was already removed by the waking thread
    572 */
    573int futex_unqueue(struct futex_q *q)
    574{
    575	spinlock_t *lock_ptr;
    576	int ret = 0;
    577
    578	/* In the common case we don't take the spinlock, which is nice. */
    579retry:
    580	/*
    581	 * q->lock_ptr can change between this read and the following spin_lock.
    582	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
    583	 * optimizing lock_ptr out of the logic below.
    584	 */
    585	lock_ptr = READ_ONCE(q->lock_ptr);
    586	if (lock_ptr != NULL) {
    587		spin_lock(lock_ptr);
    588		/*
    589		 * q->lock_ptr can change between reading it and
    590		 * spin_lock(), causing us to take the wrong lock.  This
    591		 * corrects the race condition.
    592		 *
    593		 * Reasoning goes like this: if we have the wrong lock,
    594		 * q->lock_ptr must have changed (maybe several times)
    595		 * between reading it and the spin_lock().  It can
    596		 * change again after the spin_lock() but only if it was
    597		 * already changed before the spin_lock().  It cannot,
    598		 * however, change back to the original value.  Therefore
    599		 * we can detect whether we acquired the correct lock.
    600		 */
    601		if (unlikely(lock_ptr != q->lock_ptr)) {
    602			spin_unlock(lock_ptr);
    603			goto retry;
    604		}
    605		__futex_unqueue(q);
    606
    607		BUG_ON(q->pi_state);
    608
    609		spin_unlock(lock_ptr);
    610		ret = 1;
    611	}
    612
    613	return ret;
    614}
    615
    616/*
    617 * PI futexes can not be requeued and must remove themselves from the
    618 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
    619 */
    620void futex_unqueue_pi(struct futex_q *q)
    621{
    622	__futex_unqueue(q);
    623
    624	BUG_ON(!q->pi_state);
    625	put_pi_state(q->pi_state);
    626	q->pi_state = NULL;
    627}
    628
    629/* Constants for the pending_op argument of handle_futex_death */
    630#define HANDLE_DEATH_PENDING	true
    631#define HANDLE_DEATH_LIST	false
    632
    633/*
    634 * Process a futex-list entry, check whether it's owned by the
    635 * dying task, and do notification if so:
    636 */
    637static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
    638			      bool pi, bool pending_op)
    639{
    640	u32 uval, nval, mval;
    641	int err;
    642
    643	/* Futex address must be 32bit aligned */
    644	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
    645		return -1;
    646
    647retry:
    648	if (get_user(uval, uaddr))
    649		return -1;
    650
    651	/*
    652	 * Special case for regular (non PI) futexes. The unlock path in
    653	 * user space has two race scenarios:
    654	 *
    655	 * 1. The unlock path releases the user space futex value and
    656	 *    before it can execute the futex() syscall to wake up
    657	 *    waiters it is killed.
    658	 *
    659	 * 2. A woken up waiter is killed before it can acquire the
    660	 *    futex in user space.
    661	 *
    662	 * In both cases the TID validation below prevents a wakeup of
    663	 * potential waiters which can cause these waiters to block
    664	 * forever.
    665	 *
    666	 * In both cases the following conditions are met:
    667	 *
    668	 *	1) task->robust_list->list_op_pending != NULL
    669	 *	   @pending_op == true
    670	 *	2) User space futex value == 0
    671	 *	3) Regular futex: @pi == false
    672	 *
    673	 * If these conditions are met, it is safe to attempt waking up a
    674	 * potential waiter without touching the user space futex value and
    675	 * trying to set the OWNER_DIED bit. The user space futex value is
    676	 * uncontended and the rest of the user space mutex state is
    677	 * consistent, so a woken waiter will just take over the
    678	 * uncontended futex. Setting the OWNER_DIED bit would create
    679	 * inconsistent state and malfunction of the user space owner died
    680	 * handling.
    681	 */
    682	if (pending_op && !pi && !uval) {
    683		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
    684		return 0;
    685	}
    686
    687	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
    688		return 0;
    689
    690	/*
    691	 * Ok, this dying thread is truly holding a futex
    692	 * of interest. Set the OWNER_DIED bit atomically
    693	 * via cmpxchg, and if the value had FUTEX_WAITERS
    694	 * set, wake up a waiter (if any). (We have to do a
    695	 * futex_wake() even if OWNER_DIED is already set -
    696	 * to handle the rare but possible case of recursive
    697	 * thread-death.) The rest of the cleanup is done in
    698	 * userspace.
    699	 */
    700	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
    701
    702	/*
    703	 * We are not holding a lock here, but we want to have
    704	 * the pagefault_disable/enable() protection because
    705	 * we want to handle the fault gracefully. If the
    706	 * access fails we try to fault in the futex with R/W
    707	 * verification via get_user_pages. get_user() above
    708	 * does not guarantee R/W access. If that fails we
    709	 * give up and leave the futex locked.
    710	 */
    711	if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
    712		switch (err) {
    713		case -EFAULT:
    714			if (fault_in_user_writeable(uaddr))
    715				return -1;
    716			goto retry;
    717
    718		case -EAGAIN:
    719			cond_resched();
    720			goto retry;
    721
    722		default:
    723			WARN_ON_ONCE(1);
    724			return err;
    725		}
    726	}
    727
    728	if (nval != uval)
    729		goto retry;
    730
    731	/*
    732	 * Wake robust non-PI futexes here. The wakeup of
    733	 * PI futexes happens in exit_pi_state():
    734	 */
    735	if (!pi && (uval & FUTEX_WAITERS))
    736		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
    737
    738	return 0;
    739}
    740
    741/*
    742 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
    743 */
    744static inline int fetch_robust_entry(struct robust_list __user **entry,
    745				     struct robust_list __user * __user *head,
    746				     unsigned int *pi)
    747{
    748	unsigned long uentry;
    749
    750	if (get_user(uentry, (unsigned long __user *)head))
    751		return -EFAULT;
    752
    753	*entry = (void __user *)(uentry & ~1UL);
    754	*pi = uentry & 1;
    755
    756	return 0;
    757}
    758
    759/*
    760 * Walk curr->robust_list (very carefully, it's a userspace list!)
    761 * and mark any locks found there dead, and notify any waiters.
    762 *
    763 * We silently return on any sign of list-walking problem.
    764 */
    765static void exit_robust_list(struct task_struct *curr)
    766{
    767	struct robust_list_head __user *head = curr->robust_list;
    768	struct robust_list __user *entry, *next_entry, *pending;
    769	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
    770	unsigned int next_pi;
    771	unsigned long futex_offset;
    772	int rc;
    773
    774	/*
    775	 * Fetch the list head (which was registered earlier, via
    776	 * sys_set_robust_list()):
    777	 */
    778	if (fetch_robust_entry(&entry, &head->list.next, &pi))
    779		return;
    780	/*
    781	 * Fetch the relative futex offset:
    782	 */
    783	if (get_user(futex_offset, &head->futex_offset))
    784		return;
    785	/*
    786	 * Fetch any possibly pending lock-add first, and handle it
    787	 * if it exists:
    788	 */
    789	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
    790		return;
    791
    792	next_entry = NULL;	/* avoid warning with gcc */
    793	while (entry != &head->list) {
    794		/*
    795		 * Fetch the next entry in the list before calling
    796		 * handle_futex_death:
    797		 */
    798		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
    799		/*
    800		 * A pending lock might already be on the list, so
    801		 * don't process it twice:
    802		 */
    803		if (entry != pending) {
    804			if (handle_futex_death((void __user *)entry + futex_offset,
    805						curr, pi, HANDLE_DEATH_LIST))
    806				return;
    807		}
    808		if (rc)
    809			return;
    810		entry = next_entry;
    811		pi = next_pi;
    812		/*
    813		 * Avoid excessively long or circular lists:
    814		 */
    815		if (!--limit)
    816			break;
    817
    818		cond_resched();
    819	}
    820
    821	if (pending) {
    822		handle_futex_death((void __user *)pending + futex_offset,
    823				   curr, pip, HANDLE_DEATH_PENDING);
    824	}
    825}
    826
    827#ifdef CONFIG_COMPAT
    828static void __user *futex_uaddr(struct robust_list __user *entry,
    829				compat_long_t futex_offset)
    830{
    831	compat_uptr_t base = ptr_to_compat(entry);
    832	void __user *uaddr = compat_ptr(base + futex_offset);
    833
    834	return uaddr;
    835}
    836
    837/*
    838 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
    839 */
    840static inline int
    841compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
    842		   compat_uptr_t __user *head, unsigned int *pi)
    843{
    844	if (get_user(*uentry, head))
    845		return -EFAULT;
    846
    847	*entry = compat_ptr((*uentry) & ~1);
    848	*pi = (unsigned int)(*uentry) & 1;
    849
    850	return 0;
    851}
    852
    853/*
    854 * Walk curr->robust_list (very carefully, it's a userspace list!)
    855 * and mark any locks found there dead, and notify any waiters.
    856 *
    857 * We silently return on any sign of list-walking problem.
    858 */
    859static void compat_exit_robust_list(struct task_struct *curr)
    860{
    861	struct compat_robust_list_head __user *head = curr->compat_robust_list;
    862	struct robust_list __user *entry, *next_entry, *pending;
    863	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
    864	unsigned int next_pi;
    865	compat_uptr_t uentry, next_uentry, upending;
    866	compat_long_t futex_offset;
    867	int rc;
    868
    869	/*
    870	 * Fetch the list head (which was registered earlier, via
    871	 * sys_set_robust_list()):
    872	 */
    873	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
    874		return;
    875	/*
    876	 * Fetch the relative futex offset:
    877	 */
    878	if (get_user(futex_offset, &head->futex_offset))
    879		return;
    880	/*
    881	 * Fetch any possibly pending lock-add first, and handle it
    882	 * if it exists:
    883	 */
    884	if (compat_fetch_robust_entry(&upending, &pending,
    885			       &head->list_op_pending, &pip))
    886		return;
    887
    888	next_entry = NULL;	/* avoid warning with gcc */
    889	while (entry != (struct robust_list __user *) &head->list) {
    890		/*
    891		 * Fetch the next entry in the list before calling
    892		 * handle_futex_death:
    893		 */
    894		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
    895			(compat_uptr_t __user *)&entry->next, &next_pi);
    896		/*
    897		 * A pending lock might already be on the list, so
    898		 * dont process it twice:
    899		 */
    900		if (entry != pending) {
    901			void __user *uaddr = futex_uaddr(entry, futex_offset);
    902
    903			if (handle_futex_death(uaddr, curr, pi,
    904					       HANDLE_DEATH_LIST))
    905				return;
    906		}
    907		if (rc)
    908			return;
    909		uentry = next_uentry;
    910		entry = next_entry;
    911		pi = next_pi;
    912		/*
    913		 * Avoid excessively long or circular lists:
    914		 */
    915		if (!--limit)
    916			break;
    917
    918		cond_resched();
    919	}
    920	if (pending) {
    921		void __user *uaddr = futex_uaddr(pending, futex_offset);
    922
    923		handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
    924	}
    925}
    926#endif
    927
    928#ifdef CONFIG_FUTEX_PI
    929
    930/*
    931 * This task is holding PI mutexes at exit time => bad.
    932 * Kernel cleans up PI-state, but userspace is likely hosed.
    933 * (Robust-futex cleanup is separate and might save the day for userspace.)
    934 */
    935static void exit_pi_state_list(struct task_struct *curr)
    936{
    937	struct list_head *next, *head = &curr->pi_state_list;
    938	struct futex_pi_state *pi_state;
    939	struct futex_hash_bucket *hb;
    940	union futex_key key = FUTEX_KEY_INIT;
    941
    942	/*
    943	 * We are a ZOMBIE and nobody can enqueue itself on
    944	 * pi_state_list anymore, but we have to be careful
    945	 * versus waiters unqueueing themselves:
    946	 */
    947	raw_spin_lock_irq(&curr->pi_lock);
    948	while (!list_empty(head)) {
    949		next = head->next;
    950		pi_state = list_entry(next, struct futex_pi_state, list);
    951		key = pi_state->key;
    952		hb = futex_hash(&key);
    953
    954		/*
    955		 * We can race against put_pi_state() removing itself from the
    956		 * list (a waiter going away). put_pi_state() will first
    957		 * decrement the reference count and then modify the list, so
    958		 * its possible to see the list entry but fail this reference
    959		 * acquire.
    960		 *
    961		 * In that case; drop the locks to let put_pi_state() make
    962		 * progress and retry the loop.
    963		 */
    964		if (!refcount_inc_not_zero(&pi_state->refcount)) {
    965			raw_spin_unlock_irq(&curr->pi_lock);
    966			cpu_relax();
    967			raw_spin_lock_irq(&curr->pi_lock);
    968			continue;
    969		}
    970		raw_spin_unlock_irq(&curr->pi_lock);
    971
    972		spin_lock(&hb->lock);
    973		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    974		raw_spin_lock(&curr->pi_lock);
    975		/*
    976		 * We dropped the pi-lock, so re-check whether this
    977		 * task still owns the PI-state:
    978		 */
    979		if (head->next != next) {
    980			/* retain curr->pi_lock for the loop invariant */
    981			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
    982			spin_unlock(&hb->lock);
    983			put_pi_state(pi_state);
    984			continue;
    985		}
    986
    987		WARN_ON(pi_state->owner != curr);
    988		WARN_ON(list_empty(&pi_state->list));
    989		list_del_init(&pi_state->list);
    990		pi_state->owner = NULL;
    991
    992		raw_spin_unlock(&curr->pi_lock);
    993		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    994		spin_unlock(&hb->lock);
    995
    996		rt_mutex_futex_unlock(&pi_state->pi_mutex);
    997		put_pi_state(pi_state);
    998
    999		raw_spin_lock_irq(&curr->pi_lock);
   1000	}
   1001	raw_spin_unlock_irq(&curr->pi_lock);
   1002}
   1003#else
   1004static inline void exit_pi_state_list(struct task_struct *curr) { }
   1005#endif
   1006
   1007static void futex_cleanup(struct task_struct *tsk)
   1008{
   1009	if (unlikely(tsk->robust_list)) {
   1010		exit_robust_list(tsk);
   1011		tsk->robust_list = NULL;
   1012	}
   1013
   1014#ifdef CONFIG_COMPAT
   1015	if (unlikely(tsk->compat_robust_list)) {
   1016		compat_exit_robust_list(tsk);
   1017		tsk->compat_robust_list = NULL;
   1018	}
   1019#endif
   1020
   1021	if (unlikely(!list_empty(&tsk->pi_state_list)))
   1022		exit_pi_state_list(tsk);
   1023}
   1024
   1025/**
   1026 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
   1027 * @tsk:	task to set the state on
   1028 *
   1029 * Set the futex exit state of the task lockless. The futex waiter code
   1030 * observes that state when a task is exiting and loops until the task has
   1031 * actually finished the futex cleanup. The worst case for this is that the
   1032 * waiter runs through the wait loop until the state becomes visible.
   1033 *
   1034 * This is called from the recursive fault handling path in make_task_dead().
   1035 *
   1036 * This is best effort. Either the futex exit code has run already or
   1037 * not. If the OWNER_DIED bit has been set on the futex then the waiter can
   1038 * take it over. If not, the problem is pushed back to user space. If the
   1039 * futex exit code did not run yet, then an already queued waiter might
   1040 * block forever, but there is nothing which can be done about that.
   1041 */
   1042void futex_exit_recursive(struct task_struct *tsk)
   1043{
   1044	/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
   1045	if (tsk->futex_state == FUTEX_STATE_EXITING)
   1046		mutex_unlock(&tsk->futex_exit_mutex);
   1047	tsk->futex_state = FUTEX_STATE_DEAD;
   1048}
   1049
   1050static void futex_cleanup_begin(struct task_struct *tsk)
   1051{
   1052	/*
   1053	 * Prevent various race issues against a concurrent incoming waiter
   1054	 * including live locks by forcing the waiter to block on
   1055	 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
   1056	 * attach_to_pi_owner().
   1057	 */
   1058	mutex_lock(&tsk->futex_exit_mutex);
   1059
   1060	/*
   1061	 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
   1062	 *
   1063	 * This ensures that all subsequent checks of tsk->futex_state in
   1064	 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
   1065	 * tsk->pi_lock held.
   1066	 *
   1067	 * It guarantees also that a pi_state which was queued right before
   1068	 * the state change under tsk->pi_lock by a concurrent waiter must
   1069	 * be observed in exit_pi_state_list().
   1070	 */
   1071	raw_spin_lock_irq(&tsk->pi_lock);
   1072	tsk->futex_state = FUTEX_STATE_EXITING;
   1073	raw_spin_unlock_irq(&tsk->pi_lock);
   1074}
   1075
   1076static void futex_cleanup_end(struct task_struct *tsk, int state)
   1077{
   1078	/*
   1079	 * Lockless store. The only side effect is that an observer might
   1080	 * take another loop until it becomes visible.
   1081	 */
   1082	tsk->futex_state = state;
   1083	/*
   1084	 * Drop the exit protection. This unblocks waiters which observed
   1085	 * FUTEX_STATE_EXITING to reevaluate the state.
   1086	 */
   1087	mutex_unlock(&tsk->futex_exit_mutex);
   1088}
   1089
   1090void futex_exec_release(struct task_struct *tsk)
   1091{
   1092	/*
   1093	 * The state handling is done for consistency, but in the case of
   1094	 * exec() there is no way to prevent further damage as the PID stays
   1095	 * the same. But for the unlikely and arguably buggy case that a
   1096	 * futex is held on exec(), this provides at least as much state
   1097	 * consistency protection which is possible.
   1098	 */
   1099	futex_cleanup_begin(tsk);
   1100	futex_cleanup(tsk);
   1101	/*
   1102	 * Reset the state to FUTEX_STATE_OK. The task is alive and about
   1103	 * exec a new binary.
   1104	 */
   1105	futex_cleanup_end(tsk, FUTEX_STATE_OK);
   1106}
   1107
   1108void futex_exit_release(struct task_struct *tsk)
   1109{
   1110	futex_cleanup_begin(tsk);
   1111	futex_cleanup(tsk);
   1112	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
   1113}
   1114
   1115static int __init futex_init(void)
   1116{
   1117	unsigned int futex_shift;
   1118	unsigned long i;
   1119
   1120#if CONFIG_BASE_SMALL
   1121	futex_hashsize = 16;
   1122#else
   1123	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
   1124#endif
   1125
   1126	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
   1127					       futex_hashsize, 0,
   1128					       futex_hashsize < 256 ? HASH_SMALL : 0,
   1129					       &futex_shift, NULL,
   1130					       futex_hashsize, futex_hashsize);
   1131	futex_hashsize = 1UL << futex_shift;
   1132
   1133	for (i = 0; i < futex_hashsize; i++) {
   1134		atomic_set(&futex_queues[i].waiters, 0);
   1135		plist_head_init(&futex_queues[i].chain);
   1136		spin_lock_init(&futex_queues[i].lock);
   1137	}
   1138
   1139	return 0;
   1140}
   1141core_initcall(futex_init);