cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

rwsem.c (45916B)


      1// SPDX-License-Identifier: GPL-2.0
      2/* kernel/rwsem.c: R/W semaphores, public implementation
      3 *
      4 * Written by David Howells (dhowells@redhat.com).
      5 * Derived from asm-i386/semaphore.h
      6 *
      7 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
      8 * and Michel Lespinasse <walken@google.com>
      9 *
     10 * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
     11 * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
     12 *
     13 * Rwsem count bit fields re-definition and rwsem rearchitecture by
     14 * Waiman Long <longman@redhat.com> and
     15 * Peter Zijlstra <peterz@infradead.org>.
     16 */
     17
     18#include <linux/types.h>
     19#include <linux/kernel.h>
     20#include <linux/sched.h>
     21#include <linux/sched/rt.h>
     22#include <linux/sched/task.h>
     23#include <linux/sched/debug.h>
     24#include <linux/sched/wake_q.h>
     25#include <linux/sched/signal.h>
     26#include <linux/sched/clock.h>
     27#include <linux/export.h>
     28#include <linux/rwsem.h>
     29#include <linux/atomic.h>
     30#include <trace/events/lock.h>
     31
     32#ifndef CONFIG_PREEMPT_RT
     33#include "lock_events.h"
     34
     35/*
     36 * The least significant 2 bits of the owner value has the following
     37 * meanings when set.
     38 *  - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
     39 *  - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
     40 *
     41 * When the rwsem is reader-owned and a spinning writer has timed out,
     42 * the nonspinnable bit will be set to disable optimistic spinning.
     43
     44 * When a writer acquires a rwsem, it puts its task_struct pointer
     45 * into the owner field. It is cleared after an unlock.
     46 *
     47 * When a reader acquires a rwsem, it will also puts its task_struct
     48 * pointer into the owner field with the RWSEM_READER_OWNED bit set.
     49 * On unlock, the owner field will largely be left untouched. So
     50 * for a free or reader-owned rwsem, the owner value may contain
     51 * information about the last reader that acquires the rwsem.
     52 *
     53 * That information may be helpful in debugging cases where the system
     54 * seems to hang on a reader owned rwsem especially if only one reader
     55 * is involved. Ideally we would like to track all the readers that own
     56 * a rwsem, but the overhead is simply too big.
     57 *
     58 * A fast path reader optimistic lock stealing is supported when the rwsem
     59 * is previously owned by a writer and the following conditions are met:
     60 *  - rwsem is not currently writer owned
     61 *  - the handoff isn't set.
     62 */
     63#define RWSEM_READER_OWNED	(1UL << 0)
     64#define RWSEM_NONSPINNABLE	(1UL << 1)
     65#define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
     66
     67#ifdef CONFIG_DEBUG_RWSEMS
     68# define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
     69	if (!debug_locks_silent &&				\
     70	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
     71		#c, atomic_long_read(&(sem)->count),		\
     72		(unsigned long) sem->magic,			\
     73		atomic_long_read(&(sem)->owner), (long)current,	\
     74		list_empty(&(sem)->wait_list) ? "" : "not "))	\
     75			debug_locks_off();			\
     76	} while (0)
     77#else
     78# define DEBUG_RWSEMS_WARN_ON(c, sem)
     79#endif
     80
     81/*
     82 * On 64-bit architectures, the bit definitions of the count are:
     83 *
     84 * Bit  0    - writer locked bit
     85 * Bit  1    - waiters present bit
     86 * Bit  2    - lock handoff bit
     87 * Bits 3-7  - reserved
     88 * Bits 8-62 - 55-bit reader count
     89 * Bit  63   - read fail bit
     90 *
     91 * On 32-bit architectures, the bit definitions of the count are:
     92 *
     93 * Bit  0    - writer locked bit
     94 * Bit  1    - waiters present bit
     95 * Bit  2    - lock handoff bit
     96 * Bits 3-7  - reserved
     97 * Bits 8-30 - 23-bit reader count
     98 * Bit  31   - read fail bit
     99 *
    100 * It is not likely that the most significant bit (read fail bit) will ever
    101 * be set. This guard bit is still checked anyway in the down_read() fastpath
    102 * just in case we need to use up more of the reader bits for other purpose
    103 * in the future.
    104 *
    105 * atomic_long_fetch_add() is used to obtain reader lock, whereas
    106 * atomic_long_cmpxchg() will be used to obtain writer lock.
    107 *
    108 * There are three places where the lock handoff bit may be set or cleared.
    109 * 1) rwsem_mark_wake() for readers		-- set, clear
    110 * 2) rwsem_try_write_lock() for writers	-- set, clear
    111 * 3) rwsem_del_waiter()			-- clear
    112 *
    113 * For all the above cases, wait_lock will be held. A writer must also
    114 * be the first one in the wait_list to be eligible for setting the handoff
    115 * bit. So concurrent setting/clearing of handoff bit is not possible.
    116 */
    117#define RWSEM_WRITER_LOCKED	(1UL << 0)
    118#define RWSEM_FLAG_WAITERS	(1UL << 1)
    119#define RWSEM_FLAG_HANDOFF	(1UL << 2)
    120#define RWSEM_FLAG_READFAIL	(1UL << (BITS_PER_LONG - 1))
    121
    122#define RWSEM_READER_SHIFT	8
    123#define RWSEM_READER_BIAS	(1UL << RWSEM_READER_SHIFT)
    124#define RWSEM_READER_MASK	(~(RWSEM_READER_BIAS - 1))
    125#define RWSEM_WRITER_MASK	RWSEM_WRITER_LOCKED
    126#define RWSEM_LOCK_MASK		(RWSEM_WRITER_MASK|RWSEM_READER_MASK)
    127#define RWSEM_READ_FAILED_MASK	(RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
    128				 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
    129
    130/*
    131 * All writes to owner are protected by WRITE_ONCE() to make sure that
    132 * store tearing can't happen as optimistic spinners may read and use
    133 * the owner value concurrently without lock. Read from owner, however,
    134 * may not need READ_ONCE() as long as the pointer value is only used
    135 * for comparison and isn't being dereferenced.
    136 */
    137static inline void rwsem_set_owner(struct rw_semaphore *sem)
    138{
    139	atomic_long_set(&sem->owner, (long)current);
    140}
    141
    142static inline void rwsem_clear_owner(struct rw_semaphore *sem)
    143{
    144	atomic_long_set(&sem->owner, 0);
    145}
    146
    147/*
    148 * Test the flags in the owner field.
    149 */
    150static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
    151{
    152	return atomic_long_read(&sem->owner) & flags;
    153}
    154
    155/*
    156 * The task_struct pointer of the last owning reader will be left in
    157 * the owner field.
    158 *
    159 * Note that the owner value just indicates the task has owned the rwsem
    160 * previously, it may not be the real owner or one of the real owners
    161 * anymore when that field is examined, so take it with a grain of salt.
    162 *
    163 * The reader non-spinnable bit is preserved.
    164 */
    165static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
    166					    struct task_struct *owner)
    167{
    168	unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
    169		(atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
    170
    171	atomic_long_set(&sem->owner, val);
    172}
    173
    174static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
    175{
    176	__rwsem_set_reader_owned(sem, current);
    177}
    178
    179/*
    180 * Return true if the rwsem is owned by a reader.
    181 */
    182static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
    183{
    184#ifdef CONFIG_DEBUG_RWSEMS
    185	/*
    186	 * Check the count to see if it is write-locked.
    187	 */
    188	long count = atomic_long_read(&sem->count);
    189
    190	if (count & RWSEM_WRITER_MASK)
    191		return false;
    192#endif
    193	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
    194}
    195
    196#ifdef CONFIG_DEBUG_RWSEMS
    197/*
    198 * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
    199 * is a task pointer in owner of a reader-owned rwsem, it will be the
    200 * real owner or one of the real owners. The only exception is when the
    201 * unlock is done by up_read_non_owner().
    202 */
    203static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
    204{
    205	unsigned long val = atomic_long_read(&sem->owner);
    206
    207	while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
    208		if (atomic_long_try_cmpxchg(&sem->owner, &val,
    209					    val & RWSEM_OWNER_FLAGS_MASK))
    210			return;
    211	}
    212}
    213#else
    214static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
    215{
    216}
    217#endif
    218
    219/*
    220 * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
    221 * remains set. Otherwise, the operation will be aborted.
    222 */
    223static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
    224{
    225	unsigned long owner = atomic_long_read(&sem->owner);
    226
    227	do {
    228		if (!(owner & RWSEM_READER_OWNED))
    229			break;
    230		if (owner & RWSEM_NONSPINNABLE)
    231			break;
    232	} while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
    233					  owner | RWSEM_NONSPINNABLE));
    234}
    235
    236static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
    237{
    238	*cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
    239
    240	if (WARN_ON_ONCE(*cntp < 0))
    241		rwsem_set_nonspinnable(sem);
    242
    243	if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
    244		rwsem_set_reader_owned(sem);
    245		return true;
    246	}
    247
    248	return false;
    249}
    250
    251static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
    252{
    253	long tmp = RWSEM_UNLOCKED_VALUE;
    254
    255	if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
    256		rwsem_set_owner(sem);
    257		return true;
    258	}
    259
    260	return false;
    261}
    262
    263/*
    264 * Return just the real task structure pointer of the owner
    265 */
    266static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
    267{
    268	return (struct task_struct *)
    269		(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
    270}
    271
    272/*
    273 * Return the real task structure pointer of the owner and the embedded
    274 * flags in the owner. pflags must be non-NULL.
    275 */
    276static inline struct task_struct *
    277rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
    278{
    279	unsigned long owner = atomic_long_read(&sem->owner);
    280
    281	*pflags = owner & RWSEM_OWNER_FLAGS_MASK;
    282	return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
    283}
    284
    285/*
    286 * Guide to the rw_semaphore's count field.
    287 *
    288 * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
    289 * by a writer.
    290 *
    291 * The lock is owned by readers when
    292 * (1) the RWSEM_WRITER_LOCKED isn't set in count,
    293 * (2) some of the reader bits are set in count, and
    294 * (3) the owner field has RWSEM_READ_OWNED bit set.
    295 *
    296 * Having some reader bits set is not enough to guarantee a readers owned
    297 * lock as the readers may be in the process of backing out from the count
    298 * and a writer has just released the lock. So another writer may steal
    299 * the lock immediately after that.
    300 */
    301
    302/*
    303 * Initialize an rwsem:
    304 */
    305void __init_rwsem(struct rw_semaphore *sem, const char *name,
    306		  struct lock_class_key *key)
    307{
    308#ifdef CONFIG_DEBUG_LOCK_ALLOC
    309	/*
    310	 * Make sure we are not reinitializing a held semaphore:
    311	 */
    312	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
    313	lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
    314#endif
    315#ifdef CONFIG_DEBUG_RWSEMS
    316	sem->magic = sem;
    317#endif
    318	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
    319	raw_spin_lock_init(&sem->wait_lock);
    320	INIT_LIST_HEAD(&sem->wait_list);
    321	atomic_long_set(&sem->owner, 0L);
    322#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
    323	osq_lock_init(&sem->osq);
    324#endif
    325}
    326EXPORT_SYMBOL(__init_rwsem);
    327
    328enum rwsem_waiter_type {
    329	RWSEM_WAITING_FOR_WRITE,
    330	RWSEM_WAITING_FOR_READ
    331};
    332
    333struct rwsem_waiter {
    334	struct list_head list;
    335	struct task_struct *task;
    336	enum rwsem_waiter_type type;
    337	unsigned long timeout;
    338
    339	/* Writer only, not initialized in reader */
    340	bool handoff_set;
    341};
    342#define rwsem_first_waiter(sem) \
    343	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
    344
    345enum rwsem_wake_type {
    346	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */
    347	RWSEM_WAKE_READERS,	/* Wake readers only */
    348	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
    349};
    350
    351/*
    352 * The typical HZ value is either 250 or 1000. So set the minimum waiting
    353 * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
    354 * queue before initiating the handoff protocol.
    355 */
    356#define RWSEM_WAIT_TIMEOUT	DIV_ROUND_UP(HZ, 250)
    357
    358/*
    359 * Magic number to batch-wakeup waiting readers, even when writers are
    360 * also present in the queue. This both limits the amount of work the
    361 * waking thread must do and also prevents any potential counter overflow,
    362 * however unlikely.
    363 */
    364#define MAX_READERS_WAKEUP	0x100
    365
    366static inline void
    367rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
    368{
    369	lockdep_assert_held(&sem->wait_lock);
    370	list_add_tail(&waiter->list, &sem->wait_list);
    371	/* caller will set RWSEM_FLAG_WAITERS */
    372}
    373
    374/*
    375 * Remove a waiter from the wait_list and clear flags.
    376 *
    377 * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
    378 * this function. Modify with care.
    379 *
    380 * Return: true if wait_list isn't empty and false otherwise
    381 */
    382static inline bool
    383rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
    384{
    385	lockdep_assert_held(&sem->wait_lock);
    386	list_del(&waiter->list);
    387	if (likely(!list_empty(&sem->wait_list)))
    388		return true;
    389
    390	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
    391	return false;
    392}
    393
    394/*
    395 * handle the lock release when processes blocked on it that can now run
    396 * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
    397 *   have been set.
    398 * - there must be someone on the queue
    399 * - the wait_lock must be held by the caller
    400 * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
    401 *   to actually wakeup the blocked task(s) and drop the reference count,
    402 *   preferably when the wait_lock is released
    403 * - woken process blocks are discarded from the list after having task zeroed
    404 * - writers are only marked woken if downgrading is false
    405 *
    406 * Implies rwsem_del_waiter() for all woken readers.
    407 */
    408static void rwsem_mark_wake(struct rw_semaphore *sem,
    409			    enum rwsem_wake_type wake_type,
    410			    struct wake_q_head *wake_q)
    411{
    412	struct rwsem_waiter *waiter, *tmp;
    413	long oldcount, woken = 0, adjustment = 0;
    414	struct list_head wlist;
    415
    416	lockdep_assert_held(&sem->wait_lock);
    417
    418	/*
    419	 * Take a peek at the queue head waiter such that we can determine
    420	 * the wakeup(s) to perform.
    421	 */
    422	waiter = rwsem_first_waiter(sem);
    423
    424	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
    425		if (wake_type == RWSEM_WAKE_ANY) {
    426			/*
    427			 * Mark writer at the front of the queue for wakeup.
    428			 * Until the task is actually later awoken later by
    429			 * the caller, other writers are able to steal it.
    430			 * Readers, on the other hand, will block as they
    431			 * will notice the queued writer.
    432			 */
    433			wake_q_add(wake_q, waiter->task);
    434			lockevent_inc(rwsem_wake_writer);
    435		}
    436
    437		return;
    438	}
    439
    440	/*
    441	 * No reader wakeup if there are too many of them already.
    442	 */
    443	if (unlikely(atomic_long_read(&sem->count) < 0))
    444		return;
    445
    446	/*
    447	 * Writers might steal the lock before we grant it to the next reader.
    448	 * We prefer to do the first reader grant before counting readers
    449	 * so we can bail out early if a writer stole the lock.
    450	 */
    451	if (wake_type != RWSEM_WAKE_READ_OWNED) {
    452		struct task_struct *owner;
    453
    454		adjustment = RWSEM_READER_BIAS;
    455		oldcount = atomic_long_fetch_add(adjustment, &sem->count);
    456		if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
    457			/*
    458			 * When we've been waiting "too" long (for writers
    459			 * to give up the lock), request a HANDOFF to
    460			 * force the issue.
    461			 */
    462			if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
    463			    time_after(jiffies, waiter->timeout)) {
    464				adjustment -= RWSEM_FLAG_HANDOFF;
    465				lockevent_inc(rwsem_rlock_handoff);
    466			}
    467
    468			atomic_long_add(-adjustment, &sem->count);
    469			return;
    470		}
    471		/*
    472		 * Set it to reader-owned to give spinners an early
    473		 * indication that readers now have the lock.
    474		 * The reader nonspinnable bit seen at slowpath entry of
    475		 * the reader is copied over.
    476		 */
    477		owner = waiter->task;
    478		__rwsem_set_reader_owned(sem, owner);
    479	}
    480
    481	/*
    482	 * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
    483	 * queue. We know that the woken will be at least 1 as we accounted
    484	 * for above. Note we increment the 'active part' of the count by the
    485	 * number of readers before waking any processes up.
    486	 *
    487	 * This is an adaptation of the phase-fair R/W locks where at the
    488	 * reader phase (first waiter is a reader), all readers are eligible
    489	 * to acquire the lock at the same time irrespective of their order
    490	 * in the queue. The writers acquire the lock according to their
    491	 * order in the queue.
    492	 *
    493	 * We have to do wakeup in 2 passes to prevent the possibility that
    494	 * the reader count may be decremented before it is incremented. It
    495	 * is because the to-be-woken waiter may not have slept yet. So it
    496	 * may see waiter->task got cleared, finish its critical section and
    497	 * do an unlock before the reader count increment.
    498	 *
    499	 * 1) Collect the read-waiters in a separate list, count them and
    500	 *    fully increment the reader count in rwsem.
    501	 * 2) For each waiters in the new list, clear waiter->task and
    502	 *    put them into wake_q to be woken up later.
    503	 */
    504	INIT_LIST_HEAD(&wlist);
    505	list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
    506		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
    507			continue;
    508
    509		woken++;
    510		list_move_tail(&waiter->list, &wlist);
    511
    512		/*
    513		 * Limit # of readers that can be woken up per wakeup call.
    514		 */
    515		if (unlikely(woken >= MAX_READERS_WAKEUP))
    516			break;
    517	}
    518
    519	adjustment = woken * RWSEM_READER_BIAS - adjustment;
    520	lockevent_cond_inc(rwsem_wake_reader, woken);
    521
    522	oldcount = atomic_long_read(&sem->count);
    523	if (list_empty(&sem->wait_list)) {
    524		/*
    525		 * Combined with list_move_tail() above, this implies
    526		 * rwsem_del_waiter().
    527		 */
    528		adjustment -= RWSEM_FLAG_WAITERS;
    529		if (oldcount & RWSEM_FLAG_HANDOFF)
    530			adjustment -= RWSEM_FLAG_HANDOFF;
    531	} else if (woken) {
    532		/*
    533		 * When we've woken a reader, we no longer need to force
    534		 * writers to give up the lock and we can clear HANDOFF.
    535		 */
    536		if (oldcount & RWSEM_FLAG_HANDOFF)
    537			adjustment -= RWSEM_FLAG_HANDOFF;
    538	}
    539
    540	if (adjustment)
    541		atomic_long_add(adjustment, &sem->count);
    542
    543	/* 2nd pass */
    544	list_for_each_entry_safe(waiter, tmp, &wlist, list) {
    545		struct task_struct *tsk;
    546
    547		tsk = waiter->task;
    548		get_task_struct(tsk);
    549
    550		/*
    551		 * Ensure calling get_task_struct() before setting the reader
    552		 * waiter to nil such that rwsem_down_read_slowpath() cannot
    553		 * race with do_exit() by always holding a reference count
    554		 * to the task to wakeup.
    555		 */
    556		smp_store_release(&waiter->task, NULL);
    557		/*
    558		 * Ensure issuing the wakeup (either by us or someone else)
    559		 * after setting the reader waiter to nil.
    560		 */
    561		wake_q_add_safe(wake_q, tsk);
    562	}
    563}
    564
    565/*
    566 * Remove a waiter and try to wake up other waiters in the wait queue
    567 * This function is called from the out_nolock path of both the reader and
    568 * writer slowpaths with wait_lock held. It releases the wait_lock and
    569 * optionally wake up waiters before it returns.
    570 */
    571static inline void
    572rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
    573		      struct wake_q_head *wake_q)
    574		      __releases(&sem->wait_lock)
    575{
    576	bool first = rwsem_first_waiter(sem) == waiter;
    577
    578	wake_q_init(wake_q);
    579
    580	/*
    581	 * If the wait_list isn't empty and the waiter to be deleted is
    582	 * the first waiter, we wake up the remaining waiters as they may
    583	 * be eligible to acquire or spin on the lock.
    584	 */
    585	if (rwsem_del_waiter(sem, waiter) && first)
    586		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
    587	raw_spin_unlock_irq(&sem->wait_lock);
    588	if (!wake_q_empty(wake_q))
    589		wake_up_q(wake_q);
    590}
    591
    592/*
    593 * This function must be called with the sem->wait_lock held to prevent
    594 * race conditions between checking the rwsem wait list and setting the
    595 * sem->count accordingly.
    596 *
    597 * Implies rwsem_del_waiter() on success.
    598 */
    599static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
    600					struct rwsem_waiter *waiter)
    601{
    602	bool first = rwsem_first_waiter(sem) == waiter;
    603	long count, new;
    604
    605	lockdep_assert_held(&sem->wait_lock);
    606
    607	count = atomic_long_read(&sem->count);
    608	do {
    609		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
    610
    611		if (has_handoff) {
    612			if (!first)
    613				return false;
    614
    615			/* First waiter inherits a previously set handoff bit */
    616			waiter->handoff_set = true;
    617		}
    618
    619		new = count;
    620
    621		if (count & RWSEM_LOCK_MASK) {
    622			if (has_handoff || (!rt_task(waiter->task) &&
    623					    !time_after(jiffies, waiter->timeout)))
    624				return false;
    625
    626			new |= RWSEM_FLAG_HANDOFF;
    627		} else {
    628			new |= RWSEM_WRITER_LOCKED;
    629			new &= ~RWSEM_FLAG_HANDOFF;
    630
    631			if (list_is_singular(&sem->wait_list))
    632				new &= ~RWSEM_FLAG_WAITERS;
    633		}
    634	} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
    635
    636	/*
    637	 * We have either acquired the lock with handoff bit cleared or
    638	 * set the handoff bit.
    639	 */
    640	if (new & RWSEM_FLAG_HANDOFF) {
    641		waiter->handoff_set = true;
    642		lockevent_inc(rwsem_wlock_handoff);
    643		return false;
    644	}
    645
    646	/*
    647	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
    648	 * success.
    649	 */
    650	list_del(&waiter->list);
    651	rwsem_set_owner(sem);
    652	return true;
    653}
    654
    655/*
    656 * The rwsem_spin_on_owner() function returns the following 4 values
    657 * depending on the lock owner state.
    658 *   OWNER_NULL  : owner is currently NULL
    659 *   OWNER_WRITER: when owner changes and is a writer
    660 *   OWNER_READER: when owner changes and the new owner may be a reader.
    661 *   OWNER_NONSPINNABLE:
    662 *		   when optimistic spinning has to stop because either the
    663 *		   owner stops running, is unknown, or its timeslice has
    664 *		   been used up.
    665 */
    666enum owner_state {
    667	OWNER_NULL		= 1 << 0,
    668	OWNER_WRITER		= 1 << 1,
    669	OWNER_READER		= 1 << 2,
    670	OWNER_NONSPINNABLE	= 1 << 3,
    671};
    672
    673#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
    674/*
    675 * Try to acquire write lock before the writer has been put on wait queue.
    676 */
    677static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
    678{
    679	long count = atomic_long_read(&sem->count);
    680
    681	while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
    682		if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
    683					count | RWSEM_WRITER_LOCKED)) {
    684			rwsem_set_owner(sem);
    685			lockevent_inc(rwsem_opt_lock);
    686			return true;
    687		}
    688	}
    689	return false;
    690}
    691
    692static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
    693{
    694	struct task_struct *owner;
    695	unsigned long flags;
    696	bool ret = true;
    697
    698	if (need_resched()) {
    699		lockevent_inc(rwsem_opt_fail);
    700		return false;
    701	}
    702
    703	preempt_disable();
    704	/*
    705	 * Disable preemption is equal to the RCU read-side crital section,
    706	 * thus the task_strcut structure won't go away.
    707	 */
    708	owner = rwsem_owner_flags(sem, &flags);
    709	/*
    710	 * Don't check the read-owner as the entry may be stale.
    711	 */
    712	if ((flags & RWSEM_NONSPINNABLE) ||
    713	    (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
    714		ret = false;
    715	preempt_enable();
    716
    717	lockevent_cond_inc(rwsem_opt_fail, !ret);
    718	return ret;
    719}
    720
    721#define OWNER_SPINNABLE		(OWNER_NULL | OWNER_WRITER | OWNER_READER)
    722
    723static inline enum owner_state
    724rwsem_owner_state(struct task_struct *owner, unsigned long flags)
    725{
    726	if (flags & RWSEM_NONSPINNABLE)
    727		return OWNER_NONSPINNABLE;
    728
    729	if (flags & RWSEM_READER_OWNED)
    730		return OWNER_READER;
    731
    732	return owner ? OWNER_WRITER : OWNER_NULL;
    733}
    734
    735static noinline enum owner_state
    736rwsem_spin_on_owner(struct rw_semaphore *sem)
    737{
    738	struct task_struct *new, *owner;
    739	unsigned long flags, new_flags;
    740	enum owner_state state;
    741
    742	lockdep_assert_preemption_disabled();
    743
    744	owner = rwsem_owner_flags(sem, &flags);
    745	state = rwsem_owner_state(owner, flags);
    746	if (state != OWNER_WRITER)
    747		return state;
    748
    749	for (;;) {
    750		/*
    751		 * When a waiting writer set the handoff flag, it may spin
    752		 * on the owner as well. Once that writer acquires the lock,
    753		 * we can spin on it. So we don't need to quit even when the
    754		 * handoff bit is set.
    755		 */
    756		new = rwsem_owner_flags(sem, &new_flags);
    757		if ((new != owner) || (new_flags != flags)) {
    758			state = rwsem_owner_state(new, new_flags);
    759			break;
    760		}
    761
    762		/*
    763		 * Ensure we emit the owner->on_cpu, dereference _after_
    764		 * checking sem->owner still matches owner, if that fails,
    765		 * owner might point to free()d memory, if it still matches,
    766		 * our spinning context already disabled preemption which is
    767		 * equal to RCU read-side crital section ensures the memory
    768		 * stays valid.
    769		 */
    770		barrier();
    771
    772		if (need_resched() || !owner_on_cpu(owner)) {
    773			state = OWNER_NONSPINNABLE;
    774			break;
    775		}
    776
    777		cpu_relax();
    778	}
    779
    780	return state;
    781}
    782
    783/*
    784 * Calculate reader-owned rwsem spinning threshold for writer
    785 *
    786 * The more readers own the rwsem, the longer it will take for them to
    787 * wind down and free the rwsem. So the empirical formula used to
    788 * determine the actual spinning time limit here is:
    789 *
    790 *   Spinning threshold = (10 + nr_readers/2)us
    791 *
    792 * The limit is capped to a maximum of 25us (30 readers). This is just
    793 * a heuristic and is subjected to change in the future.
    794 */
    795static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
    796{
    797	long count = atomic_long_read(&sem->count);
    798	int readers = count >> RWSEM_READER_SHIFT;
    799	u64 delta;
    800
    801	if (readers > 30)
    802		readers = 30;
    803	delta = (20 + readers) * NSEC_PER_USEC / 2;
    804
    805	return sched_clock() + delta;
    806}
    807
    808static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
    809{
    810	bool taken = false;
    811	int prev_owner_state = OWNER_NULL;
    812	int loop = 0;
    813	u64 rspin_threshold = 0;
    814
    815	preempt_disable();
    816
    817	/* sem->wait_lock should not be held when doing optimistic spinning */
    818	if (!osq_lock(&sem->osq))
    819		goto done;
    820
    821	/*
    822	 * Optimistically spin on the owner field and attempt to acquire the
    823	 * lock whenever the owner changes. Spinning will be stopped when:
    824	 *  1) the owning writer isn't running; or
    825	 *  2) readers own the lock and spinning time has exceeded limit.
    826	 */
    827	for (;;) {
    828		enum owner_state owner_state;
    829
    830		owner_state = rwsem_spin_on_owner(sem);
    831		if (!(owner_state & OWNER_SPINNABLE))
    832			break;
    833
    834		/*
    835		 * Try to acquire the lock
    836		 */
    837		taken = rwsem_try_write_lock_unqueued(sem);
    838
    839		if (taken)
    840			break;
    841
    842		/*
    843		 * Time-based reader-owned rwsem optimistic spinning
    844		 */
    845		if (owner_state == OWNER_READER) {
    846			/*
    847			 * Re-initialize rspin_threshold every time when
    848			 * the owner state changes from non-reader to reader.
    849			 * This allows a writer to steal the lock in between
    850			 * 2 reader phases and have the threshold reset at
    851			 * the beginning of the 2nd reader phase.
    852			 */
    853			if (prev_owner_state != OWNER_READER) {
    854				if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
    855					break;
    856				rspin_threshold = rwsem_rspin_threshold(sem);
    857				loop = 0;
    858			}
    859
    860			/*
    861			 * Check time threshold once every 16 iterations to
    862			 * avoid calling sched_clock() too frequently so
    863			 * as to reduce the average latency between the times
    864			 * when the lock becomes free and when the spinner
    865			 * is ready to do a trylock.
    866			 */
    867			else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
    868				rwsem_set_nonspinnable(sem);
    869				lockevent_inc(rwsem_opt_nospin);
    870				break;
    871			}
    872		}
    873
    874		/*
    875		 * An RT task cannot do optimistic spinning if it cannot
    876		 * be sure the lock holder is running or live-lock may
    877		 * happen if the current task and the lock holder happen
    878		 * to run in the same CPU. However, aborting optimistic
    879		 * spinning while a NULL owner is detected may miss some
    880		 * opportunity where spinning can continue without causing
    881		 * problem.
    882		 *
    883		 * There are 2 possible cases where an RT task may be able
    884		 * to continue spinning.
    885		 *
    886		 * 1) The lock owner is in the process of releasing the
    887		 *    lock, sem->owner is cleared but the lock has not
    888		 *    been released yet.
    889		 * 2) The lock was free and owner cleared, but another
    890		 *    task just comes in and acquire the lock before
    891		 *    we try to get it. The new owner may be a spinnable
    892		 *    writer.
    893		 *
    894		 * To take advantage of two scenarios listed above, the RT
    895		 * task is made to retry one more time to see if it can
    896		 * acquire the lock or continue spinning on the new owning
    897		 * writer. Of course, if the time lag is long enough or the
    898		 * new owner is not a writer or spinnable, the RT task will
    899		 * quit spinning.
    900		 *
    901		 * If the owner is a writer, the need_resched() check is
    902		 * done inside rwsem_spin_on_owner(). If the owner is not
    903		 * a writer, need_resched() check needs to be done here.
    904		 */
    905		if (owner_state != OWNER_WRITER) {
    906			if (need_resched())
    907				break;
    908			if (rt_task(current) &&
    909			   (prev_owner_state != OWNER_WRITER))
    910				break;
    911		}
    912		prev_owner_state = owner_state;
    913
    914		/*
    915		 * The cpu_relax() call is a compiler barrier which forces
    916		 * everything in this loop to be re-loaded. We don't need
    917		 * memory barriers as we'll eventually observe the right
    918		 * values at the cost of a few extra spins.
    919		 */
    920		cpu_relax();
    921	}
    922	osq_unlock(&sem->osq);
    923done:
    924	preempt_enable();
    925	lockevent_cond_inc(rwsem_opt_fail, !taken);
    926	return taken;
    927}
    928
    929/*
    930 * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
    931 * only be called when the reader count reaches 0.
    932 */
    933static inline void clear_nonspinnable(struct rw_semaphore *sem)
    934{
    935	if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
    936		atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
    937}
    938
    939#else
    940static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
    941{
    942	return false;
    943}
    944
    945static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
    946{
    947	return false;
    948}
    949
    950static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
    951
    952static inline enum owner_state
    953rwsem_spin_on_owner(struct rw_semaphore *sem)
    954{
    955	return OWNER_NONSPINNABLE;
    956}
    957#endif
    958
    959/*
    960 * Prepare to wake up waiter(s) in the wait queue by putting them into the
    961 * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
    962 * reader-owned, wake up read lock waiters in queue front or wake up any
    963 * front waiter otherwise.
    964
    965 * This is being called from both reader and writer slow paths.
    966 */
    967static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
    968					  struct wake_q_head *wake_q)
    969{
    970	enum rwsem_wake_type wake_type;
    971
    972	if (count & RWSEM_WRITER_MASK)
    973		return;
    974
    975	if (count & RWSEM_READER_MASK) {
    976		wake_type = RWSEM_WAKE_READERS;
    977	} else {
    978		wake_type = RWSEM_WAKE_ANY;
    979		clear_nonspinnable(sem);
    980	}
    981	rwsem_mark_wake(sem, wake_type, wake_q);
    982}
    983
    984/*
    985 * Wait for the read lock to be granted
    986 */
    987static struct rw_semaphore __sched *
    988rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
    989{
    990	long adjustment = -RWSEM_READER_BIAS;
    991	long rcnt = (count >> RWSEM_READER_SHIFT);
    992	struct rwsem_waiter waiter;
    993	DEFINE_WAKE_Q(wake_q);
    994
    995	/*
    996	 * To prevent a constant stream of readers from starving a sleeping
    997	 * waiter, don't attempt optimistic lock stealing if the lock is
    998	 * currently owned by readers.
    999	 */
   1000	if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
   1001	    (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
   1002		goto queue;
   1003
   1004	/*
   1005	 * Reader optimistic lock stealing.
   1006	 */
   1007	if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
   1008		rwsem_set_reader_owned(sem);
   1009		lockevent_inc(rwsem_rlock_steal);
   1010
   1011		/*
   1012		 * Wake up other readers in the wait queue if it is
   1013		 * the first reader.
   1014		 */
   1015		if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
   1016			raw_spin_lock_irq(&sem->wait_lock);
   1017			if (!list_empty(&sem->wait_list))
   1018				rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
   1019						&wake_q);
   1020			raw_spin_unlock_irq(&sem->wait_lock);
   1021			wake_up_q(&wake_q);
   1022		}
   1023		return sem;
   1024	}
   1025
   1026queue:
   1027	waiter.task = current;
   1028	waiter.type = RWSEM_WAITING_FOR_READ;
   1029	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
   1030
   1031	raw_spin_lock_irq(&sem->wait_lock);
   1032	if (list_empty(&sem->wait_list)) {
   1033		/*
   1034		 * In case the wait queue is empty and the lock isn't owned
   1035		 * by a writer, this reader can exit the slowpath and return
   1036		 * immediately as its RWSEM_READER_BIAS has already been set
   1037		 * in the count.
   1038		 */
   1039		if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
   1040			/* Provide lock ACQUIRE */
   1041			smp_acquire__after_ctrl_dep();
   1042			raw_spin_unlock_irq(&sem->wait_lock);
   1043			rwsem_set_reader_owned(sem);
   1044			lockevent_inc(rwsem_rlock_fast);
   1045			return sem;
   1046		}
   1047		adjustment += RWSEM_FLAG_WAITERS;
   1048	}
   1049	rwsem_add_waiter(sem, &waiter);
   1050
   1051	/* we're now waiting on the lock, but no longer actively locking */
   1052	count = atomic_long_add_return(adjustment, &sem->count);
   1053
   1054	rwsem_cond_wake_waiter(sem, count, &wake_q);
   1055	raw_spin_unlock_irq(&sem->wait_lock);
   1056
   1057	if (!wake_q_empty(&wake_q))
   1058		wake_up_q(&wake_q);
   1059
   1060	trace_contention_begin(sem, LCB_F_READ);
   1061
   1062	/* wait to be given the lock */
   1063	for (;;) {
   1064		set_current_state(state);
   1065		if (!smp_load_acquire(&waiter.task)) {
   1066			/* Matches rwsem_mark_wake()'s smp_store_release(). */
   1067			break;
   1068		}
   1069		if (signal_pending_state(state, current)) {
   1070			raw_spin_lock_irq(&sem->wait_lock);
   1071			if (waiter.task)
   1072				goto out_nolock;
   1073			raw_spin_unlock_irq(&sem->wait_lock);
   1074			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
   1075			break;
   1076		}
   1077		schedule();
   1078		lockevent_inc(rwsem_sleep_reader);
   1079	}
   1080
   1081	__set_current_state(TASK_RUNNING);
   1082	lockevent_inc(rwsem_rlock);
   1083	trace_contention_end(sem, 0);
   1084	return sem;
   1085
   1086out_nolock:
   1087	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
   1088	__set_current_state(TASK_RUNNING);
   1089	lockevent_inc(rwsem_rlock_fail);
   1090	trace_contention_end(sem, -EINTR);
   1091	return ERR_PTR(-EINTR);
   1092}
   1093
   1094/*
   1095 * Wait until we successfully acquire the write lock
   1096 */
   1097static struct rw_semaphore __sched *
   1098rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
   1099{
   1100	struct rwsem_waiter waiter;
   1101	DEFINE_WAKE_Q(wake_q);
   1102
   1103	/* do optimistic spinning and steal lock if possible */
   1104	if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
   1105		/* rwsem_optimistic_spin() implies ACQUIRE on success */
   1106		return sem;
   1107	}
   1108
   1109	/*
   1110	 * Optimistic spinning failed, proceed to the slowpath
   1111	 * and block until we can acquire the sem.
   1112	 */
   1113	waiter.task = current;
   1114	waiter.type = RWSEM_WAITING_FOR_WRITE;
   1115	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
   1116	waiter.handoff_set = false;
   1117
   1118	raw_spin_lock_irq(&sem->wait_lock);
   1119	rwsem_add_waiter(sem, &waiter);
   1120
   1121	/* we're now waiting on the lock */
   1122	if (rwsem_first_waiter(sem) != &waiter) {
   1123		rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
   1124				       &wake_q);
   1125		if (!wake_q_empty(&wake_q)) {
   1126			/*
   1127			 * We want to minimize wait_lock hold time especially
   1128			 * when a large number of readers are to be woken up.
   1129			 */
   1130			raw_spin_unlock_irq(&sem->wait_lock);
   1131			wake_up_q(&wake_q);
   1132			raw_spin_lock_irq(&sem->wait_lock);
   1133		}
   1134	} else {
   1135		atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
   1136	}
   1137
   1138	/* wait until we successfully acquire the lock */
   1139	set_current_state(state);
   1140	trace_contention_begin(sem, LCB_F_WRITE);
   1141
   1142	for (;;) {
   1143		if (rwsem_try_write_lock(sem, &waiter)) {
   1144			/* rwsem_try_write_lock() implies ACQUIRE on success */
   1145			break;
   1146		}
   1147
   1148		raw_spin_unlock_irq(&sem->wait_lock);
   1149
   1150		if (signal_pending_state(state, current))
   1151			goto out_nolock;
   1152
   1153		/*
   1154		 * After setting the handoff bit and failing to acquire
   1155		 * the lock, attempt to spin on owner to accelerate lock
   1156		 * transfer. If the previous owner is a on-cpu writer and it
   1157		 * has just released the lock, OWNER_NULL will be returned.
   1158		 * In this case, we attempt to acquire the lock again
   1159		 * without sleeping.
   1160		 */
   1161		if (waiter.handoff_set) {
   1162			enum owner_state owner_state;
   1163
   1164			preempt_disable();
   1165			owner_state = rwsem_spin_on_owner(sem);
   1166			preempt_enable();
   1167
   1168			if (owner_state == OWNER_NULL)
   1169				goto trylock_again;
   1170		}
   1171
   1172		schedule();
   1173		lockevent_inc(rwsem_sleep_writer);
   1174		set_current_state(state);
   1175trylock_again:
   1176		raw_spin_lock_irq(&sem->wait_lock);
   1177	}
   1178	__set_current_state(TASK_RUNNING);
   1179	raw_spin_unlock_irq(&sem->wait_lock);
   1180	lockevent_inc(rwsem_wlock);
   1181	trace_contention_end(sem, 0);
   1182	return sem;
   1183
   1184out_nolock:
   1185	__set_current_state(TASK_RUNNING);
   1186	raw_spin_lock_irq(&sem->wait_lock);
   1187	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
   1188	lockevent_inc(rwsem_wlock_fail);
   1189	trace_contention_end(sem, -EINTR);
   1190	return ERR_PTR(-EINTR);
   1191}
   1192
   1193/*
   1194 * handle waking up a waiter on the semaphore
   1195 * - up_read/up_write has decremented the active part of count if we come here
   1196 */
   1197static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
   1198{
   1199	unsigned long flags;
   1200	DEFINE_WAKE_Q(wake_q);
   1201
   1202	raw_spin_lock_irqsave(&sem->wait_lock, flags);
   1203
   1204	if (!list_empty(&sem->wait_list))
   1205		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
   1206
   1207	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
   1208	wake_up_q(&wake_q);
   1209
   1210	return sem;
   1211}
   1212
   1213/*
   1214 * downgrade a write lock into a read lock
   1215 * - caller incremented waiting part of count and discovered it still negative
   1216 * - just wake up any readers at the front of the queue
   1217 */
   1218static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
   1219{
   1220	unsigned long flags;
   1221	DEFINE_WAKE_Q(wake_q);
   1222
   1223	raw_spin_lock_irqsave(&sem->wait_lock, flags);
   1224
   1225	if (!list_empty(&sem->wait_list))
   1226		rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
   1227
   1228	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
   1229	wake_up_q(&wake_q);
   1230
   1231	return sem;
   1232}
   1233
   1234/*
   1235 * lock for reading
   1236 */
   1237static inline int __down_read_common(struct rw_semaphore *sem, int state)
   1238{
   1239	long count;
   1240
   1241	if (!rwsem_read_trylock(sem, &count)) {
   1242		if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
   1243			return -EINTR;
   1244		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
   1245	}
   1246	return 0;
   1247}
   1248
   1249static inline void __down_read(struct rw_semaphore *sem)
   1250{
   1251	__down_read_common(sem, TASK_UNINTERRUPTIBLE);
   1252}
   1253
   1254static inline int __down_read_interruptible(struct rw_semaphore *sem)
   1255{
   1256	return __down_read_common(sem, TASK_INTERRUPTIBLE);
   1257}
   1258
   1259static inline int __down_read_killable(struct rw_semaphore *sem)
   1260{
   1261	return __down_read_common(sem, TASK_KILLABLE);
   1262}
   1263
   1264static inline int __down_read_trylock(struct rw_semaphore *sem)
   1265{
   1266	long tmp;
   1267
   1268	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
   1269
   1270	tmp = atomic_long_read(&sem->count);
   1271	while (!(tmp & RWSEM_READ_FAILED_MASK)) {
   1272		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
   1273						    tmp + RWSEM_READER_BIAS)) {
   1274			rwsem_set_reader_owned(sem);
   1275			return 1;
   1276		}
   1277	}
   1278	return 0;
   1279}
   1280
   1281/*
   1282 * lock for writing
   1283 */
   1284static inline int __down_write_common(struct rw_semaphore *sem, int state)
   1285{
   1286	if (unlikely(!rwsem_write_trylock(sem))) {
   1287		if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
   1288			return -EINTR;
   1289	}
   1290
   1291	return 0;
   1292}
   1293
   1294static inline void __down_write(struct rw_semaphore *sem)
   1295{
   1296	__down_write_common(sem, TASK_UNINTERRUPTIBLE);
   1297}
   1298
   1299static inline int __down_write_killable(struct rw_semaphore *sem)
   1300{
   1301	return __down_write_common(sem, TASK_KILLABLE);
   1302}
   1303
   1304static inline int __down_write_trylock(struct rw_semaphore *sem)
   1305{
   1306	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
   1307	return rwsem_write_trylock(sem);
   1308}
   1309
   1310/*
   1311 * unlock after reading
   1312 */
   1313static inline void __up_read(struct rw_semaphore *sem)
   1314{
   1315	long tmp;
   1316
   1317	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
   1318	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
   1319
   1320	rwsem_clear_reader_owned(sem);
   1321	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
   1322	DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
   1323	if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
   1324		      RWSEM_FLAG_WAITERS)) {
   1325		clear_nonspinnable(sem);
   1326		rwsem_wake(sem);
   1327	}
   1328}
   1329
   1330/*
   1331 * unlock after writing
   1332 */
   1333static inline void __up_write(struct rw_semaphore *sem)
   1334{
   1335	long tmp;
   1336
   1337	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
   1338	/*
   1339	 * sem->owner may differ from current if the ownership is transferred
   1340	 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
   1341	 */
   1342	DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
   1343			    !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
   1344
   1345	rwsem_clear_owner(sem);
   1346	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
   1347	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
   1348		rwsem_wake(sem);
   1349}
   1350
   1351/*
   1352 * downgrade write lock to read lock
   1353 */
   1354static inline void __downgrade_write(struct rw_semaphore *sem)
   1355{
   1356	long tmp;
   1357
   1358	/*
   1359	 * When downgrading from exclusive to shared ownership,
   1360	 * anything inside the write-locked region cannot leak
   1361	 * into the read side. In contrast, anything in the
   1362	 * read-locked region is ok to be re-ordered into the
   1363	 * write side. As such, rely on RELEASE semantics.
   1364	 */
   1365	DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
   1366	tmp = atomic_long_fetch_add_release(
   1367		-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
   1368	rwsem_set_reader_owned(sem);
   1369	if (tmp & RWSEM_FLAG_WAITERS)
   1370		rwsem_downgrade_wake(sem);
   1371}
   1372
   1373#else /* !CONFIG_PREEMPT_RT */
   1374
   1375#define RT_MUTEX_BUILD_MUTEX
   1376#include "rtmutex.c"
   1377
   1378#define rwbase_set_and_save_current_state(state)	\
   1379	set_current_state(state)
   1380
   1381#define rwbase_restore_current_state()			\
   1382	__set_current_state(TASK_RUNNING)
   1383
   1384#define rwbase_rtmutex_lock_state(rtm, state)		\
   1385	__rt_mutex_lock(rtm, state)
   1386
   1387#define rwbase_rtmutex_slowlock_locked(rtm, state)	\
   1388	__rt_mutex_slowlock_locked(rtm, NULL, state)
   1389
   1390#define rwbase_rtmutex_unlock(rtm)			\
   1391	__rt_mutex_unlock(rtm)
   1392
   1393#define rwbase_rtmutex_trylock(rtm)			\
   1394	__rt_mutex_trylock(rtm)
   1395
   1396#define rwbase_signal_pending_state(state, current)	\
   1397	signal_pending_state(state, current)
   1398
   1399#define rwbase_schedule()				\
   1400	schedule()
   1401
   1402#include "rwbase_rt.c"
   1403
   1404void __init_rwsem(struct rw_semaphore *sem, const char *name,
   1405		  struct lock_class_key *key)
   1406{
   1407	init_rwbase_rt(&(sem)->rwbase);
   1408
   1409#ifdef CONFIG_DEBUG_LOCK_ALLOC
   1410	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
   1411	lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
   1412#endif
   1413}
   1414EXPORT_SYMBOL(__init_rwsem);
   1415
   1416static inline void __down_read(struct rw_semaphore *sem)
   1417{
   1418	rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
   1419}
   1420
   1421static inline int __down_read_interruptible(struct rw_semaphore *sem)
   1422{
   1423	return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
   1424}
   1425
   1426static inline int __down_read_killable(struct rw_semaphore *sem)
   1427{
   1428	return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
   1429}
   1430
   1431static inline int __down_read_trylock(struct rw_semaphore *sem)
   1432{
   1433	return rwbase_read_trylock(&sem->rwbase);
   1434}
   1435
   1436static inline void __up_read(struct rw_semaphore *sem)
   1437{
   1438	rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
   1439}
   1440
   1441static inline void __sched __down_write(struct rw_semaphore *sem)
   1442{
   1443	rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
   1444}
   1445
   1446static inline int __sched __down_write_killable(struct rw_semaphore *sem)
   1447{
   1448	return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
   1449}
   1450
   1451static inline int __down_write_trylock(struct rw_semaphore *sem)
   1452{
   1453	return rwbase_write_trylock(&sem->rwbase);
   1454}
   1455
   1456static inline void __up_write(struct rw_semaphore *sem)
   1457{
   1458	rwbase_write_unlock(&sem->rwbase);
   1459}
   1460
   1461static inline void __downgrade_write(struct rw_semaphore *sem)
   1462{
   1463	rwbase_write_downgrade(&sem->rwbase);
   1464}
   1465
   1466/* Debug stubs for the common API */
   1467#define DEBUG_RWSEMS_WARN_ON(c, sem)
   1468
   1469static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
   1470					    struct task_struct *owner)
   1471{
   1472}
   1473
   1474static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
   1475{
   1476	int count = atomic_read(&sem->rwbase.readers);
   1477
   1478	return count < 0 && count != READER_BIAS;
   1479}
   1480
   1481#endif /* CONFIG_PREEMPT_RT */
   1482
   1483/*
   1484 * lock for reading
   1485 */
   1486void __sched down_read(struct rw_semaphore *sem)
   1487{
   1488	might_sleep();
   1489	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
   1490
   1491	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
   1492}
   1493EXPORT_SYMBOL(down_read);
   1494
   1495int __sched down_read_interruptible(struct rw_semaphore *sem)
   1496{
   1497	might_sleep();
   1498	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
   1499
   1500	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
   1501		rwsem_release(&sem->dep_map, _RET_IP_);
   1502		return -EINTR;
   1503	}
   1504
   1505	return 0;
   1506}
   1507EXPORT_SYMBOL(down_read_interruptible);
   1508
   1509int __sched down_read_killable(struct rw_semaphore *sem)
   1510{
   1511	might_sleep();
   1512	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
   1513
   1514	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
   1515		rwsem_release(&sem->dep_map, _RET_IP_);
   1516		return -EINTR;
   1517	}
   1518
   1519	return 0;
   1520}
   1521EXPORT_SYMBOL(down_read_killable);
   1522
   1523/*
   1524 * trylock for reading -- returns 1 if successful, 0 if contention
   1525 */
   1526int down_read_trylock(struct rw_semaphore *sem)
   1527{
   1528	int ret = __down_read_trylock(sem);
   1529
   1530	if (ret == 1)
   1531		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
   1532	return ret;
   1533}
   1534EXPORT_SYMBOL(down_read_trylock);
   1535
   1536/*
   1537 * lock for writing
   1538 */
   1539void __sched down_write(struct rw_semaphore *sem)
   1540{
   1541	might_sleep();
   1542	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
   1543	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
   1544}
   1545EXPORT_SYMBOL(down_write);
   1546
   1547/*
   1548 * lock for writing
   1549 */
   1550int __sched down_write_killable(struct rw_semaphore *sem)
   1551{
   1552	might_sleep();
   1553	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
   1554
   1555	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
   1556				  __down_write_killable)) {
   1557		rwsem_release(&sem->dep_map, _RET_IP_);
   1558		return -EINTR;
   1559	}
   1560
   1561	return 0;
   1562}
   1563EXPORT_SYMBOL(down_write_killable);
   1564
   1565/*
   1566 * trylock for writing -- returns 1 if successful, 0 if contention
   1567 */
   1568int down_write_trylock(struct rw_semaphore *sem)
   1569{
   1570	int ret = __down_write_trylock(sem);
   1571
   1572	if (ret == 1)
   1573		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
   1574
   1575	return ret;
   1576}
   1577EXPORT_SYMBOL(down_write_trylock);
   1578
   1579/*
   1580 * release a read lock
   1581 */
   1582void up_read(struct rw_semaphore *sem)
   1583{
   1584	rwsem_release(&sem->dep_map, _RET_IP_);
   1585	__up_read(sem);
   1586}
   1587EXPORT_SYMBOL(up_read);
   1588
   1589/*
   1590 * release a write lock
   1591 */
   1592void up_write(struct rw_semaphore *sem)
   1593{
   1594	rwsem_release(&sem->dep_map, _RET_IP_);
   1595	__up_write(sem);
   1596}
   1597EXPORT_SYMBOL(up_write);
   1598
   1599/*
   1600 * downgrade write lock to read lock
   1601 */
   1602void downgrade_write(struct rw_semaphore *sem)
   1603{
   1604	lock_downgrade(&sem->dep_map, _RET_IP_);
   1605	__downgrade_write(sem);
   1606}
   1607EXPORT_SYMBOL(downgrade_write);
   1608
   1609#ifdef CONFIG_DEBUG_LOCK_ALLOC
   1610
   1611void down_read_nested(struct rw_semaphore *sem, int subclass)
   1612{
   1613	might_sleep();
   1614	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
   1615	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
   1616}
   1617EXPORT_SYMBOL(down_read_nested);
   1618
   1619int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
   1620{
   1621	might_sleep();
   1622	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
   1623
   1624	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
   1625		rwsem_release(&sem->dep_map, _RET_IP_);
   1626		return -EINTR;
   1627	}
   1628
   1629	return 0;
   1630}
   1631EXPORT_SYMBOL(down_read_killable_nested);
   1632
   1633void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
   1634{
   1635	might_sleep();
   1636	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
   1637	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
   1638}
   1639EXPORT_SYMBOL(_down_write_nest_lock);
   1640
   1641void down_read_non_owner(struct rw_semaphore *sem)
   1642{
   1643	might_sleep();
   1644	__down_read(sem);
   1645	__rwsem_set_reader_owned(sem, NULL);
   1646}
   1647EXPORT_SYMBOL(down_read_non_owner);
   1648
   1649void down_write_nested(struct rw_semaphore *sem, int subclass)
   1650{
   1651	might_sleep();
   1652	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
   1653	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
   1654}
   1655EXPORT_SYMBOL(down_write_nested);
   1656
   1657int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
   1658{
   1659	might_sleep();
   1660	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
   1661
   1662	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
   1663				  __down_write_killable)) {
   1664		rwsem_release(&sem->dep_map, _RET_IP_);
   1665		return -EINTR;
   1666	}
   1667
   1668	return 0;
   1669}
   1670EXPORT_SYMBOL(down_write_killable_nested);
   1671
   1672void up_read_non_owner(struct rw_semaphore *sem)
   1673{
   1674	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
   1675	__up_read(sem);
   1676}
   1677EXPORT_SYMBOL(up_read_non_owner);
   1678
   1679#endif