From 8698a745d800c59cd5a576398bdeccd578ac66f1 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Mar 2014 18:09:12 +0800 Subject: sched, treewide: Replace hardcoded nice values with MIN_NICE/MAX_NICE Replace various -20/+19 hardcoded nice values with MIN_NICE/MAX_NICE. Signed-off-by: Dongsheng Yang Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/ff13819fd09b7a5dba5ab5ae797f2e7019bdfa17.1394532288.git.yangds.fnst@cn.fujitsu.com Cc: devel@driverdev.osuosl.org Cc: devicetree@vger.kernel.org Cc: fcoe-devel@open-fcoe.org Cc: linux390@de.ibm.com Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: nbd-general@lists.sourceforge.net Cc: ocfs2-devel@oss.oracle.com Cc: openipmi-developer@lists.sourceforge.net Cc: qla2xxx-upstream@qlogic.com Cc: linux-arch@vger.kernel.org [ Consolidated the patches, twiddled the changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/locktorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..23343be46e91 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg) static DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); do { schedule_timeout_uninterruptible(1); -- cgit v1.2.3-71-gd317 From 1413c03893332366e5b4d1e26f942ada25f3e82a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 8 Jan 2014 14:21:46 -0500 Subject: lockdep: Increase static allocations Fuzzing a recent kernel with a large configuration hits the static allocation limits and disables lockdep. This patch doubles the limits. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389208906-24338-1-git-send-email-sasha.levin@oracle.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep_internals.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..51c4b24b6328 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_ENTRIES 32768UL -#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_LOCKDEP_CHAINS_BITS 16 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -65,7 +65,7 @@ enum { * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ -#define MAX_STACK_TRACE_ENTRIES 262144UL +#define MAX_STACK_TRACE_ENTRIES 524288UL extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -- cgit v1.2.3-71-gd317 From 3cf2f34e1a3d4d5ff209d087925cf950e52f4805 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 2 May 2014 12:53:57 -0700 Subject: rwsem: Add comments to explain the meaning of the rwsem's count field It took me quite a while to understand how rwsem's count field mainifested itself in different scenarios. Add comments to provide a quick reference to the the rwsem's count field for each scenario where readers and writers are contending for the lock. Hopefully it will be useful for future maintenance of the code and for people to get up to speed on how the logic in the code works. Signed-off-by: Tim Chen Cc: Davidlohr Bueso Cc: Alex Shi Cc: Michel Lespinasse Cc: Rik van Riel Cc: Peter Hurley Cc: Paul E.McKenney Cc: Jason Low Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1399060437.2970.146.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 1d66e08e897d..b4219ff87b8c 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -11,6 +11,55 @@ #include #include +/* + * Guide to the rw_semaphore's count field for common values. + * (32-bit case illustrated, similar for 64-bit) + * + * 0x0000000X (1) X readers active or attempting lock, no writer waiting + * X = #active_readers + #readers attempting to lock + * (X*ACTIVE_BIAS) + * + * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or + * attempting to read lock or write lock. + * + * 0xffff000X (1) X readers active or attempting lock, with waiters for lock + * X = #active readers + # readers attempting lock + * (X*ACTIVE_BIAS + WAITING_BIAS) + * (2) 1 writer attempting lock, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * (3) 1 writer active, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * + * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock + * (WAITING_BIAS + ACTIVE_BIAS) + * (2) 1 writer active or attempting lock, no waiters for lock + * (ACTIVE_WRITE_BIAS) + * + * 0xffff0000 (1) There are writers or readers queued but none active + * or in the process of attempting lock. + * (WAITING_BIAS) + * Note: writer can attempt to steal lock for this count by adding + * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count + * + * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. + * (ACTIVE_WRITE_BIAS + WAITING_BIAS) + * + * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking + * the count becomes more than 0 for successful lock acquisition, + * i.e. the case where there are only readers or nobody has lock. + * (1st and 2nd case above). + * + * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and + * checking the count becomes ACTIVE_WRITE_BIAS for successful lock + * acquisition (i.e. nobody else has lock or attempts lock). If + * unsuccessful, in rwsem_down_write_failed, we'll check to see if there + * are only waiters but none active (5th case above), and attempt to + * steal the lock. + * + */ + /* * Initialize an rwsem: */ -- cgit v1.2.3-71-gd317 From da601c63fd3a3e6c30f85eefd5ee46397b5b439d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Feb 2014 12:14:51 -0800 Subject: torture: Intensify locking test The current lock_torture_writer() spends too much time sleeping and not enough time hammering locks, as in an eight-CPU test will often only be utilizing a CPU or two. This commit therefore makes lock_torture_writer() sleep less and hammer more. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..b0d3e3c50672 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -219,7 +219,8 @@ static int lock_torture_writer(void *arg) set_user_nice(current, 19); do { - schedule_timeout_uninterruptible(1); + if ((torture_random(&rand) & 0xfffff) == 0) + schedule_timeout_uninterruptible(1); cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_write_lock_fail++; -- cgit v1.2.3-71-gd317 From d065eacfdb9d47010f120e9310d7fc8ef2eba272 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Apr 2014 17:17:35 -0700 Subject: locktorture: Remove reference to nonexistent Kconfig parameter The locktorture module references CONFIG_LOCK_TORTURE_TEST_RUNNABLE, which does not exist. Which is a good thing, because otherwise randconfig testing could enable both rcutorture and locktorture concurrently, which the torture tests are not set up for. This commit therefore removes the reference, so that test is runnable immediately only when inserted as a module. Reported-by: Paul Bolle Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/locktorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index b0d3e3c50672..1952466c7db5 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -82,14 +82,14 @@ struct lock_writer_stress_stats { }; static struct lock_writer_stress_stats *lwsa; -#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) +#if defined(MODULE) #define LOCKTORTURE_RUNNABLE_INIT 1 #else #define LOCKTORTURE_RUNNABLE_INIT 0 #endif int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; module_param(locktorture_runnable, int, 0444); -MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); /* Forward reference. */ static void lock_torture_cleanup(void); -- cgit v1.2.3-71-gd317 From 5228084eed8d54c426c7abde3be66daf8e1b0e57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Apr 2014 09:14:11 -0700 Subject: torture: Check for multiple concurrent torture tests The torture tests are designed to run in isolation, but do not enforce this isolation. This commit therefore checks for concurrent torture tests, and refuses to start new tests while old tests are running. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 3 ++- kernel/rcu/rcutorture.c | 3 ++- kernel/torture.c | 13 +++++++++++-- 4 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/include/linux/torture.h b/include/linux/torture.h index b2e2b468e511..f998574247fd 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -81,7 +81,7 @@ void stutter_wait(const char *title); int torture_stutter_init(int s); /* Initialization and cleanup. */ -void torture_init_begin(char *ttype, bool v, int *runnable); +bool torture_init_begin(char *ttype, bool v, int *runnable); void torture_init_end(void); bool torture_cleanup(void); bool torture_must_stop(void); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 1952466c7db5..dbafeac18e4d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -355,7 +355,8 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, }; - torture_init_begin(torture_type, verbose, &locktorture_runnable); + if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) + return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4b7b97ff1195..7fa34f86e5ba 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1536,7 +1536,8 @@ rcu_torture_init(void) &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, }; - torture_init_begin(torture_type, verbose, &rcutorture_runnable); + if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) + return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { diff --git a/kernel/torture.c b/kernel/torture.c index ae1723a4c751..0ed0b49d2ce1 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -599,14 +599,20 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -void __init torture_init_begin(char *ttype, bool v, int *runnable) +bool __init torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); + if (torture_type != NULL) { + pr_alert("torture_init_begin: refusing %s init: %s running", + ttype, torture_type); + mutex_unlock(&fullstop_mutex); + return false; + } torture_type = ttype; verbose = v; torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; - + return true; } EXPORT_SYMBOL_GPL(torture_init_begin); @@ -645,6 +651,9 @@ bool torture_cleanup(void) torture_shuffle_cleanup(); torture_stutter_cleanup(); torture_onoff_cleanup(); + mutex_lock(&fullstop_mutex); + torture_type = NULL; + mutex_unlock(&fullstop_mutex); return false; } EXPORT_SYMBOL_GPL(torture_cleanup); -- cgit v1.2.3-71-gd317 From 397335f004f41e5fcf7a795e94eb3ab83411a17c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 May 2014 03:25:39 +0000 Subject: rtmutex: Fix deadlock detector for real The current deadlock detection logic does not work reliably due to the following early exit path: /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting * mode! */ if (top_waiter && (!task_has_pi_waiters(task) || top_waiter != task_top_pi_waiter(task))) goto out_unlock_pi; So this not only exits when the task has no waiters, it also exits unconditionally when the current waiter is not the top priority waiter of the task. So in a nested locking scenario, it might abort the lock chain walk and therefor miss a potential deadlock. Simple fix: Continue the chain walk, when deadlock detection is enabled. We also avoid the whole enqueue, if we detect the deadlock right away (A-A). It's an optimization, but also prevents that another waiter who comes in after the detection and before the task has undone the damage observes the situation and detects the deadlock and returns -EDEADLOCK, which is wrong as the other task is not in a deadlock situation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Reviewed-by: Steven Rostedt Cc: Lai Jiangshan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20140522031949.725272460@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index aa4dff04b594..a620d4d08ca6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * top_waiter can be NULL, when we are in the deboosting * mode! */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; + if (top_waiter) { + if (!task_has_pi_waiters(task)) + goto out_unlock_pi; + /* + * If deadlock detection is off, we stop here if we + * are not the top pi waiter of the task. + */ + if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) + goto out_unlock_pi; + } /* * When deadlock detection is off then we check, if further @@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto retry; } - /* Deadlock detection */ + /* + * Deadlock detection. If the lock is the same as the original + * lock which caused us to walk the lock chain or if the + * current lock is owned by the task which initiated the chain + * walk, we detected a deadlock. + */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); @@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; + /* + * Early deadlock detection. We really don't want the task to + * enqueue on itself just to untangle the mess later. It's not + * only an optimization. We drop the locks, so another waiter + * can come in before the chain walk detects the deadlock. So + * the other will detect the deadlock and return -EDEADLOCK, + * which is wrong, as the other waiter is not in a deadlock + * situation. + */ + if (detect_deadlock && owner == task) + return -EDEADLK; + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); waiter->task = task; -- cgit v1.2.3-71-gd317 From 4fc828e24cd9c385d3a44e1b499ec7fc70239d8a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 2 May 2014 11:24:15 -0700 Subject: locking/rwsem: Support optimistic spinning We have reached the point where our mutexes are quite fine tuned for a number of situations. This includes the use of heuristics and optimistic spinning, based on MCS locking techniques. Exclusive ownership of read-write semaphores are, conceptually, just about the same as mutexes, making them close cousins. To this end we need to make them both perform similarly, and right now, rwsems are simply not up to it. This was discovered by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable) and similarly, converting some other mutexes (ie: i_mmap_mutex) to rwsems. This creates a situation where users have to choose between a rwsem and mutex taking into account this important performance difference. Specifically, biggest difference between both locks is when we fail to acquire a mutex in the fastpath, optimistic spinning comes in to play and we can avoid a large amount of unnecessary sleeping and overhead of moving tasks in and out of wait queue. Rwsems do not have such logic. This patch, based on the work from Tim Chen and I, adds support for write-side optimistic spinning when the lock is contended. It also includes support for the recently added cancelable MCS locking for adaptive spinning. Note that is is only applicable to the xadd method, and the spinlock rwsem variant remains intact. Allowing optimistic spinning before putting the writer on the wait queue reduces wait queue contention and provided greater chance for the rwsem to get acquired. With these changes, rwsem is on par with mutex. The performance benefits can be seen on a number of workloads. For instance, on a 8 socket, 80 core 64bit Westmere box, aim7 shows the following improvements in throughput: +--------------+---------------------+-----------------+ | Workload | throughput-increase | number of users | +--------------+---------------------+-----------------+ | alltests | 20% | >1000 | | custom | 27%, 60% | 10-100, >1000 | | high_systime | 36%, 30% | >100, >1000 | | shared | 58%, 29% | 10-100, >1000 | +--------------+---------------------+-----------------+ There was also improvement on smaller systems, such as a quad-core x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up to +60% in throughput for over 50 clients. Additionally, benefits were also noticed in exim (mail server) workloads. Furthermore, no performance regression have been seen at all. Based-on-work-from: Tim Chen Signed-off-by: Davidlohr Bueso [peterz: rej fixup due to comment patches, sched/rt.h header] Signed-off-by: Peter Zijlstra Cc: Alex Shi Cc: Andi Kleen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Peter Hurley Cc: "Paul E.McKenney" Cc: Jason Low Cc: Aswin Chandramouleeswaran Cc: Andrew Morton Cc: Linus Torvalds Cc: "Scott J Norton" Cc: Andrea Arcangeli Cc: Chris Mason Cc: Josef Bacik Link: http://lkml.kernel.org/r/1399055055.6275.15.camel@buesod1.americas.hpqcorp.net Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 25 ++++- kernel/locking/rwsem-xadd.c | 225 ++++++++++++++++++++++++++++++++++++++------ kernel/locking/rwsem.c | 31 +++++- 3 files changed, 248 insertions(+), 33 deletions(-) (limited to 'kernel/locking') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 03f3b05e8ec1..3e108f154cb6 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -16,6 +16,7 @@ #include +struct optimistic_spin_queue; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -23,9 +24,17 @@ struct rw_semaphore; #else /* All arch specific implementations share the same struct */ struct rw_semaphore { - long count; - raw_spinlock_t wait_lock; - struct list_head wait_list; + long count; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_SMP + /* + * Write owner. Used as a speculative check to see + * if the owner is running on the cpu. + */ + struct task_struct *owner; + struct optimistic_spin_queue *osq; /* spinner MCS lock */ +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) # define __RWSEM_DEP_MAP_INIT(lockname) #endif +#ifdef CONFIG_SMP +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, \ + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list), \ + NULL, /* owner */ \ + NULL /* mcs lock */ \ + __RWSEM_DEP_MAP_INIT(name) } +#else #define __RWSEM_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, \ __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ LIST_HEAD_INIT((name).wait_list) \ __RWSEM_DEP_MAP_INIT(name) } +#endif #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b4219ff87b8c..4a75278142cd 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -5,11 +5,17 @@ * * Writer lock-stealing by Alex Shi * and Michel Lespinasse + * + * Optimistic spinning by Tim Chen + * and Davidlohr Bueso . Based on mutexes. */ #include #include #include #include +#include + +#include "mcs_spinlock.h" /* * Guide to the rw_semaphore's count field for common values. @@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); +#ifdef CONFIG_SMP + sem->owner = NULL; + sem->osq = NULL; +#endif } EXPORT_SYMBOL(__init_rwsem); @@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) } /* - * wait for the read lock to be granted + * Wait for the read lock to be granted */ __visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) @@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + if (!(count & RWSEM_ACTIVE_MASK)) { + /* try acquiring the write lock */ + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; + } + } + return false; +} + +#ifdef CONFIG_SMP /* - * wait until we successfully acquire the write lock + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long old, count = ACCESS_ONCE(sem->count); + + while (true) { + if (!(count == 0 || count == RWSEM_WAITING_BIAS)) + return false; + + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + if (old == count) + return true; + + count = old; + } +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool on_cpu = true; + + if (need_resched()) + return 0; + + rcu_read_lock(); + owner = ACCESS_ONCE(sem->owner); + if (owner) + on_cpu = owner->on_cpu; + rcu_read_unlock(); + + /* + * If sem->owner is not set, the rwsem owner may have + * just acquired it and not set the owner yet or the rwsem + * has been released. + */ + return on_cpu; +} + +static inline bool owner_running(struct rw_semaphore *sem, + struct task_struct *owner) +{ + if (sem->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * sem->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +static noinline +bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(sem, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() or when the + * owner changed, which is a sign for heavy contention. Return + * success only when sem->owner is NULL. + */ + return sem->owner == NULL; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + while (true) { + owner = ACCESS_ONCE(sem->owner); + if (owner && !rwsem_spin_on_owner(sem, owner)) + break; + + /* wait_lock will be acquired if write_lock is obtained */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + return taken; +} + +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait until we successfully acquire the write lock */ __visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count; + bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; - struct task_struct *tsk = current; - /* set up my own style of waitqueue */ - waiter.task = tsk; + /* undo write bias from down_write operation, stop active locking */ + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; + waiting = false; + list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + if (waiting) { + count = ACCESS_ONCE(sem->count); - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + /* + * If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. + */ + if (count > RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + } else + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - + if (rwsem_try_write_lock(count, sem)) + break; raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ do { schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } + __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; return sem; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..42f806de49d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,6 +12,27 @@ #include +#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif + /* * lock for reading */ @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_owner(sem); + } + return ret; } @@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_clear_owner(sem); __up_write(sem); } @@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ + rwsem_clear_owner(sem); __downgrade_write(sem); } @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); -- cgit v1.2.3-71-gd317 From 0cc3d01164aba483edd8232aa5c781136843c367 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 20:19:48 +0200 Subject: locking/rwsem: Fix checkpatch.pl warnings WARNING: line over 80 characters #205: FILE: kernel/locking/rwsem-xadd.c:275: + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); WARNING: line over 80 characters #376: FILE: kernel/locking/rwsem-xadd.c:434: + * If there were already threads queued before us and there are no WARNING: line over 80 characters #377: FILE: kernel/locking/rwsem-xadd.c:435: + * active writers, the lock must be read owned; so we try to wake total: 0 errors, 3 warnings, 417 lines checked Signed-off-by: Andrew Morton Signed-off-by: Peter Zijlstra Cc: "H. Peter Anvin" Cc: Davidlohr Bueso Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-pn6pslaplw031lykweojsn8c@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 4a75278142cd..dacc32142fcc 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,9 +433,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = ACCESS_ONCE(sem->count); /* - * If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. + * If there were already threads queued before us and there are + * no active writers, the lock must be read owned; so we try to + * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); -- cgit v1.2.3-71-gd317 From 70af2f8a4f48d6cebdf92d533d3aef37853ce6de Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 3 Feb 2014 13:18:49 +0100 Subject: locking/rwlocks: Introduce 'qrwlocks' - fair, queued rwlocks This rwlock uses the arch_spin_lock_t as a waitqueue, and assuming the arch_spin_lock_t is a fair lock (ticket,mcs etc..) the resulting rwlock is a fair lock. It fits in the same 8 bytes as the regular rwlock_t by folding the reader and writer count into a single integer, using the remaining 4 bytes for the arch_spinlock_t. Architectures that can single-copy adress bytes can optimize queue_write_unlock() with a 0 write to the LSB (the write count). Performance as measured by Davidlohr Bueso (rwlock_t -> qrwlock_t): +--------------+-------------+---------------+ | Workload | #users | delta | +--------------+-------------+---------------+ | alltests | > 1400 | -4.83% | | custom | 0-100,> 100 | +1.43%,-1.57% | | high_systime | > 1000 | -2.61 | | shared | all | +0.32 | +--------------+-------------+---------------+ http://www.stgolabs.net/qrwlock-stuff/aim7-results-vs-rwsem_optsin/ Signed-off-by: Waiman Long [peterz: near complete rewrite] Signed-off-by: Peter Zijlstra Cc: Arnd Bergmann Cc: Linus Torvalds Cc: "Paul E.McKenney" Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-gac1nnl3wvs2ij87zv2xkdzq@git.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/qrwlock.h | 166 ++++++++++++++++++++++++++++++++++++ include/asm-generic/qrwlock_types.h | 21 +++++ kernel/Kconfig.locks | 7 ++ kernel/locking/Makefile | 1 + kernel/locking/qrwlock.c | 133 +++++++++++++++++++++++++++++ 5 files changed, 328 insertions(+) create mode 100644 include/asm-generic/qrwlock.h create mode 100644 include/asm-generic/qrwlock_types.h create mode 100644 kernel/locking/qrwlock.c (limited to 'kernel/locking') diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h new file mode 100644 index 000000000000..6383d54bf983 --- /dev/null +++ b/include/asm-generic/qrwlock.h @@ -0,0 +1,166 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long + */ +#ifndef __ASM_GENERIC_QRWLOCK_H +#define __ASM_GENERIC_QRWLOCK_H + +#include +#include +#include + +#include + +/* + * Writer states & reader shift and bias + */ +#define _QW_WAITING 1 /* A writer is waiting */ +#define _QW_LOCKED 0xff /* A writer holds the lock */ +#define _QW_WMASK 0xff /* Writer mask */ +#define _QR_SHIFT 8 /* Reader count shift */ +#define _QR_BIAS (1U << _QR_SHIFT) + +/* + * External function declarations + */ +extern void queue_read_lock_slowpath(struct qrwlock *lock); +extern void queue_write_lock_slowpath(struct qrwlock *lock); + +/** + * queue_read_can_lock- would read_trylock() succeed? + * @lock: Pointer to queue rwlock structure + */ +static inline int queue_read_can_lock(struct qrwlock *lock) +{ + return !(atomic_read(&lock->cnts) & _QW_WMASK); +} + +/** + * queue_write_can_lock- would write_trylock() succeed? + * @lock: Pointer to queue rwlock structure + */ +static inline int queue_write_can_lock(struct qrwlock *lock) +{ + return !atomic_read(&lock->cnts); +} + +/** + * queue_read_trylock - try to acquire read lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static inline int queue_read_trylock(struct qrwlock *lock) +{ + u32 cnts; + + cnts = atomic_read(&lock->cnts); + if (likely(!(cnts & _QW_WMASK))) { + cnts = (u32)atomic_add_return(_QR_BIAS, &lock->cnts); + if (likely(!(cnts & _QW_WMASK))) + return 1; + atomic_sub(_QR_BIAS, &lock->cnts); + } + return 0; +} + +/** + * queue_write_trylock - try to acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static inline int queue_write_trylock(struct qrwlock *lock) +{ + u32 cnts; + + cnts = atomic_read(&lock->cnts); + if (unlikely(cnts)) + return 0; + + return likely(atomic_cmpxchg(&lock->cnts, + cnts, cnts | _QW_LOCKED) == cnts); +} +/** + * queue_read_lock - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +static inline void queue_read_lock(struct qrwlock *lock) +{ + u32 cnts; + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts); + if (likely(!(cnts & _QW_WMASK))) + return; + + /* The slowpath will decrement the reader count, if necessary. */ + queue_read_lock_slowpath(lock); +} + +/** + * queue_write_lock - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +static inline void queue_write_lock(struct qrwlock *lock) +{ + /* Optimize for the unfair lock case where the fair flag is 0. */ + if (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0) + return; + + queue_write_lock_slowpath(lock); +} + +/** + * queue_read_unlock - release read lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +static inline void queue_read_unlock(struct qrwlock *lock) +{ + /* + * Atomically decrement the reader count + */ + smp_mb__before_atomic(); + atomic_sub(_QR_BIAS, &lock->cnts); +} + +#ifndef queue_write_unlock +/** + * queue_write_unlock - release write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +static inline void queue_write_unlock(struct qrwlock *lock) +{ + /* + * If the writer field is atomic, it can be cleared directly. + * Otherwise, an atomic subtraction will be used to clear it. + */ + smp_mb__before_atomic(); + atomic_sub(_QW_LOCKED, &lock->cnts); +} +#endif + +/* + * Remapping rwlock architecture specific functions to the corresponding + * queue rwlock functions. + */ +#define arch_read_can_lock(l) queue_read_can_lock(l) +#define arch_write_can_lock(l) queue_write_can_lock(l) +#define arch_read_lock(l) queue_read_lock(l) +#define arch_write_lock(l) queue_write_lock(l) +#define arch_read_trylock(l) queue_read_trylock(l) +#define arch_write_trylock(l) queue_write_trylock(l) +#define arch_read_unlock(l) queue_read_unlock(l) +#define arch_write_unlock(l) queue_write_unlock(l) + +#endif /* __ASM_GENERIC_QRWLOCK_H */ diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h new file mode 100644 index 000000000000..4d76f24df518 --- /dev/null +++ b/include/asm-generic/qrwlock_types.h @@ -0,0 +1,21 @@ +#ifndef __ASM_GENERIC_QRWLOCK_TYPES_H +#define __ASM_GENERIC_QRWLOCK_TYPES_H + +#include +#include + +/* + * The queue read/write lock data structure + */ + +typedef struct qrwlock { + atomic_t cnts; + arch_spinlock_t lock; +} arch_rwlock_t; + +#define __ARCH_RW_LOCK_UNLOCKED { \ + .cnts = ATOMIC_INIT(0), \ + .lock = __ARCH_SPIN_LOCK_UNLOCKED, \ +} + +#endif /* __ASM_GENERIC_QRWLOCK_TYPES_H */ diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index d2b32ac27a39..35536d9c0964 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -223,3 +223,10 @@ endif config MUTEX_SPIN_ON_OWNER def_bool y depends on SMP && !DEBUG_MUTEXES + +config ARCH_USE_QUEUE_RWLOCK + bool + +config QUEUE_RWLOCK + def_bool y if ARCH_USE_QUEUE_RWLOCK + depends on SMP diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index b8bdcd4785b7..8541bfdfd232 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 000000000000..fb5b8ac411a5 --- /dev/null +++ b/kernel/locking/qrwlock.c @@ -0,0 +1,133 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long + */ +#include +#include +#include +#include +#include +#include +#include + +/** + * rspin_until_writer_unlock - inc reader count & spin until writer is gone + * @lock : Pointer to queue rwlock structure + * @writer: Current queue rwlock writer status byte + * + * In interrupt context or at the head of the queue, the reader will just + * increment the reader count & wait until the writer releases the lock. + */ +static __always_inline void +rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) +{ + while ((cnts & _QW_WMASK) == _QW_LOCKED) { + arch_mutex_cpu_relax(); + cnts = smp_load_acquire((u32 *)&lock->cnts); + } +} + +/** + * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +void queue_read_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* + * Readers come here when they cannot get the lock without waiting + */ + if (unlikely(in_interrupt())) { + /* + * Readers in interrupt context will spin until the lock is + * available without waiting in the queue. + */ + cnts = smp_load_acquire((u32 *)&lock->cnts); + rspin_until_writer_unlock(lock, cnts); + return; + } + atomic_sub(_QR_BIAS, &lock->cnts); + + /* + * Put the reader into the wait queue + */ + arch_spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state + * goes to 0 and then try to increment the reader count and get + * the lock. It is possible that an incoming writer may steal the + * lock in the interim, so it is necessary to check the writer byte + * to make sure that the write lock isn't taken. + */ + while (atomic_read(&lock->cnts) & _QW_WMASK) + arch_mutex_cpu_relax(); + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + rspin_until_writer_unlock(lock, cnts); + + /* + * Signal the next one in queue to become queue head + */ + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_read_lock_slowpath); + +/** + * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +void queue_write_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* Put the writer into the wait queue */ + arch_spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present */ + if (!atomic_read(&lock->cnts) && + (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + goto unlock; + + /* + * Set the waiting flag to notify readers that a writer is pending, + * or wait for a previous writer to go away. + */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if (!(cnts & _QW_WMASK) && + (atomic_cmpxchg(&lock->cnts, cnts, + cnts | _QW_WAITING) == cnts)) + break; + + arch_mutex_cpu_relax(); + } + + /* When no more readers, set the locked flag */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if ((cnts == _QW_WAITING) && + (atomic_cmpxchg(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) + break; + + arch_mutex_cpu_relax(); + } +unlock: + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_write_lock_slowpath); -- cgit v1.2.3-71-gd317 From 3d5c9340d1949733eb37616abd15db36aef9a57c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2014 12:34:23 +0200 Subject: rtmutex: Handle deadlock detection smarter Even in the case when deadlock detection is not requested by the caller, we can detect deadlocks. Right now the code stops the lock chain walk and keeps the waiter enqueued, even on itself. Silly not to yell when such a scenario is detected and to keep the waiter enqueued. Return -EDEADLK unconditionally and handle it at the call sites. The futex calls return -EDEADLK. The non futex ones dequeue the waiter, throw a warning and put the task into a schedule loop. Tagged for stable as it makes the code more robust. Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Brad Mouring Link: http://lkml.kernel.org/r/20140605152801.836501969@linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex-debug.h | 5 +++++ kernel/locking/rtmutex.c | 33 ++++++++++++++++++++++++++++----- kernel/locking/rtmutex.h | 5 +++++ 3 files changed, 38 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d78..ab29b6a22669 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, { return (waiter != NULL); } + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + debug_rt_mutex_print_deadlock(w); +} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a620d4d08ca6..eb7a46327798 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -314,7 +314,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } put_task_struct(task); - return deadlock_detect ? -EDEADLK : 0; + return -EDEADLK; } retry: /* @@ -377,7 +377,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); - ret = deadlock_detect ? -EDEADLK : 0; + ret = -EDEADLK; goto out_unlock_pi; } @@ -548,7 +548,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * which is wrong, as the other waiter is not in a deadlock * situation. */ - if (detect_deadlock && owner == task) + if (owner == task) return -EDEADLK; raw_spin_lock_irqsave(&task->pi_lock, flags); @@ -763,6 +763,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_waiter *w) +{ + /* + * If the result is not -EDEADLOCK or the caller requested + * deadlock detection, nothing to do here. + */ + if (res != -EDEADLOCK || detect_deadlock) + return; + + /* + * Yell lowdly and stop the task right here. + */ + rt_mutex_print_deadlock(w); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } +} + /* * Slow path lock function: */ @@ -802,8 +822,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(TASK_RUNNING); - if (unlikely(ret)) + if (unlikely(ret)) { remove_waiter(lock, &waiter); + rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); + } /* * try_to_take_rt_mutex() sets the waiter bit @@ -1112,7 +1134,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, return 1; } - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); if (ret && !rt_mutex_owner(lock)) { /* diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421d..f6a1f3c133b1 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -24,3 +24,8 @@ #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + WARN(1, "rtmutex deadlock detected\n"); +} -- cgit v1.2.3-71-gd317 From 82084984383babe728e6e3c9a8e5c46278091315 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2014 11:16:12 +0200 Subject: rtmutex: Detect changes in the pi lock chain When we walk the lock chain, we drop all locks after each step. So the lock chain can change under us before we reacquire the locks. That's harmless in principle as we just follow the wrong lock path. But it can lead to a false positive in the dead lock detection logic: T0 holds L0 T0 blocks on L1 held by T1 T1 blocks on L2 held by T2 T2 blocks on L3 held by T3 T4 blocks on L4 held by T4 Now we walk the chain lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> drop locks T2 times out and blocks on L0 Now we continue: lock T2 -> lock L0 -> deadlock detected, but it's not a deadlock at all. Brad tried to work around that in the deadlock detection logic itself, but the more I looked at it the less I liked it, because it's crystal ball magic after the fact. We actually can detect a chain change very simple: lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> next_lock = T2->pi_blocked_on->lock; drop locks T2 times out and blocks on L0 Now we continue: lock T2 -> if (next_lock != T2->pi_blocked_on->lock) return; So if we detect that T2 is now blocked on a different lock we stop the chain walk. That's also correct in the following scenario: lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> next_lock = T2->pi_blocked_on->lock; drop locks T3 times out and drops L3 T2 acquires L3 and blocks on L4 now Now we continue: lock T2 -> if (next_lock != T2->pi_blocked_on->lock) return; We don't have to follow up the chain at that point, because T2 propagated our priority up to T4 already. [ Folded a cleanup patch from peterz ] Signed-off-by: Thomas Gleixner Reported-by: Brad Mouring Cc: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140605152801.930031935@linutronix.de Cc: stable@vger.kernel.org --- kernel/locking/rtmutex.c | 95 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 24 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index eb7a46327798..a8a83a22bb91 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -260,27 +260,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task) */ int max_lock_depth = 1024; +static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +{ + return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; +} + /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. * - * @task: the task owning the mutex (owner) for which a chain walk is probably - * needed + * @task: the task owning the mutex (owner) for which a chain walk is + * probably needed * @deadlock_detect: do we have to carry out deadlock detection? - * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck - * things for a task that has just got its priority adjusted, and - * is waiting on a mutex) + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @next_lock: the mutex on which the owner of @orig_lock was blocked before + * we dropped its pi_lock. Is never dereferenced, only used for + * comparison to detect lock chain changes. * @orig_waiter: rt_mutex_waiter struct for the task that has just donated - * its priority to the mutex owner (can be NULL in the case - * depicted above or if the top waiter is gone away and we are - * actually deboosting the owner) - * @top_task: the current top waiter + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int deadlock_detect, struct rt_mutex *orig_lock, + struct rt_mutex *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { @@ -338,6 +347,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (orig_waiter && !rt_mutex_owner(orig_lock)) goto out_unlock_pi; + /* + * We dropped all locks after taking a refcount on @task, so + * the task might have moved on in the lock chain or even left + * the chain completely and blocks now on an unrelated lock or + * on @orig_lock. + * + * We stored the lock on which @task was blocked in @next_lock, + * so we can detect the chain change. + */ + if (next_lock != waiter->lock) + goto out_unlock_pi; + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting @@ -422,11 +443,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } + /* + * Check whether the task which owns the current lock is pi + * blocked itself. If yes we store a pointer to the lock for + * the lock chain change detection above. After we dropped + * task->pi_lock next_lock cannot be dereferenced anymore. + */ + next_lock = task_blocked_on_lock(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); raw_spin_unlock(&lock->wait_lock); + /* + * We reached the end of the lock chain. Stop right here. No + * point to go back just to figure that out. + */ + if (!next_lock) + goto out_put_task; + if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -536,8 +572,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; + struct rt_mutex *next_lock; int chain_walk = 0, res; + unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -569,20 +606,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (!owner) return 0; + raw_spin_lock_irqsave(&owner->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { chain_walk = 1; + } - if (!chain_walk) + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); + + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + /* + * Even if full deadlock detection is on, if the owner is not + * blocked itself, we can avoid finding this out in the chain + * walk. + */ + if (!chain_walk || !next_lock) return 0; /* @@ -594,8 +639,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - task); + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + next_lock, waiter, task); raw_spin_lock(&lock->wait_lock); @@ -644,8 +689,8 @@ static void remove_waiter(struct rt_mutex *lock, { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex *next_lock = NULL; unsigned long flags; - int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); rt_mutex_dequeue(lock, waiter); @@ -669,13 +714,13 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) - chain_walk = 1; + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - if (!chain_walk) + if (!next_lock) return; /* gets dropped in rt_mutex_adjust_prio_chain()! */ @@ -683,7 +728,7 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); raw_spin_lock(&lock->wait_lock); } @@ -696,6 +741,7 @@ static void remove_waiter(struct rt_mutex *lock, void rt_mutex_adjust_pi(struct task_struct *task) { struct rt_mutex_waiter *waiter; + struct rt_mutex *next_lock; unsigned long flags; raw_spin_lock_irqsave(&task->pi_lock, flags); @@ -706,12 +752,13 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - + next_lock = waiter->lock; raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); + + rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); } /** -- cgit v1.2.3-71-gd317 From 27e35715df54cbc4f2d044f681802ae30479e7fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Jun 2014 18:44:04 +0000 Subject: rtmutex: Plug slow unlock race When the rtmutex fast path is enabled the slow unlock function can create the following situation: spin_lock(foo->m->wait_lock); foo->m->owner = NULL; rt_mutex_lock(foo->m); <-- fast path free = atomic_dec_and_test(foo->refcnt); rt_mutex_unlock(foo->m); <-- fast path if (free) kfree(foo); spin_unlock(foo->m->wait_lock); <--- Use after free. Plug the race by changing the slow unlock to the following scheme: while (!rt_mutex_has_waiters(m)) { /* Clear the waiters bit in m->owner */ clear_rt_mutex_waiters(m); owner = rt_mutex_owner(m); spin_unlock(m->wait_lock); if (cmpxchg(m->owner, owner, 0) == owner) return; spin_lock(m->wait_lock); } So in case of a new waiter incoming while the owner tries the slow path unlock we have two situations: unlock(wait_lock); lock(wait_lock); cmpxchg(p, owner, 0) == owner mark_rt_mutex_waiters(lock); acquire(lock); Or: unlock(wait_lock); lock(wait_lock); mark_rt_mutex_waiters(lock); cmpxchg(p, owner, 0) != owner enqueue_waiter(); unlock(wait_lock); lock(wait_lock); wakeup_next waiter(); unlock(wait_lock); lock(wait_lock); acquire(lock); If the fast path is disabled, then the simple m->owner = NULL; unlock(m->wait_lock); is sufficient as all access to m->owner is serialized via m->wait_lock; Also document and clarify the wakeup_next_waiter function as suggested by Oleg Nesterov. Reported-by: Steven Rostedt Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140611183852.937945560@linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 115 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 6 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a8a83a22bb91..fc605941b9b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) owner = *p; } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); } + +/* + * Safe fastpath aware unlock: + * 1) Clear the waiters bit + * 2) Drop lock->wait_lock + * 3) Try to unlock the lock with cmpxchg + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + struct task_struct *owner = rt_mutex_owner(lock); + + clear_rt_mutex_waiters(lock); + raw_spin_unlock(&lock->wait_lock); + /* + * If a new waiter comes in between the unlock and the cmpxchg + * we have two situations: + * + * unlock(wait_lock); + * lock(wait_lock); + * cmpxchg(p, owner, 0) == owner + * mark_rt_mutex_waiters(lock); + * acquire(lock); + * or: + * + * unlock(wait_lock); + * lock(wait_lock); + * mark_rt_mutex_waiters(lock); + * + * cmpxchg(p, owner, 0) != owner + * enqueue_waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * wake waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * acquire(lock); + */ + return rt_mutex_cmpxchg(lock, owner, NULL); +} + #else # define rt_mutex_cmpxchg(l,c,n) (0) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) @@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); } + +/* + * Simple slow path only version: lock->owner is protected by lock->wait_lock. + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + lock->owner = NULL; + raw_spin_unlock(&lock->wait_lock); + return true; +} #endif static inline int @@ -650,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and wake it up. + * Remove the top waiter from the current tasks pi waiter list and + * wake it up. * * Called with lock->wait_lock held. */ @@ -671,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock) */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_set_owner(lock, NULL); + /* + * As we are waking up the top waiter, and the waiter stays + * queued on the lock until it gets the lock, this lock + * obviously has waiters. Just set the bit here and this has + * the added benefit of forcing all new tasks into the + * slow path making sure no task of lower priority than + * the top waiter can steal this lock. + */ + lock->owner = (void *) RT_MUTEX_HAS_WAITERS; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + /* + * It's safe to dereference waiter as it cannot go away as + * long as we hold lock->wait_lock. The waiter task needs to + * acquire it in order to dequeue the waiter. + */ wake_up_process(waiter->task); } @@ -928,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock) rt_mutex_deadlock_account_unlock(current); - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); - return; + /* + * We must be careful here if the fast path is enabled. If we + * have no waiters queued we cannot set owner to NULL here + * because of: + * + * foo->lock->owner = NULL; + * rtmutex_lock(foo->lock); <- fast path + * free = atomic_dec_and_test(foo->refcnt); + * rtmutex_unlock(foo->lock); <- fast path + * if (free) + * kfree(foo); + * raw_spin_unlock(foo->lock->wait_lock); + * + * So for the fastpath enabled kernel: + * + * Nothing can set the waiters bit as long as we hold + * lock->wait_lock. So we do the following sequence: + * + * owner = rt_mutex_owner(lock); + * clear_rt_mutex_waiters(lock); + * raw_spin_unlock(&lock->wait_lock); + * if (cmpxchg(&lock->owner, owner, 0) == owner) + * return; + * goto retry; + * + * The fastpath disabled variant is simple as all access to + * lock->owner is serialized by lock->wait_lock: + * + * lock->owner = NULL; + * raw_spin_unlock(&lock->wait_lock); + */ + while (!rt_mutex_has_waiters(lock)) { + /* Drops lock->wait_lock ! */ + if (unlock_rt_mutex_safe(lock) == true) + return; + /* Relock the rtmutex and try again */ + raw_spin_lock(&lock->wait_lock); } + /* + * The wakeup next waiter path does not suffer from the above + * race. See the comments there. + */ wakeup_next_waiter(lock); raw_spin_unlock(&lock->wait_lock); -- cgit v1.2.3-71-gd317 From 37e9562453b813d2ea527bd9531fef2c3c592847 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 4 Jul 2014 20:49:32 -0700 Subject: locking/rwsem: Allow conservative optimistic spinning when readers have lock Commit 4fc828e24cd9 ("locking/rwsem: Support optimistic spinning") introduced a major performance regression for workloads such as xfs_repair which mix read and write locking of the mmap_sem across many threads. The result was xfs_repair ran 5x slower on 3.16-rc2 than on 3.15 and using 20x more system CPU time. Perf profiles indicate in some workloads that significant time can be spent spinning on !owner. This is because we don't set the lock owner when readers(s) obtain the rwsem. In this patch, we'll modify rwsem_can_spin_on_owner() such that we'll return false if there is no lock owner. The rationale is that if we just entered the slowpath, yet there is no lock owner, then there is a possibility that a reader has the lock. To be conservative, we'll avoid spinning in these situations. This patch reduced the total run time of the xfs_repair workload from about 4 minutes 24 seconds down to approximately 1 minute 26 seconds, back to close to the same performance as on 3.15. Retesting of AIM7, which were some of the workloads used to test the original optimistic spinning code, confirmed that we still get big performance gains with optimistic spinning, even with this additional regression fix. Davidlohr found that while the 'custom' workload took a performance hit of ~-14% to throughput for >300 users with this additional patch, the overall gain with optimistic spinning is still ~+45%. The 'disk' workload even improved by ~+15% at >1000 users. Tested-by: Dave Chinner Acked-by: Davidlohr Bueso Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1404532172.2572.30.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index dacc32142fcc..c40c7d28661d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; - bool on_cpu = true; + bool on_cpu = false; if (need_resched()) - return 0; + return false; rcu_read_lock(); owner = ACCESS_ONCE(sem->owner); @@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_unlock(); /* - * If sem->owner is not set, the rwsem owner may have - * just acquired it and not set the owner yet or the rwsem - * has been released. + * If sem->owner is not set, yet we have just recently entered the + * slowpath, then there is a possibility reader(s) may have the lock. + * To be safe, avoid spinning in these situations. */ return on_cpu; } -- cgit v1.2.3-71-gd317 From 046a619d8e9746fa4c0e29e8c6b78e16efc008a8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 14 Jul 2014 10:27:48 -0700 Subject: locking/spinlocks/mcs: Rename optimistic_spin_queue() to optimistic_spin_node() Currently, the per-cpu nodes structure for the cancellable MCS spinlock is named "optimistic_spin_queue". However, in a follow up patch in the series we will be introducing a new structure that serves as the new "handle" for the lock. It would make more sense if that structure is named "optimistic_spin_queue". Additionally, since the current use of the "optimistic_spin_queue" structure are "nodes", it might be better if we rename them to "node" anyway. This preparatory patch renames all current "optimistic_spin_queue" to "optimistic_spin_node". Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Scott Norton Cc: "Paul E. McKenney" Cc: Dave Chinner Cc: Waiman Long Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Cc: Chris Mason Cc: Heiko Carstens Cc: Josef Bacik Link: http://lkml.kernel.org/r/1405358872-3732-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 4 ++-- include/linux/rwsem.h | 4 ++-- kernel/locking/mcs_spinlock.c | 24 ++++++++++++------------ kernel/locking/mcs_spinlock.h | 8 ++++---- 4 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel/locking') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 11692dea18aa..885f3f56a77f 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -46,7 +46,7 @@ * - detects multi-task circular deadlocks and prints out all affected * locks and tasks (and only those tasks) */ -struct optimistic_spin_queue; +struct optimistic_spin_node; struct mutex { /* 1: unlocked, 0: locked, negative: locked, possible waiters */ atomic_t count; @@ -56,7 +56,7 @@ struct mutex { struct task_struct *owner; #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - struct optimistic_spin_queue *osq; /* Spinner MCS lock */ + struct optimistic_spin_node *osq; /* Spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8d79708146aa..ba3f108ddea1 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -16,7 +16,7 @@ #include -struct optimistic_spin_queue; +struct optimistic_spin_node; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -33,7 +33,7 @@ struct rw_semaphore { * if the owner is running on the cpu. */ struct task_struct *owner; - struct optimistic_spin_queue *osq; /* spinner MCS lock */ + struct optimistic_spin_node *osq; /* spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index 838dc9e00669..e9866f70e828 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -14,18 +14,18 @@ * called from interrupt context and we have preemption disabled while * spinning. */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. */ -static inline struct optimistic_spin_queue * -osq_wait_next(struct optimistic_spin_queue **lock, - struct optimistic_spin_queue *node, - struct optimistic_spin_queue *prev) +static inline struct optimistic_spin_node * +osq_wait_next(struct optimistic_spin_node **lock, + struct optimistic_spin_node *node, + struct optimistic_spin_node *prev) { - struct optimistic_spin_queue *next = NULL; + struct optimistic_spin_node *next = NULL; for (;;) { if (*lock == node && cmpxchg(lock, node, prev) == node) { @@ -59,10 +59,10 @@ osq_wait_next(struct optimistic_spin_queue **lock, return next; } -bool osq_lock(struct optimistic_spin_queue **lock) +bool osq_lock(struct optimistic_spin_node **lock) { - struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); - struct optimistic_spin_queue *prev, *next; + struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_node *prev, *next; node->locked = 0; node->next = NULL; @@ -149,10 +149,10 @@ unqueue: return false; } -void osq_unlock(struct optimistic_spin_queue **lock) +void osq_unlock(struct optimistic_spin_node **lock) { - struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); - struct optimistic_spin_queue *next; + struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_node *next; /* * Fast path for the uncontended case. diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index a2dbac4aca6b..c99dc0052f49 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -118,12 +118,12 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) * mutex_lock()/rwsem_down_{read,write}() etc. */ -struct optimistic_spin_queue { - struct optimistic_spin_queue *next, *prev; +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; int locked; /* 1 if lock acquired */ }; -extern bool osq_lock(struct optimistic_spin_queue **lock); -extern void osq_unlock(struct optimistic_spin_queue **lock); +extern bool osq_lock(struct optimistic_spin_node **lock); +extern void osq_unlock(struct optimistic_spin_node **lock); #endif /* __LINUX_MCS_SPINLOCK_H */ -- cgit v1.2.3-71-gd317 From 90631822c5d307b5410500806e8ac3e63928aa3e Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 14 Jul 2014 10:27:49 -0700 Subject: locking/spinlocks/mcs: Convert osq lock to atomic_t to reduce overhead The cancellable MCS spinlock is currently used to queue threads that are doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining the lock would access and queue the local node corresponding to the CPU that it's running on. Currently, the cancellable MCS lock is implemented by using pointers to these nodes. In this patch, instead of operating on pointers to the per-cpu nodes, we store the CPU numbers in which the per-cpu nodes correspond to in atomic_t. A similar concept is used with the qspinlock. By operating on the CPU # of the nodes using atomic_t instead of pointers to those nodes, this can reduce the overhead of the cancellable MCS spinlock by 32 bits (on 64 bit systems). Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Scott Norton Cc: "Paul E. McKenney" Cc: Dave Chinner Cc: Waiman Long Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Cc: Chris Mason Cc: Heiko Carstens Cc: Josef Bacik Link: http://lkml.kernel.org/r/1405358872-3732-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 4 ++-- include/linux/osq_lock.h | 19 ++++++++++++++++++ include/linux/rwsem.h | 7 +++---- kernel/locking/mcs_spinlock.c | 46 ++++++++++++++++++++++++++++++++++++------- kernel/locking/mcs_spinlock.h | 5 +++-- kernel/locking/mutex.c | 2 +- kernel/locking/rwsem-xadd.c | 2 +- 7 files changed, 68 insertions(+), 17 deletions(-) create mode 100644 include/linux/osq_lock.h (limited to 'kernel/locking') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 885f3f56a77f..42aa9b9ecd5f 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * Simple, straightforward mutexes with strict semantics: @@ -46,7 +47,6 @@ * - detects multi-task circular deadlocks and prints out all affected * locks and tasks (and only those tasks) */ -struct optimistic_spin_node; struct mutex { /* 1: unlocked, 0: locked, negative: locked, possible waiters */ atomic_t count; @@ -56,7 +56,7 @@ struct mutex { struct task_struct *owner; #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - struct optimistic_spin_node *osq; /* Spinner MCS lock */ + struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h new file mode 100644 index 000000000000..b001682bf7cb --- /dev/null +++ b/include/linux/osq_lock.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_OSQ_LOCK_H +#define __LINUX_OSQ_LOCK_H + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + */ + +#define OSQ_UNLOCKED_VAL (0) + +struct optimistic_spin_queue { + /* + * Stores an encoded value of the CPU # of the tail node in the queue. + * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL. + */ + atomic_t tail; +}; + +#endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index ba3f108ddea1..9fdcdd03507d 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -13,10 +13,9 @@ #include #include #include - #include +#include -struct optimistic_spin_node; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -33,7 +32,7 @@ struct rw_semaphore { * if the owner is running on the cpu. */ struct task_struct *owner; - struct optimistic_spin_node *osq; /* spinner MCS lock */ + struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ LIST_HEAD_INIT((name).wait_list), \ NULL, /* owner */ \ - NULL /* mcs lock */ \ + { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \ __RWSEM_DEP_MAP_INIT(name) } #else #define __RWSEM_INITIALIZER(name) \ diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index e9866f70e828..32fc16c0a545 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -16,19 +16,45 @@ */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); +/* + * We use the value 0 to represent "no CPU", thus the encoded value + * will be the CPU number incremented by 1. + */ +static inline int encode_cpu(int cpu_nr) +{ + return cpu_nr + 1; +} + +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) +{ + int cpu_nr = encoded_cpu_val - 1; + + return per_cpu_ptr(&osq_node, cpu_nr); +} + /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. */ static inline struct optimistic_spin_node * -osq_wait_next(struct optimistic_spin_node **lock, +osq_wait_next(struct optimistic_spin_queue *lock, struct optimistic_spin_node *node, struct optimistic_spin_node *prev) { struct optimistic_spin_node *next = NULL; + int curr = encode_cpu(smp_processor_id()); + int old; + + /* + * If there is a prev node in queue, then the 'old' value will be + * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if + * we're currently last in queue, then the queue will then become empty. + */ + old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; for (;;) { - if (*lock == node && cmpxchg(lock, node, prev) == node) { + if (atomic_read(&lock->tail) == curr && + atomic_cmpxchg(&lock->tail, curr, old) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its @@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_node **lock, return next; } -bool osq_lock(struct optimistic_spin_node **lock) +bool osq_lock(struct optimistic_spin_queue *lock) { struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); struct optimistic_spin_node *prev, *next; + int curr = encode_cpu(smp_processor_id()); + int old; node->locked = 0; node->next = NULL; + node->cpu = curr; - node->prev = prev = xchg(lock, node); - if (likely(prev == NULL)) + old = atomic_xchg(&lock->tail, curr); + if (old == OSQ_UNLOCKED_VAL) return true; + prev = decode_cpu(old); + node->prev = prev; ACCESS_ONCE(prev->next) = node; /* @@ -149,15 +180,16 @@ unqueue: return false; } -void osq_unlock(struct optimistic_spin_node **lock) +void osq_unlock(struct optimistic_spin_queue *lock) { struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); struct optimistic_spin_node *next; + int curr = encode_cpu(smp_processor_id()); /* * Fast path for the uncontended case. */ - if (likely(cmpxchg(lock, node, NULL) == node)) + if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) return; /* diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index c99dc0052f49..74356dc0ce29 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -121,9 +121,10 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) struct optimistic_spin_node { struct optimistic_spin_node *next, *prev; int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # value */ }; -extern bool osq_lock(struct optimistic_spin_node **lock); -extern void osq_unlock(struct optimistic_spin_node **lock); +extern bool osq_lock(struct optimistic_spin_queue *lock); +extern void osq_unlock(struct optimistic_spin_queue *lock); #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index bc73d33c6760..d9b313906caa 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->osq = NULL; + atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL); #endif debug_mutex_init(lock, name, key); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index c40c7d28661d..b77a6230bbf6 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_SMP sem->owner = NULL; - sem->osq = NULL; + atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL); #endif } -- cgit v1.2.3-71-gd317 From 4d9d951e6b5df85ccfca2c5bd8b4f5c71d256b65 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 14 Jul 2014 10:27:50 -0700 Subject: locking/spinlocks/mcs: Introduce and use init macro and function for osq locks Currently, we initialize the osq lock by directly setting the lock's values. It would be preferable if we use an init macro to do the initialization like we do with other locks. This patch introduces and uses a macro and function for initializing the osq lock. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Scott Norton Cc: "Paul E. McKenney" Cc: Dave Chinner Cc: Waiman Long Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Cc: Chris Mason Cc: Josef Bacik Link: http://lkml.kernel.org/r/1405358872-3732-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/osq_lock.h | 8 ++++++++ include/linux/rwsem.h | 2 +- kernel/locking/mutex.c | 2 +- kernel/locking/rwsem-xadd.c | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h index b001682bf7cb..90230d5811c5 100644 --- a/include/linux/osq_lock.h +++ b/include/linux/osq_lock.h @@ -16,4 +16,12 @@ struct optimistic_spin_queue { atomic_t tail; }; +/* Init macro and function. */ +#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } + +static inline void osq_lock_init(struct optimistic_spin_queue *lock) +{ + atomic_set(&lock->tail, OSQ_UNLOCKED_VAL); +} + #endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9fdcdd03507d..25cd9aa2f3d7 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -69,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ LIST_HEAD_INIT((name).wait_list), \ NULL, /* owner */ \ - { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \ + OSQ_LOCK_UNLOCKED /* osq */ \ __RWSEM_DEP_MAP_INIT(name) } #else #define __RWSEM_INITIALIZER(name) \ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d9b313906caa..acca2c1a3c5e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL); + osq_lock_init(&lock->osq); #endif debug_mutex_init(lock, name, key); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b77a6230bbf6..7190592c2645 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_SMP sem->owner = NULL; - atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL); + osq_lock_init(&sem->osq); #endif } -- cgit v1.2.3-71-gd317 From 33ecd2083a9560fbc1ef1b1279ef3ecb4c012a4f Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 14 Jul 2014 10:27:51 -0700 Subject: locking/spinlocks/mcs: Micro-optimize osq_unlock() In the unlock function of the cancellable MCS spinlock, the first thing we do is to retrive the current CPU's osq node. However, due to the changes made in the previous patch, in the common case where the lock is not contended, we wouldn't need to access the current CPU's osq node anymore. This patch optimizes this by only retriving this CPU's osq node after we attempt the initial cmpxchg to unlock the osq and found that its contended. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Scott Norton Cc: "Paul E. McKenney" Cc: Dave Chinner Cc: Waiman Long Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1405358872-3732-5-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mcs_spinlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index 32fc16c0a545..be9ee1559fca 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -182,8 +182,7 @@ unqueue: void osq_unlock(struct optimistic_spin_queue *lock) { - struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); - struct optimistic_spin_node *next; + struct optimistic_spin_node *node, *next; int curr = encode_cpu(smp_processor_id()); /* @@ -195,6 +194,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) /* * Second most likely case. */ + node = this_cpu_ptr(&osq_node); next = xchg(&node->next, NULL); if (next) { ACCESS_ONCE(next->locked) = 1; -- cgit v1.2.3-71-gd317 From 13b9a962a2594ee880c5d50d7f70964da1d4fe5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Jul 2014 14:54:55 +0200 Subject: locking/rwsem: Rename 'activity' to 'count' There are two definitions of struct rw_semaphore, one in linux/rwsem.h and one in linux/rwsem-spinlock.h. For some reason they have different names for the initial field. This makes it impossible to use C99 named initialization for __RWSEM_INITIALIZER() -- or we have to duplicate that entire thing along with the structure definitions. The simpler patch is renaming the rwsem-spinlock variant to match the regular rwsem. This allows us to switch to C99 named initialization. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-bmrZolsbGmautmzrerog27io@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/rwsem-spinlock.h | 8 ++++---- kernel/locking/rwsem-spinlock.c | 28 ++++++++++++++-------------- 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel/locking') diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index d5b13bc07a0b..561e8615528d 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -15,13 +15,13 @@ #ifdef __KERNEL__ /* * the rw-semaphore definition - * - if activity is 0 then there are no active readers or writers - * - if activity is +ve then that is the number of active readers - * - if activity is -1 then there is one active writer + * - if count is 0 then there are no active readers or writers + * - if count is +ve then that is the number of active readers + * - if count is -1 then there is one active writer * - if wait_list is not empty, then there are processes waiting for the semaphore */ struct rw_semaphore { - __s32 activity; + __s32 count; raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 9be8a9144978..2c93571162cb 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem) unsigned long flags; if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->activity != 0); + ret = (sem->count != 0); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } return ret; @@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->activity = 0; + sem->count = 0; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } @@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) waiter = list_entry(next, struct rwsem_waiter, list); } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - sem->activity += woken; + sem->count += woken; out: return sem; @@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + if (sem->count >= 0 && list_empty(&sem->wait_list)) { /* granted */ - sem->activity++; + sem->count++; raw_spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + if (sem->count >= 0 && list_empty(&sem->wait_list)) { /* granted */ - sem->activity++; + sem->count++; ret = 1; } @@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) * itself into sleep and waiting for system woke it or someone * else in the head of the wait list up. */ - if (sem->activity == 0) + if (sem->count == 0) break; set_task_state(tsk, TASK_UNINTERRUPTIBLE); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ - sem->activity = -1; + sem->count = -1; list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity == 0) { + if (sem->count == 0) { /* got the lock */ - sem->activity = -1; + sem->count = -1; ret = 1; } @@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) + if (--sem->count == 0 && !list_empty(&sem->wait_list)) sem = __rwsem_wake_one_writer(sem); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - sem->activity = 0; + sem->count = 0; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 1); @@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - sem->activity = 1; + sem->count = 1; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 0); -- cgit v1.2.3-71-gd317 From 5db6c6fefb1ca0e81e3bd6dd8998bf51c453d823 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 11 Jul 2014 14:00:06 -0700 Subject: locking/rwsem: Add CONFIG_RWSEM_SPIN_ON_OWNER Just like with mutexes (CONFIG_MUTEX_SPIN_ON_OWNER), encapsulate the dependencies for rwsem optimistic spinning. No logical changes here as it continues to depend on both SMP and the XADD algorithm variant. Signed-off-by: Davidlohr Bueso Acked-by: Jason Low [ Also make it depend on ARCH_SUPPORTS_ATOMIC_RMW. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1405112406-13052-2-git-send-email-davidlohr@hp.com Cc: aswin@hp.com Cc: Chris Mason Cc: Davidlohr Bueso Cc: Josef Bacik Cc: Linus Torvalds Cc: Waiman Long Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 6 ++++-- kernel/Kconfig.locks | 4 ++++ kernel/locking/rwsem-xadd.c | 4 ++-- kernel/locking/rwsem.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 716807f0eb2d..035d3c57fc8a 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -14,7 +14,9 @@ #include #include #include +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include +#endif struct rw_semaphore; @@ -26,7 +28,7 @@ struct rw_semaphore { long count; struct list_head wait_list; raw_spinlock_t wait_lock; -#ifdef CONFIG_SMP +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ /* * Write owner. Used as a speculative check to see @@ -63,7 +65,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) # define __RWSEM_DEP_MAP_INIT(lockname) #endif -#if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK) +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER #define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL #else #define __RWSEM_OPT_INIT(lockname) diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 81907941d921..76768ee812b2 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -227,6 +227,10 @@ config MUTEX_SPIN_ON_OWNER def_bool y depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW +config RWSEM_SPIN_ON_OWNER + def_bool y + depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW + config ARCH_USE_QUEUE_RWLOCK bool diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 7190592c2645..a2391ac135c8 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -82,7 +82,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); -#ifdef CONFIG_SMP +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER sem->owner = NULL; osq_lock_init(&sem->osq); #endif @@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) return false; } -#ifdef CONFIG_SMP +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * Try to acquire write lock before the writer has been put on wait queue. */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 42f806de49d4..e2d3bc7f03b4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,7 +12,7 @@ #include -#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER static inline void rwsem_set_owner(struct rw_semaphore *sem) { sem->owner = current; -- cgit v1.2.3-71-gd317