From 80127a39681bd68c959f0953f84a830cbd7c3b1c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 14 Jul 2016 20:08:46 +0200
Subject: locking/percpu-rwsem: Optimize readers and reduce global impact

Currently the percpu-rwsem switches to (global) atomic ops while a
writer is waiting; which could be quite a while and slows down
releasing the readers.

This patch cures this problem by ordering the reader-state vs
reader-count (see the comments in __percpu_down_read() and
percpu_down_write()). This changes a global atomic op into a full
memory barrier, which doesn't have the global cacheline contention.

This also enables using the percpu-rwsem with rcu_sync disabled in order
to bias the implementation differently, reducing the writer latency by
adding some cost to readers.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
[ Fixed modular build. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h | 84 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 75 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index c2fa3ecb0dce..146efefde2a1 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -10,30 +10,96 @@
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
-	unsigned int __percpu	*fast_read_ctr;
+	unsigned int __percpu	*read_count;
 	struct rw_semaphore	rw_sem;
-	atomic_t		slow_read_ctr;
-	wait_queue_head_t	write_waitq;
+	wait_queue_head_t	writer;
+	int			readers_block;
 };
 
-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+	preempt_disable();
+	/*
+	 * We are in an RCU-sched read-side critical section, so the writer
+	 * cannot both change sem->state from readers_fast and start checking
+	 * counters while we are here. So if we see !sem->state, we know that
+	 * the writer won't be checking until we're past the preempt_enable()
+	 * and that one the synchronize_sched() is done, the writer will see
+	 * anything we did within this RCU-sched read-size critical section.
+	 */
+	__this_cpu_inc(*sem->read_count);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		__percpu_down_read(sem, false); /* Unconditional memory barrier */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+}
+
+static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+	int ret = 1;
+
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	__this_cpu_inc(*sem->read_count);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+
+	if (ret)
+		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	/*
+	 * The barrier() in preempt_disable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_dec(*sem->read_count);
+	else
+		__percpu_up_read(sem); /* Unconditional memory barrier */
+	preempt_enable();
+
+	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
 				const char *, struct lock_class_key *);
+
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-#define percpu_init_rwsem(brw)	\
+#define percpu_init_rwsem(sem)					\
 ({								\
 	static struct lock_class_key rwsem_key;			\
-	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
+	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
-
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
-- 
cgit v1.2.3-71-gd317


From 3942a9bd7b5842a924e99ee6ec1350b8006c94ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Aug 2016 18:54:13 +0200
Subject: locking, rcu, cgroup: Avoid synchronize_sched() in
 __cgroup_procs_write()

The current percpu-rwsem read side is entirely free of serializing insns
at the cost of having a synchronize_sched() in the write path.

The latency of the synchronize_sched() is too high for cgroups. The
commit 1ed1328792ff talks about the write path being a fairly cold path
but this is not the case for Android which moves task to the foreground
cgroup and back around binder IPC calls from foreground processes to
background processes, so it is significantly hotter than human initiated
operations.

Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the
problem, hopefully it should not be that slow after another commit:

  80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global impact").

We could just add rcu_sync_enter() into cgroup_init() but we do not want
another synchronize_sched() at boot time, so this patch adds the new helper
which doesn't block but currently can only be called before the first use.

Reported-by: John Stultz <john.stultz@linaro.org>
Reported-by: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Colin Cross <ccross@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rom Lemarchand <romlem@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@google.com>
Link: http://lkml.kernel.org/r/20160811165413.GA22807@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rcu_sync.h |  1 +
 kernel/cgroup.c          |  6 ++++++
 kernel/rcu/sync.c        | 12 ++++++++++++
 3 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index a63a33e6196e..ece7ed9a4a70 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221..9f51cdf58f5a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5606,6 +5606,12 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+	/*
+	 * The latency of the synchronize_sched() is too high for cgroups,
+	 * avoid it at the cost of forcing all readers into the slow path.
+	 */
+	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
 	get_user_ns(init_cgroup_ns.user_ns);
 
 	mutex_lock(&cgroup_mutex);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 198473d90f81..50d1861f7759 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -84,6 +84,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 	rsp->gp_type = type;
 }
 
+/**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+	rsp->gp_count++;
+	rsp->gp_state = GP_PASSED;
+}
+
 /**
  * rcu_sync_enter() - Force readers onto slowpath
  * @rsp: Pointer to rcu_sync structure to use for synchronization
-- 
cgit v1.2.3-71-gd317


From 11d9684ca638aad99f740ef3abcba2aa4c9290bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Jun 2015 14:16:31 +0200
Subject: locking/percpu-rwsem: Add DEFINE_STATIC_PERCPU_RWSEMand
 percpu_rwsem_assert_held()

Provide a static init and a standard locking assertion method.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: der.herr@hofr.at
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: riel@redhat.com
Cc: tj@kernel.org
Cc: viro@ZenIV.linux.org.uk
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 146efefde2a1..d402d3924a91 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -16,6 +16,15 @@ struct percpu_rw_semaphore {
 	int			readers_block;
 };
 
+#define DEFINE_STATIC_PERCPU_RWSEM(name)				\
+static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
+static struct percpu_rw_semaphore name = {				\
+	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
+	.read_count = &__percpu_rwsem_rc_##name,			\
+	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
+	.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),		\
+}
+
 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
@@ -102,6 +111,9 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
 
+#define percpu_rwsem_assert_held(sem)				\
+	lockdep_assert_held(&(sem)->rw_sem)
+
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-- 
cgit v1.2.3-71-gd317


From 259d69b7f056bc9a543c7d184e791ef6c2775081 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 Nov 2015 15:23:55 +0100
Subject: locking/percpu-rwsem: Add down_read_preempt_disable()

Provide a down_read()/up_read() variant that keeps preemption disabled
over the whole thing, when possible.

This avoids a needless preemption point for constructs such as:

	percpu_down_read(&global_rwsem);
	spin_lock(&lock);
	...
	spin_unlock(&lock);
	percpu_up_read(&global_rwsem);

Which perturbs timings. In particular it was found to cure a
performance regression in a follow up patch in fs/locks.c

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index d402d3924a91..5b2e6159b744 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -28,7 +28,7 @@ static struct percpu_rw_semaphore name = {				\
 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
 {
 	might_sleep();
 
@@ -46,13 +46,19 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 	__this_cpu_inc(*sem->read_count);
 	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
 		__percpu_down_read(sem, false); /* Unconditional memory barrier */
-	preempt_enable();
+	barrier();
 	/*
-	 * The barrier() from preempt_enable() prevents the compiler from
+	 * The barrier() prevents the compiler from
 	 * bleeding the critical section out.
 	 */
 }
 
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+	percpu_down_read_preempt_disable(sem);
+	preempt_enable();
+}
+
 static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
 	int ret = 1;
@@ -76,13 +82,13 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	return ret;
 }
 
-static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
 {
 	/*
-	 * The barrier() in preempt_disable() prevents the compiler from
+	 * The barrier() prevents the compiler from
 	 * bleeding the critical section out.
 	 */
-	preempt_disable();
+	barrier();
 	/*
 	 * Same as in percpu_down_read().
 	 */
@@ -95,6 +101,12 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
 }
 
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	preempt_disable();
+	percpu_up_read_preempt_enable(sem);
+}
+
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
-- 
cgit v1.2.3-71-gd317


From e6253970413d99f416f7de8bd516e5f1834d8216 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 21 Nov 2015 19:11:48 +0100
Subject: stop_machine: Remove stop_cpus_lock and lg_double_lock/unlock()

stop_two_cpus() and stop_cpus() use stop_cpus_lock to avoid the deadlock,
we need to ensure that the stopper functions can't be queued "backwards"
from one another. This doesn't look nice; if we use lglock then we do not
really need stopper->lock, cpu_stop_queue_work() could use lg_local_lock()
under local_irq_save().

OTOH it would be even better to avoid lglock in stop_machine.c and remove
lg_double_lock(). This patch adds "bool stop_cpus_in_progress" set/cleared
by queue_stop_cpus_work(), and changes cpu_stop_queue_two_works() to busy
wait until it is cleared.

queue_stop_cpus_work() sets stop_cpus_in_progress = T lockless, but after
it queues a work on CPU1 it must be visible to stop_two_cpus(CPU1, CPU2)
which checks it under the same lock. And since stop_two_cpus() holds the
2nd lock too, queue_stop_cpus_work() can not clear stop_cpus_in_progress
if it is also going to queue a work on CPU2, it needs to take that 2nd
lock to do this.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151121181148.GA433@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lglock.h  |  5 -----
 kernel/locking/lglock.c | 22 ----------------------
 kernel/stop_machine.c   | 42 ++++++++++++++++++++++++++----------------
 3 files changed, 26 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index c92ebd100d9b..0081f000e34b 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -52,15 +52,10 @@ struct lglock {
 	static struct lglock name = { .lock = &name ## _lock }
 
 void lg_lock_init(struct lglock *lg, char *name);
-
 void lg_local_lock(struct lglock *lg);
 void lg_local_unlock(struct lglock *lg);
 void lg_local_lock_cpu(struct lglock *lg, int cpu);
 void lg_local_unlock_cpu(struct lglock *lg, int cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
-
 void lg_global_lock(struct lglock *lg);
 void lg_global_unlock(struct lglock *lg);
 
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 951cfcd10b4a..86ae2aebf004 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,28 +60,6 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
 }
 EXPORT_SYMBOL(lg_local_unlock_cpu);
 
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
-{
-	BUG_ON(cpu1 == cpu2);
-
-	/* lock in cpu order, just like lg_global_lock */
-	if (cpu2 < cpu1)
-		swap(cpu1, cpu2);
-
-	preempt_disable();
-	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-	arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
-	arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
-}
-
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
-{
-	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-	arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
-	arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
-	preempt_enable();
-}
-
 void lg_global_lock(struct lglock *lg)
 {
 	int i;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4a1ca5f6da7e..ae6f41fb9cba 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,6 @@
 #include <linux/kallsyms.h>
 #include <linux/smpboot.h>
 #include <linux/atomic.h>
-#include <linux/lglock.h>
 #include <linux/nmi.h>
 
 /*
@@ -47,13 +46,9 @@ struct cpu_stopper {
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static bool stop_machine_initialized = false;
 
-/*
- * Avoids a race between stop_two_cpus and global stop_cpus, where
- * the stoppers could get queued up in reverse order, leading to
- * system deadlock. Using an lglock means stop_two_cpus remains
- * relatively cheap.
- */
-DEFINE_STATIC_LGLOCK(stop_cpus_lock);
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static bool stop_cpus_in_progress;
 
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
@@ -230,14 +225,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 	struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
 	struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
 	int err;
-
-	lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+retry:
 	spin_lock_irq(&stopper1->lock);
 	spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
 
 	err = -ENOENT;
 	if (!stopper1->enabled || !stopper2->enabled)
 		goto unlock;
+	/*
+	 * Ensure that if we race with __stop_cpus() the stoppers won't get
+	 * queued up in reverse order leading to system deadlock.
+	 *
+	 * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has
+	 * queued a work on cpu1 but not on cpu2, we hold both locks.
+	 *
+	 * It can be falsely true but it is safe to spin until it is cleared,
+	 * queue_stop_cpus_work() does everything under preempt_disable().
+	 */
+	err = -EDEADLK;
+	if (unlikely(stop_cpus_in_progress))
+			goto unlock;
 
 	err = 0;
 	__cpu_stop_queue_work(stopper1, work1);
@@ -245,8 +252,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 unlock:
 	spin_unlock(&stopper2->lock);
 	spin_unlock_irq(&stopper1->lock);
-	lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
 
+	if (unlikely(err == -EDEADLK)) {
+		while (stop_cpus_in_progress)
+			cpu_relax();
+		goto retry;
+	}
 	return err;
 }
 /**
@@ -316,9 +327,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
-/* static data for stop_cpus */
-static DEFINE_MUTEX(stop_cpus_mutex);
-
 static bool queue_stop_cpus_work(const struct cpumask *cpumask,
 				 cpu_stop_fn_t fn, void *arg,
 				 struct cpu_stop_done *done)
@@ -332,7 +340,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
 	 * preempted by a stopper which might wait for other stoppers
 	 * to enter @fn which can lead to deadlock.
 	 */
-	lg_global_lock(&stop_cpus_lock);
+	preempt_disable();
+	stop_cpus_in_progress = true;
 	for_each_cpu(cpu, cpumask) {
 		work = &per_cpu(cpu_stopper.stop_work, cpu);
 		work->fn = fn;
@@ -341,7 +350,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
 		if (cpu_stop_queue_work(cpu, work))
 			queued = true;
 	}
-	lg_global_unlock(&stop_cpus_lock);
+	stop_cpus_in_progress = false;
+	preempt_enable();
 
 	return queued;
 }
-- 
cgit v1.2.3-71-gd317


From d32cdbfb0ba319e44f75437afde868f7cafdc467 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 Nov 2015 18:36:16 +0100
Subject: locking/lglock: Remove lglock implementation

It is now unused, remove it before someone else thinks its a good idea
to use this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/locking/lglock.txt | 166 ---------------------------------------
 include/linux/lglock.h           |  76 ------------------
 kernel/locking/Makefile          |   1 -
 kernel/locking/lglock.c          |  89 ---------------------
 4 files changed, 332 deletions(-)
 delete mode 100644 Documentation/locking/lglock.txt
 delete mode 100644 include/linux/lglock.h
 delete mode 100644 kernel/locking/lglock.c

(limited to 'include')

diff --git a/Documentation/locking/lglock.txt b/Documentation/locking/lglock.txt
deleted file mode 100644
index a6971e34fabe..000000000000
--- a/Documentation/locking/lglock.txt
+++ /dev/null
@@ -1,166 +0,0 @@
-lglock - local/global locks for mostly local access patterns
-------------------------------------------------------------
-
-Origin: Nick Piggin's VFS scalability series introduced during
-	2.6.35++ [1] [2]
-Location: kernel/locking/lglock.c
-	include/linux/lglock.h
-Users: currently only the VFS and stop_machine related code
-
-Design Goal:
-------------
-
-Improve scalability of globally used large data sets that are
-distributed over all CPUs as per_cpu elements.
-
-To manage global data structures that are partitioned over all CPUs
-as per_cpu elements but can be mostly handled by CPU local actions
-lglock will be used where the majority of accesses are cpu local
-reading and occasional cpu local writing with very infrequent
-global write access.
-
-
-* deal with things locally whenever possible
-	- very fast access to the local per_cpu data
-	- reasonably fast access to specific per_cpu data on a different
-	  CPU
-* while making global action possible when needed
-	- by expensive access to all CPUs locks - effectively
-	  resulting in a globally visible critical section.
-
-Design:
--------
-
-Basically it is an array of per_cpu spinlocks with the
-lg_local_lock/unlock accessing the local CPUs lock object and the
-lg_local_lock_cpu/unlock_cpu accessing a remote CPUs lock object
-the lg_local_lock has to disable preemption as migration protection so
-that the reference to the local CPUs lock does not go out of scope.
-Due to the lg_local_lock/unlock only touching cpu-local resources it
-is fast. Taking the local lock on a different CPU will be more
-expensive but still relatively cheap.
-
-One can relax the migration constraints by acquiring the current
-CPUs lock with lg_local_lock_cpu, remember the cpu, and release that
-lock at the end of the critical section even if migrated. This should
-give most of the performance benefits without inhibiting migration
-though needs careful considerations for nesting of lglocks and
-consideration of deadlocks with lg_global_lock.
-
-The lg_global_lock/unlock locks all underlying spinlocks of all
-possible CPUs (including those off-line). The preemption disable/enable
-are needed in the non-RT kernels to prevent deadlocks like:
-
-                     on cpu 1
-
-              task A          task B
-         lg_global_lock
-           got cpu 0 lock
-                 <<<< preempt <<<<
-                         lg_local_lock_cpu for cpu 0
-                           spin on cpu 0 lock
-
-On -RT this deadlock scenario is resolved by the arch_spin_locks in the
-lglocks being replaced by rt_mutexes which resolve the above deadlock
-by boosting the lock-holder.
-
-
-Implementation:
----------------
-
-The initial lglock implementation from Nick Piggin used some complex
-macros to generate the lglock/brlock in lglock.h - they were later
-turned into a set of functions by Andi Kleen [7]. The change to functions
-was motivated by the presence of multiple lock users and also by them
-being easier to maintain than the generating macros. This change to
-functions is also the basis to eliminated the restriction of not
-being initializeable in kernel modules (the remaining problem is that
-locks are not explicitly initialized - see lockdep-design.txt)
-
-Declaration and initialization:
--------------------------------
-
-  #include <linux/lglock.h>
-
-  DEFINE_LGLOCK(name)
-  or:
-  DEFINE_STATIC_LGLOCK(name);
-
-  lg_lock_init(&name, "lockdep_name_string");
-
-  on UP this is mapped to DEFINE_SPINLOCK(name) in both cases, note
-  also that as of 3.18-rc6 all declaration in use are of the _STATIC_
-  variant (and it seems that the non-static was never in use).
-  lg_lock_init is initializing the lockdep map only.
-
-Usage:
-------
-
-From the locking semantics it is a spinlock. It could be called a
-locality aware spinlock. lg_local_* behaves like a per_cpu
-spinlock and lg_global_* like a global spinlock.
-No surprises in the API.
-
-  lg_local_lock(*lglock);
-     access to protected per_cpu object on this CPU
-  lg_local_unlock(*lglock);
-
-  lg_local_lock_cpu(*lglock, cpu);
-     access to protected per_cpu object on other CPU cpu
-  lg_local_unlock_cpu(*lglock, cpu);
-
-  lg_global_lock(*lglock);
-     access all protected per_cpu objects on all CPUs
-  lg_global_unlock(*lglock);
-
-  There are no _trylock variants of the lglocks.
-
-Note that the lg_global_lock/unlock has to iterate over all possible
-CPUs rather than the actually present CPUs or a CPU could go off-line
-with a held lock [4] and that makes it very expensive. A discussion on
-these issues can be found at [5]
-
-Constraints:
-------------
-
-  * currently the declaration of lglocks in kernel modules is not
-    possible, though this should be doable with little change.
-  * lglocks are not recursive.
-  * suitable for code that can do most operations on the CPU local
-    data and will very rarely need the global lock
-  * lg_global_lock/unlock is *very* expensive and does not scale
-  * on UP systems all lg_* primitives are simply spinlocks
-  * in PREEMPT_RT the spinlock becomes an rt-mutex and can sleep but
-    does not change the tasks state while sleeping [6].
-  * in PREEMPT_RT the preempt_disable/enable in lg_local_lock/unlock
-    is downgraded to a migrate_disable/enable, the other
-    preempt_disable/enable are downgraded to barriers [6].
-    The deadlock noted for non-RT above is resolved due to rt_mutexes
-    boosting the lock-holder in this case which arch_spin_locks do
-    not do.
-
-lglocks were designed for very specific problems in the VFS and probably
-only are the right answer in these corner cases. Any new user that looks
-at lglocks probably wants to look at the seqlock and RCU alternatives as
-her first choice. There are also efforts to resolve the RCU issues that
-currently prevent using RCU in place of view remaining lglocks.
-
-Note on brlock history:
------------------------
-
-The 'Big Reader' read-write spinlocks were originally introduced by
-Ingo Molnar in 2000 (2.4/2.5 kernel series) and removed in 2003. They
-later were introduced by the VFS scalability patch set in 2.6 series
-again as the "big reader lock" brlock [2] variant of lglock which has
-been replaced by seqlock primitives or by RCU based primitives in the
-3.13 kernel series as was suggested in [3] in 2003. The brlock was
-entirely removed in the 3.13 kernel series.
-
-Link: 1 http://lkml.org/lkml/2010/8/2/81
-Link: 2 http://lwn.net/Articles/401738/
-Link: 3 http://lkml.org/lkml/2003/3/9/205
-Link: 4 https://lkml.org/lkml/2011/8/24/185
-Link: 5 http://lkml.org/lkml/2011/12/18/189
-Link: 6 https://www.kernel.org/pub/linux/kernel/projects/rt/
-        patch series - lglocks-rt.patch.patch
-Link: 7 http://lkml.org/lkml/2012/3/5/26
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
deleted file mode 100644
index 0081f000e34b..000000000000
--- a/include/linux/lglock.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Specialised local-global spinlock. Can only be declared as global variables
- * to avoid overhead and keep things simple (and we don't want to start using
- * these inside dynamically allocated structures).
- *
- * "local/global locks" (lglocks) can be used to:
- *
- * - Provide fast exclusive access to per-CPU data, with exclusive access to
- *   another CPU's data allowed but possibly subject to contention, and to
- *   provide very slow exclusive access to all per-CPU data.
- * - Or to provide very fast and scalable read serialisation, and to provide
- *   very slow exclusive serialisation of data (not necessarily per-CPU data).
- *
- * Brlocks are also implemented as a short-hand notation for the latter use
- * case.
- *
- * Copyright 2009, 2010, Nick Piggin, Novell Inc.
- */
-#ifndef __LINUX_LGLOCK_H
-#define __LINUX_LGLOCK_H
-
-#include <linux/spinlock.h>
-#include <linux/lockdep.h>
-#include <linux/percpu.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-
-#ifdef CONFIG_SMP
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define LOCKDEP_INIT_MAP lockdep_init_map
-#else
-#define LOCKDEP_INIT_MAP(a, b, c, d)
-#endif
-
-struct lglock {
-	arch_spinlock_t __percpu *lock;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lock_class_key lock_key;
-	struct lockdep_map    lock_dep_map;
-#endif
-};
-
-#define DEFINE_LGLOCK(name)						\
-	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
-	= __ARCH_SPIN_LOCK_UNLOCKED;					\
-	struct lglock name = { .lock = &name ## _lock }
-
-#define DEFINE_STATIC_LGLOCK(name)					\
-	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
-	= __ARCH_SPIN_LOCK_UNLOCKED;					\
-	static struct lglock name = { .lock = &name ## _lock }
-
-void lg_lock_init(struct lglock *lg, char *name);
-void lg_local_lock(struct lglock *lg);
-void lg_local_unlock(struct lglock *lg);
-void lg_local_lock_cpu(struct lglock *lg, int cpu);
-void lg_local_unlock_cpu(struct lglock *lg, int cpu);
-void lg_global_lock(struct lglock *lg);
-void lg_global_unlock(struct lglock *lg);
-
-#else
-/* When !CONFIG_SMP, map lglock to spinlock */
-#define lglock spinlock
-#define DEFINE_LGLOCK(name) DEFINE_SPINLOCK(name)
-#define DEFINE_STATIC_LGLOCK(name) static DEFINE_SPINLOCK(name)
-#define lg_lock_init(lg, name) spin_lock_init(lg)
-#define lg_local_lock spin_lock
-#define lg_local_unlock spin_unlock
-#define lg_local_lock_cpu(lg, cpu) spin_lock(lg)
-#define lg_local_unlock_cpu(lg, cpu) spin_unlock(lg)
-#define lg_global_lock spin_lock
-#define lg_global_unlock spin_unlock
-#endif
-
-#endif
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 31322a4275cd..6f88e352cd4f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
-obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
deleted file mode 100644
index 86ae2aebf004..000000000000
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-
-void lg_lock_init(struct lglock *lg, char *name)
-{
-	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-
-void lg_local_lock(struct lglock *lg)
-{
-	arch_spinlock_t *lock;
-
-	preempt_disable();
-	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-	lock = this_cpu_ptr(lg->lock);
-	arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-
-void lg_local_unlock(struct lglock *lg)
-{
-	arch_spinlock_t *lock;
-
-	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-	lock = this_cpu_ptr(lg->lock);
-	arch_spin_unlock(lock);
-	preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
-	arch_spinlock_t *lock;
-
-	preempt_disable();
-	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
-	arch_spinlock_t *lock;
-
-	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_unlock(lock);
-	preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-
-void lg_global_lock(struct lglock *lg)
-{
-	int i;
-
-	preempt_disable();
-	lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
-		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_lock(lock);
-	}
-}
-EXPORT_SYMBOL(lg_global_lock);
-
-void lg_global_unlock(struct lglock *lg)
-{
-	int i;
-
-	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
-		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_unlock(lock);
-	}
-	preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);
-- 
cgit v1.2.3-71-gd317