From 2a01bb3885c9145dbb7583d5aa5f5d5504f6f46f Mon Sep 17 00:00:00 2001
From: Kyle McMartin <kmcmarti@redhat.com>
Date: Wed, 11 Apr 2012 08:15:29 -0400
Subject: panic: Make panic_on_oops configurable

Several distros set this by default by patching panic_on_oops.
It seems to fit with the BOOTPARAM_{HARD,SOFT}_PANIC options
though, so let's add a Kconfig entry and reduce some more
upstream delta.

Signed-off-by: Kyle McMartin <kyle@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120411121529.GH26688@redacted.bos.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/panic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 8ed89a175d79..b6215b7ce99d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,7 @@
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
 
-int panic_on_oops;
+int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
 static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
-- 
cgit v1.2.3-71-gd317


From 62be73eafaa045d3233337303fb140f7f8a61135 Mon Sep 17 00:00:00 2001
From: Seiji Aguchi <seiji.aguchi@hds.com>
Date: Tue, 15 May 2012 17:35:09 -0400
Subject: kdump: Execute kmsg_dump(KMSG_DUMP_PANIC) after smp_send_stop()

This patch moves kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop(),
to serialize the crash-logging process via smp_send_stop() and to
thus retrieve a more stable crash image of all CPUs stopped.

Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: dle-develop@lists.sourceforge.net <dle-develop@lists.sourceforge.net>
Cc: Satoru Moriya <satoru.moriya@hds.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: a.p.zijlstra@chello.nl <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/5C4C569E8A4B9B42A84A977CF070A35B2E4D7A5CE2@USINDEVS01.corp.hds.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/panic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index b6215b7ce99d..d2a5f4ecc6dd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...)
 	 */
 	crash_kexec(NULL);
 
-	kmsg_dump(KMSG_DUMP_PANIC);
-
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
 	 * unfortunately means it may not be hardened to work in a panic
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...)
 	 */
 	smp_send_stop();
 
+	kmsg_dump(KMSG_DUMP_PANIC);
+
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
 	bust_spinlocks(0);
-- 
cgit v1.2.3-71-gd317


From e73f8959af0439d114847eab5a8a5ce48f1217c4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 11 May 2012 10:59:07 +1000
Subject: task_work_add: generic process-context callbacks

Provide a simple mechanism that allows running code in the (nonatomic)
context of the arbitrary task.

The caller does task_work_add(task, task_work) and this task executes
task_work->func() either from do_notify_resume() or from do_exit().  The
callback can rely on PF_EXITING to detect the latter case.

"struct task_work" can be embedded in another struct, still it has "void
*data" to handle the most common/simple case.

This allows us to kill the ->replacement_session_keyring hack, and
potentially this can have more users.

Performance-wise, this adds 2 "unlikely(!hlist_empty())" checks into
tracehook_notify_resume() and do_exit().  But at the same time we can
remove the "replacement_session_keyring != NULL" checks from
arch/*/signal.c and exit_creds().

Note: task_work_add/task_work_run abuses ->pi_lock.  This is only because
this lock is already used by lookup_pi_state() to synchronize with
do_exit() setting PF_EXITING.  Fortunately the scope of this lock in
task_work.c is really tiny, and the code is unlikely anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Gordeev <agordeev@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Smith <dsmith@redhat.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sched.h     |  2 ++
 include/linux/task_work.h | 33 +++++++++++++++++++
 include/linux/tracehook.h | 11 +++++++
 kernel/Makefile           |  2 +-
 kernel/exit.c             |  5 ++-
 kernel/fork.c             |  1 +
 kernel/task_work.c        | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/task_work.h
 create mode 100644 kernel/task_work.c

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ea8baea9387..7930131abc1a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,6 +1400,8 @@ struct task_struct {
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
+	struct hlist_head task_works;
+
 	struct audit_context *audit_context;
 #ifdef CONFIG_AUDITSYSCALL
 	uid_t loginuid;
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
new file mode 100644
index 000000000000..294d5d5e90b1
--- /dev/null
+++ b/include/linux/task_work.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_TASK_WORK_H
+#define _LINUX_TASK_WORK_H
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct task_work;
+typedef void (*task_work_func_t)(struct task_work *);
+
+struct task_work {
+	struct hlist_node hlist;
+	task_work_func_t func;
+	void *data;
+};
+
+static inline void
+init_task_work(struct task_work *twork, task_work_func_t func, void *data)
+{
+	twork->func = func;
+	twork->data = data;
+}
+
+int task_work_add(struct task_struct *task, struct task_work *twork, bool);
+struct task_work *task_work_cancel(struct task_struct *, task_work_func_t);
+void task_work_run(void);
+
+static inline void exit_task_work(struct task_struct *task)
+{
+	if (unlikely(!hlist_empty(&task->task_works)))
+		task_work_run();
+}
+
+#endif	/* _LINUX_TASK_WORK_H */
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index b9ca903bb553..b2dd0917ca0d 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -49,6 +49,7 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
+#include <linux/task_work.h>
 struct linux_binprm;
 
 /*
@@ -164,8 +165,10 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
  */
 static inline void set_notify_resume(struct task_struct *task)
 {
+#ifdef TIF_NOTIFY_RESUME
 	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
 		kick_process(task);
+#endif
 }
 
 /**
@@ -185,6 +188,14 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 {
 	if (current->replacement_session_keyring)
 		key_replace_session_keyring();
+	/*
+	 * The caller just cleared TIF_NOTIFY_RESUME. This barrier
+	 * pairs with task_work_add()->set_notify_resume() after
+	 * hlist_add_head(task->task_works);
+	 */
+	smp_mb__after_clear_bit();
+	if (unlikely(!hlist_empty(&current->task_works)))
+		task_work_run();
 }
 
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..bf1034008aca 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,7 +5,7 @@
 obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    cpu.o exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
-	    signal.o sys.o kmod.o workqueue.o pid.o \
+	    signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
diff --git a/kernel/exit.c b/kernel/exit.c
index 910a0716e17a..3d93325e0b1a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -946,11 +946,14 @@ void do_exit(long code)
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
-	 * an exiting task cleaning up the robust pi futexes.
+	 * an exiting task cleaning up the robust pi futexes, and in
+	 * task_work_add() to avoid the race with exit_task_work().
 	 */
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
 
+	exit_task_work(tsk);
+
 	exit_irq_thread();
 
 	if (unlikely(in_atomic()))
diff --git a/kernel/fork.c b/kernel/fork.c
index 05c813dc9ecc..a46db217a589 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1411,6 +1411,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
+	INIT_HLIST_HEAD(&p->task_works);
 
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
diff --git a/kernel/task_work.c b/kernel/task_work.c
new file mode 100644
index 000000000000..82d1c794066d
--- /dev/null
+++ b/kernel/task_work.c
@@ -0,0 +1,84 @@
+#include <linux/spinlock.h>
+#include <linux/task_work.h>
+#include <linux/tracehook.h>
+
+int
+task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
+{
+	unsigned long flags;
+	int err = -ESRCH;
+
+#ifndef TIF_NOTIFY_RESUME
+	if (notify)
+		return -ENOTSUPP;
+#endif
+	/*
+	 * We must not insert the new work if the task has already passed
+	 * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
+	 * and check PF_EXITING under pi_lock.
+	 */
+	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	if (likely(!(task->flags & PF_EXITING))) {
+		hlist_add_head(&twork->hlist, &task->task_works);
+		err = 0;
+	}
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	/* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
+	if (likely(!err) && notify)
+		set_notify_resume(task);
+	return err;
+}
+
+struct task_work *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+	unsigned long flags;
+	struct task_work *twork;
+	struct hlist_node *pos;
+
+	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
+		if (twork->func == func) {
+			hlist_del(&twork->hlist);
+			goto found;
+		}
+	}
+	twork = NULL;
+ found:
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	return twork;
+}
+
+void task_work_run(void)
+{
+	struct task_struct *task = current;
+	struct hlist_head task_works;
+	struct hlist_node *pos;
+
+	raw_spin_lock_irq(&task->pi_lock);
+	hlist_move_list(&task->task_works, &task_works);
+	raw_spin_unlock_irq(&task->pi_lock);
+
+	if (unlikely(hlist_empty(&task_works)))
+		return;
+	/*
+	 * We use hlist to save the space in task_struct, but we want fifo.
+	 * Find the last entry, the list should be short, then process them
+	 * in reverse order.
+	 */
+	for (pos = task_works.first; pos->next; pos = pos->next)
+		;
+
+	for (;;) {
+		struct hlist_node **pprev = pos->pprev;
+		struct task_work *twork = container_of(pos, struct task_work,
+							hlist);
+		twork->func(twork);
+
+		if (pprev == &task_works.first)
+			break;
+		pos = container_of(pprev, struct hlist_node, next);
+	}
+}
-- 
cgit v1.2.3-71-gd317


From 4d1d61a6b203d957777d73fcebf19d90b038b5b2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 11 May 2012 10:59:08 +1000
Subject: genirq: reimplement exit_irq_thread() hook via task_work_add()

exit_irq_thread() and task->irq_thread are needed to handle the unexpected
(and unlikely) exit of irq-thread.

We can use task_work instead and make this all private to
kernel/irq/manage.c, cleanup plus micro-optimization.

1. rename exit_irq_thread() to irq_thread_dtor(), make it
   static, and move it up before irq_thread().

2. change irq_thread() to do task_work_add(irq_thread_dtor)
   at the start and task_work_cancel() before return.

   tracehook_notify_resume() can never play with kthreads,
   only do_exit()->exit_task_work() can call the callback
   and this is what we want.

3. remove task_struct->irq_thread and the special hook
   in do_exit().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Gordeev <agordeev@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Smith <dsmith@redhat.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/interrupt.h |  4 ---
 include/linux/sched.h     | 10 ++-----
 kernel/exit.c             |  2 --
 kernel/irq/manage.c       | 68 +++++++++++++++++++++++------------------------
 4 files changed, 35 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c91171599cb6..e68a8e53bb59 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -142,8 +142,6 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler,
 extern int __must_check
 request_percpu_irq(unsigned int irq, irq_handler_t handler,
 		   const char *devname, void __percpu *percpu_dev_id);
-
-extern void exit_irq_thread(void);
 #else
 
 extern int __must_check
@@ -177,8 +175,6 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler,
 {
 	return request_irq(irq, handler, 0, devname, percpu_dev_id);
 }
-
-static inline void exit_irq_thread(void) { }
 #endif
 
 extern void free_irq(unsigned int, void *);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7930131abc1a..da013853a622 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1301,11 +1301,6 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 
-#ifdef CONFIG_GENERIC_HARDIRQS
-	/* IRQ handler threads */
-	unsigned irq_thread:1;
-#endif
-
 	pid_t pid;
 	pid_t tgid;
 
@@ -1313,10 +1308,9 @@ struct task_struct {
 	/* Canary value for the -fstack-protector gcc feature */
 	unsigned long stack_canary;
 #endif
-
-	/* 
+	/*
 	 * pointers to (original) parent process, youngest child, younger sibling,
-	 * older sibling, respectively.  (p->father can be replaced with 
+	 * older sibling, respectively.  (p->father can be replaced with
 	 * p->real_parent->pid)
 	 */
 	struct task_struct __rcu *real_parent; /* real parent process */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3d93325e0b1a..3ecd096e5d4d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -954,8 +954,6 @@ void do_exit(long code)
 
 	exit_task_work(tsk);
 
-	exit_irq_thread();
-
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, task_pid_nr(current),
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bb32326afe87..4d1f8f897414 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -14,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/task_work.h>
 
 #include "internals.h"
 
@@ -773,11 +774,39 @@ static void wake_threads_waitq(struct irq_desc *desc)
 		wake_up(&desc->wait_for_threads);
 }
 
+static void irq_thread_dtor(struct task_work *unused)
+{
+	struct task_struct *tsk = current;
+	struct irq_desc *desc;
+	struct irqaction *action;
+
+	if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
+		return;
+
+	action = kthread_data(tsk);
+
+	pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+	       tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
+
+
+	desc = irq_to_desc(action->irq);
+	/*
+	 * If IRQTF_RUNTHREAD is set, we need to decrement
+	 * desc->threads_active and wake possible waiters.
+	 */
+	if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+		wake_threads_waitq(desc);
+
+	/* Prevent a stale desc->threads_oneshot */
+	irq_finalize_oneshot(desc, action);
+}
+
 /*
  * Interrupt handler thread
  */
 static int irq_thread(void *data)
 {
+	struct task_work on_exit_work;
 	static const struct sched_param param = {
 		.sched_priority = MAX_USER_RT_PRIO/2,
 	};
@@ -793,7 +822,9 @@ static int irq_thread(void *data)
 		handler_fn = irq_thread_fn;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
-	current->irq_thread = 1;
+
+	init_task_work(&on_exit_work, irq_thread_dtor, NULL);
+	task_work_add(current, &on_exit_work, false);
 
 	while (!irq_wait_for_interrupt(action)) {
 		irqreturn_t action_ret;
@@ -815,44 +846,11 @@ static int irq_thread(void *data)
 	 * cannot touch the oneshot mask at this point anymore as
 	 * __setup_irq() might have given out currents thread_mask
 	 * again.
-	 *
-	 * Clear irq_thread. Otherwise exit_irq_thread() would make
-	 * fuzz about an active irq thread going into nirvana.
 	 */
-	current->irq_thread = 0;
+	task_work_cancel(current, irq_thread_dtor);
 	return 0;
 }
 
-/*
- * Called from do_exit()
- */
-void exit_irq_thread(void)
-{
-	struct task_struct *tsk = current;
-	struct irq_desc *desc;
-	struct irqaction *action;
-
-	if (!tsk->irq_thread)
-		return;
-
-	action = kthread_data(tsk);
-
-	pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
-	       tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
-
-	desc = irq_to_desc(action->irq);
-
-	/*
-	 * If IRQTF_RUNTHREAD is set, we need to decrement
-	 * desc->threads_active and wake possible waiters.
-	 */
-	if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
-		wake_threads_waitq(desc);
-
-	/* Prevent a stale desc->threads_oneshot */
-	irq_finalize_oneshot(desc, action);
-}
-
 static void irq_setup_forced_threading(struct irqaction *new)
 {
 	if (!force_irqthreads)
-- 
cgit v1.2.3-71-gd317


From f23ca335462e3c84f13270b9e65f83936068ec2c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 11 May 2012 10:59:09 +1000
Subject: keys: kill task_struct->replacement_session_keyring

Kill the no longer used task_struct->replacement_session_keyring, update
copy_creds() and exit_creds().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Gordeev <agordeev@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Smith <dsmith@redhat.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sched.h | 2 --
 kernel/cred.c         | 9 ---------
 2 files changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index da013853a622..17c6c929ee94 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1357,8 +1357,6 @@ struct task_struct {
 					 * credentials (COW) */
 	const struct cred __rcu *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
-	struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
-
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
 				       it with task_lock())
diff --git a/kernel/cred.c b/kernel/cred.c
index 430557ea488f..de728ac50d82 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -207,13 +207,6 @@ void exit_creds(struct task_struct *tsk)
 	validate_creds(cred);
 	alter_cred_subscribers(cred, -1);
 	put_cred(cred);
-
-	cred = (struct cred *) tsk->replacement_session_keyring;
-	if (cred) {
-		tsk->replacement_session_keyring = NULL;
-		validate_creds(cred);
-		put_cred(cred);
-	}
 }
 
 /**
@@ -396,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct cred *new;
 	int ret;
 
-	p->replacement_session_keyring = NULL;
-
 	if (
 #ifdef CONFIG_KEYS
 		!p->cred->thread_keyring &&
-- 
cgit v1.2.3-71-gd317


From 23812b9d9e497580d38c62ebdc6f308733b0a32a Mon Sep 17 00:00:00 2001
From: Ning Jiang <ning.n.jiang@gmail.com>
Date: Tue, 22 May 2012 00:19:20 +0800
Subject: genirq: Add IRQS_PENDING for nested and simple irq

Every interrupt which is an active wakeup source needs the ability to
abort suspend if there is a pending irq. Right now only edge and level
irqs can do that.

            |
       +---------+
       |   INTC  |
       +---------+
               | GPIO_IRQ
            +------------+
            |  gpio-exp  |
            +------------+
              |        |
         GPIO0_IRQ  GPIO1_IRQ

In the above diagram, gpio expander has irq number GPIO_IRQ, it is
connected with two sub GPIO pins, GPIO0 and GPIO1.

During suspend, we set IRQF_NO_SUSPEND for GPIO_IRQ so that gpio
expander driver can handle the sub irq GPIO0_IRQ and GPIO1_IRQ, and
these two irqs themselves can further be handled by simple or nested
irq in some drivers(typically gpio and mfd driver). If they are used
as wakeup sources during suspend, we want them to be able to abort
suspend too.

Setting IRQS_PENDING flag in handle_nested_irq() and handle_simple_irq()
when the irq is disabled allows check_wakeup_irqs() to identify such
irqs as source for aborting suspend.

Signed-off-by: Ning Jiang <ning.n.jiang@gmail.com>
Cc: rjw@sisk.pl
Link: http://lkml.kernel.org/r/CAH3Oq6T905%2B3fkF43NAMMFvJvq7dsk_so6T2vQ8ZJrA5xiU3YA@mail.gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fc275e4f629b..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	action = desc->action;
-	if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
+	if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+		desc->istate |= IRQS_PENDING;
 		goto out_unlock;
+	}
 
 	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 	raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+		desc->istate |= IRQS_PENDING;
 		goto out_unlock;
+	}
 
 	handle_irq_event(desc);
 
-- 
cgit v1.2.3-71-gd317


From 818b0f3bfb236ae66cac3ff38e86b9e47f24b7aa Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Fri, 30 Mar 2012 23:11:34 +0800
Subject: genirq: Introduce irq_do_set_affinity() to reduce duplicated code

All invocations of chip->irq_set_affinity() are doing the same return
value checks. Let them all use a common function.

[ tglx: removed the silly likely while at it ]

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Keping Chen <chenkeping@huawei.com>
Link: http://lkml.kernel.org/r/1333120296-13563-3-git-send-email-jiang.liu@huawei.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h |  3 +++
 kernel/irq/manage.c    | 39 ++++++++++++++++++++++-----------------
 kernel/irq/migration.c | 13 ++-----------
 3 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
+extern int irq_do_set_affinity(struct irq_data *data,
+			       const struct cpumask *dest, bool force);
+
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bb32326afe87..a1b903380bcf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -139,6 +139,25 @@ static inline void
 irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
 #endif
 
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
+			bool force)
+{
+	struct irq_desc *desc = irq_data_to_desc(data);
+	struct irq_chip *chip = irq_data_get_irq_chip(data);
+	int ret;
+
+	ret = chip->irq_set_affinity(data, mask, false);
+	switch (ret) {
+	case IRQ_SET_MASK_OK:
+		cpumask_copy(data->affinity, mask);
+	case IRQ_SET_MASK_OK_NOCOPY:
+		irq_set_thread_affinity(desc);
+		ret = 0;
+	}
+
+	return ret;
+}
+
 int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 {
 	struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -149,14 +168,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 		return -EINVAL;
 
 	if (irq_can_move_pcntxt(data)) {
-		ret = chip->irq_set_affinity(data, mask, false);
-		switch (ret) {
-		case IRQ_SET_MASK_OK:
-			cpumask_copy(data->affinity, mask);
-		case IRQ_SET_MASK_OK_NOCOPY:
-			irq_set_thread_affinity(desc);
-			ret = 0;
-		}
+		ret = irq_do_set_affinity(data, mask, false);
 	} else {
 		irqd_set_move_pending(data);
 		irq_copy_pending(desc, mask);
@@ -280,9 +292,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
-	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct cpumask *set = irq_default_affinity;
-	int ret, node = desc->irq_data.node;
+	int node = desc->irq_data.node;
 
 	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
@@ -308,13 +319,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 		if (cpumask_intersects(mask, nodemask))
 			cpumask_and(mask, mask, nodemask);
 	}
-	ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
-	switch (ret) {
-	case IRQ_SET_MASK_OK:
-		cpumask_copy(desc->irq_data.affinity, mask);
-	case IRQ_SET_MASK_OK_NOCOPY:
-		irq_set_thread_affinity(desc);
-	}
+	irq_do_set_affinity(&desc->irq_data, mask, false);
 	return 0;
 }
 #else
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
 	 * For correct operation this depends on the caller
 	 * masking the irqs.
 	 */
-	if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-		   < nr_cpu_ids)) {
-		int ret = chip->irq_set_affinity(&desc->irq_data,
-						 desc->pending_mask, false);
-		switch (ret) {
-		case IRQ_SET_MASK_OK:
-			cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
-		case IRQ_SET_MASK_OK_NOCOPY:
-			irq_set_thread_affinity(desc);
-		}
-	}
+	if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
+		irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
 
 	cpumask_clear(desc->pending_mask);
 }
-- 
cgit v1.2.3-71-gd317


From ee74d13229fb606353ff56f4927fa93b37e95bbe Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Thu, 24 May 2012 20:40:55 +0530
Subject: smpboot, idle: Optimize calls to smp_processor_id() in
 idle_threads_init()

While trying to initialize idle threads for all cpus, idle_threads_init()
calls smp_processor_id() in a loop, which is unnecessary. The intent
is to initialize idle threads for all non-boot cpus. So just use a variable
to note the boot cpu and use it in the loop.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: suresh.b.siddha@intel.com
Cc: venki@google.com
Cc: nikunj@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/20120524151055.2549.64309.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/smpboot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e1a797e028a3..0f2162f808a7 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -52,10 +52,12 @@ static inline void idle_init(unsigned int cpu)
  */
 void __init idle_threads_init(void)
 {
-	unsigned int cpu;
+	unsigned int cpu, boot_cpu;
+
+	boot_cpu = smp_processor_id();
 
 	for_each_possible_cpu(cpu) {
-		if (cpu != smp_processor_id())
+		if (cpu != boot_cpu)
 			idle_init(cpu);
 	}
 }
-- 
cgit v1.2.3-71-gd317


From 4a70d2d9909b43ed88043b98cabe2c7fbd563021 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Thu, 24 May 2012 20:41:00 +0530
Subject: smpboot, idle: Fix comment mismatch over idle_threads_init()

The comment over idle_threads_init() really talks about the functionality
of idle_init(). Move that comment to idle_init(), and add a suitable
comment over idle_threads_init().

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: suresh.b.siddha@intel.com
Cc: venki@google.com
Cc: nikunj@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/20120524151100.2549.66501.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/smpboot.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 0f2162f808a7..98f60c5caa1b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void)
 	per_cpu(idle_threads, smp_processor_id()) = current;
 }
 
+/**
+ * idle_init - Initialize the idle thread for a cpu
+ * @cpu:	The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
 static inline void idle_init(unsigned int cpu)
 {
 	struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -45,10 +51,7 @@ static inline void idle_init(unsigned int cpu)
 }
 
 /**
- * idle_thread_init - Initialize the idle thread for a cpu
- * @cpu:	The cpu for which the idle thread should be initialized
- *
- * Creates the thread if it does not exist.
+ * idle_threads_init - Initialize idle threads for all cpus
  */
 void __init idle_threads_init(void)
 {
-- 
cgit v1.2.3-71-gd317


From 5307c9556bc17e3cd26d4e94fc3b2565921834de Mon Sep 17 00:00:00 2001
From: Mike Galbraith <mgalbraith@suse.de>
Date: Tue, 8 May 2012 12:20:58 +0200
Subject: tick: Add tick skew boot option

Let the user decide whether power consumption or jitter is the
more important consideration for their machines.

Quoting removal commit af5ab277ded04bd9bc6b048c5a2f0e7d70ef0867:

"Historically, Linux has tried to make the regular timer tick on the
 various CPUs not happen at the same time, to avoid contention on
 xtime_lock.

 Nowadays, with the tickless kernel, this contention no longer happens
 since time keeping and updating are done differently. In addition,
 this skew is actually hurting power consumption in a measurable way on
 many-core systems."

Problems:

- Contrary to the above, systems do encounter contention on both
  xtime_lock and RCU structure locks when the tick is synchronized.

- Moderate sized RT systems suffer intolerable jitter due to the tick
  being synchronized.

- SGI reports the same for their large systems.

- Fully utilized systems reap no power saving benefit from skew removal,
  but do suffer from resulting induced lock contention.

- 0209f649 rcu: limit rcu_node leaf-level fanout
  This patch was born to combat lock contention which testing showed
  to have been _induced by_ skew removal.  Skew the tick, contention
  disappeared virtually completely.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Link: http://lkml.kernel.org/r/1336472458.21924.78.camel@marge.simpson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  9 +++++++++
 kernel/time/tick-sched.c            | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b69cfdc12112..ea38cd1f0aba 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2532,6 +2532,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	sched_debug	[KNL] Enables verbose scheduler debug messages.
 
+	skew_tick=	[KNL] Offset the periodic timer tick per cpu to mitigate
+			xtime_lock contention on larger systems, and/or RCU lock
+			contention on all systems with CONFIG_MAXSMP set.
+			Format: { "0" | "1" }
+			0 -- disable. (may be 1 via CONFIG_CMDLINE="skew_tick=1"
+			1 -- enable.
+			Note: increases power consumption, thus should only be
+			enabled if running jitter sensitive (HPC/RT) workloads.
+
 	security=	[SECURITY] Choose a security module to enable at boot.
 			If this boot parameter is not specified, only the first
 			security module asking for security registration will be
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..4eddbb5ea9c5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -814,6 +814,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	return HRTIMER_RESTART;
 }
 
+static int sched_skew_tick;
+
 /**
  * tick_setup_sched_timer - setup the tick emulation timer
  */
@@ -831,6 +833,14 @@ void tick_setup_sched_timer(void)
 	/* Get the next period (per cpu) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
 
+	/* Offset the tick to avert xtime_lock contention. */
+	if (sched_skew_tick) {
+		u64 offset = ktime_to_ns(tick_period) >> 1;
+		do_div(offset, num_possible_cpus());
+		offset *= smp_processor_id();
+		hrtimer_add_expires_ns(&ts->sched_timer, offset);
+	}
+
 	for (;;) {
 		hrtimer_forward(&ts->sched_timer, now, tick_period);
 		hrtimer_start_expires(&ts->sched_timer,
@@ -910,3 +920,11 @@ int tick_check_oneshot_change(int allow_nohz)
 	tick_nohz_switch_to_nohz();
 	return 0;
 }
+
+static int __init skew_tick(char *str)
+{
+	get_option(&str, &sched_skew_tick);
+
+	return 0;
+}
+early_param("skew_tick", skew_tick);
-- 
cgit v1.2.3-71-gd317


From e5400321a6f15ce0fe77c8455954f213ef7dcc54 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Wed, 9 May 2012 23:39:34 +0900
Subject: clockevents: Make clockevents_config() a global symbol

Make clockevents_config() into a global symbol to allow it to be used
by compiled-in clockevent drivers. This is needed by drivers that want
to update the timer frequency after registration time.

Signed-off-by: Magnus Damm <damm@opensource.se>
Tested-by: Simon Horman <horms@verge.net.au>
Cc: arnd@arndb.de
Cc: johnstul@us.ibm.com
Cc: rjw@sisk.pl
Cc: lethal@linux-sh.org
Cc: gregkh@linuxfoundation.org
Cc: olof@lixom.net
Cc: Magnus Damm <magnus.damm@gmail.com>
Link: http://lkml.kernel.org/r/20120509143934.27521.46553.sendpatchset@w520
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h | 1 +
 kernel/time/clockevents.c  | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 81e803e90aa4..acba894374a1 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -132,6 +132,7 @@ extern u64 clockevent_delta2ns(unsigned long latch,
 			       struct clock_event_device *evt);
 extern void clockevents_register_device(struct clock_event_device *dev);
 
+extern void clockevents_config(struct clock_event_device *dev, u32 freq);
 extern void clockevents_config_and_register(struct clock_event_device *dev,
 					    u32 freq, unsigned long min_delta,
 					    unsigned long max_delta);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
 
-static void clockevents_config(struct clock_event_device *dev,
-			       u32 freq)
+void clockevents_config(struct clock_event_device *dev, u32 freq)
 {
 	u64 sec;
 
-- 
cgit v1.2.3-71-gd317


From 62cf20b32aee4ae889a2eb40fd41c0eab73de970 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 May 2012 14:08:57 +0200
Subject: tick: Move skew_tick option into the HIGH_RES_TIMER section

commit 5307c95 (tick: Add tick skew boot option) broke the
!CONFIG_HIGH_RES_TIMERS build.

Move the boot option parsing into the CONFIG_HIGH_RES_TIMERS section.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <mgalbraith@suse.de>
---
 kernel/time/tick-sched.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4eddbb5ea9c5..efd386667536 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -816,6 +816,14 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 
 static int sched_skew_tick;
 
+static int __init skew_tick(char *str)
+{
+	get_option(&str, &sched_skew_tick);
+
+	return 0;
+}
+early_param("skew_tick", skew_tick);
+
 /**
  * tick_setup_sched_timer - setup the tick emulation timer
  */
@@ -920,11 +928,3 @@ int tick_check_oneshot_change(int allow_nohz)
 	tick_nohz_switch_to_nohz();
 	return 0;
 }
-
-static int __init skew_tick(char *str)
-{
-	get_option(&str, &sched_skew_tick);
-
-	return 0;
-}
-early_param("skew_tick", skew_tick);
-- 
cgit v1.2.3-71-gd317


From fa980ca87d15bb8a1317853f257a505990f3ffde Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 May 2012 08:24:39 -0700
Subject: cgroup: superblock can't be released with active dentries

48ddbe1946 "cgroup: make css->refcnt clearing on cgroup removal
optional" allowed a css to linger after the associated cgroup is
removed.  As a css holds a reference on the cgroup's dentry, it means
that cgroup dentries may linger for a while.

cgroup_create() does grab an active reference on the superblock to
prevent it from going away while there are !root cgroups; however, the
reference is put from cgroup_diput() which is invoked on cgroup
removal, so cgroup dentries which are removed but persisting due to
lingering csses already have released their superblock active refs
allowing superblock to be killed while those dentries are around.

Given the right condition, this makes cgroup_kill_sb() call
kill_litter_super() with dentries with non-zero d_count leading to
BUG() in shrink_dcache_for_umount_subtree().

Fix it by adding cgroup_dops->d_release() operation and moving
deactivate_super() to it.  cgroup_diput() now marks dentry->d_fsdata
with itself if superblock should be deactivated and cgroup_d_release()
deactivates the superblock on dentry release.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
LKML-Reference: <CA+1xoqe5hMuxzCRhMy7J0XchDk2ZnuxOHJKikROk1-ReAzcT6g@mail.gmail.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ad8eae5bb801..e887b55f1f29 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -896,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		mutex_unlock(&cgroup_mutex);
 
 		/*
-		 * Drop the active superblock reference that we took when we
-		 * created the cgroup
+		 * We want to drop the active superblock reference from the
+		 * cgroup creation after all the dentry refs are gone -
+		 * kill_sb gets mighty unhappy otherwise.  Mark
+		 * dentry->d_fsdata with cgroup_diput() to tell
+		 * cgroup_d_release() to call deactivate_super().
 		 */
-		deactivate_super(cgrp->root->sb);
+		dentry->d_fsdata = cgroup_diput;
 
 		/*
 		 * if we're getting rid of the cgroup, refcount should ensure
@@ -925,6 +928,13 @@ static int cgroup_delete(const struct dentry *d)
 	return 1;
 }
 
+static void cgroup_d_release(struct dentry *dentry)
+{
+	/* did cgroup_diput() tell me to deactivate super? */
+	if (dentry->d_fsdata == cgroup_diput)
+		deactivate_super(dentry->d_sb);
+}
+
 static void remove_dir(struct dentry *d)
 {
 	struct dentry *parent = dget(d->d_parent);
@@ -1532,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
 	static const struct dentry_operations cgroup_dops = {
 		.d_iput = cgroup_diput,
 		.d_delete = cgroup_delete,
+		.d_release = cgroup_d_release,
 	};
 
 	struct inode *inode =
-- 
cgit v1.2.3-71-gd317


From eea62f831b8030b0eeea8314eed73b6132d1de26 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 8 May 2012 13:32:24 +0930
Subject: brlocks/lglocks: turn into functions

lglocks and brlocks are currently generated with some complicated macros
in lglock.h.  But there's no reason to not just use common utility
functions and put all the data into a common data structure.

Since there are at least two users it makes sense to share this code in a
library.  This is also easier maintainable than a macro forest.

This will also make it later possible to dynamically allocate lglocks and
also use them in modules (this would both still need some additional, but
now straightforward, code)

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c        |   1 -
 fs/internal.h          |   2 +-
 include/linux/lglock.h | 125 ++++++++++---------------------------------------
 kernel/Makefile        |   2 +-
 kernel/lglock.c        |  89 +++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 102 deletions(-)
 create mode 100644 kernel/lglock.c

(limited to 'kernel')

diff --git a/fs/file_table.c b/fs/file_table.c
index 70f2a0fd6aec..f5c67c59ec10 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-DECLARE_LGLOCK(files_lglock);
 DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
diff --git a/fs/internal.h b/fs/internal.h
index 9962c59ba280..8040af489c78 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-DECLARE_BRLOCK(vfsmount_lock);
+extern struct lglock vfsmount_lock;
 
 
 /*
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0fdd821e77b7..f01e5f6d1f07 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -23,26 +23,17 @@
 #include <linux/lockdep.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)	name##_lock_init()
-#define br_read_lock(name)	name##_local_lock()
-#define br_read_unlock(name)	name##_local_unlock()
-#define br_write_lock(name)	name##_global_lock()
-#define br_write_unlock(name)	name##_global_unlock()
+#define br_lock_init(name)	lg_lock_init(name, #name)
+#define br_read_lock(name)	lg_local_lock(name)
+#define br_read_unlock(name)	lg_local_unlock(name)
+#define br_write_lock(name)	lg_global_lock(name)
+#define br_write_unlock(name)	lg_global_unlock(name)
 
-#define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
 
-
-#define lg_lock_init(name)	name##_lock_init()
-#define lg_local_lock(name)	name##_local_lock()
-#define lg_local_unlock(name)	name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)	name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)	name##_global_lock()
-#define lg_global_unlock(name)	name##_global_unlock()
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 
@@ -57,90 +48,26 @@
 #define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
-
-#define DECLARE_LGLOCK(name)						\
- extern void name##_lock_init(void);					\
- extern void name##_local_lock(void);					\
- extern void name##_local_unlock(void);					\
- extern void name##_local_lock_cpu(int cpu);				\
- extern void name##_local_unlock_cpu(int cpu);				\
- extern void name##_global_lock(void);					\
- extern void name##_global_unlock(void);				\
+struct lglock {
+	arch_spinlock_t __percpu *lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key lock_key;
+	struct lockdep_map    lock_dep_map;
+#endif
+};
 
 #define DEFINE_LGLOCK(name)						\
-									\
- DEFINE_SPINLOCK(name##_cpu_lock);					\
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
- DEFINE_LGLOCK_LOCKDEP(name);						\
-									\
- void name##_lock_init(void) {						\
-	int i;								\
-	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_lock_init);					\
-									\
- void name##_local_lock(void) {						\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock);					\
-									\
- void name##_local_unlock(void) {					\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock);					\
-									\
- void name##_local_lock_cpu(int cpu) {					\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock_cpu);					\
-									\
- void name##_local_unlock_cpu(int cpu) {				\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock_cpu);				\
-									\
- void name##_global_lock(void) {					\
-	int i;								\
-	preempt_disable();						\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock);					\
-									\
- void name##_global_unlock(void) {					\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_global_unlock);
+	DEFINE_LGLOCK_LOCKDEP(name);					\
+	DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)			\
+	= __ARCH_SPIN_LOCK_UNLOCKED;					\
+	struct lglock name = { .lock = &name ## _lock }
+
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
+
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..296132c19a57 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o
+	    async.o range.o groups.o lglock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644
index 000000000000..6535a667a5a7
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+	int i;
+
+	preempt_disable();
+	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_lock(lock);
+	}
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+	int i;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_unlock(lock);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
-- 
cgit v1.2.3-71-gd317


From 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 17 May 2012 17:15:29 +0200
Subject: sched/nohz: Fix rq->cpu_load calculations some more

Follow up on commit 556061b00 ("sched/nohz: Fix rq->cpu_load[]
calculations") since while that fixed the busy case it regressed the
mostly idle case.

Add a callback from the nohz exit to also age the rq->cpu_load[]
array. This closes the hole where either there was no nohz load
balance pass during the nohz, or there was a 'significant' amount of
idle time between the last nohz balance and the nohz exit.

So we'll update unconditionally from the tick to not insert any
accidental 0 load periods while busy, and we try and catch up from
nohz idle balance and nohz exit. Both these are still prone to missing
a jiffy, but that has always been the case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: pjt@google.com
Cc: Venkatesh Pallipadi <venki@google.com>
Link: http://lkml.kernel.org/n/tip-kt0trz0apodbf84ucjfdbr1a@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    |  1 +
 kernel/sched/core.c      | 53 +++++++++++++++++++++++++++++++++++++++---------
 kernel/time/tick-sched.c |  1 +
 3 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f45c0b280b5d..d61e5977e517 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -145,6 +145,7 @@ extern unsigned long this_cpu_load(void);
 
 
 extern void calc_global_load(unsigned long ticks);
+extern void update_cpu_load_nohz(void);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..75844a8f9aeb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2517,25 +2517,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 	sched_avg_update(this_rq);
 }
 
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
 /*
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
  */
 void update_idle_cpu_load(struct rq *this_rq)
 {
-	unsigned long curr_jiffies = jiffies;
+	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
 	unsigned long load = this_rq->load.weight;
 	unsigned long pending_updates;
 
 	/*
-	 * Bloody broken means of dealing with nohz, but better than nothing..
-	 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
-	 * update and see 0 difference the one time and 2 the next, even though
-	 * we ticked at roughtly the same rate.
-	 *
-	 * Hence we only use this from nohz_idle_balance() and skip this
-	 * nonsense when called from the scheduler_tick() since that's
-	 * guaranteed a stable rate.
+	 * bail if there's load or we're actually up-to-date.
 	 */
 	if (load || curr_jiffies == this_rq->last_load_update_tick)
 		return;
@@ -2546,13 +2553,39 @@ void update_idle_cpu_load(struct rq *this_rq)
 	__update_cpu_load(this_rq, load, pending_updates);
 }
 
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+	struct rq *this_rq = this_rq();
+	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+	unsigned long pending_updates;
+
+	if (curr_jiffies == this_rq->last_load_update_tick)
+		return;
+
+	raw_spin_lock(&this_rq->lock);
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	if (pending_updates) {
+		this_rq->last_load_update_tick = curr_jiffies;
+		/*
+		 * We were idle, this means load 0, the current load might be
+		 * !0 due to remote wakeups and the sort.
+		 */
+		__update_cpu_load(this_rq, 0, pending_updates);
+	}
+	raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
 /*
  * Called from scheduler_tick()
  */
 static void update_cpu_load_active(struct rq *this_rq)
 {
 	/*
-	 * See the mess in update_idle_cpu_load().
+	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
 	__update_cpu_load(this_rq, this_rq->load.weight, 1);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..0c927cd85345 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void)
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
 	tick_do_update_jiffies64(now);
+	update_cpu_load_nohz();
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	/*
-- 
cgit v1.2.3-71-gd317


From 2ea45800d8e1c3c51c45a233d6bd6289a297a386 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 May 2012 09:26:43 +0200
Subject: sched: Don't try allocating memory from offline nodes

Allocators don't appreciate it when you try and allocate memory from
offline nodes.

Reported-and-tested-by: Tony Luck <tony.luck@intel.com>
Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-epfc1io9whb7o22bcujf31vn@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75844a8f9aeb..55733616baaa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6436,7 +6436,7 @@ static void sched_init_numa(void)
 			return;
 
 		for (j = 0; j < nr_node_ids; j++) {
-			struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
 			if (!mask)
 				return;
 
-- 
cgit v1.2.3-71-gd317


From 74a5ce20e6eeeb3751340b390e7ac1d1d07bbf55 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 23 May 2012 18:00:43 +0200
Subject: sched: Fix SD_OVERLAP

SD_OVERLAP exists to allow overlapping groups, overlapping groups
appear in NUMA topologies that aren't fully connected.

The typical result of not fully connected NUMA is that each cpu (or
rather node) will have different spans for a particular distance.
However due to how sched domains are traversed -- only the first cpu
in the mask goes one level up -- the next level only cares about the
spans of the cpus that went up.

Due to this two things were observed to be broken:

 - build_overlap_sched_groups() -- since its possible the cpu we're
   building the groups for exists in multiple (or all) groups, the
   selection criteria of the first group didn't ensure there was a cpu
   for which is was true that cpumask_first(span) == cpu. Thus load-
   balancing would terminate.

 - update_group_power() -- assumed that the cpu span of the first
   group of the domain was covered by all groups of the child domain.
   The above explains why this isn't true, so deal with it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Rientjes <rientjes@google.com>
Link: http://lkml.kernel.org/r/1337788843.9783.14.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c |  7 +++++--
 kernel/sched/fair.c | 25 ++++++++++++++++++++-----
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 55733616baaa..3a69374fb427 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6030,11 +6030,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
 		cpumask_or(covered, covered, sg_span);
 
-		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+		sg->sgp = *per_cpu_ptr(sdd->sgp, i);
 		atomic_inc(&sg->sgp->ref);
 
-		if (cpumask_test_cpu(cpu, sg_span))
+		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+			       cpumask_first(sg_span) == cpu) {
+			WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
 			groups = sg;
+		}
 
 		if (!first)
 			first = sg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..f0380d4987b3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3574,11 +3574,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
 
 	power = 0;
 
-	group = child->groups;
-	do {
-		power += group->sgp->power;
-		group = group->next;
-	} while (group != child->groups);
+	if (child->flags & SD_OVERLAP) {
+		/*
+		 * SD_OVERLAP domains cannot assume that child groups
+		 * span the current group.
+		 */
+
+		for_each_cpu(cpu, sched_group_cpus(sdg))
+			power += power_of(cpu);
+	} else  {
+		/*
+		 * !SD_OVERLAP domains can assume that child groups
+		 * span the current group.
+		 */ 
+
+		group = child->groups;
+		do {
+			power += group->sgp->power;
+			group = group->next;
+		} while (group != child->groups);
+	}
 
 	sdg->sgp->power = power;
 }
-- 
cgit v1.2.3-71-gd317


From b654f7de41b0e3903ee2b51d3b8db77fe52ce728 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 22 May 2012 14:04:28 +0200
Subject: sched: Make sure to not re-read variables after validation

We could re-read rq->rt_avg after we validated it was smaller than
total, invalidating the check and resulting in an unintended negative.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Rientjes <rientjes@google.com>
Link: http://lkml.kernel.org/r/1337688268.9698.29.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f0380d4987b3..2b449a762074 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 unsigned long scale_rt_power(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	u64 total, available;
+	u64 total, available, age_stamp, avg;
 
-	total = sched_avg_period() + (rq->clock - rq->age_stamp);
+	/*
+	 * Since we're reading these variables without serialization make sure
+	 * we read them once before doing sanity checks on them.
+	 */
+	age_stamp = ACCESS_ONCE(rq->age_stamp);
+	avg = ACCESS_ONCE(rq->rt_avg);
+
+	total = sched_avg_period() + (rq->clock - age_stamp);
 
-	if (unlikely(total < rq->rt_avg)) {
+	if (unlikely(total < avg)) {
 		/* Ensures that power won't end up being negative */
 		available = 0;
 	} else {
-		available = total - rq->rt_avg;
+		available = total - avg;
 	}
 
 	if (unlikely((s64)total < SCHED_POWER_SCALE))
-- 
cgit v1.2.3-71-gd317


From 29baa7478ba47d746e3625c91d3b2afbf46b4312 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Apr 2012 12:11:21 +0200
Subject: sched: Move nr_cpus_allowed out of 'struct sched_rt_entity'

Since nr_cpus_allowed is used outside of sched/rt.c and wants to be
used outside of there more, move it to a more natural site.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-kr61f02y9brwzkh6x53pdptm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/blackfin/kernel/process.c |  2 +-
 include/linux/init_task.h      |  2 +-
 include/linux/sched.h          |  2 +-
 kernel/sched/core.c            |  2 +-
 kernel/sched/fair.c            |  2 +-
 kernel/sched/rt.c              | 36 +++++++++++++++++++++---------------
 6 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 2e3994b20169..62bcea7dcc6d 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -173,7 +173,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
 	unsigned long newsp;
 
 #ifdef __ARCH_SYNC_CORE_DCACHE
-	if (current->rt.nr_cpus_allowed == num_possible_cpus())
+	if (current->nr_cpus_allowed == num_possible_cpus())
 		set_cpus_allowed_ptr(current, cpumask_of(smp_processor_id()));
 #endif
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e4baff5f7ff4..9e65eff6af3b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -149,6 +149,7 @@ extern struct cred init_cred;
 	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
+	.nr_cpus_allowed= NR_CPUS,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.se		= {						\
@@ -157,7 +158,6 @@ extern struct cred init_cred;
 	.rt		= {						\
 		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
 		.time_slice	= RR_TIMESLICE,				\
-		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	INIT_PUSHABLE_TASKS(tsk)					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d61e5977e517..0f50e78f7f44 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1188,7 +1188,6 @@ struct sched_rt_entity {
 	struct list_head run_list;
 	unsigned long timeout;
 	unsigned int time_slice;
-	int nr_cpus_allowed;
 
 	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -1253,6 +1252,7 @@ struct task_struct {
 #endif
 
 	unsigned int policy;
+	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
 
 #ifdef CONFIG_PREEMPT_RCU
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a69374fb427..70cc36a6073f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5015,7 +5015,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 
 	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2b449a762074..b2a2d236f27b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
 
-	if (p->rt.nr_cpus_allowed == 1)
+	if (p->nr_cpus_allowed == 1)
 		return prev_cpu;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..295da737b6fe 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
 
 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+	struct task_struct *p;
+
 	if (!rt_entity_is_task(rt_se))
 		return;
 
+	p = rt_task_of(rt_se);
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total++;
-	if (rt_se->nr_cpus_allowed > 1)
+	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory++;
 
 	update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+	struct task_struct *p;
+
 	if (!rt_entity_is_task(rt_se))
 		return;
 
+	p = rt_task_of(rt_se);
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total--;
-	if (rt_se->nr_cpus_allowed > 1)
+	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory--;
 
 	update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
-	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 
 	inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 
 	cpu = task_cpu(p);
 
-	if (p->rt.nr_cpus_allowed == 1)
+	if (p->nr_cpus_allowed == 1)
 		goto out;
 
 	/* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 	 * will have to sort it out.
 	 */
 	if (curr && unlikely(rt_task(curr)) &&
-	    (curr->rt.nr_cpus_allowed < 2 ||
+	    (curr->nr_cpus_allowed < 2 ||
 	     curr->prio <= p->prio) &&
-	    (p->rt.nr_cpus_allowed > 1)) {
+	    (p->nr_cpus_allowed > 1)) {
 		int target = find_lowest_rq(p);
 
 		if (target != -1)
@@ -1276,10 +1282,10 @@ out:
 
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-	if (rq->curr->rt.nr_cpus_allowed == 1)
+	if (rq->curr->nr_cpus_allowed == 1)
 		return;
 
-	if (p->rt.nr_cpus_allowed != 1
+	if (p->nr_cpus_allowed != 1
 	    && cpupri_find(&rq->rd->cpupri, p, NULL))
 		return;
 
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 	 * The previous task needs to be made eligible for pushing
 	 * if it is still active
 	 */
-	if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
 	    (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-	    (p->rt.nr_cpus_allowed > 1))
+	    (p->nr_cpus_allowed > 1))
 		return 1;
 	return 0;
 }
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
 	if (unlikely(!lowest_mask))
 		return -1;
 
-	if (task->rt.nr_cpus_allowed == 1)
+	if (task->nr_cpus_allowed == 1)
 		return -1; /* No other targets possible */
 
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(p->rt.nr_cpus_allowed <= 1);
+	BUG_ON(p->nr_cpus_allowed <= 1);
 
 	BUG_ON(!p->on_rq);
 	BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
 	    has_pushable_tasks(rq) &&
-	    p->rt.nr_cpus_allowed > 1 &&
+	    p->nr_cpus_allowed > 1 &&
 	    rt_task(rq->curr) &&
-	    (rq->curr->rt.nr_cpus_allowed < 2 ||
+	    (rq->curr->nr_cpus_allowed < 2 ||
 	     rq->curr->prio <= p->prio))
 		push_rt_tasks(rq);
 }
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 	 * Only update if the process changes its state from whether it
 	 * can migrate or not.
 	 */
-	if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
+	if ((p->nr_cpus_allowed > 1) == (weight > 1))
 		return;
 
 	rq = task_rq(p);
-- 
cgit v1.2.3-71-gd317


From 454c79999f7eaedcdf4c15c449e43902980cbdf5 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Wed, 16 May 2012 21:34:23 -0700
Subject: sched/rt: Fix SCHED_RR across cgroups

task_tick_rt() has an optimization to only reschedule SCHED_RR tasks
if they were the only element on their rq.  However, with cgroups
a SCHED_RR task could be the only element on its per-cgroup rq but
still be competing with other SCHED_RR tasks in its parent's
cgroup.  In this case, the SCHED_RR task in the child cgroup would
never yield at the end of its timeslice.  If the child cgroup
rt_runtime_us was the same as the parent cgroup rt_runtime_us,
the task in the parent cgroup would starve completely.

Modify task_tick_rt() to check that the task is the only task on its
rq, and that the each of the scheduling entities of its ancestors
is also the only entity on its rq.

Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1337229266-15798-1-git-send-email-ccross@android.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 295da737b6fe..2a4e8dffbd6b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1985,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
+	struct sched_rt_entity *rt_se = &p->rt;
+
 	update_curr_rt(rq);
 
 	watchdog(rq, p);
@@ -2002,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	p->rt.time_slice = RR_TIMESLICE;
 
 	/*
-	 * Requeue to the end of queue if we are not the only element
-	 * on the queue:
+	 * Requeue to the end of queue if we (and all of our ancestors) are the
+	 * only element on the queue
 	 */
-	if (p->rt.run_list.prev != p->rt.run_list.next) {
-		requeue_task_rt(rq, p, 0);
-		set_tsk_need_resched(p);
+	for_each_sched_rt_entity(rt_se) {
+		if (rt_se->run_list.prev != rt_se->run_list.next) {
+			requeue_task_rt(rq, p, 0);
+			set_tsk_need_resched(p);
+			return;
+		}
 	}
 }
 
-- 
cgit v1.2.3-71-gd317


From 1292531f6f27af909e713671dd9cc3bcab8114b7 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Fri, 25 May 2012 15:41:54 +0900
Subject: sched: Make sched_feat_names const

The strings sched_feat_names are never changed.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4FBF29B2.9030904@ct.jp.nec.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 70cc36a6073f..c1679a098fc7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,7 +142,7 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)	\
 	#name ,
 
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
 #include "features.h"
 	NULL
 };
-- 
cgit v1.2.3-71-gd317


From 7997a456ef841bb78eb6f881d7cc2c17c2f9b35e Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Fri, 25 May 2012 15:42:47 +0900
Subject: sched: Remove the last NULL entry from sched_feat_names

No need to have the last NULL entry.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4FBF29E7.5020805@ct.jp.nec.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1679a098fc7..94d598ac5e64 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -144,7 +144,6 @@ const_debug unsigned int sysctl_sched_features =
 
 static const char * const sched_feat_names[] = {
 #include "features.h"
-	NULL
 };
 
 #undef SCHED_FEAT
-- 
cgit v1.2.3-71-gd317


From 6a4c96eef42f835734a82c6b512abf9881b7c55d Mon Sep 17 00:00:00 2001
From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Date: Wed, 23 May 2012 14:44:11 +0530
Subject: sched: Remove NULL assignment of dattr_cur

Remove explicit NULL assignment of static pointer
dattr_cur from init_sched_domains().

Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120523091411.GG5005@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94d598ac5e64..c46958e26121 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6726,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
 	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-	dattr_cur = NULL;
 	err = build_sched_domains(doms_cur[0], NULL);
 	register_sched_domain_sysctl();
 
-- 
cgit v1.2.3-71-gd317


From cb7225feec627e91d598198996429e9ee6804f8d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Thu, 31 May 2012 14:51:44 +0900
Subject: perf: Remove duplicate invocation on perf_event_for_each

The @func callback was invoked twice for group leader when
perf_event_for_each() called. It seems the commit 75f937f24bd9
("perf_counter: Fix ctx->mutex vs counter ->mutex inversion") made the
mistake during the change.

Signed-off-by: Namhyung Kim <namhyung.kim@lge.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1338443506-25009-1-git-send-email-namhyung.kim@lge.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b06cbbf6931..f85c0154b333 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3181,7 +3181,6 @@ static void perf_event_for_each(struct perf_event *event,
 	event = event->group_leader;
 
 	perf_event_for_each_child(event, func);
-	func(event);
 	list_for_each_entry(sibling, &event->sibling_list, group_entry)
 		perf_event_for_each_child(sibling, func);
 	mutex_unlock(&ctx->mutex);
-- 
cgit v1.2.3-71-gd317


From ee5e5683d8ac3fec876cb6c26792212f773d5898 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Thu, 31 May 2012 16:26:05 -0700
Subject: kernel/resource.c: correct the comment of allocate_resource()

In the comment of allocate_resource(), the explanation of parameter max
and min is not correct.

Actually, these two parameters are used to specify the range of the
resource that will be allocated, not the min/max size that will be
allocated.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 7e8ea66a8c01..e1d2b8ee76d5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -515,8 +515,8 @@ out:
  * @root: root resource descriptor
  * @new: resource descriptor desired by caller
  * @size: requested resource region size
- * @min: minimum size to allocate
- * @max: maximum size to allocate
+ * @min: minimum boundary to allocate
+ * @max: maximum boundary to allocate
  * @align: alignment requested, in bytes
  * @alignf: alignment function, optional, called if not NULL
  * @alignf_data: arbitrary data to pass to the @alignf function
-- 
cgit v1.2.3-71-gd317


From 499eea6bf9c06df3bf4549954aee6fb3427946ed Mon Sep 17 00:00:00 2001
From: Sasikantha babu <sasikanth.v19@gmail.com>
Date: Thu, 31 May 2012 16:26:07 -0700
Subject: sethostname/setdomainname: notify userspace when there is a change in
 uts_kern_table

sethostname() and setdomainname() notify userspace on failure (without
modifying uts_kern_table).  Change things so that we only notify userspace
on success, when uts_kern_table was actually modified.

Signed-off-by: Sasikantha babu <sasikanth.v19@gmail.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: WANG Cong <amwang@redhat.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 6df42624e454..8b71cef3bf1a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1378,8 +1378,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 		memcpy(u->nodename, tmp, len);
 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
 		errno = 0;
+		uts_proc_notify(UTS_PROC_HOSTNAME);
 	}
-	uts_proc_notify(UTS_PROC_HOSTNAME);
 	up_write(&uts_sem);
 	return errno;
 }
@@ -1429,8 +1429,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 		memcpy(u->domainname, tmp, len);
 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
 		errno = 0;
+		uts_proc_notify(UTS_PROC_DOMAINNAME);
 	}
-	uts_proc_notify(UTS_PROC_DOMAINNAME);
 	up_write(&uts_sem);
 	return errno;
 }
-- 
cgit v1.2.3-71-gd317


From 97fd75b7b8e0f4e6d3f06b819c89b2555f626fcf Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 31 May 2012 16:26:07 -0700
Subject: kernel/irq/manage.c: use the pr_foo() infrastructure to prefix
 printks

Use the module-wide pr_fmt() mechanism rather than open-coding "genirq: "
everywhere.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bb32326afe87..7c475cd3f6e6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -7,6 +7,8 @@
  * This file contains driver APIs to the irq subsystem.
  */
 
+#define pr_fmt(fmt) "genirq: " fmt
+
 #include <linux/irq.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
@@ -565,7 +567,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		 * IRQF_TRIGGER_* but the PIC does not support multiple
 		 * flow-types?
 		 */
-		pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq,
+		pr_debug("No set_type function for IRQ %d (%s)\n", irq,
 			 chip ? (chip->name ? : "unknown") : "unknown");
 		return 0;
 	}
@@ -600,7 +602,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		ret = 0;
 		break;
 	default:
-		pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n",
+		pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
 		       flags, irq, chip->irq_set_type);
 	}
 	if (unmask)
@@ -837,7 +839,7 @@ void exit_irq_thread(void)
 
 	action = kthread_data(tsk);
 
-	pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+	pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
 	       tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
 
 	desc = irq_to_desc(action->irq);
@@ -1044,7 +1046,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * has. The type flags are unreliable as the
 		 * underlying chip implementation can override them.
 		 */
-		pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
+		pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
 		       irq);
 		ret = -EINVAL;
 		goto out_mask;
@@ -1095,7 +1097,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 		if (nmsk != omsk)
 			/* hope the handler works with current  trigger mode */
-			pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n",
+			pr_warning("irq %d uses trigger mode %u; requested %u\n",
 				   irq, nmsk, omsk);
 	}
 
@@ -1133,7 +1135,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 mismatch:
 	if (!(new->flags & IRQF_PROBE_SHARED)) {
-		pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
+		pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
 		       irq, new->flags, new->name, old->flags, old->name);
 #ifdef CONFIG_DEBUG_SHIRQ
 		dump_stack();
-- 
cgit v1.2.3-71-gd317


From d84970bbaf9a09b3fc60c18ee6d59bc9cb4c3b8a Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@fluxnic.net>
Date: Thu, 31 May 2012 16:26:07 -0700
Subject: kernel/cpu_pm.c: fix various typos

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Colin Cross <ccross@android.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu_pm.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 249152e15308..9656a3c36503 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
 
 /**
- * cpm_pm_enter - CPU low power entry notifier
+ * cpu_pm_enter - CPU low power entry notifier
  *
  * Notifies listeners that a single CPU is entering a low power state that may
  * cause some blocks in the same power domain as the cpu to reset.
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
  * Must be called on the affected CPU with interrupts disabled.  Platform is
  * responsible for ensuring that cpu_pm_enter is not called twice on the same
  * CPU before cpu_pm_exit is called. Notified drivers can include VFP
- * co-processor, interrupt controller and it's PM extensions, local CPU
+ * co-processor, interrupt controller and its PM extensions, local CPU
  * timers context save/restore which shouldn't be interrupted. Hence it
  * must be called with interrupts disabled.
  *
@@ -115,13 +115,13 @@ int cpu_pm_enter(void)
 EXPORT_SYMBOL_GPL(cpu_pm_enter);
 
 /**
- * cpm_pm_exit - CPU low power exit notifier
+ * cpu_pm_exit - CPU low power exit notifier
  *
  * Notifies listeners that a single CPU is exiting a low power state that may
  * have caused some blocks in the same power domain as the cpu to reset.
  *
  * Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
  * shouldn't be interrupted. Hence it must be called with interrupts disabled.
  *
  * Return conditions are same as __raw_notifier_call_chain.
@@ -139,7 +139,7 @@ int cpu_pm_exit(void)
 EXPORT_SYMBOL_GPL(cpu_pm_exit);
 
 /**
- * cpm_cluster_pm_enter - CPU cluster low power entry notifier
+ * cpu_cluster_pm_enter - CPU cluster low power entry notifier
  *
  * Notifies listeners that all cpus in a power domain are entering a low power
  * state that may cause some blocks in the same power domain to reset.
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
  * Must be called after cpu_pm_enter has been called on all cpus in the power
  * domain, and before cpu_pm_exit has been called on any cpu in the power
  * domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
  * shouldn't be interrupted. Hence it must be called with interrupts disabled.
  *
  * Must be called with interrupts disabled.
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void)
 EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
 
 /**
- * cpm_cluster_pm_exit - CPU cluster low power exit notifier
+ * cpu_cluster_pm_exit - CPU cluster low power exit notifier
  *
  * Notifies listeners that all cpus in a power domain are exiting form a
  * low power state that may have caused some blocks in the same power domain
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
  * Must be called after cpu_pm_exit has been called on all cpus in the power
  * domain, and before cpu_pm_exit has been called on any cpu in the power
  * domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
  * shouldn't be interrupted. Hence it must be called with interrupts disabled.
  *
  * Return conditions are same as __raw_notifier_call_chain.
-- 
cgit v1.2.3-71-gd317


From ae3cef7300e9fddc35ad251dd5f27c5b88c8594a Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 31 May 2012 16:26:14 -0700
Subject: kmod: unexport call_usermodehelper_freeinfo()

call_usermodehelper_freeinfo() is not used outside of kmod.c.  So unexport
it, and make it static to kmod.c

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h | 4 ----
 kernel/kmod.c        | 3 +--
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index dd99c329e161..f07f9a4e10ff 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -79,10 +79,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
 /* Actually execute the sub-process */
 int call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
-/* Free the subprocess_info. This is only needed if you're not going
-   to call call_usermodehelper_exec */
-void call_usermodehelper_freeinfo(struct subprocess_info *info);
-
 static inline int
 call_usermodehelper_fns(char *path, char **argv, char **envp, int wait,
 			int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 05698a7415fe..21a0f8e99102 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -221,13 +221,12 @@ fail:
 	return 0;
 }
 
-void call_usermodehelper_freeinfo(struct subprocess_info *info)
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
 	if (info->cleanup)
 		(*info->cleanup)(info);
 	kfree(info);
 }
-EXPORT_SYMBOL(call_usermodehelper_freeinfo);
 
 static void umh_complete(struct subprocess_info *sub_info)
 {
-- 
cgit v1.2.3-71-gd317


From 81ab6e7b26b453a795d46f2616ed0e31d97f05b9 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 31 May 2012 16:26:15 -0700
Subject: kmod: convert two call sites to call_usermodehelper_fns()

Both kernel/sys.c && security/keys/request_key.c where inlining the exact
same code as call_usermodehelper_fns(); So simply convert these sites to
directly use call_usermodehelper_fns().

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c                | 19 ++++++++-----------
 security/keys/request_key.c | 13 +++----------
 2 files changed, 11 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 8b71cef3bf1a..6e81aa7e4688 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2114,7 +2114,6 @@ int orderly_poweroff(bool force)
 		NULL
 	};
 	int ret = -ENOMEM;
-	struct subprocess_info *info;
 
 	if (argv == NULL) {
 		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
@@ -2122,18 +2121,16 @@ int orderly_poweroff(bool force)
 		goto out;
 	}
 
-	info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
-	if (info == NULL) {
-		argv_free(argv);
-		goto out;
-	}
-
-	call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
+	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
+				      NULL, argv_cleanup, NULL);
+out:
+	if (likely(!ret))
+		return 0;
 
-	ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
+	if (ret == -ENOMEM)
+		argv_free(argv);
 
-  out:
-	if (ret && force) {
+	if (force) {
 		printk(KERN_WARNING "Failed to start orderly shutdown: "
 		       "forcing the issue\n");
 
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index cc3790315d2f..000e75017520 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -93,16 +93,9 @@ static void umh_keys_cleanup(struct subprocess_info *info)
 static int call_usermodehelper_keys(char *path, char **argv, char **envp,
 					struct key *session_keyring, int wait)
 {
-	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-	struct subprocess_info *info =
-		call_usermodehelper_setup(path, argv, envp, gfp_mask);
-
-	if (!info)
-		return -ENOMEM;
-
-	call_usermodehelper_setfns(info, umh_keys_init, umh_keys_cleanup,
-					key_get(session_keyring));
-	return call_usermodehelper_exec(info, wait);
+	return call_usermodehelper_fns(path, argv, envp, wait,
+				       umh_keys_init, umh_keys_cleanup,
+				       key_get(session_keyring));
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From 785042f2e275089e22c36b462f6495ce8d91732d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 31 May 2012 16:26:15 -0700
Subject: kmod: move call_usermodehelper_fns() to .c file and unexport all it's
 helpers

If we move call_usermodehelper_fns() to kmod.c file and EXPORT_SYMBOL it
we can avoid exporting all it's helper functions:
	call_usermodehelper_setup
	call_usermodehelper_setfns
	call_usermodehelper_exec
And make all of them static to kmod.c

Since the optimizer will see all these as a single call site it will
inline them inside call_usermodehelper_fns().  So we loose the call to
_fns but gain 3 calls to the helpers.  (Not that it matters)

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h | 30 ++----------------------------
 kernel/kmod.c        | 25 ++++++++++++++++++++++---
 2 files changed, 24 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index f07f9a4e10ff..5398d5807075 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -66,36 +66,10 @@ struct subprocess_info {
 	void *data;
 };
 
-/* Allocate a subprocess_info structure */
-struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
-						  char **envp, gfp_t gfp_mask);
-
-/* Set various pieces of state into the subprocess_info structure */
-void call_usermodehelper_setfns(struct subprocess_info *info,
-		    int (*init)(struct subprocess_info *info, struct cred *new),
-		    void (*cleanup)(struct subprocess_info *info),
-		    void *data);
-
-/* Actually execute the sub-process */
-int call_usermodehelper_exec(struct subprocess_info *info, int wait);
-
-static inline int
+extern int
 call_usermodehelper_fns(char *path, char **argv, char **envp, int wait,
 			int (*init)(struct subprocess_info *info, struct cred *new),
-			void (*cleanup)(struct subprocess_info *), void *data)
-{
-	struct subprocess_info *info;
-	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-
-	info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
-
-	if (info == NULL)
-		return -ENOMEM;
-
-	call_usermodehelper_setfns(info, init, cleanup, data);
-
-	return call_usermodehelper_exec(info, wait);
-}
+			void (*cleanup)(struct subprocess_info *), void *data);
 
 static inline int
 call_usermodehelper(char *path, char **argv, char **envp, int wait)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 21a0f8e99102..1f596e4de306 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -478,6 +478,7 @@ static void helper_unlock(void)
  * structure.  This should be passed to call_usermodehelper_exec to
  * exec the process and free the structure.
  */
+static
 struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 						  char **envp, gfp_t gfp_mask)
 {
@@ -493,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
   out:
 	return sub_info;
 }
-EXPORT_SYMBOL(call_usermodehelper_setup);
 
 /**
  * call_usermodehelper_setfns - set a cleanup/init function
@@ -511,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  * Function must be runnable in either a process context or the
  * context in which call_usermodehelper_exec is called.
  */
+static
 void call_usermodehelper_setfns(struct subprocess_info *info,
 		    int (*init)(struct subprocess_info *info, struct cred *new),
 		    void (*cleanup)(struct subprocess_info *info),
@@ -520,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
 	info->init = init;
 	info->data = data;
 }
-EXPORT_SYMBOL(call_usermodehelper_setfns);
 
 /**
  * call_usermodehelper_exec - start a usermode application
@@ -534,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
  */
+static
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
@@ -575,7 +576,25 @@ unlock:
 	helper_unlock();
 	return retval;
 }
-EXPORT_SYMBOL(call_usermodehelper_exec);
+
+int call_usermodehelper_fns(
+	char *path, char **argv, char **envp, int wait,
+	int (*init)(struct subprocess_info *info, struct cred *new),
+	void (*cleanup)(struct subprocess_info *), void *data)
+{
+	struct subprocess_info *info;
+	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+	info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
+
+	if (info == NULL)
+		return -ENOMEM;
+
+	call_usermodehelper_setfns(info, init, cleanup, data);
+
+	return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper_fns);
 
 static int proc_cap_handler(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
-- 
cgit v1.2.3-71-gd317


From 9b3c98cd663750c33434572ff76ba306505eba5a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 31 May 2012 16:26:15 -0700
Subject: kmod.c: fix kernel-doc warning

Warning(kernel/kmod.c:419): No description found for parameter 'depth'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1f596e4de306..ff2c7cb86d77 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -409,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
 
 /**
  * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
- * depth: New value to assign to usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
  *
  * Change the value of usermodehelper_disabled (under umhelper_sem locked for
  * writing) and wakeup tasks waiting for it to change.
-- 
cgit v1.2.3-71-gd317


From 43e13cc107cf6cd3c15fbe1cef849435c2223d50 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 31 May 2012 16:26:16 -0700
Subject: cred: remove task_is_dead() from __task_cred() validation

Commit 8f92054e7ca1 ("CRED: Fix __task_cred()'s lockdep check and banner
comment"):

    add the following validation condition:

        task->exit_state >= 0

    to permit the access if the target task is dead and therefore
    unable to change its own credentials.

OK, but afaics currently this can only help wait_task_zombie() which calls
__task_cred() without rcu lock.

Remove this validation and change wait_task_zombie() to use task_uid()
instead.  This means we do rcu_read_lock() only to shut up the lockdep,
but we already do the same in, say, wait_task_stopped().

task_is_dead() should die, task->exit_state != 0 means that this task has
passed exit_notify(), only do_wait-like code paths should use this.

Unfortunately, we can't kill task_is_dead() right now, it has already
acquired buggy users in drivers/staging.  The fix already exists.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h | 10 +++-------
 kernel/exit.c        |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 917dc5aeb1d4..ebbed2ce6637 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -277,17 +277,13 @@ static inline void put_cred(const struct cred *_cred)
  * @task: The task to query
  *
  * Access the objective credentials of a task.  The caller must hold the RCU
- * readlock or the task must be dead and unable to change its own credentials.
+ * readlock.
  *
  * The result of this function should not be passed directly to get_cred();
  * rather get_task_cred() should be used instead.
  */
-#define __task_cred(task)						\
-	({								\
-		const struct task_struct *__t = (task);			\
-		rcu_dereference_check(__t->real_cred,			\
-				      task_is_dead(__t));		\
-	})
+#define __task_cred(task)	\
+	rcu_dereference((task)->real_cred)
 
 /**
  * get_current_cred - Get the current task's subjective credentials
diff --git a/kernel/exit.c b/kernel/exit.c
index 910a0716e17a..3281493ce7ad 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1214,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
-	uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid);
+	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
 	struct siginfo __user *infop;
 
 	if (!likely(wo->wo_flags & WEXITED))
-- 
cgit v1.2.3-71-gd317


From 168eeccbc956d2ec083c3a513f7706784ee0dc5f Mon Sep 17 00:00:00 2001
From: Tim Bird <tim.bird@am.sony.com>
Date: Thu, 31 May 2012 16:26:16 -0700
Subject: stack usage: add pid to warning printk in check_stack_usage

In embedded systems, sometimes the same program (busybox) is the cause of
multiple warnings.  Outputting the pid with the program name in the
warning printk helps distinguish which instances of a program are using
the stack most.

This is a small patch, but useful.

Signed-off-by: Tim Bird <tim.bird@am.sony.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3281493ce7ad..6d85655353e9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -884,9 +884,9 @@ static void check_stack_usage(void)
 
 	spin_lock(&low_water_lock);
 	if (free < lowest_to_date) {
-		printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
-				"left\n",
-				current->comm, free);
+		printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+				"%lu bytes left\n",
+				current->comm, task_pid_nr(current), free);
 		lowest_to_date = free;
 	}
 	spin_unlock(&low_water_lock);
-- 
cgit v1.2.3-71-gd317


From f7505d64f2db5da2d7d94873ddf2cd2524847061 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Thu, 31 May 2012 16:26:21 -0700
Subject: fork: call complete_vfork_done() after clearing child_tid and
 flushing rss-counters

Child should wake up the parent from vfork() only after finishing all
operations with shared mm.  There is no sense in using
CLONE_CHILD_CLEARTID together with CLONE_VFORK, but it looks more accurate
now.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 017fb23d5983..2254fbf23567 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -787,9 +787,6 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 	/* Get rid of any cached register state */
 	deactivate_mm(tsk, mm);
 
-	if (tsk->vfork_done)
-		complete_vfork_done(tsk);
-
 	/*
 	 * If we're exiting normally, clear a user-space tid field if
 	 * requested.  We leave this alone when dying by signal, to leave
@@ -810,6 +807,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		}
 		tsk->clear_child_tid = NULL;
 	}
+
+	/*
+	 * All done, finally we can wake up parent and return this mm to him.
+	 * Also kthread_stop() uses this completion for synchronization.
+	 */
+	if (tsk->vfork_done)
+		complete_vfork_done(tsk);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From cb79295e20a8088a2fd6a9b3cb5f2d889ec36b4d Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <anton.vorontsov@linaro.org>
Date: Thu, 31 May 2012 16:26:22 -0700
Subject: cpu: introduce clear_tasks_mm_cpumask() helper

Many architectures clear tasks' mm_cpumask like this:

	read_lock(&tasklist_lock);
	for_each_process(p) {
		if (p->mm)
			cpumask_clear_cpu(cpu, mm_cpumask(p->mm));
	}
	read_unlock(&tasklist_lock);

Depending on the context, the code above may have several problems,
such as:

1. Working with task->mm w/o getting mm or grabing the task lock is
   dangerous as ->mm might disappear (exit_mm() assigns NULL under
   task_lock(), so tasklist lock is not enough).

2. Checking for process->mm is not enough because process' main
   thread may exit or detach its mm via use_mm(), but other threads
   may still have a valid mm.

This patch implements a small helper function that does things
correctly, i.e.:

1. We take the task's lock while whe handle its mm (we can't use
   get_task_mm()/mmput() pair as mmput() might sleep);

2. To catch exited main thread case, we use find_lock_task_mm(),
   which walks up all threads and returns an appropriate task
   (with task lock held).

Also, Per Peter Zijlstra's idea, now we don't grab tasklist_lock in
the new helper, instead we take the rcu read lock. We can do this
because the function is called after the cpu is taken down and marked
offline, so no new tasks will get this cpu set in their mm mask.

Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpu.h |  1 +
 kernel/cpu.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 7230bb59a06f..2e9b9ebbeb78 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -177,6 +177,7 @@ extern void put_online_cpus(void);
 #define hotcpu_notifier(fn, pri)	cpu_notifier(fn, pri)
 #define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
 #define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
+void clear_tasks_mm_cpumask(int cpu);
 int cpu_down(unsigned int cpu);
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0e6353cf147a..0575197deb4a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,6 +10,8 @@
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
+#include <linux/oom.h>
+#include <linux/rcupdate.h>
 #include <linux/export.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
@@ -173,6 +175,30 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
+void clear_tasks_mm_cpumask(int cpu)
+{
+	struct task_struct *p;
+
+	/*
+	 * This function is called after the cpu is taken down and marked
+	 * offline, so its not like new tasks will ever get this cpu set in
+	 * their mm mask. -- Peter Zijlstra
+	 * Thus, we may use rcu_read_lock() here, instead of grabbing
+	 * full-fledged tasklist_lock.
+	 */
+	rcu_read_lock();
+	for_each_process(p) {
+		struct task_struct *t;
+
+		t = find_lock_task_mm(p);
+		if (!t)
+			continue;
+		cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+		task_unlock(t);
+	}
+	rcu_read_unlock();
+}
+
 static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
-- 
cgit v1.2.3-71-gd317


From e4cc2f873ad0833aa5c4aca56bebe15b9603a1e7 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <anton.vorontsov@linaro.org>
Date: Thu, 31 May 2012 16:26:26 -0700
Subject: kernel/cpu.c: document clear_tasks_mm_cpumask()

Add more comments on clear_tasks_mm_cpumask, plus adds a runtime check:
the function is only suitable for offlined CPUs, and if called
inappropriately, the kernel should scream aloud.

[akpm@linux-foundation.org: tweak comment: s/walks up/walks/, use 80 cols]
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0575197deb4a..a4eb5227a19e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,6 +13,7 @@
 #include <linux/oom.h>
 #include <linux/rcupdate.h>
 #include <linux/export.h>
+#include <linux/bug.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
@@ -175,6 +176,18 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
+/**
+ * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
+ * @cpu: a CPU id
+ *
+ * This function walks all processes, finds a valid mm struct for each one and
+ * then clears a corresponding bit in mm's cpumask.  While this all sounds
+ * trivial, there are various non-obvious corner cases, which this function
+ * tries to solve in a safe manner.
+ *
+ * Also note that the function uses a somewhat relaxed locking scheme, so it may
+ * be called only for an already offlined CPU.
+ */
 void clear_tasks_mm_cpumask(int cpu)
 {
 	struct task_struct *p;
@@ -186,10 +199,15 @@ void clear_tasks_mm_cpumask(int cpu)
 	 * Thus, we may use rcu_read_lock() here, instead of grabbing
 	 * full-fledged tasklist_lock.
 	 */
+	WARN_ON(cpu_online(cpu));
 	rcu_read_lock();
 	for_each_process(p) {
 		struct task_struct *t;
 
+		/*
+		 * Main thread might exit, but other threads may still have
+		 * a valid mm. Find one.
+		 */
 		t = find_lock_task_mm(p);
 		if (!t)
 			continue;
-- 
cgit v1.2.3-71-gd317


From 3208450488ae724196f1efffc457e4265957c04e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 31 May 2012 16:26:39 -0700
Subject: pidns: use task_active_pid_ns in do_notify_parent

Using task_active_pid_ns is more robust because it works even after we
have called exit_namespaces.  This change allows us to have parent
processes that are zombies.  Normally a zombie parent processes is crazy
and the last thing you would want to have but in the case of not letting
the init process of a pid namespace be reaped until all of it's children
are dead and reaped a zombie parent process is exactly what we want.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Louis Rilling <louis.rilling@kerlabs.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f7b418217633..08dfbd748cd2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1656,19 +1656,18 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_signo = sig;
 	info.si_errno = 0;
 	/*
-	 * we are under tasklist_lock here so our parent is tied to
-	 * us and cannot exit and release its namespace.
+	 * We are under tasklist_lock here so our parent is tied to
+	 * us and cannot change.
 	 *
-	 * the only it can is to switch its nsproxy with sys_unshare,
-	 * bu uncharing pid namespaces is not allowed, so we'll always
-	 * see relevant namespace
+	 * task_active_pid_ns will always return the same pid namespace
+	 * until a task passes through release_task.
 	 *
 	 * write_lock() currently calls preempt_disable() which is the
 	 * same as rcu_read_lock(), but according to Oleg, this is not
 	 * correct to rely on this
 	 */
 	rcu_read_lock();
-	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+	info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
 	info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
 				       task_uid(tsk));
 	rcu_read_unlock();
-- 
cgit v1.2.3-71-gd317


From 00c10bc13cdb58447d6bb2a003afad7bd60f5a5f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 31 May 2012 16:26:40 -0700
Subject: pidns: make killed children autoreap

Force SIGCHLD handling to SIG_IGN so that signals are not generated and so
that the children autoreap.  This increases the parallelize and in general
the speed of network namespace shutdown.

Note self reaping childrean can exist past zap_pid_ns_processess but they
will all be reaped before we allow the pid namespace init task with pid ==
1 to be reaped.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Louis Rilling <louis.rilling@kerlabs.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 57bc1fd35b3c..fd3c44986191 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 {
 	int nr;
 	int rc;
-	struct task_struct *task;
+	struct task_struct *task, *me = current;
+
+	/* Ignore SIGCHLD causing any terminated children to autoreap */
+	spin_lock_irq(&me->sighand->siglock);
+	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
+	spin_unlock_irq(&me->sighand->siglock);
 
 	/*
 	 * The last thread in the cgroup-init thread group is terminating.
-- 
cgit v1.2.3-71-gd317


From 98ed57eef9f67dfe541be0bca34660ffc88365b2 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 31 May 2012 16:26:42 -0700
Subject: sysctl: make kernel.ns_last_pid control dependent on
 CHECKPOINT_RESTORE

For those who doesn't need C/R functionality there is no need to control
last pid, ie the pid for the next fork() call.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fd3c44986191..16b20e38c4a1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -196,6 +196,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	return;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -223,8 +224,8 @@ static struct ctl_table pid_ns_ctl_table[] = {
 	},
 	{ }
 };
-
 static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+#endif	/* CONFIG_CHECKPOINT_RESTORE */
 
 int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 {
@@ -258,7 +259,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
 	register_sysctl_paths(kern_path, pid_ns_ctl_table);
+#endif
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From d97b46a64674a267bc41c9e16132ee2a98c3347d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 31 May 2012 16:26:44 -0700
Subject: syscalls, x86: add __NR_kcmp syscall

While doing the checkpoint-restore in the user space one need to determine
whether various kernel objects (like mm_struct-s of file_struct-s) are
shared between tasks and restore this state.

The 2nd step can be solved by using appropriate CLONE_ flags and the
unshare syscall, while there's currently no ways for solving the 1st one.

One of the ways for checking whether two tasks share e.g.  mm_struct is to
provide some mm_struct ID of a task to its proc file, but showing such
info considered to be not that good for security reasons.

Thus after some debates we end up in conclusion that using that named
'comparison' syscall might be the best candidate.  So here is it --
__NR_kcmp.

It takes up to 5 arguments - the pids of the two tasks (which
characteristics should be compared), the comparison type and (in case of
comparison of files) two file descriptors.

Lookups for pids are done in the caller's PID namespace only.

At moment only x86 is supported and tested.

[akpm@linux-foundation.org: fix up selftests, warnings]
[akpm@linux-foundation.org: include errno.h]
[akpm@linux-foundation.org: tweak comment text]
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Valdis.Kletnieks@vt.edu
Cc: Michal Marek <mmarek@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/syscalls/syscall_32.tbl         |   1 +
 arch/x86/syscalls/syscall_64.tbl         |   2 +
 include/linux/kcmp.h                     |  17 +++
 include/linux/syscalls.h                 |   2 +
 kernel/Makefile                          |   3 +
 kernel/kcmp.c                            | 196 +++++++++++++++++++++++++++++++
 kernel/sys_ni.c                          |   3 +
 tools/testing/selftests/Makefile         |   2 +-
 tools/testing/selftests/kcmp/Makefile    |  29 +++++
 tools/testing/selftests/kcmp/kcmp_test.c |  94 +++++++++++++++
 10 files changed, 348 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kcmp.h
 create mode 100644 kernel/kcmp.c
 create mode 100644 tools/testing/selftests/kcmp/Makefile
 create mode 100644 tools/testing/selftests/kcmp/kcmp_test.c

(limited to 'kernel')

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 29f9f0554f7d..7a35a6e71d44 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,4 @@
 346	i386	setns			sys_setns
 347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+349	i386	kcmp			sys_kcmp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dd29a9ea27c5..51171aeff0dc 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -318,6 +318,8 @@
 309	common	getcpu			sys_getcpu
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
+312	64	kcmp			sys_kcmp
+
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
 # for native 64-bit operation.
diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h
new file mode 100644
index 000000000000..2dcd1b3aafc8
--- /dev/null
+++ b/include/linux/kcmp.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_KCMP_H
+#define _LINUX_KCMP_H
+
+/* Comparison type */
+enum kcmp_type {
+	KCMP_FILE,
+	KCMP_VM,
+	KCMP_FILES,
+	KCMP_FS,
+	KCMP_SIGHAND,
+	KCMP_IO,
+	KCMP_SYSVSEM,
+
+	KCMP_TYPES,
+};
+
+#endif /* _LINUX_KCMP_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3de3acb84a95..19439c75c5b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 				      unsigned long riovcnt,
 				      unsigned long flags);
 
+asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
+			 unsigned long idx1, unsigned long idx2);
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..80be6ca0cc75 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,6 +25,9 @@ endif
 obj-y += sched/
 obj-y += power/
 
+ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
+obj-$(CONFIG_X86) += kcmp.o
+endif
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000000..30b7b225306c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,196 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/kcmp.h>
+
+#include <asm/unistd.h>
+
+/*
+ * We don't expose the real in-memory order of objects for security reasons.
+ * But still the comparison results should be suitable for sorting. So we
+ * obfuscate kernel pointers values and compare the production instead.
+ *
+ * The obfuscation is done in two steps. First we xor the kernel pointer with
+ * a random value, which puts pointer into a new position in a reordered space.
+ * Secondly we multiply the xor production with a large odd random number to
+ * permute its bits even more (the odd multiplier guarantees that the product
+ * is unique ever after the high bits are truncated, since any odd number is
+ * relative prime to 2^n).
+ *
+ * Note also that the obfuscation itself is invisible to userspace and if needed
+ * it can be changed to an alternate scheme.
+ */
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+
+static long kptr_obfuscate(long v, int type)
+{
+	return (v ^ cookies[type][0]) * cookies[type][1];
+}
+
+/*
+ * 0 - equal, i.e. v1 = v2
+ * 1 - less than, i.e. v1 < v2
+ * 2 - greater than, i.e. v1 > v2
+ * 3 - not equal but ordering unavailable (reserved for future)
+ */
+static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
+{
+	long ret;
+
+	ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+
+	return (ret < 0) | ((ret > 0) << 1);
+}
+
+/* The caller must have pinned the task */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+	struct file *file = NULL;
+
+	task_lock(task);
+	rcu_read_lock();
+
+	if (task->files)
+		file = fcheck_files(task->files, idx);
+
+	rcu_read_unlock();
+	task_unlock(task);
+
+	return file;
+}
+
+static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+{
+	if (likely(m2 != m1))
+		mutex_unlock(m2);
+	mutex_unlock(m1);
+}
+
+static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+{
+	int err;
+
+	if (m2 > m1)
+		swap(m1, m2);
+
+	err = mutex_lock_killable(m1);
+	if (!err && likely(m1 != m2)) {
+		err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+		if (err)
+			mutex_unlock(m1);
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+		unsigned long, idx1, unsigned long, idx2)
+{
+	struct task_struct *task1, *task2;
+	int ret;
+
+	rcu_read_lock();
+
+	/*
+	 * Tasks are looked up in caller's PID namespace only.
+	 */
+	task1 = find_task_by_vpid(pid1);
+	task2 = find_task_by_vpid(pid2);
+	if (!task1 || !task2)
+		goto err_no_task;
+
+	get_task_struct(task1);
+	get_task_struct(task2);
+
+	rcu_read_unlock();
+
+	/*
+	 * One should have enough rights to inspect task details.
+	 */
+	ret = kcmp_lock(&task1->signal->cred_guard_mutex,
+			&task2->signal->cred_guard_mutex);
+	if (ret)
+		goto err;
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+		ret = -EPERM;
+		goto err_unlock;
+	}
+
+	switch (type) {
+	case KCMP_FILE: {
+		struct file *filp1, *filp2;
+
+		filp1 = get_file_raw_ptr(task1, idx1);
+		filp2 = get_file_raw_ptr(task2, idx2);
+
+		if (filp1 && filp2)
+			ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
+		else
+			ret = -EBADF;
+		break;
+	}
+	case KCMP_VM:
+		ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
+		break;
+	case KCMP_FILES:
+		ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
+		break;
+	case KCMP_FS:
+		ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
+		break;
+	case KCMP_SIGHAND:
+		ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
+		break;
+	case KCMP_IO:
+		ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
+		break;
+	case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+		ret = kcmp_ptr(task1->sysvsem.undo_list,
+			       task2->sysvsem.undo_list,
+			       KCMP_SYSVSEM);
+#else
+		ret = -EOPNOTSUPP;
+#endif
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+err_unlock:
+	kcmp_unlock(&task1->signal->cred_guard_mutex,
+		    &task2->signal->cred_guard_mutex);
+err:
+	put_task_struct(task1);
+	put_task_struct(task2);
+
+	return ret;
+
+err_no_task:
+	rcu_read_unlock();
+	return -ESRCH;
+}
+
+static __init int kcmp_cookies_init(void)
+{
+	int i;
+
+	get_random_bytes(cookies, sizeof(cookies));
+
+	for (i = 0; i < KCMP_TYPES; i++)
+		cookies[i][1] |= (~(~0UL >>  1) | 1);
+
+	return 0;
+}
+arch_initcall(kcmp_cookies_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d7..dbff751e4086 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
 cond_syscall(sys_name_to_handle_at);
 cond_syscall(sys_open_by_handle_at);
 cond_syscall(compat_sys_open_by_handle_at);
+
+/* compare kernel pointers */
+cond_syscall(sys_kcmp);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 14972017a43e..a4162e15c25f 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,4 @@
-TARGETS = breakpoints mqueue vm
+TARGETS = breakpoints kcmp mqueue vm
 
 all:
 	for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile
new file mode 100644
index 000000000000..dc79b86ea65c
--- /dev/null
+++ b/tools/testing/selftests/kcmp/Makefile
@@ -0,0 +1,29 @@
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+ifeq ($(ARCH),i386)
+        ARCH := X86
+	CFLAGS := -DCONFIG_X86_32 -D__i386__
+endif
+ifeq ($(ARCH),x86_64)
+	ARCH := X86
+	CFLAGS := -DCONFIG_X86_64 -D__x86_64__
+endif
+
+CFLAGS += -I../../../../arch/x86/include/generated/
+CFLAGS += -I../../../../include/
+CFLAGS += -I../../../../usr/include/
+CFLAGS += -I../../../../arch/x86/include/
+
+all:
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) kcmp_test.c -o run_test
+else
+	echo "Not an x86 target, can't build kcmp selftest"
+endif
+
+run-tests: all
+	./kcmp_test
+
+clean:
+	rm -fr ./run_test
+	rm -fr ./test-file
diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c
new file mode 100644
index 000000000000..358cc6bfa35d
--- /dev/null
+++ b/tools/testing/selftests/kcmp/kcmp_test.c
@@ -0,0 +1,94 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <linux/unistd.h>
+#include <linux/kcmp.h>
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+static long sys_kcmp(int pid1, int pid2, int type, int fd1, int fd2)
+{
+	return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2);
+}
+
+int main(int argc, char **argv)
+{
+	const char kpath[] = "kcmp-test-file";
+	int pid1, pid2;
+	int fd1, fd2;
+	int status;
+
+	fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644);
+	pid1 = getpid();
+
+	if (fd1 < 0) {
+		perror("Can't create file");
+		exit(1);
+	}
+
+	pid2 = fork();
+	if (pid2 < 0) {
+		perror("fork failed");
+		exit(1);
+	}
+
+	if (!pid2) {
+		int pid2 = getpid();
+		int ret;
+
+		fd2 = open(kpath, O_RDWR, 0644);
+		if (fd2 < 0) {
+			perror("Can't open file");
+			exit(1);
+		}
+
+		/* An example of output and arguments */
+		printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld "
+		       "FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld "
+		       "INV: %2ld\n",
+		       pid1, pid2,
+		       sys_kcmp(pid1, pid2, KCMP_FILE,		fd1, fd2),
+		       sys_kcmp(pid1, pid2, KCMP_FILES,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_VM,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_FS,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_SIGHAND,	0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_IO,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_SYSVSEM,	0, 0),
+
+			/* This one should fail */
+		       sys_kcmp(pid1, pid2, KCMP_TYPES + 1,	0, 0));
+
+		/* This one should return same fd */
+		ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1);
+		if (ret) {
+			printf("FAIL: 0 expected but %d returned\n", ret);
+			ret = -1;
+		} else
+			printf("PASS: 0 returned as expected\n");
+
+		/* Compare with self */
+		ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0);
+		if (ret) {
+			printf("FAIL: 0 expected but %li returned\n", ret);
+			ret = -1;
+		} else
+			printf("PASS: 0 returned as expected\n");
+
+		exit(ret);
+	}
+
+	waitpid(pid2, &status, P_ALL);
+
+	return 0;
+}
-- 
cgit v1.2.3-71-gd317


From fe8c7f5cbf91124987106faa3bdf0c8b955c4cf7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 31 May 2012 16:26:45 -0700
Subject: c/r: prctl: extend PR_SET_MM to set up more mm_struct entries

During checkpoint we dump whole process memory to a file and the dump
includes process stack memory.  But among stack data itself, the stack
carries additional parameters such as command line arguments, environment
data and auxiliary vector.

So when we do restore procedure and once we've restored stack data itself
we need to setup mm_struct::arg_start/end, env_start/end, so restored
process would be able to find command line arguments and environment data
it had at checkpoint time.  The same applies to auxiliary vector.

For this reason additional PR_SET_MM_(ARG_START | ARG_END | ENV_START |
ENV_END | AUXV) codes are introduced.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Vagin <avagin@openvz.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |   5 ++
 kernel/sys.c          | 134 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 88 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 78b76e24cc7e..18d84c4b42d8 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -113,6 +113,11 @@
 # define PR_SET_MM_START_STACK		5
 # define PR_SET_MM_START_BRK		6
 # define PR_SET_MM_BRK			7
+# define PR_SET_MM_ARG_START		8
+# define PR_SET_MM_ARG_END		9
+# define PR_SET_MM_ENV_START		10
+# define PR_SET_MM_ENV_END		11
+# define PR_SET_MM_AUXV			12
 
 /*
  * Set specific pid that is allowed to ptrace the current task.
diff --git a/kernel/sys.c b/kernel/sys.c
index 6e81aa7e4688..8b544972e46e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1784,17 +1784,23 @@ SYSCALL_DEFINE1(umask, int, mask)
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
+static bool vma_flags_mismatch(struct vm_area_struct *vma,
+			       unsigned long required,
+			       unsigned long banned)
+{
+	return (vma->vm_flags & required) != required ||
+		(vma->vm_flags & banned);
+}
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
 	unsigned long rlim = rlimit(RLIMIT_DATA);
-	unsigned long vm_req_flags;
-	unsigned long vm_bad_flags;
-	struct vm_area_struct *vma;
-	int error = 0;
 	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int error;
 
-	if (arg4 | arg5)
+	if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_RESOURCE))
@@ -1803,58 +1809,23 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (addr >= TASK_SIZE)
 		return -EINVAL;
 
+	error = -EINVAL;
+
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, addr);
 
-	if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
-		/* It must be existing VMA */
-		if (!vma || vma->vm_start > addr)
-			goto out;
-	}
-
-	error = -EINVAL;
 	switch (opt) {
 	case PR_SET_MM_START_CODE:
+		mm->start_code = addr;
+		break;
 	case PR_SET_MM_END_CODE:
-		vm_req_flags = VM_READ | VM_EXEC;
-		vm_bad_flags = VM_WRITE | VM_MAYSHARE;
-
-		if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
-		    (vma->vm_flags & vm_bad_flags))
-			goto out;
-
-		if (opt == PR_SET_MM_START_CODE)
-			mm->start_code = addr;
-		else
-			mm->end_code = addr;
+		mm->end_code = addr;
 		break;
-
 	case PR_SET_MM_START_DATA:
-	case PR_SET_MM_END_DATA:
-		vm_req_flags = VM_READ | VM_WRITE;
-		vm_bad_flags = VM_EXEC | VM_MAYSHARE;
-
-		if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
-		    (vma->vm_flags & vm_bad_flags))
-			goto out;
-
-		if (opt == PR_SET_MM_START_DATA)
-			mm->start_data = addr;
-		else
-			mm->end_data = addr;
+		mm->start_data = addr;
 		break;
-
-	case PR_SET_MM_START_STACK:
-
-#ifdef CONFIG_STACK_GROWSUP
-		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
-#else
-		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
-#endif
-		if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
-			goto out;
-
-		mm->start_stack = addr;
+	case PR_SET_MM_END_DATA:
+		mm->end_data = addr;
 		break;
 
 	case PR_SET_MM_START_BRK:
@@ -1881,16 +1852,77 @@ static int prctl_set_mm(int opt, unsigned long addr,
 		mm->brk = addr;
 		break;
 
+	/*
+	 * If command line arguments and environment
+	 * are placed somewhere else on stack, we can
+	 * set them up here, ARG_START/END to setup
+	 * command line argumets and ENV_START/END
+	 * for environment.
+	 */
+	case PR_SET_MM_START_STACK:
+	case PR_SET_MM_ARG_START:
+	case PR_SET_MM_ARG_END:
+	case PR_SET_MM_ENV_START:
+	case PR_SET_MM_ENV_END:
+		if (!vma) {
+			error = -EFAULT;
+			goto out;
+		}
+#ifdef CONFIG_STACK_GROWSUP
+		if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
+#else
+		if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
+#endif
+			goto out;
+		if (opt == PR_SET_MM_START_STACK)
+			mm->start_stack = addr;
+		else if (opt == PR_SET_MM_ARG_START)
+			mm->arg_start = addr;
+		else if (opt == PR_SET_MM_ARG_END)
+			mm->arg_end = addr;
+		else if (opt == PR_SET_MM_ENV_START)
+			mm->env_start = addr;
+		else if (opt == PR_SET_MM_ENV_END)
+			mm->env_end = addr;
+		break;
+
+	/*
+	 * This doesn't move auxiliary vector itself
+	 * since it's pinned to mm_struct, but allow
+	 * to fill vector with new values. It's up
+	 * to a caller to provide sane values here
+	 * otherwise user space tools which use this
+	 * vector might be unhappy.
+	 */
+	case PR_SET_MM_AUXV: {
+		unsigned long user_auxv[AT_VECTOR_SIZE];
+
+		if (arg4 > sizeof(user_auxv))
+			goto out;
+		up_read(&mm->mmap_sem);
+
+		if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
+			return -EFAULT;
+
+		/* Make sure the last entry is always AT_NULL */
+		user_auxv[AT_VECTOR_SIZE - 2] = 0;
+		user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+		BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+		task_lock(current);
+		memcpy(mm->saved_auxv, user_auxv, arg4);
+		task_unlock(current);
+
+		return 0;
+	}
 	default:
-		error = -EINVAL;
 		goto out;
 	}
 
 	error = 0;
-
 out:
 	up_read(&mm->mmap_sem);
-
 	return error;
 }
 #else /* CONFIG_CHECKPOINT_RESTORE */
-- 
cgit v1.2.3-71-gd317


From b32dfe377102ce668775f8b6b1461f7ad428f8b6 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 31 May 2012 16:26:46 -0700
Subject: c/r: prctl: add ability to set new mm_struct::exe_file

When we do restore we would like to have a way to setup a former
mm_struct::exe_file so that /proc/pid/exe would point to the original
executable file a process had at checkpoint time.

For this the PR_SET_MM_EXE_FILE code is introduced.  This option takes a
file descriptor which will be set as a source for new /proc/$pid/exe
symlink.

Note it allows to change /proc/$pid/exe if there are no VM_EXECUTABLE
vmas present for current process, simply because this feature is a special
to C/R and mm::num_exe_file_vmas become meaningless after that.

To minimize the amount of transition the /proc/pid/exe symlink might have,
this feature is implemented in one-shot manner.  Thus once changed the
symlink can't be changed again.  This should help sysadmins to monitor the
symlinks over all process running in a system.

In particular one could make a snapshot of processes and ring alarm if
there unexpected changes of /proc/pid/exe's in a system.

Note -- this feature is available iif CONFIG_CHECKPOINT_RESTORE is set and
the caller must have CAP_SYS_RESOURCE capability granted, otherwise the
request to change symlink will be rejected.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |  1 +
 kernel/sys.c          | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 18d84c4b42d8..711e0a30aacc 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -118,6 +118,7 @@
 # define PR_SET_MM_ENV_START		10
 # define PR_SET_MM_ENV_END		11
 # define PR_SET_MM_AUXV			12
+# define PR_SET_MM_EXE_FILE		13
 
 /*
  * Set specific pid that is allowed to ptrace the current task.
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b544972e46e..9ff89cb9657a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -36,6 +36,8 @@
 #include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
+#include <linux/file.h>
+#include <linux/mount.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
@@ -1792,6 +1794,57 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma,
 		(vma->vm_flags & banned);
 }
 
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+{
+	struct file *exe_file;
+	struct dentry *dentry;
+	int err;
+
+	/*
+	 * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
+	 * remain. So perform a quick test first.
+	 */
+	if (mm->num_exe_file_vmas)
+		return -EBUSY;
+
+	exe_file = fget(fd);
+	if (!exe_file)
+		return -EBADF;
+
+	dentry = exe_file->f_path.dentry;
+
+	/*
+	 * Because the original mm->exe_file points to executable file, make
+	 * sure that this one is executable as well, to avoid breaking an
+	 * overall picture.
+	 */
+	err = -EACCES;
+	if (!S_ISREG(dentry->d_inode->i_mode)	||
+	    exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
+
+	err = inode_permission(dentry->d_inode, MAY_EXEC);
+	if (err)
+		goto exit;
+
+	/*
+	 * The symlink can be changed only once, just to disallow arbitrary
+	 * transitions malicious software might bring in. This means one
+	 * could make a snapshot over all processes running and monitor
+	 * /proc/pid/exe changes to notice unusual activity if needed.
+	 */
+	down_write(&mm->mmap_sem);
+	if (likely(!mm->exe_file))
+		set_mm_exe_file(mm, exe_file);
+	else
+		err = -EBUSY;
+	up_write(&mm->mmap_sem);
+
+exit:
+	fput(exe_file);
+	return err;
+}
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
@@ -1806,6 +1859,9 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
+	if (opt == PR_SET_MM_EXE_FILE)
+		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+
 	if (addr >= TASK_SIZE)
 		return -EINVAL;
 
-- 
cgit v1.2.3-71-gd317


From 754421c8cab1a568be844a7069fe04c1cf6391b8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 26 Apr 2012 18:31:00 -0400
Subject: HAVE_RESTORE_SIGMASK is defined on all architectures now

Everyone either defines it in arch thread_info.h or has TIF_RESTORE_SIGMASK
and picks default set_restore_sigmask() in linux/thread_info.h.  Kill the
ifdefs, slap #error in linux/thread_info.h to catch breakage when new ones
get merged.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c                 | 4 ----
 fs/eventpoll.c              | 4 ----
 fs/select.c                 | 4 ----
 include/linux/thread_info.h | 4 ++++
 kernel/signal.c             | 2 --
 5 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/fs/compat.c b/fs/compat.c
index 6556a9ce8a28..3adf3d4c2cd9 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1547,7 +1547,6 @@ asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
 				 compat_ptr(a.exp), compat_ptr(a.tvp));
 }
 
-#ifdef HAVE_SET_RESTORE_SIGMASK
 static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
@@ -1670,11 +1669,9 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 
 	return ret;
 }
-#endif /* HAVE_SET_RESTORE_SIGMASK */
 
 #ifdef CONFIG_EPOLL
 
-#ifdef HAVE_SET_RESTORE_SIGMASK
 asmlinkage long compat_sys_epoll_pwait(int epfd,
 			struct compat_epoll_event __user *events,
 			int maxevents, int timeout,
@@ -1718,7 +1715,6 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 
 	return err;
 }
-#endif /* HAVE_SET_RESTORE_SIGMASK */
 
 #endif /* CONFIG_EPOLL */
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 079d1be65ba9..74598f67efeb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1853,8 +1853,6 @@ error_return:
 	return error;
 }
 
-#ifdef HAVE_SET_RESTORE_SIGMASK
-
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_pwait(2).
@@ -1899,8 +1897,6 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 	return error;
 }
 
-#endif /* HAVE_SET_RESTORE_SIGMASK */
-
 static int __init eventpoll_init(void)
 {
 	struct sysinfo si;
diff --git a/fs/select.c b/fs/select.c
index 17d33d09fc16..bae321569dfa 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -614,7 +614,6 @@ SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
 	return ret;
 }
 
-#ifdef HAVE_SET_RESTORE_SIGMASK
 static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 		       fd_set __user *exp, struct timespec __user *tsp,
 		       const sigset_t __user *sigmask, size_t sigsetsize)
@@ -686,7 +685,6 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
 
 	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
 }
-#endif /* HAVE_SET_RESTORE_SIGMASK */
 
 #ifdef __ARCH_WANT_SYS_OLD_SELECT
 struct sel_arg_struct {
@@ -941,7 +939,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 	return ret;
 }
 
-#ifdef HAVE_SET_RESTORE_SIGMASK
 SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
 		size_t, sigsetsize)
@@ -992,4 +989,3 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 
 	return ret;
 }
-#endif /* HAVE_SET_RESTORE_SIGMASK */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index db78775eff3b..eee729428683 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -129,6 +129,10 @@ static inline void set_restore_sigmask(void)
 }
 #endif	/* TIF_RESTORE_SIGMASK && !HAVE_SET_RESTORE_SIGMASK */
 
+#ifndef HAVE_SET_RESTORE_SIGMASK
+#error "no set_restore_sigmask() provided and default one won't work"
+#endif
+
 #endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
diff --git a/kernel/signal.c b/kernel/signal.c
index 08dfbd748cd2..95a9d9d8122b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3235,7 +3235,6 @@ SYSCALL_DEFINE0(pause)
 
 #endif
 
-#ifdef HAVE_SET_RESTORE_SIGMASK
 int sigsuspend(sigset_t *set)
 {
 	sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP));
@@ -3248,7 +3247,6 @@ int sigsuspend(sigset_t *set)
 	set_restore_sigmask();
 	return -ERESTARTNOHAND;
 }
-#endif
 
 #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
 /**
-- 
cgit v1.2.3-71-gd317


From a610d6e672d6d3723e8da257ad4a8a288a8f2f89 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 May 2012 23:42:15 -0400
Subject: pull clearing RESTORE_SIGMASK into block_sigmask()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c         |  5 -----
 arch/arm/kernel/signal.c           | 18 +++---------------
 arch/avr32/kernel/signal.c         | 11 ++++-------
 arch/blackfin/kernel/signal.c      | 24 +++++++-----------------
 arch/c6x/kernel/signal.c           | 23 ++++++-----------------
 arch/cris/arch-v10/kernel/signal.c | 14 ++------------
 arch/cris/arch-v32/kernel/signal.c | 15 ++-------------
 arch/frv/kernel/signal.c           | 24 +++++++-----------------
 arch/h8300/kernel/signal.c         |  4 +---
 arch/hexagon/kernel/signal.c       | 26 ++++++--------------------
 arch/ia64/kernel/signal.c          | 10 +---------
 arch/m32r/kernel/signal.c          |  8 +++-----
 arch/m68k/kernel/signal.c          |  2 --
 arch/microblaze/kernel/signal.c    | 17 +++--------------
 arch/mips/kernel/signal.c          | 18 +++---------------
 arch/mn10300/kernel/signal.c       | 18 +++++-------------
 arch/openrisc/kernel/signal.c      |  2 --
 arch/parisc/kernel/signal.c        | 10 ++++------
 arch/powerpc/kernel/signal.c       |  7 -------
 arch/s390/kernel/compat_signal.c   | 10 +++++++---
 arch/s390/kernel/entry.h           |  2 +-
 arch/s390/kernel/signal.c          | 32 +++++++++++---------------------
 arch/score/kernel/signal.c         | 24 +++++-------------------
 arch/sh/kernel/signal_32.c         | 26 +++++++-------------------
 arch/sh/kernel/signal_64.c         | 24 ++++++++----------------
 arch/sparc/kernel/signal32.c       | 15 +++------------
 arch/sparc/kernel/signal_32.c      | 16 +++-------------
 arch/sparc/kernel/signal_64.c      | 15 +++------------
 arch/tile/kernel/signal.c          | 25 +++++--------------------
 arch/um/kernel/signal.c            | 16 ++--------------
 arch/unicore32/kernel/signal.c     | 18 +++---------------
 arch/x86/kernel/signal.c           | 31 +++++++++----------------------
 arch/xtensa/kernel/signal.c        |  1 -
 kernel/signal.c                    |  6 ++++++
 34 files changed, 130 insertions(+), 387 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index f1e7d2aa2586..bb45a8813393 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -481,11 +481,6 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 		return;
 	}
 	block_sigmask(ka, sig);
-	/* A signal was successfully delivered, and the
-	   saved sigmask was stored on the signal frame,
-	   and will be restored by sigreturn.  So we can
-	   simply clear the restore sigmask flag.  */
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
 }
 
 static inline void
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 2e66c93973c3..7f9abd75fc2e 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -528,7 +528,7 @@ setup_rt_frame(int usig, struct k_sigaction *ka, siginfo_t *info,
 /*
  * OK, we're invoking a handler
  */	
-static int
+static void
 handle_signal(unsigned long sig, struct k_sigaction *ka,
 	      siginfo_t *info, struct pt_regs *regs)
 {
@@ -559,17 +559,14 @@ handle_signal(unsigned long sig, struct k_sigaction *ka,
 
 	if (ret != 0) {
 		force_sigsegv(sig, tsk);
-		return ret;
+		return;
 	}
 
 	/*
 	 * Block the signal if we were successful.
 	 */
 	block_sigmask(ka, sig);
-
 	tracehook_signal_handler(sig, info, ka, regs, 0);
-
-	return 0;
 }
 
 /*
@@ -633,16 +630,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 			clear_thread_flag(TIF_SYSCALL_RESTARTSYS);
 		}
 
-		if (handle_signal(signr, &ka, &info, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
+		handle_signal(signr, &ka, &info, regs);
 		return;
 	}
 
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index 0e2c0527c9fe..dc7875a0ad79 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -238,16 +238,13 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	 */
 	ret |= !valid_user_regs(regs);
 
-	if (ret != 0) {
-		force_sigsegv(sig, current);
-		return;
-	}
-
 	/*
 	 * Block the signal if we were successful.
 	 */
-	block_sigmask(ka, sig);
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
+	if (ret != 0)
+		force_sigsegv(sig, current);
+	else
+		block_sigmask(ka, sig);
 }
 
 /*
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index 7f4205ddfa4d..b25cbfef8192 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -247,7 +247,7 @@ handle_restart(struct pt_regs *regs, struct k_sigaction *ka, int has_handler)
 /*
  * OK, we're invoking a handler
  */
-static int
+static void
 handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	      struct pt_regs *regs)
 {
@@ -260,11 +260,12 @@ handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 
 	/* set up the stack frame */
 	ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
+	if (ret)
+		return;
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -290,18 +291,7 @@ asmlinkage void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signr, &info, &ka, regs,
-				test_thread_flag(TIF_SINGLESTEP));
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index 38bb501eb117..f39346f1f2d6 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -248,7 +248,7 @@ do_restart:
 /*
  * handle the actual delivery of a signal to userspace
  */
-static int handle_signal(int sig,
+static void handle_signal(int sig,
 			 siginfo_t *info, struct k_sigaction *ka,
 			 struct pt_regs *regs, int syscall)
 {
@@ -277,11 +277,10 @@ static int handle_signal(int sig,
 	}
 
 	/* Set up the stack frame */
-	ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0)
+		return;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs, 0);
 }
 
 /*
@@ -300,17 +299,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &info, &ka, regs, syscall) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signr, &info, &ka, regs, 0);
-		}
-
+		handle_signal(signr, &info, &ka, regs, syscall);
 		return;
 	}
 
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 09a4cf4eb08a..46c8ca605e4d 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -415,7 +415,7 @@ give_sigsegv:
  * OK, we're invoking a handler
  */
 
-static inline int handle_signal(int canrestart, unsigned long sig,
+static inline void handle_signal(int canrestart, unsigned long sig,
 	siginfo_t *info, struct k_sigaction *ka,
 	struct pt_regs *regs)
 {
@@ -458,8 +458,6 @@ static inline int handle_signal(int canrestart, unsigned long sig,
 
 	if (ret == 0)
 		block_sigmask(ka, sig);
-
-	return ret;
 }
 
 /*
@@ -492,15 +490,7 @@ void do_signal(int canrestart, struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(canrestart, signr, &info, &ka,
-				regs)) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
+		handle_signal(canrestart, signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index d52276ddae4b..e0431328b7cd 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -434,7 +434,7 @@ give_sigsegv:
 }
 
 /* Invoke a signal handler to, well, handle the signal. */
-static inline int
+static inline void
 handle_signal(int canrestart, unsigned long sig,
 	      siginfo_t *info, struct k_sigaction *ka,
               struct pt_regs * regs)
@@ -491,8 +491,6 @@ handle_signal(int canrestart, unsigned long sig,
 
 	if (ret == 0)
 		block_sigmask(ka, sig);
-
-	return ret;
 }
 
 /*
@@ -525,16 +523,7 @@ do_signal(int canrestart, struct pt_regs *regs)
 
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(canrestart, signr, &info, &ka,
-				regs)) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
-
+		handle_signal(canrestart, signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 22efe8d25038..8dd0492bfb7b 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -426,7 +426,7 @@ give_sigsegv:
 /*
  * OK, we're invoking a handler
  */
-static int handle_signal(unsigned long sig, siginfo_t *info,
+static void handle_signal(unsigned long sig, siginfo_t *info,
 			 struct k_sigaction *ka)
 {
 	sigset_t *oldset = sigmask_to_save();
@@ -461,11 +461,12 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 	else
 		ret = setup_frame(sig, ka, oldset);
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	if (ret)
+		return;
 
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, __frame,
+				 test_thread_flag(TIF_SINGLESTEP));
 } /* end handle_signal() */
 
 /*****************************************************************************/
@@ -495,18 +496,7 @@ static void do_signal(void)
 
 	signr = get_signal_to_deliver(&info, &ka, __frame, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &info, &ka) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signr, &info, &ka, __frame,
-						 test_thread_flag(TIF_SINGLESTEP));
-		}
-
+		handle_signal(signr, &info, &ka);
 		return;
 	}
 
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index d4d2f72672ad..eac26c9ffc44 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -442,10 +442,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
 
-	if (!ret) {
+	if (!ret)
 		block_sigmask(ka, sig);
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-	}
 }
 
 /*
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index f73fcee09bac..5f7d7c8a1328 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -149,11 +149,9 @@ sigsegv:
 /*
  * Setup invocation of signal handler
  */
-static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
+static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 			 struct pt_regs *regs)
 {
-	int rc;
-
 	/*
 	 * If we're handling a signal that aborted a system call,
 	 * set up the error return value before adding the signal
@@ -186,15 +184,13 @@ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	 * Set up the stack frame; not doing the SA_SIGINFO thing.  We
 	 * only set up the rt_frame flavor.
 	 */
-	rc = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
-
 	/* If there was an error on setup, no signal was delivered. */
-	if (rc)
-		return rc;
+	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0)
+		return;
 
 	block_sigmask(ka, sig);
-
-	return 0;
+	tracehook_signal_handler(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -215,17 +211,7 @@ static void do_signal(struct pt_regs *regs)
 	signo = get_signal_to_deliver(&info, &sigact, regs, NULL);
 
 	if (signo > 0) {
-		if (handle_signal(signo, &info, &sigact, regs) == 0) {
-			/*
-			 * Successful delivery case.  The saved sigmask is
-			 * stored in the signal frame, and will be restored
-			 * by sigreturn.  We can clear the TIF flag.
-			 */
-			clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signo, &info, &sigact, regs,
-				test_thread_flag(TIF_SINGLESTEP));
-		}
+		handle_signal(signo, &info, &sigact, regs);
 		return;
 	}
 
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 9fee6d6a3f21..dc6fe6573465 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -501,16 +501,8 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
 		 * Whee!  Actually deliver the signal.  If the delivery failed, we need to
 		 * continue to iterate in this loop so we can deliver the SIGSEGV...
 		 */
-		if (handle_signal(signr, &ka, &info, scr)) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+		if (handle_signal(signr, &ka, &info, scr))
 			return;
-		}
 	}
 
 	/* Did we come from a system call? */
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index e0d6d1079f33..970f46dbf24f 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -267,7 +267,7 @@ static int prev_insn(struct pt_regs *regs)
  * OK, we're invoking a handler
  */
 
-static int
+static void
 handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	      struct pt_regs *regs)
 {
@@ -295,10 +295,9 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 
 	/* Set up the stack frame */
 	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs))
-		return -EFAULT;
+		return;
 
 	block_sigmask(ka, sig);
-	return 0;
 }
 
 /*
@@ -333,8 +332,7 @@ static void do_signal(struct pt_regs *regs)
 		 */
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &ka, &info, regs) == 0)
-			clear_thread_flag(TIF_RESTORE_SIGMASK);
+		handle_signal(signr, &ka, &info, regs);
 
 		return;
 	}
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index c83eb5a8ed8b..6dbee8a167a5 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -1147,8 +1147,6 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 		regs->sr &= ~0x8000;
 		send_sig(SIGTRAP, current, 1);
 	}
-
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
 }
 
 /*
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index fd2de5718a4e..03641199666e 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -310,7 +310,7 @@ do_restart:
  * OK, we're invoking a handler
  */
 
-static int
+static void
 handle_signal(unsigned long sig, struct k_sigaction *ka,
 		siginfo_t *info, struct pt_regs *regs)
 {
@@ -324,11 +324,9 @@ handle_signal(unsigned long sig, struct k_sigaction *ka,
 		ret = setup_rt_frame(sig, ka, NULL, oldset, regs);
 
 	if (ret)
-		return ret;
+		return;
 
 	block_sigmask(ka, sig);
-
-	return 0;
 }
 
 /*
@@ -356,16 +354,7 @@ static void do_signal(struct pt_regs *regs, int in_syscall)
 		/* Whee! Actually deliver the signal. */
 		if (in_syscall)
 			handle_restart(regs, &ka, 1);
-		if (!handle_signal(signr, &ka, &info, oldset, regs)) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &=
-			    ~TS_RESTORE_SIGMASK;
-		}
+		handle_signal(signr, &ka, &info, regs);
 		return;
 	}
 
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 18355060f241..53c6e90082f0 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -514,7 +514,7 @@ struct mips_abi mips_abi = {
 	.restart	= __NR_restart_syscall
 };
 
-static int handle_signal(unsigned long sig, siginfo_t *info,
+static void handle_signal(unsigned long sig, siginfo_t *info,
 	struct k_sigaction *ka, struct pt_regs *regs)
 {
 	sigset_t *oldset = sigmask_to_save();
@@ -551,11 +551,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 				       ka, regs, sig, oldset);
 
 	if (ret)
-		return ret;
+		return;
 
 	block_sigmask(ka, sig);
-
-	return ret;
 }
 
 static void do_signal(struct pt_regs *regs)
@@ -575,17 +573,7 @@ static void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index 26a1d98c62a1..1715478f4e94 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -462,11 +462,12 @@ static int handle_signal(int sig,
 		ret = setup_rt_frame(sig, ka, info, oldset, regs);
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
+	if (ret)
+		return;
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs,
+				 test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -486,15 +487,6 @@ static void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signr, &info, &ka, regs,
-						 test_thread_flag(TIF_SINGLESTEP));
 		}
 
 		return;
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 6c41778410e6..aa1105c1618f 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -263,8 +263,6 @@ handle_signal(unsigned long sig,
 		return;
 
 	block_sigmask(ka, sig);
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
-
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
 }
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 441b25992846..d6ddc572eba1 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -459,6 +459,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		test_thread_flag(TIF_SINGLESTEP) ||
 		test_thread_flag(TIF_BLOCKSTEP));
 
+	DBG(1,KERN_DEBUG "do_signal: Exit (success), regs->gr[28] = %ld\n",
+		regs->gr[28]);
+
 	return 1;
 }
 
@@ -593,13 +596,8 @@ do_signal(struct pt_regs *regs, long in_syscall)
 		/* Whee!  Actually deliver the signal.  If the
 		   delivery failed, we need to continue to iterate in
 		   this loop so we can deliver the SIGSEGV... */
-		if (handle_signal(signr, &info, &ka, regs, in_syscall)) {
-			DBG(1,KERN_DEBUG "do_signal: Exit (success), regs->gr[28] = %ld\n",
-				regs->gr[28]);
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
+		if (handle_signal(signr, &info, &ka, regs, in_syscall))
 			return;
-		}
 	}
 	/* end of while(1) looping forever if we can't force a signal */
 
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 8e9ddab7ade6..d926d2e4611a 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -159,13 +159,6 @@ static int do_signal(struct pt_regs *regs)
 	regs->trap = 0;
 	if (ret) {
 		block_sigmask(&ka, signr);
-
-		/*
-		 * A signal was successfully delivered; the saved sigmask is in
-		 * its frame, and we can clear the TLF_RESTORE_SIGMASK flag.
-		 */
-		current_thread_info()->local_flags &= ~_TLF_RESTORE_SIGMASK;
-
 		/*
 		 * Let tracing know that we've done the handler setup.
 		 */
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 377c096ca4a7..233db1d68eee 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -572,7 +572,7 @@ give_sigsegv:
  * OK, we're invoking a handler
  */	
 
-int handle_signal32(unsigned long sig, struct k_sigaction *ka,
+void handle_signal32(unsigned long sig, struct k_sigaction *ka,
 		    siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
 {
 	int ret;
@@ -583,8 +583,12 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
 	else
 		ret = setup_frame32(sig, ka, oldset, regs);
 	if (ret)
-		return ret;
+		return;
 	block_sigmask(ka, sig);
-	return 0;
+	/*
+	 * Let tracing know that we've done the handler setup.
+	 */
+	tracehook_signal_handler(sig, info, ka, regs,
+				 test_thread_flag(TIF_SINGLE_STEP));
 }
 
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 6cdddac93a2e..f66a229ab0b3 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -31,7 +31,7 @@ void do_per_trap(struct pt_regs *regs);
 void syscall_trace(struct pt_regs *regs, int entryexit);
 void kernel_stack_overflow(struct pt_regs * regs);
 void do_signal(struct pt_regs *regs);
-int handle_signal32(unsigned long sig, struct k_sigaction *ka,
+void handle_signal32(unsigned long sig, struct k_sigaction *ka,
 		    siginfo_t *info, sigset_t *oldset, struct pt_regs *regs);
 void do_notify_resume(struct pt_regs *regs);
 
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index c880c48a09f3..7f9a862a161a 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -367,7 +367,7 @@ give_sigsegv:
 	return -EFAULT;
 }
 
-static int handle_signal(unsigned long sig, struct k_sigaction *ka,
+static void handle_signal(unsigned long sig, struct k_sigaction *ka,
 			 siginfo_t *info, sigset_t *oldset,
 			 struct pt_regs *regs)
 {
@@ -379,9 +379,13 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka,
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
 	if (ret)
-		return ret;
+		return;
 	block_sigmask(ka, sig);
-	return 0;
+	/*
+	 * Let tracing know that we've done the handler setup.
+	 */
+	tracehook_signal_handler(sig, info, ka, regs,
+				 test_thread_flag(TIF_SINGLE_STEP));
 }
 
 /*
@@ -436,24 +440,10 @@ void do_signal(struct pt_regs *regs)
 		/* No longer in a system call */
 		clear_thread_flag(TIF_SYSCALL);
 
-		if ((is_compat_task() ?
-		     handle_signal32(signr, &ka, &info, oldset, regs) :
-		     handle_signal(signr, &ka, &info, oldset, regs)) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			/*
-			 * Let tracing know that we've done the handler setup.
-			 */
-			tracehook_signal_handler(signr, &info, &ka, regs,
-					 test_thread_flag(TIF_SINGLE_STEP));
-		}
+		if (is_compat_task())
+			handle_signal32(signr, &ka, &info, oldset, regs);
+		else
+			handle_signal(signr, &ka, &info, oldset, regs);
 		return;
 	}
 
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index b24dfaf2462f..13e0eed0e301 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -241,11 +241,9 @@ give_sigsegv:
 	return -EFAULT;
 }
 
-static int handle_signal(unsigned long sig, siginfo_t *info,
+static void handle_signal(unsigned long sig, siginfo_t *info,
 	struct k_sigaction *ka, struct pt_regs *regs)
 {
-	int ret;
-
 	if (regs->is_syscall) {
 		switch (regs->regs[4]) {
 		case ERESTART_RESTARTBLOCK:
@@ -269,12 +267,10 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 	/*
 	 * Set up the stack frame
 	 */
-	ret = setup_rt_frame(ka, regs, sig, sigmask_to_save(), info);
-
-	if (ret == 0)
-		block_sigmask(ka, sig);
+	if (setup_rt_frame(ka, regs, sig, sigmask_to_save(), info) < 0)
+		return;
 
-	return ret;
+	block_sigmask(ka, sig);
 }
 
 static void do_signal(struct pt_regs *regs)
@@ -294,17 +290,7 @@ static void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index bfb3d599f032..2675a97f374f 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -522,7 +522,7 @@ handle_syscall_restart(unsigned long save_r0, struct pt_regs *regs,
 /*
  * OK, we're invoking a handler
  */
-static int
+static void
 handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	      struct pt_regs *regs, unsigned int save_r0)
 {
@@ -535,10 +535,11 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	if (ret)
+		return;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -570,20 +571,7 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0)
 		handle_syscall_restart(save_r0, regs, &ka.sa);
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &ka, &info,
-				  regs, save_r0) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-
-			tracehook_signal_handler(signr, &info, &ka, regs,
-					test_thread_flag(TIF_SINGLESTEP));
-		}
-
+		handle_signal(signr, &ka, &info, regs, save_r0);
 		return;
 	}
 
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index aeeab070aaa9..7075c63bfc6f 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -43,7 +43,7 @@
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
-static int
+static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs * regs);
 
@@ -103,17 +103,7 @@ static void do_signal(struct pt_regs *regs)
 		handle_syscall_restart(regs, &ka.sa);
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/*
-			 * If a signal was successfully delivered, the
-			 * saved sigmask is in its frame, and we can
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-
-			tracehook_signal_handler(signr, &info, &ka, regs,
-					test_thread_flag(TIF_SINGLESTEP));
-		}
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
@@ -648,7 +638,7 @@ give_sigsegv:
 /*
  * OK, we're invoking a handler
  */
-static int
+static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs * regs)
 {
@@ -661,10 +651,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
+	if (ret)
+		return;
 
-	return ret;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 88e0d8122d2c..8c93c00922a7 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -775,7 +775,7 @@ sigsegv:
 	return -EFAULT;
 }
 
-static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
+static inline void handle_signal32(unsigned long signr, struct k_sigaction *ka,
 				  siginfo_t *info,
 				  sigset_t *oldset, struct pt_regs *regs)
 {
@@ -787,12 +787,10 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
 		err = setup_frame32(ka, regs, signr, oldset);
 
 	if (err)
-		return err;
+		return;
 
 	block_sigmask(ka, signr);
 	tracehook_signal_handler(signr, info, ka, regs, 0);
-
-	return 0;
 }
 
 static inline void syscall_restart32(unsigned long orig_i0, struct pt_regs *regs,
@@ -841,14 +839,7 @@ void do_signal32(sigset_t *oldset, struct pt_regs * regs)
 	if (signr > 0) {
 		if (restart_syscall)
 			syscall_restart32(orig_i0, regs, &ka.sa);
-		if (handle_signal32(signr, &ka, &info, oldset, regs) == 0) {
-			/* A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
+		handle_signal32(signr, &ka, &info, oldset, regs);
 		return;
 	}
 	if (restart_syscall &&
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 5d74410c787b..ee81b90c532f 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -449,7 +449,7 @@ sigsegv:
 	return -EFAULT;
 }
 
-static inline int
+static inline void
 handle_signal(unsigned long signr, struct k_sigaction *ka,
 	      siginfo_t *info, struct pt_regs *regs)
 {
@@ -462,12 +462,10 @@ handle_signal(unsigned long signr, struct k_sigaction *ka,
 		err = setup_frame(ka, regs, signr, oldset);
 
 	if (err)
-		return err;
+		return;
 
 	block_sigmask(ka, signr);
 	tracehook_signal_handler(signr, info, ka, regs, 0);
-
-	return 0;
 }
 
 static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
@@ -539,15 +537,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	if (signr > 0) {
 		if (restart_syscall)
 			syscall_restart(orig_i0, regs, &ka.sa);
-		if (handle_signal(signr, &ka, &info, regs) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
+		handle_signal(signr, &ka, &info, regs);
 		return;
 	}
 	if (restart_syscall &&
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 088a733f83f9..febbc4b697ba 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -466,7 +466,7 @@ sigsegv:
 	return -EFAULT;
 }
 
-static inline int handle_signal(unsigned long signr, struct k_sigaction *ka,
+static inline void handle_signal(unsigned long signr, struct k_sigaction *ka,
 				siginfo_t *info,
 				sigset_t *oldset, struct pt_regs *regs)
 {
@@ -475,12 +475,10 @@ static inline int handle_signal(unsigned long signr, struct k_sigaction *ka,
 	err = setup_rt_frame(ka, regs, signr, oldset,
 			     (ka->sa.sa_flags & SA_SIGINFO) ? info : NULL);
 	if (err)
-		return err;
+		return;
 
 	block_sigmask(ka, signr);
 	tracehook_signal_handler(signr, info, ka, regs, 0);
-
-	return 0;
 }
 
 static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
@@ -558,14 +556,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	if (signr > 0) {
 		if (restart_syscall)
 			syscall_restart(orig_i0, regs, &ka.sa);
-		if (handle_signal(signr, &ka, &info, oldset, regs) == 0) {
-			/* A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
+		handle_signal(signr, &ka, &info, oldset, regs);
 		return;
 	}
 	if (restart_syscall &&
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 588c28b2db58..9b71bfd4913d 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -242,7 +242,7 @@ give_sigsegv:
  * OK, we're invoking a handler
  */
 
-static int handle_signal(unsigned long sig, siginfo_t *info,
+static void handle_signal(unsigned long sig, siginfo_t *info,
 			 struct k_sigaction *ka,
 			 struct pt_regs *regs)
 {
@@ -279,15 +279,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 	else
 #endif
 		ret = setup_rt_frame(sig, ka, info, oldset, regs);
-	if (ret == 0) {
-		/* This code is only called from system calls or from
-		 * the work_pending path in the return-to-user code, and
-		 * either way we can re-enable interrupts unconditionally.
-		 */
-		block_sigmask(ka, sig);
-	}
-
-	return ret;
+	if (ret)
+		return;
+	block_sigmask(ka, sig);
 }
 
 /*
@@ -311,16 +305,7 @@ void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee! Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		goto done;
 	}
 
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 909e9b8d6612..549a51c8e54f 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -22,7 +22,7 @@ EXPORT_SYMBOL(unblock_signals);
 /*
  * OK, we're invoking a handler
  */
-static int handle_signal(struct pt_regs *regs, unsigned long signr,
+static void handle_signal(struct pt_regs *regs, unsigned long signr,
 			 struct k_sigaction *ka, siginfo_t *info)
 {
 	sigset_t *oldset = sigmask_to_save();
@@ -66,8 +66,6 @@ static int handle_signal(struct pt_regs *regs, unsigned long signr,
 		force_sigsegv(signr, current);
 	else
 		block_sigmask(ka, signr);
-
-	return err;
 }
 
 static int kern_do_signal(struct pt_regs *regs)
@@ -79,17 +77,7 @@ static int kern_do_signal(struct pt_regs *regs)
 	while ((sig = get_signal_to_deliver(&info, &ka_copy, regs, NULL)) > 0) {
 		handled_sig = 1;
 		/* Whee!  Actually deliver the signal.  */
-		if (!handle_signal(regs, sig, &ka_copy, &info)) {
-			/*
-			 * a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-			break;
-		}
+		handle_signal(regs, sig, &ka_copy, &info);
 	}
 
 	/* Did we come from a system call? */
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index bf23194dc74d..dc41b11f8a57 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -312,7 +312,7 @@ static inline void setup_syscall_restart(struct pt_regs *regs)
 /*
  * OK, we're invoking a handler
  */
-static int handle_signal(unsigned long sig, struct k_sigaction *ka,
+static void handle_signal(unsigned long sig, struct k_sigaction *ka,
 	      siginfo_t *info, struct pt_regs *regs, int syscall)
 {
 	struct thread_info *thread = current_thread_info();
@@ -363,15 +363,13 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka,
 
 	if (ret != 0) {
 		force_sigsegv(sig, tsk);
-		return ret;
+		return;
 	}
 
 	/*
 	 * Block the signal if we were successful.
 	 */
 	block_sigmask(ka, sig);
-
-	return 0;
 }
 
 /*
@@ -403,17 +401,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &ka, &info, regs, syscall)
-				== 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
+		handle_signal(signr, &ka, &info, regs, syscall);
 		return;
 	}
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 56f3062c5111..700c49dcd84e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -648,38 +648,27 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 {
 	int usig = signr_convert(sig);
 	sigset_t *set = sigmask_to_save();
-	int ret;
 
 	/* Set up the stack frame */
 	if (is_ia32) {
 		if (ka->sa.sa_flags & SA_SIGINFO)
-			ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+			return ia32_setup_rt_frame(usig, ka, info, set, regs);
 		else
-			ret = ia32_setup_frame(usig, ka, set, regs);
+			return ia32_setup_frame(usig, ka, set, regs);
 #ifdef CONFIG_X86_X32_ABI
 	} else if (is_x32) {
-		ret = x32_setup_rt_frame(usig, ka, info,
+		return x32_setup_rt_frame(usig, ka, info,
 					 (compat_sigset_t *)set, regs);
 #endif
 	} else {
-		ret = __setup_rt_frame(sig, ka, info, set, regs);
+		return __setup_rt_frame(sig, ka, info, set, regs);
 	}
-
-	if (ret) {
-		force_sigsegv(sig, current);
-		return -EFAULT;
-	}
-
-	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-	return ret;
 }
 
-static int
+static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs *regs)
 {
-	int ret;
-
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
@@ -710,10 +699,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
 		regs->flags &= ~X86_EFLAGS_TF;
 
-	ret = setup_rt_frame(sig, ka, info, regs);
-
-	if (ret)
-		return ret;
+	if (setup_rt_frame(sig, ka, info, regs) < 0) {
+		force_sigsegv(sig, current);
+		return;
+	}
 
 	/*
 	 * Clear the direction flag as per the ABI for function entry.
@@ -732,8 +721,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
-
-	return 0;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index e4b06e2d4eb9..3e83913a3c7c 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -499,7 +499,6 @@ static void do_signal(struct pt_regs *regs)
 		if (ret)
 			return;
 
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
 		block_sigmask(&ka, signr);
 		if (current->ptrace & PT_SINGLESTEP)
 			task_pt_regs(current)->icountlevel = 1;
diff --git a/kernel/signal.c b/kernel/signal.c
index 95a9d9d8122b..b9be7e0fe41a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2382,6 +2382,12 @@ void block_sigmask(struct k_sigaction *ka, int signr)
 {
 	sigset_t blocked;
 
+	/* A signal was successfully delivered, and the
+	   saved sigmask was stored on the signal frame,
+	   and will be restored by sigreturn.  So we can
+	   simply clear the restore sigmask flag.  */
+	clear_restore_sigmask();
+
 	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
 	if (!(ka->sa.sa_flags & SA_NODEFER))
 		sigaddset(&blocked, signr);
-- 
cgit v1.2.3-71-gd317


From 77097ae503b170120ab66dd1d547f8577193f91f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Apr 2012 13:58:59 -0400
Subject: most of set_current_blocked() callers want SIGKILL/SIGSTOP removed
 from set

Only 3 out of 63 do not.  Renamed the current variant to __set_current_blocked(),
added set_current_blocked() that will exclude unblockable signals, switched
open-coded instances to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c          |  2 --
 arch/arm/kernel/signal.c            |  6 +-----
 arch/avr32/kernel/signal.c          |  3 ---
 arch/blackfin/kernel/signal.c       |  3 ---
 arch/c6x/kernel/signal.c            |  3 ---
 arch/cris/arch-v10/kernel/signal.c  |  4 ----
 arch/cris/arch-v32/kernel/signal.c  |  5 -----
 arch/frv/kernel/signal.c            |  4 ----
 arch/h8300/kernel/signal.c          |  4 ----
 arch/hexagon/kernel/signal.c        |  3 ---
 arch/ia64/kernel/signal.c           |  2 --
 arch/m32r/kernel/signal.c           |  3 ---
 arch/m68k/kernel/signal.c           |  4 ----
 arch/microblaze/kernel/signal.c     |  3 ---
 arch/mips/kernel/signal-common.h    |  2 --
 arch/mips/kernel/signal.c           |  2 --
 arch/mips/kernel/signal32.c         |  2 --
 arch/mips/kernel/signal_n32.c       |  1 -
 arch/mn10300/kernel/signal.c        |  4 ----
 arch/openrisc/kernel/signal.c       |  3 ---
 arch/parisc/kernel/signal.c         |  4 ----
 arch/parisc/kernel/signal32.c       |  2 --
 arch/powerpc/kernel/signal.c        |  1 -
 arch/powerpc/kernel/signal.h        |  2 --
 arch/s390/kernel/compat_signal.c    |  4 ----
 arch/s390/kernel/signal.c           |  5 -----
 arch/score/kernel/signal.c          |  3 ---
 arch/sh/kernel/signal_32.c          |  4 ----
 arch/sh/kernel/signal_64.c          |  4 ----
 arch/sparc/kernel/signal32.c        |  4 ----
 arch/sparc/kernel/signal_32.c       |  4 ----
 arch/sparc/kernel/signal_64.c       |  4 ----
 arch/tile/kernel/compat_signal.c    |  3 ---
 arch/tile/kernel/signal.c           |  3 ---
 arch/um/include/shared/frame_kern.h |  3 ---
 arch/um/kernel/signal.c             |  4 ----
 arch/unicore32/kernel/signal.c      |  6 +-----
 arch/x86/ia32/ia32_signal.c         |  2 --
 arch/x86/include/asm/sighandling.h  |  2 --
 arch/x86/kernel/signal.c            |  3 ---
 arch/x86/um/signal.c                |  2 --
 arch/xtensa/kernel/signal.c         |  3 ---
 include/linux/sched.h               |  2 +-
 include/linux/signal.h              |  3 ++-
 kernel/signal.c                     | 18 ++++++++++++------
 45 files changed, 17 insertions(+), 141 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index bb45a8813393..48c4df2389ac 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -226,7 +226,6 @@ do_sigreturn(struct sigcontext __user *sc, struct pt_regs *regs,
 	if (__get_user(set.sig[0], &sc->sc_mask))
 		goto give_sigsegv;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(sc, regs, sw))
@@ -261,7 +260,6 @@ do_rt_sigreturn(struct rt_sigframe __user *frame, struct pt_regs *regs,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto give_sigsegv;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(&frame->uc.uc_mcontext, regs, sw))
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7f9abd75fc2e..c126eba8411d 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -22,8 +22,6 @@
 
 #include "signal.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * For ARM syscalls, we encode the syscall number into the instruction.
  */
@@ -210,10 +208,8 @@ static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf)
 	int err;
 
 	err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
-	if (err == 0) {
-		sigdelsetmask(&set, ~_BLOCKABLE);
+	if (err == 0)
 		set_current_blocked(&set);
-	}
 
 	__get_user_error(regs->ARM_r0, &sf->uc.uc_mcontext.arm_r0, err);
 	__get_user_error(regs->ARM_r1, &sf->uc.uc_mcontext.arm_r1, err);
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index 3ac1a60f9eb6..e883fa5eb845 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -22,8 +22,6 @@
 #include <asm/ucontext.h>
 #include <asm/syscalls.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 			       struct pt_regs *regs)
 {
@@ -89,7 +87,6 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index b20d435d084a..463612643821 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -19,8 +19,6 @@
 #include <asm/fixed_code.h>
 #include <asm/syscall.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* Location of the trace bit in SYSCFG. */
 #define TRACE_BITS 0x0001
 
@@ -98,7 +96,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (rt_restore_sigcontext(regs, &frame->uc.uc_mcontext, &r0))
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index d599a7fb5d24..eb1b3086ae00 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -20,8 +20,6 @@
 #include <asm/cacheflush.h>
 
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * Do a signal return, undo the signal stack.
  */
@@ -87,7 +85,6 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 46c8ca605e4d..cf6380cb9a57 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -31,8 +31,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* a syscall in Linux/CRIS is a break 13 instruction which is 2 bytes */
 /* manipulate regs so that upon return, it will be re-executed */
 
@@ -176,7 +174,6 @@ asmlinkage int sys_sigreturn(long r10, long r11, long r12, long r13, long mof,
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc))
@@ -212,7 +209,6 @@ asmlinkage int sys_rt_sigreturn(long r10, long r11, long r12, long r13,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index e0431328b7cd..07b81ee09f65 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -24,9 +24,6 @@
 
 extern unsigned long cris_signal_return_page;
 
-/* Flag to check if a signal is blockable. */
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * A syscall in CRIS is really a "break 13" instruction, which is 2
  * bytes. The registers is manipulated so upon return the instruction
@@ -167,7 +164,6 @@ sys_sigreturn(long r10, long r11, long r12, long r13, long mof, long srp,
 						 sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc))
@@ -208,7 +204,6 @@ sys_rt_sigreturn(long r10, long r11, long r12, long r13, long mof, long srp,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 9ec3d2e27b4c..511285fa2461 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -28,8 +28,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 struct fdpic_func_descriptor {
 	unsigned long	text;
 	unsigned long	GOT;
@@ -149,7 +147,6 @@ asmlinkage int sys_sigreturn(void)
 	    __copy_from_user(&set.sig[1], &frame->extramask, sizeof(frame->extramask)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(&frame->sc, &gr8))
@@ -172,7 +169,6 @@ asmlinkage int sys_rt_sigreturn(void)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(&frame->uc.uc_mcontext, &gr8))
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index 8fbfc39574f5..aa6f09666915 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -47,8 +47,6 @@
 #include <asm/traps.h>
 #include <asm/ucontext.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * Atomically swap in the new signal mask, and wait for a signal.
  */
@@ -186,7 +184,6 @@ asmlinkage int do_sigreturn(unsigned long __unused,...)
 			      sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	
 	if (restore_sigcontext(regs, &frame->sc, &er0))
@@ -211,7 +208,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused,...)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &er0))
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index c9caf7401191..439f11a3a8ef 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -31,8 +31,6 @@
 #include <asm/signal.h>
 #include <asm/vdso.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 struct rt_sigframe {
 	unsigned long tramp[2];
 	struct siginfo info;
@@ -273,7 +271,6 @@ asmlinkage int sys_rt_sigreturn(void)
 	if (__copy_from_user(&blocked, &frame->uc.uc_sigmask, sizeof(blocked)))
 		goto badframe;
 
-	sigdelsetmask(&blocked, ~_BLOCKABLE);
 	set_current_blocked(&blocked);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index dc6fe6573465..c4041c76c07d 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -30,7 +30,6 @@
 
 #define DEBUG_SIG	0
 #define STACK_ALIGN	16		/* minimal alignment for stack pointer */
-#define _BLOCKABLE	(~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
 #if _NSIG_WORDS > 1
 # define PUT_SIGSET(k,u)	__copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t))
@@ -200,7 +199,6 @@ ia64_rt_sigreturn (struct sigscratch *scr)
 	if (GET_SIGSET(&set, &sc->sc_mask))
 		goto give_sigsegv;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(sc, scr))
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 7cbfa639fbfa..07f9032576c0 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -28,8 +28,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 asmlinkage int
 sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 		unsigned long r2, unsigned long r3, unsigned long r4,
@@ -111,7 +109,6 @@ sys_rt_sigreturn(unsigned long r0, unsigned long r1,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &result))
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 6dbee8a167a5..c00caad215a6 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -51,8 +51,6 @@
 #include <asm/traps.h>
 #include <asm/ucontext.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 #ifdef CONFIG_MMU
 
 /*
@@ -795,7 +793,6 @@ asmlinkage int do_sigreturn(unsigned long __unused)
 			      sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, frame + 1))
@@ -820,7 +817,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (rt_restore_ucontext(regs, sw, &frame->uc))
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 03641199666e..c662e68671a2 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -41,8 +41,6 @@
 #include <asm/cacheflush.h>
 #include <asm/syscalls.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 asmlinkage long
 sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 		struct pt_regs *regs)
@@ -106,7 +104,6 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &rval))
diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h
index 10263b405981..9c60d09e62a7 100644
--- a/arch/mips/kernel/signal-common.h
+++ b/arch/mips/kernel/signal-common.h
@@ -19,8 +19,6 @@
 #  define DEBUGP(fmt, args...)
 #endif
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * Determine which stack to use..
  */
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 896165757e6f..02e0cba24f82 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -339,7 +339,6 @@ asmlinkage void sys_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_from_user(&blocked, &frame->sf_mask, sizeof(blocked)))
 		goto badframe;
 
-	sigdelsetmask(&blocked, ~_BLOCKABLE);
 	set_current_blocked(&blocked);
 
 	sig = restore_sigcontext(&regs, &frame->sf_sc);
@@ -375,7 +374,6 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_from_user(&set, &frame->rs_uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	sig = restore_sigcontext(&regs, &frame->rs_uc.uc_mcontext);
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index b4fe2eacbd5d..da1b56a39ac7 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -465,7 +465,6 @@ asmlinkage void sys32_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_conv_sigset_from_user(&blocked, &frame->sf_mask))
 		goto badframe;
 
-	sigdelsetmask(&blocked, ~_BLOCKABLE);
 	set_current_blocked(&blocked);
 
 	sig = restore_sigcontext32(&regs, &frame->sf_sc);
@@ -503,7 +502,6 @@ asmlinkage void sys32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	sig = restore_sigcontext32(&regs, &frame->rs_uc.uc_mcontext);
diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c
index 63ffac9af7c5..3574c145511b 100644
--- a/arch/mips/kernel/signal_n32.c
+++ b/arch/mips/kernel/signal_n32.c
@@ -109,7 +109,6 @@ asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	sig = restore_sigcontext(&regs, &frame->rs_uc.uc_mcontext);
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index d57013e06ea0..4f6d20763061 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -31,8 +31,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * atomically swap in the new signal mask, and wait for a signal.
  */
@@ -163,7 +161,6 @@ asmlinkage long sys_sigreturn(void)
 			     sizeof(frame->extramask)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(current_frame(), &frame->sc, &d0))
@@ -191,7 +188,6 @@ asmlinkage long sys_rt_sigreturn(void)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(current_frame(), &frame->uc.uc_mcontext, &d0))
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index aa1105c1618f..53972b7260b7 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -33,8 +33,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 asmlinkage long
 _sys_sigaltstack(const stack_t *uss, stack_t *uoss, struct pt_regs *regs)
 {
@@ -101,7 +99,6 @@ asmlinkage long _sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 7f3c8f2c962d..25161eaf720d 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -48,9 +48,6 @@
 #define DBG(LEVEL, ...)
 #endif
 	
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* gcc will complain if a pointer is cast to an integer of different
  * size.  If you really need to do this (and we do for an ELF32 user
  * application in an ELF64 kernel) then you have to do a cast to an
@@ -131,7 +128,6 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall)
 			goto give_sigsegv;
 	}
 		
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	/* Good thing we saved the old gr[30], eh? */
diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c
index e14132430762..fd49aeda9eb8 100644
--- a/arch/parisc/kernel/signal32.c
+++ b/arch/parisc/kernel/signal32.c
@@ -47,8 +47,6 @@
 #define DBG(LEVEL, ...)
 #endif
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 inline void
 sigset_32to64(sigset_t *s64, compat_sigset_t *s32)
 {
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index d926d2e4611a..3a3413c049c3 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -57,7 +57,6 @@ void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
  */
 void restore_sigmask(sigset_t *set)
 {
-	sigdelsetmask(set, ~_BLOCKABLE);
 	set_current_blocked(set);
 }
 
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index 8dde973aaaf5..11439ea18ed4 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -10,8 +10,6 @@
 #ifndef _POWERPC_ARCH_SIGNAL_H
 #define _POWERPC_ARCH_SIGNAL_H
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
 
 extern void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 233db1d68eee..923baa96c0b0 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -32,8 +32,6 @@
 #include "compat_ptrace.h"
 #include "entry.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 typedef struct 
 {
 	__u8 callee_used_stack[__SIGNAL_FRAMESIZE32];
@@ -364,7 +362,6 @@ asmlinkage long sys32_sigreturn(void)
 		goto badframe;
 	if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32))
 		goto badframe;
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	if (restore_sigregs32(regs, &frame->sregs))
 		goto badframe;
@@ -390,7 +387,6 @@ asmlinkage long sys32_rt_sigreturn(void)
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	if (restore_sigregs32(regs, &frame->uc.uc_mcontext))
 		goto badframe;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 7f9a862a161a..8332a6943384 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -33,9 +33,6 @@
 #include <asm/switch_to.h>
 #include "entry.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-
 typedef struct 
 {
 	__u8 callee_used_stack[__SIGNAL_FRAMESIZE];
@@ -169,7 +166,6 @@ SYSCALL_DEFINE0(sigreturn)
 		goto badframe;
 	if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE))
 		goto badframe;
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	if (restore_sigregs(regs, &frame->sregs))
 		goto badframe;
@@ -189,7 +185,6 @@ SYSCALL_DEFINE0(rt_sigreturn)
 		goto badframe;
 	if (__copy_from_user(&set.sig, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	if (restore_sigregs(regs, &frame->uc.uc_mcontext))
 		goto badframe;
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 13e0eed0e301..f1b3fef0907b 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -34,8 +34,6 @@
 #include <asm/syscalls.h>
 #include <asm/ucontext.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 struct rt_sigframe {
 	u32 rs_ass[4];		/* argument save space */
 	u32 rs_code[2];		/* signal trampoline */
@@ -162,7 +160,6 @@ score_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->rs_uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	sig = restore_sigcontext(regs, &frame->rs_uc.uc_mcontext);
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 2675a97f374f..e4a531414e19 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -32,8 +32,6 @@
 #include <asm/syscalls.h>
 #include <asm/fpu.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 struct fdpic_func_descriptor {
 	unsigned long	text;
 	unsigned long	GOT;
@@ -226,7 +224,6 @@ asmlinkage int sys_sigreturn(unsigned long r4, unsigned long r5,
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, &r0))
@@ -256,7 +253,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long r4, unsigned long r5,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &r0))
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 7075c63bfc6f..75960ef6c1d1 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -41,8 +41,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs * regs);
@@ -330,7 +328,6 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, &ret))
@@ -363,7 +360,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ret))
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 8c93c00922a7..ba3dbfcdb28e 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -32,8 +32,6 @@
 
 #include "sigutil.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* This magic should be in g_upper[0] for all upper parts
  * to be valid.
  */
@@ -274,7 +272,6 @@ void do_sigreturn32(struct pt_regs *regs)
 		case 2: set.sig[1] = seta[2] + (((long)seta[3]) << 32);
 		case 1: set.sig[0] = seta[0] + (((long)seta[1]) << 32);
 	}
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 
@@ -376,7 +373,6 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
 		case 2: set.sig[1] = seta.sig[2] + (((long)seta.sig[3]) << 32);
 		case 1: set.sig[0] = seta.sig[0] + (((long)seta.sig[1]) << 32);
 	}
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 segv:
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index f6722427203d..1bfa854be602 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -29,8 +29,6 @@
 
 #include "sigutil.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 extern void fpsave(unsigned long *fpregs, unsigned long *fsr,
 		   void *fpqueue, unsigned long *fpqdepth);
 extern void fpload(unsigned long *fpregs, unsigned long *fsr);
@@ -130,7 +128,6 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
 	if (err)
 		goto segv_and_exit;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 
@@ -197,7 +194,6 @@ asmlinkage void do_rt_sigreturn(struct pt_regs *regs)
 			goto segv;
 	}
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 segv:
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index febbc4b697ba..23b60caa6c43 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -38,8 +38,6 @@
 #include "systbls.h"
 #include "sigutil.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* {set, get}context() needed for 64-bit SparcLinux userland. */
 asmlinkage void sparc64_set_context(struct pt_regs *regs)
 {
@@ -71,7 +69,6 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs)
 			if (__copy_from_user(&set, &ucp->uc_sigmask, sizeof(sigset_t)))
 				goto do_sigsegv;
 		}
-		sigdelsetmask(&set, ~_BLOCKABLE);
 		set_current_blocked(&set);
 	}
 	if (test_thread_flag(TIF_32BIT)) {
@@ -315,7 +312,6 @@ void do_rt_sigreturn(struct pt_regs *regs)
 	/* Prevent syscall restart.  */
 	pt_regs_clear_syscall(regs);
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 segv:
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index cdef6e5ec022..474571b84085 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -118,8 +118,6 @@ struct compat_rt_sigframe {
 	struct compat_ucontext uc;
 };
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act,
 			     struct compat_sigaction __user *oact,
 			     size_t sigsetsize)
@@ -302,7 +300,6 @@ long compat_sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 9b71bfd4913d..e068aa0c6dfc 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -37,8 +37,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss,
 		stack_t __user *, uoss, struct pt_regs *, regs)
 {
@@ -96,7 +94,6 @@ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/um/include/shared/frame_kern.h b/arch/um/include/shared/frame_kern.h
index 76078490c258..e584e40ee832 100644
--- a/arch/um/include/shared/frame_kern.h
+++ b/arch/um/include/shared/frame_kern.h
@@ -6,9 +6,6 @@
 #ifndef __FRAME_KERN_H_
 #define __FRAME_KERN_H_
 
-#define _S(nr) (1<<((nr)-1))
-#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
-
 extern int setup_signal_stack_sc(unsigned long stack_top, int sig, 
 				 struct k_sigaction *ka,
 				 struct pt_regs *regs, 
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 549a51c8e54f..4ce6ab2d2996 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -15,10 +15,6 @@
 EXPORT_SYMBOL(block_signals);
 EXPORT_SYMBOL(unblock_signals);
 
-#define _S(nr) (1<<((nr)-1))
-
-#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
-
 /*
  * OK, we're invoking a handler
  */
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index af962e57efb2..4d9c4841989d 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -21,8 +21,6 @@
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * For UniCore syscalls, we encode the syscall number into the instruction.
  */
@@ -61,10 +59,8 @@ static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf)
 	int err;
 
 	err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
-	if (err == 0) {
-		sigdelsetmask(&set, ~_BLOCKABLE);
+	if (err == 0)
 		set_current_blocked(&set);
-	}
 
 	err |= __get_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00);
 	err |= __get_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 98bd70faccc5..daeca56211e3 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -273,7 +273,6 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
@@ -299,7 +298,6 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index ada93b3b8c66..beff97f7df37 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -7,8 +7,6 @@
 
 #include <asm/processor-flags.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 #define __FIX_EFLAGS	(X86_EFLAGS_AC | X86_EFLAGS_OF | \
 			 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
 			 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 700c49dcd84e..11e206f0f45a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -555,7 +555,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, &ax))
@@ -581,7 +580,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
@@ -915,7 +913,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index bb0fb03b9f85..a508cea13503 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -486,7 +486,6 @@ long sys_sigreturn(struct pt_regs *regs)
 	    copy_from_user(&set.sig[1], extramask, sig_size))
 		goto segfault;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (copy_sc_from_user(&current->thread.regs, sc))
@@ -600,7 +599,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
 	if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
 		goto segfault;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (copy_sc_from_user(&current->thread.regs, &uc->uc_mcontext))
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index ca98b86ef9a7..4da3c6f6d929 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -30,8 +30,6 @@
 
 #define DEBUG_SIG  0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 extern struct task_struct *coproc_owners[];
 
 struct rt_sigframe
@@ -261,7 +259,6 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, frame))
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ded3fb63fb06..f34437e835a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2210,7 +2210,7 @@ extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned lon
 static inline void restore_saved_sigmask(void)
 {
 	if (test_and_clear_restore_sigmask())
-		set_current_blocked(&current->saved_sigmask);
+		__set_current_blocked(&current->saved_sigmask);
 }
 
 static inline sigset_t *sigmask_to_save(void)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 17046cc484bc..065e76330398 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -250,7 +250,8 @@ extern long do_sigpending(void __user *, unsigned long);
 extern int do_sigtimedwait(const sigset_t *, siginfo_t *,
 				const struct timespec *);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern void set_current_blocked(const sigset_t *);
+extern void set_current_blocked(sigset_t *);
+extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
 extern int sigsuspend(sigset_t *);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index b9be7e0fe41a..df8d721a9e6f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2524,7 +2524,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
  * It is wrong to change ->blocked directly, this helper should be used
  * to ensure the process can't miss a shared signal we are going to block.
  */
-void set_current_blocked(const sigset_t *newset)
+void set_current_blocked(sigset_t *newset)
+{
+	struct task_struct *tsk = current;
+	sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
+	spin_lock_irq(&tsk->sighand->siglock);
+	__set_task_blocked(tsk, newset);
+	spin_unlock_irq(&tsk->sighand->siglock);
+}
+
+void __set_current_blocked(const sigset_t *newset)
 {
 	struct task_struct *tsk = current;
 
@@ -2564,7 +2573,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 		return -EINVAL;
 	}
 
-	set_current_blocked(&newset);
+	__set_current_blocked(&newset);
 	return 0;
 }
 
@@ -3138,7 +3147,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
 			return -EINVAL;
 		}
 
-		set_current_blocked(&new_blocked);
+		__set_current_blocked(&new_blocked);
 	}
 
 	if (oset) {
@@ -3202,7 +3211,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
 	int old = current->blocked.sig[0];
 	sigset_t newset;
 
-	siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
 	set_current_blocked(&newset);
 
 	return old;
@@ -3243,8 +3251,6 @@ SYSCALL_DEFINE0(pause)
 
 int sigsuspend(sigset_t *set)
 {
-	sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
 	current->saved_sigmask = current->blocked;
 	set_current_blocked(set);
 
-- 
cgit v1.2.3-71-gd317


From efee984c27b67e3ebef40410f35671997441b57c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 28 Apr 2012 02:04:15 -0400
Subject: new helper: signal_delivered()

Does block_sigmask() + tracehook_signal_handler();  called when
sigframe has been successfully built.  All architectures converted
to it; block_sigmask() itself is gone now (merged into this one).

I'm still not too happy with the signature, but that's a separate
story (IMO we need a structure that would contain signal number +
siginfo + k_sigaction, so that get_signal_to_deliver() would fill one,
signal_delivered(), handle_signal() and probably setup...frame() -
take one).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c         |  2 +-
 arch/arm/kernel/signal.c           |  7 +------
 arch/avr32/kernel/signal.c         |  2 +-
 arch/blackfin/kernel/signal.c      |  3 +--
 arch/c6x/kernel/signal.c           |  3 +--
 arch/cris/arch-v10/kernel/signal.c |  2 +-
 arch/cris/arch-v32/kernel/signal.c |  2 +-
 arch/frv/kernel/signal.c           |  3 +--
 arch/h8300/kernel/signal.c         |  2 +-
 arch/hexagon/kernel/signal.c       |  3 +--
 arch/ia64/kernel/signal.c          |  7 +------
 arch/m32r/kernel/signal.c          |  2 +-
 arch/m68k/kernel/signal.c          |  2 +-
 arch/microblaze/kernel/signal.c    |  2 +-
 arch/mips/kernel/signal.c          |  2 +-
 arch/mn10300/kernel/signal.c       |  3 +--
 arch/openrisc/kernel/signal.c      |  3 +--
 arch/parisc/kernel/signal.c        |  4 +---
 arch/powerpc/kernel/signal.c       |  6 +-----
 arch/s390/kernel/compat_signal.c   |  6 +-----
 arch/s390/kernel/signal.c          |  6 +-----
 arch/score/kernel/signal.c         |  2 +-
 arch/sh/kernel/signal_32.c         |  3 +--
 arch/sh/kernel/signal_64.c         |  3 +--
 arch/sparc/kernel/signal32.c       |  3 +--
 arch/sparc/kernel/signal_32.c      |  3 +--
 arch/sparc/kernel/signal_64.c      |  3 +--
 arch/tile/kernel/signal.c          |  2 +-
 arch/um/kernel/signal.c            |  2 +-
 arch/unicore32/kernel/signal.c     |  5 +----
 arch/x86/kernel/signal.c           |  6 ++----
 arch/xtensa/kernel/signal.c        |  2 +-
 include/linux/signal.h             |  2 +-
 kernel/signal.c                    | 22 +++++++++++++---------
 34 files changed, 47 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 48c4df2389ac..a8c97d42ec8e 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -478,7 +478,7 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 		force_sigsegv(sig, current);
 		return;
 	}
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 static inline void
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index c126eba8411d..fd2392a17ac1 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -557,12 +557,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka,
 		force_sigsegv(sig, tsk);
 		return;
 	}
-
-	/*
-	 * Block the signal if we were successful.
-	 */
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs, 0);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index e883fa5eb845..c140f9b41dce 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -241,7 +241,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	if (ret != 0)
 		force_sigsegv(sig, current);
 	else
-		block_sigmask(ka, sig);
+		signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index 463612643821..35459e681483 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -260,8 +260,7 @@ handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 			test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index eb1b3086ae00..3d8f3c22a94f 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -276,8 +276,7 @@ static void handle_signal(int sig,
 	/* Set up the stack frame */
 	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0)
 		return;
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs, 0);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index cf6380cb9a57..0bb477c13a4e 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -453,7 +453,7 @@ static inline void handle_signal(int canrestart, unsigned long sig,
 		ret = setup_frame(sig, ka, oldset, regs);
 
 	if (ret == 0)
-		block_sigmask(ka, sig);
+		signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 07b81ee09f65..b60d1b65a426 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -485,7 +485,7 @@ handle_signal(int canrestart, unsigned long sig,
 		ret = setup_frame(sig, ka, oldset, regs);
 
 	if (ret == 0)
-		block_sigmask(ka, sig);
+		signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 511285fa2461..4e134c7eceea 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -460,8 +460,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, __frame,
+	signal_delivered(sig, info, ka, __frame,
 				 test_thread_flag(TIF_SINGLESTEP));
 } /* end handle_signal() */
 
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index aa6f09666915..fca10378701b 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -439,7 +439,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		ret = setup_frame(sig, ka, oldset, regs);
 
 	if (!ret)
-		block_sigmask(ka, sig);
+		signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 439f11a3a8ef..304b0808d072 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -186,8 +186,7 @@ static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 			test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index c4041c76c07d..a199be1fe619 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -419,12 +419,7 @@ handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	if (!setup_frame(sig, ka, info, sigmask_to_save(), scr))
 		return 0;
 
-	block_sigmask(ka, sig);
-
-	/*
-	 * Let tracing know that we've done the handler setup.
-	 */
-	tracehook_signal_handler(sig, info, ka, &scr->pt,
+	signal_delivered(sig, info, ka, &scr->pt,
 				 test_thread_flag(TIF_SINGLESTEP));
 
 	return 1;
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 07f9032576c0..f3fb2c029cfc 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -294,7 +294,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs))
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index c00caad215a6..710a528b928b 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -1137,7 +1137,7 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 	if (err)
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 
 	if (test_thread_flag(TIF_DELAYED_TRACE)) {
 		regs->sr &= ~0x8000;
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index c662e68671a2..76b9722557db 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -323,7 +323,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 02e0cba24f82..f2c09cfc60ac 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -551,7 +551,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 static void do_signal(struct pt_regs *regs)
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index 4f6d20763061..6ab0bee2a54f 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -461,8 +461,7 @@ static int handle_signal(int sig,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 53972b7260b7..30110297f4f9 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -259,8 +259,7 @@ handle_signal(unsigned long sig,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 25161eaf720d..594459bde14e 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -449,9 +449,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	if (!setup_rt_frame(sig, ka, info, oldset, regs, in_syscall))
 		return 0;
 
-	block_sigmask(ka, sig);
-
-	tracehook_signal_handler(sig, info, ka, regs, 
+	signal_delivered(sig, info, ka, regs, 
 		test_thread_flag(TIF_SINGLESTEP) ||
 		test_thread_flag(TIF_BLOCKSTEP));
 
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 129bdffc6daf..5c023c9cf16e 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -148,11 +148,7 @@ static int do_signal(struct pt_regs *regs)
 
 	regs->trap = 0;
 	if (ret) {
-		block_sigmask(&ka, signr);
-		/*
-		 * Let tracing know that we've done the handler setup.
-		 */
-		tracehook_signal_handler(signr, &info, &ka, regs,
+		signal_delivered(signr, &info, &ka, regs,
 					 test_thread_flag(TIF_SINGLESTEP));
 	}
 
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 923baa96c0b0..3c0c19830c37 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -580,11 +580,7 @@ void handle_signal32(unsigned long sig, struct k_sigaction *ka,
 		ret = setup_frame32(sig, ka, oldset, regs);
 	if (ret)
 		return;
-	block_sigmask(ka, sig);
-	/*
-	 * Let tracing know that we've done the handler setup.
-	 */
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLE_STEP));
 }
 
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 8332a6943384..ac565b44aabb 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -375,11 +375,7 @@ static void handle_signal(unsigned long sig, struct k_sigaction *ka,
 		ret = setup_frame(sig, ka, oldset, regs);
 	if (ret)
 		return;
-	block_sigmask(ka, sig);
-	/*
-	 * Let tracing know that we've done the handler setup.
-	 */
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLE_STEP));
 }
 
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index f1b3fef0907b..e382c52ca0d9 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -267,7 +267,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 	if (setup_rt_frame(ka, regs, sig, sigmask_to_save(), info) < 0)
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 static void do_signal(struct pt_regs *regs)
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index e4a531414e19..d6b7b6154f87 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -533,8 +533,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 
 	if (ret)
 		return;
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 			test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 75960ef6c1d1..6b5b3dfe886b 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -650,8 +650,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 			test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index ba3dbfcdb28e..a53e0a5fd3a3 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -785,8 +785,7 @@ static inline void handle_signal32(unsigned long signr, struct k_sigaction *ka,
 	if (err)
 		return;
 
-	block_sigmask(ka, signr);
-	tracehook_signal_handler(signr, info, ka, regs, 0);
+	signal_delivered(signr, info, ka, regs, 0);
 }
 
 static inline void syscall_restart32(unsigned long orig_i0, struct pt_regs *regs,
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 1bfa854be602..68f9c8650af4 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -460,8 +460,7 @@ handle_signal(unsigned long signr, struct k_sigaction *ka,
 	if (err)
 		return;
 
-	block_sigmask(ka, signr);
-	tracehook_signal_handler(signr, info, ka, regs, 0);
+	signal_delivered(signr, info, ka, regs, 0);
 }
 
 static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 23b60caa6c43..867de2f8189c 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -473,8 +473,7 @@ static inline void handle_signal(unsigned long signr, struct k_sigaction *ka,
 	if (err)
 		return;
 
-	block_sigmask(ka, signr);
-	tracehook_signal_handler(signr, info, ka, regs, 0);
+	signal_delivered(signr, info, ka, regs, 0);
 }
 
 static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index e068aa0c6dfc..e29b0553211d 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -278,7 +278,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 		ret = setup_rt_frame(sig, ka, info, oldset, regs);
 	if (ret)
 		return;
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 4ce6ab2d2996..7362d58efc29 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -61,7 +61,7 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr,
 	if (err)
 		force_sigsegv(signr, current);
 	else
-		block_sigmask(ka, signr);
+		signal_delivered(signr, info, ka, regs, 0);
 }
 
 static int kern_do_signal(struct pt_regs *regs)
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 4d9c4841989d..8adedb37720a 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -362,10 +362,7 @@ static void handle_signal(unsigned long sig, struct k_sigaction *ka,
 		return;
 	}
 
-	/*
-	 * Block the signal if we were successful.
-	 */
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 11e206f0f45a..e8a89374d356 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -715,10 +715,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
 
-	block_sigmask(ka, sig);
-
-	tracehook_signal_handler(sig, info, ka, regs,
-				 test_thread_flag(TIF_SINGLESTEP));
+	signal_delivered(sig, info, ka, regs,
+			 test_thread_flag(TIF_SINGLESTEP));
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 4da3c6f6d929..b9f8e5850d3a 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -493,7 +493,7 @@ static void do_signal(struct pt_regs *regs)
 		if (ret)
 			return;
 
-		block_sigmask(&ka, signr);
+		signal_delivered(signr, info, ka, regs, 0);
 		if (current->ptrace & PT_SINGLESTEP)
 			task_pt_regs(current)->icountlevel = 1;
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 065e76330398..26b424adc842 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -256,7 +256,7 @@ extern int show_unhandled_signals;
 extern int sigsuspend(sigset_t *);
 
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
-extern void block_sigmask(struct k_sigaction *ka, int signr);
+extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping);
 extern void exit_signals(struct task_struct *tsk);
 
 extern struct kmem_cache *sighand_cachep;
diff --git a/kernel/signal.c b/kernel/signal.c
index df8d721a9e6f..677102789cf2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2368,17 +2368,20 @@ relock:
 }
 
 /**
- * block_sigmask - add @ka's signal mask to current->blocked
- * @ka: action for @signr
- * @signr: signal that has been successfully delivered
+ * signal_delivered - 
+ * @sig:		number of signal being delivered
+ * @info:		siginfo_t of signal being delivered
+ * @ka:			sigaction setting that chose the handler
+ * @regs:		user register state
+ * @stepping:		nonzero if debugger single-step or block-step in use
  *
  * This function should be called when a signal has succesfully been
- * delivered. It adds the mask of signals for @ka to current->blocked
- * so that they are blocked during the execution of the signal
- * handler. In addition, @signr will be blocked unless %SA_NODEFER is
- * set in @ka->sa.sa_flags.
+ * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * is always blocked, and the signal itself is blocked unless %SA_NODEFER
+ * is set in @ka->sa.sa_flags.  Tracing is notified.
  */
-void block_sigmask(struct k_sigaction *ka, int signr)
+void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
+			struct pt_regs *regs, int stepping)
 {
 	sigset_t blocked;
 
@@ -2390,8 +2393,9 @@ void block_sigmask(struct k_sigaction *ka, int signr)
 
 	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
 	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&blocked, signr);
+		sigaddset(&blocked, sig);
 	set_current_blocked(&blocked);
+	tracehook_signal_handler(sig, info, ka, regs, stepping);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From fad0c66c4bb836d57a5f125ecd38bed653ca863a Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 30 May 2012 10:54:57 -0700
Subject: timekeeping: Fix CLOCK_MONOTONIC inconsistency during leapsecond

Commit 6b43ae8a61 (ntp: Fix leap-second hrtimer livelock) broke the
leapsecond update of CLOCK_MONOTONIC. The missing leapsecond update to
wall_to_monotonic causes discontinuities in CLOCK_MONOTONIC.

Adjust wall_to_monotonic when NTP inserted a leapsecond.

Reported-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Tested-by: Richard Cochran <richardcochran@gmail.com>
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/1338400497-12420-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6e46cacf5969..6f46a00a1e8a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 		timekeeper.xtime.tv_sec++;
 		leap = second_overflow(timekeeper.xtime.tv_sec);
 		timekeeper.xtime.tv_sec += leap;
+		timekeeper.wall_to_monotonic.tv_sec -= leap;
 	}
 
 	/* Accumulate raw time */
@@ -1077,6 +1078,7 @@ static void update_wall_time(void)
 		timekeeper.xtime.tv_sec++;
 		leap = second_overflow(timekeeper.xtime.tv_sec);
 		timekeeper.xtime.tv_sec += leap;
+		timekeeper.wall_to_monotonic.tv_sec -= leap;
 	}
 
 	timekeeping_update(false);
-- 
cgit v1.2.3-71-gd317


From 10717dcde10d09f9fcee53a12a4236af1a82b484 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Wed, 6 Jun 2012 14:52:51 +0800
Subject: sched/numa: Load balance between remote nodes

Commit cb83b629b ("sched/numa: Rewrite the CONFIG_NUMA sched
domain support") removed the NODE sched domain and started checking
if the node distance in SLIT table is farther than REMOTE_DISTANCE,
if so, it will lose the load balance chance at exec/fork/wake_affine
points.

But actually, even the node distance is farther than REMOTE_DISTANCE.

Modern CPUs also has QPI like connections, which ensures that memory
access is not too slow between nodes. So the above change in behavior
on NUMA machine causes a performance regression on various benchmarks:
hackbench, tbench, netperf, oltp, etc.

This patch will recover the scheduler behavior to old mode on all my
Intel platforms: NHM EP/EX, WSM EP, SNB EP/EP4S, and thus fixes the
perfromance regressions. (all of them just have 2 kinds distance, 10, 21)

Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1338965571-9812-1-git-send-email-alex.shi@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c46958e26121..6546083af3e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6321,7 +6321,7 @@ static int sched_domains_curr_level;
 
 static inline int sd_local_flags(int level)
 {
-	if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
 		return 0;
 
 	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-- 
cgit v1.2.3-71-gd317


From 7f1b43936f0ecad14770634c021cf4a929aec74d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 17 May 2012 21:19:46 +0200
Subject: sched/rt: Fix lockdep annotation within find_lock_lowest_rq()

Roland Dreier reported spurious, hard to trigger lockdep warnings
within the scheduler - without any real lockup.

This bit gives us the right clue:

> [89945.640512]  [<ffffffff8103fa1a>] double_lock_balance+0x5a/0x90
> [89945.640568]  [<ffffffff8104c546>] push_rt_task+0xc6/0x290

if you look at that code you'll find the double_lock_balance() in
question is the one in find_lock_lowest_rq() [yay for inlining].

Now find_lock_lowest_rq() has a bug.. it fails to use
double_unlock_balance() in one exit path, if this results in a retry in
push_rt_task() we'll call double_lock_balance() again, at which point
we'll run into said lockdep confusion.

Reported-by: Roland Dreier <roland@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1337282386.4281.77.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2a4e8dffbd6b..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1562,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 				     task_running(rq, task) ||
 				     !task->on_rq)) {
 
-				raw_spin_unlock(&lowest_rq->lock);
+				double_unlock_balance(rq, lowest_rq);
 				lowest_rq = NULL;
 				break;
 			}
-- 
cgit v1.2.3-71-gd317


From c1174876874dcf8986806e4dad3d7d07af20b439 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 31 May 2012 14:47:33 +0200
Subject: sched: Fix domain iteration

Weird topologies can lead to asymmetric domain setups. This needs
further consideration since these setups are typically non-minimal
too.

For now, make it work by adding an extra mask selecting which CPUs
are allowed to iterate up.

The topology that triggered it is the one from David Rientjes:

	10 20 20 30
	20 10 20 20
	20 20 10 20
	30 20 20 10

resulting in boxes that wouldn't even boot.

Reported-by: David Rientjes <rientjes@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-3p86l9cuaqnxz7uxsojmz5rm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 11 +++++++++
 kernel/sched/core.c   | 64 ++++++++++++++++++++++++++++++++++++++++++++-------
 kernel/sched/fair.c   |  5 ++--
 kernel/sched/sched.h  |  2 ++
 4 files changed, 72 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6029d8c54476..ac321d753470 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -876,6 +876,8 @@ struct sched_group_power {
 	 * Number of busy cpus in this group.
 	 */
 	atomic_t nr_busy_cpus;
+
+	unsigned long cpumask[0]; /* iteration mask */
 };
 
 struct sched_group {
@@ -900,6 +902,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 	return to_cpumask(sg->cpumask);
 }
 
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+	return to_cpumask(sg->sgp->cpumask);
+}
+
 /**
  * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
  * @group: The group whose first cpu is to be returned.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6546083af3e0..781acb91a50a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5994,6 +5994,44 @@ struct sched_domain_topology_level {
 	struct sd_data      data;
 };
 
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+	const struct cpumask *span = sched_domain_span(sd);
+	struct sd_data *sdd = sd->private;
+	struct sched_domain *sibling;
+	int i;
+
+	for_each_cpu(i, span) {
+		sibling = *per_cpu_ptr(sdd->sd, i);
+		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+			continue;
+
+		cpumask_set_cpu(i, sched_group_mask(sg));
+	}
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -6012,6 +6050,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		if (cpumask_test_cpu(i, covered))
 			continue;
 
+		child = *per_cpu_ptr(sdd->sd, i);
+
+		/* See the comment near build_group_mask(). */
+		if (!cpumask_test_cpu(i, sched_domain_span(child)))
+			continue;
+
 		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 				GFP_KERNEL, cpu_to_node(cpu));
 
@@ -6019,8 +6063,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 			goto fail;
 
 		sg_span = sched_group_cpus(sg);
-
-		child = *per_cpu_ptr(sdd->sd, i);
 		if (child->child) {
 			child = child->child;
 			cpumask_copy(sg_span, sched_domain_span(child));
@@ -6030,13 +6072,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		cpumask_or(covered, covered, sg_span);
 
 		sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-		atomic_inc(&sg->sgp->ref);
+		if (atomic_inc_return(&sg->sgp->ref) == 1)
+			build_group_mask(sd, sg);
+
 
+		/*
+		 * Make sure the first group of this domain contains the
+		 * canonical balance cpu. Otherwise the sched_domain iteration
+		 * breaks. See update_sg_lb_stats().
+		 */
 		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
-			       cpumask_first(sg_span) == cpu) {
-			WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
+		    group_balance_cpu(sg) == cpu)
 			groups = sg;
-		}
 
 		if (!first)
 			first = sg;
@@ -6109,6 +6156,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 
 		cpumask_clear(sched_group_cpus(sg));
 		sg->sgp->power = 0;
+		cpumask_setall(sched_group_mask(sg));
 
 		for_each_cpu(j, span) {
 			if (get_group(j, sdd, NULL) != group)
@@ -6150,7 +6198,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 		sg = sg->next;
 	} while (sg != sd->groups);
 
-	if (cpu != group_first_cpu(sg))
+	if (cpu != group_balance_cpu(sg))
 		return;
 
 	update_group_power(sd, cpu);
@@ -6525,7 +6573,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
 			*per_cpu_ptr(sdd->sg, j) = sg;
 
-			sgp = kzalloc_node(sizeof(struct sched_group_power),
+			sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sgp)
 				return -ENOMEM;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2a2d236f27b..54cbaa4e7b37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	int i;
 
 	if (local_group)
-		balance_cpu = group_first_cpu(group);
+		balance_cpu = group_balance_cpu(group);
 
 	/* Tally up the load of all CPUs in the group */
 	max_cpu_load = 0;
@@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group) {
-			if (idle_cpu(i) && !first_idle_cpu) {
+			if (idle_cpu(i) && !first_idle_cpu &&
+					cpumask_test_cpu(i, sched_group_mask(group))) {
 				first_idle_cpu = 1;
 				balance_cpu = i;
 			}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..6d52cea7f33d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
 
+extern int group_balance_cpu(struct sched_group *sg);
+
 #endif /* CONFIG_SMP */
 
 #include "stats.h"
-- 
cgit v1.2.3-71-gd317


From c3decf0dfbc95736b7c0ab68fa4e5854c4734da9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 31 May 2012 12:05:32 +0200
Subject: sched: Always initialize cpu-power

Often when we run into mis-shapen topologies the balance iteration
fails to update the cpu power properly and we'll end up in /0 traps.

Always initialize the cpu-power to a semi-sane value so that we can
at least boot the machine, even if the load-balancer might not
function correctly.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-3lbhyj25sr169ha7z3qht5na@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 13 ++++++++++++-
 kernel/sched/fair.c |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 781acb91a50a..725ee7c1c8cf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5604,7 +5604,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!group->sgp->power) {
+		/*
+		 * Even though we initialize ->power to something semi-sane,
+		 * we leave power_orig unset. This allows us to detect if
+		 * domain iteration is still funny without causing /0 traps.
+		 */
+		if (!group->sgp->power_orig) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
@@ -6075,6 +6080,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		if (atomic_inc_return(&sg->sgp->ref) == 1)
 			build_group_mask(sd, sg);
 
+		/*
+		 * Initialize sgp->power such that even if we mess up the
+		 * domains and no possible iteration will get us here, we won't
+		 * die on a /0 trap.
+		 */
+		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
 
 		/*
 		 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54cbaa4e7b37..c9fd6d673d05 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3602,7 +3602,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
 		} while (group != child->groups);
 	}
 
-	sdg->sgp->power = power;
+	sdg->sgp->power_orig = sdg->sgp->power = power;
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From d039ac60800fe8ed8522ec3b9ca796aaf748c18b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 31 May 2012 21:20:16 +0200
Subject: sched: Validate assumptions in sched_init_numa()

Add some code to validate assumptions we're making and output
warnings if they are not.

If this trigger we want to know about it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alex Shi <lkml.alex@gmail.com>
Link: http://lkml.kernel.org/n/tip-6uc3wk5s9udxtdl9cnku0vtt@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 99 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 80 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 725ee7c1c8cf..2bdd17616437 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
 
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
 {
-	sched_domain_debug_enabled = 1;
+	sched_debug_enabled = 1;
 
 	return 0;
 }
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+	return sched_debug_enabled;
+}
 
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  struct cpumask *groupmask)
@@ -5657,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 
-	if (!sched_domain_debug_enabled)
+	if (!sched_debug_enabled)
 		return;
 
 	if (!sd) {
@@ -5678,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+	return false;
+}
 #endif /* CONFIG_SCHED_DEBUG */
 
 static int sd_degenerate(struct sched_domain *sd)
@@ -6373,7 +6382,6 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
 #ifdef CONFIG_NUMA
 
 static int sched_domains_numa_levels;
-static int sched_domains_numa_scale;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
@@ -6438,6 +6446,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
 	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
 }
 
+static void sched_numa_warn(const char *str)
+{
+	static int done = false;
+	int i,j;
+
+	if (done)
+		return;
+
+	done = true;
+
+	printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+	for (i = 0; i < nr_node_ids; i++) {
+		printk(KERN_WARNING "  ");
+		for (j = 0; j < nr_node_ids; j++)
+			printk(KERN_CONT "%02d ", node_distance(i,j));
+		printk(KERN_CONT "\n");
+	}
+	printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+	int i;
+
+	if (distance == node_distance(0, 0))
+		return true;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		if (sched_domains_numa_distance[i] == distance)
+			return true;
+	}
+
+	return false;
+}
+
 static void sched_init_numa(void)
 {
 	int next_distance, curr_distance = node_distance(0, 0);
@@ -6445,7 +6489,6 @@ static void sched_init_numa(void)
 	int level = 0;
 	int i, j, k;
 
-	sched_domains_numa_scale = curr_distance;
 	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
 	if (!sched_domains_numa_distance)
 		return;
@@ -6456,23 +6499,41 @@ static void sched_init_numa(void)
 	 *
 	 * Assumes node_distance(0,j) includes all distances in
 	 * node_distance(i,j) in order to avoid cubic time.
-	 *
-	 * XXX: could be optimized to O(n log n) by using sort()
 	 */
 	next_distance = curr_distance;
 	for (i = 0; i < nr_node_ids; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
-			int distance = node_distance(0, j);
-			if (distance > curr_distance &&
-					(distance < next_distance ||
-					 next_distance == curr_distance))
-				next_distance = distance;
+			for (k = 0; k < nr_node_ids; k++) {
+				int distance = node_distance(i, k);
+
+				if (distance > curr_distance &&
+				    (distance < next_distance ||
+				     next_distance == curr_distance))
+					next_distance = distance;
+
+				/*
+				 * While not a strong assumption it would be nice to know
+				 * about cases where if node A is connected to B, B is not
+				 * equally connected to A.
+				 */
+				if (sched_debug() && node_distance(k, i) != distance)
+					sched_numa_warn("Node-distance not symmetric");
+
+				if (sched_debug() && i && !find_numa_distance(distance))
+					sched_numa_warn("Node-0 not representative");
+			}
+			if (next_distance != curr_distance) {
+				sched_domains_numa_distance[level++] = next_distance;
+				sched_domains_numa_levels = level;
+				curr_distance = next_distance;
+			} else break;
 		}
-		if (next_distance != curr_distance) {
-			sched_domains_numa_distance[level++] = next_distance;
-			sched_domains_numa_levels = level;
-			curr_distance = next_distance;
-		} else break;
+
+		/*
+		 * In case of sched_debug() we verify the above assumption.
+		 */
+		if (!sched_debug())
+			break;
 	}
 	/*
 	 * 'level' contains the number of unique distances, excluding the
-- 
cgit v1.2.3-71-gd317


From a841f8cef4bb124f0f5563314d0beaf2e1249d72 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Tue, 5 Jun 2012 13:44:36 -0500
Subject: sched: Fix the relax_domain_level boot parameter

It does not get processed because sched_domain_level_max is 0 at the
time that setup_relax_domain_level() is run.

Simply accept the value as it is, as we don't know the value of
sched_domain_level_max until sched domain construction is completed.

Fix sched_relax_domain_level in cpuset.  The build_sched_domain() routine calls
the set_domain_attribute() routine prior to setting the sd->level, however,
the set_domain_attribute() routine relies on the sd->level to decide whether
idle load balancing will be off/on.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120605184436.GA15668@sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2bdd17616437..d5594a4268d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6268,11 +6268,8 @@ int sched_domain_level_max;
 
 static int __init setup_relax_domain_level(char *str)
 {
-	unsigned long val;
-
-	val = simple_strtoul(str, NULL, 0);
-	if (val < sched_domain_level_max)
-		default_relax_domain_level = val;
+	if (kstrtoint(str, 0, &default_relax_domain_level))
+		pr_warn("Unable to set relax_domain_level\n");
 
 	return 1;
 }
@@ -6698,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 	if (!sd)
 		return child;
 
-	set_domain_attribute(sd, attr);
 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
@@ -6706,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		child->parent = sd;
 	}
 	sd->child = child;
+	set_domain_attribute(sd, attr);
 
 	return sd;
 }
-- 
cgit v1.2.3-71-gd317


From 967db0ea65b0bf8507a7643ac8f296c4f2c0a834 Mon Sep 17 00:00:00 2001
From: Salman Qazi <sqazi@google.com>
Date: Wed, 6 Jun 2012 18:51:35 -0700
Subject: cgroup: make sure that decisions in __css_put are atomic

__css_put is using atomic_dec on the ref count, and then
looking at the ref count to make decisions.  This is prone
to races, as someone else may decrement ref count between
our decrement and our decision.  Instead, we should base our
decisions on the value that we decremented the ref count to.

(This results in an actual race on Google's kernel which I
haven't been able to reproduce on the upstream kernel.  Having
said that, it's still incorrect by inspection).

Signed-off-by: Salman Qazi <sqazi@google.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 72fcd3069a90..ceeafe874b3f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4984,8 +4984,7 @@ void __css_put(struct cgroup_subsys_state *css)
 	struct cgroup *cgrp = css->cgroup;
 
 	rcu_read_lock();
-	atomic_dec(&css->refcnt);
-	switch (css_refcnt(css)) {
+	switch (atomic_dec_return(&css->refcnt)) {
 	case 1:
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
-- 
cgit v1.2.3-71-gd317


From f2bf1f6f5f89d031245067512449fc889b2f4bb2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 Jun 2012 19:50:40 -0400
Subject: tracing: Have tracing_off() actually turn tracing off

A recent update to have tracing_on/off() only affect the ftrace ring
buffers instead of all ring buffers had a cut and paste error.
The tracing_off() did the exact same thing as tracing_on() and
would not actually turn off tracing. Unfortunately, tracing_off()
is more important to be working than tracing_on() as this is a key
development tool, as it lets the developer turn off tracing as soon
as a problem is discovered. It is also used by panic and oops code.

This bug also breaks the 'echo func:traceoff > set_ftrace_filter'

Cc: <stable@vger.kernel.org> # 3.4
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 68032c6177db..49249c28690d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
 void tracing_off(void)
 {
 	if (global_trace.buffer)
-		ring_buffer_record_on(global_trace.buffer);
+		ring_buffer_record_off(global_trace.buffer);
 	/*
 	 * This flag is only looked at when buffers haven't been
 	 * allocated yet. We don't really care about the race
-- 
cgit v1.2.3-71-gd317


From 8f5af6f1f2d09fe5eac86a5dc1731a5917c1503a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Fri, 4 May 2012 08:31:53 -0700
Subject: rcu: RCU_FAST_NO_HZ detection of callback adoption

In the present implementations of CPU hotplug, the outgoing CPU is
guaranteed to run its stop-machine process on the way out, which
will guarantee that RCU_FAST_NO_HZ forces the CPU out of dyntick-idle
mode.

However, new versions of CPU hotplug might not work this way.  This
commit therefore removes this design constraint by explicitly notifying
CPUs when they adopt non-lazy RCU callbacks.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
---
 kernel/rcutree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88d92d0..3b0f1337f75b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1397,6 +1397,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 	rdp->qlen_lazy += rsp->qlen_lazy;
 	rdp->qlen += rsp->qlen;
 	rdp->n_cbs_adopted += rsp->qlen;
+	if (rsp->qlen_lazy != rsp->qlen)
+		rcu_idle_count_callbacks_posted();
 	rsp->qlen_lazy = 0;
 	rsp->qlen = 0;
 
-- 
cgit v1.2.3-71-gd317


From fd4b352687fd8604d49c190c4c9ea9e369fd42d5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Sat, 5 May 2012 19:10:35 -0700
Subject: rcu: Update RCU_FAST_NO_HZ tracing for lazy callbacks

In the current code, a short dyntick-idle interval (where there is
at least one non-lazy callback on the CPU) and a long dyntick-idle
interval (where there are only lazy callbacks on the CPU) are traced
identically, which can be less than helpful.  This commit therefore
emits different event traces in these two cases.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
---
 include/trace/events/rcu.h | 1 +
 kernel/rcutree_plugin.h    | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 1480900c511c..d274734b2aa4 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -289,6 +289,7 @@ TRACE_EVENT(rcu_dyntick,
  *	"In holdoff": Nothing to do, holding off after unsuccessful attempt.
  *	"Begin holdoff": Attempt failed, don't retry until next jiffy.
  *	"Dyntick with callbacks": Entering dyntick-idle despite callbacks.
+ *	"Dyntick with lazy callbacks": Entering dyntick-idle w/lazy callbacks.
  *	"More callbacks": Still more callbacks, try again to clear them out.
  *	"Callbacks drained": All callbacks processed, off to dyntick idle!
  *	"Timer": Timer fired to cause CPU to continue processing callbacks.
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2411000d9869..5449f02c4820 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -2165,15 +2165,17 @@ static void rcu_prepare_for_idle(int cpu)
 		   !rcu_pending(cpu) &&
 		   !local_softirq_pending()) {
 		/* Can we go dyntick-idle despite still having callbacks? */
-		trace_rcu_prep_idle("Dyntick with callbacks");
 		per_cpu(rcu_dyntick_drain, cpu) = 0;
 		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
-		if (rcu_cpu_has_nonlazy_callbacks(cpu))
+		if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
+			trace_rcu_prep_idle("Dyntick with callbacks");
 			per_cpu(rcu_idle_gp_timer_expires, cpu) =
 					   jiffies + RCU_IDLE_GP_DELAY;
-		else
+		} else {
 			per_cpu(rcu_idle_gp_timer_expires, cpu) =
 					   jiffies + RCU_IDLE_LAZY_GP_DELAY;
+			trace_rcu_prep_idle("Dyntick with lazy callbacks");
+		}
 		tp = &per_cpu(rcu_idle_gp_timer, cpu);
 		mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
 		per_cpu(rcu_nonlazy_posted_snap, cpu) =
-- 
cgit v1.2.3-71-gd317


From 5955f7eecd77d6b440db278b266cfecdb72ecd00 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 9 May 2012 12:07:05 -0700
Subject: rcu: Move RCU_FAST_NO_HZ per-CPU variables to rcu_dynticks structure

The RCU_FAST_NO_HZ code relies on a number of per-CPU variables.
This works, but is hidden from someone scanning the data structures
in rcutree.h.  This commit therefore converts these per-CPU variables
to fields in the per-CPU rcu_dynticks structures.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
---
 kernel/rcutree.h        | 14 +++++++
 kernel/rcutree_plugin.h | 99 ++++++++++++++++++++++---------------------------
 2 files changed, 58 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7f5d138dedf5..ea056495783e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,6 +84,20 @@ struct rcu_dynticks {
 				    /* Process level is worth LLONG_MAX/2. */
 	int dynticks_nmi_nesting;   /* Track NMI nesting level. */
 	atomic_t dynticks;	    /* Even value for idle, else odd. */
+#ifdef CONFIG_RCU_FAST_NO_HZ
+	int dyntick_drain;	    /* Prepare-for-idle state variable. */
+	unsigned long dyntick_holdoff;
+				    /* No retries for the jiffy of failure. */
+	struct timer_list idle_gp_timer;
+				    /* Wake up CPU sleeping with callbacks. */
+	unsigned long idle_gp_timer_expires;
+				    /* When to wake up CPU (for repost). */
+	bool idle_first_pass;	    /* First pass of attempt to go idle? */
+	unsigned long nonlazy_posted;
+				    /* # times non-lazy CBs posted to CPU. */
+	unsigned long nonlazy_posted_snap;
+				    /* idle-period nonlazy_posted snapshot. */
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
 
 /* RCU's kthread states for tracing. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5449f02c4820..6bd9637d5d83 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1962,21 +1962,6 @@ static void rcu_idle_count_callbacks_posted(void)
 #define RCU_IDLE_GP_DELAY 6		/* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */
 
-/* Loop counter for rcu_prepare_for_idle(). */
-static DEFINE_PER_CPU(int, rcu_dyntick_drain);
-/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
-static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
-static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
-/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
-static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
-/* Enable special processing on first attempt to enter dyntick-idle mode. */
-static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
-/* Running count of non-lazy callbacks posted, never decremented. */
-static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
-/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
-static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
-
 /*
  * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
  * callbacks on this CPU, (2) this CPU has not yet attempted to enter
@@ -1988,13 +1973,15 @@ static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
  */
 int rcu_needs_cpu(int cpu)
 {
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
 	/* Flag a new idle sojourn to the idle-entry state machine. */
-	per_cpu(rcu_idle_first_pass, cpu) = 1;
+	rdtp->idle_first_pass = 1;
 	/* If no callbacks, RCU doesn't need the CPU. */
 	if (!rcu_cpu_has_callbacks(cpu))
 		return 0;
 	/* Otherwise, RCU needs the CPU only if it recently tried and failed. */
-	return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
+	return rdtp->dyntick_holdoff == jiffies;
 }
 
 /*
@@ -2075,21 +2062,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in)
  */
 static void rcu_prepare_for_idle_init(int cpu)
 {
-	per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
-	setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
-		    rcu_idle_gp_timer_func, cpu);
-	per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
-	per_cpu(rcu_idle_first_pass, cpu) = 1;
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	rdtp->dyntick_holdoff = jiffies - 1;
+	setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
+	rdtp->idle_gp_timer_expires = jiffies - 1;
+	rdtp->idle_first_pass = 1;
 }
 
 /*
  * Clean up for exit from idle.  Because we are exiting from idle, there
- * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will
+ * is no longer any point to ->idle_gp_timer, so cancel it.  This will
  * do nothing if this timer is not active, so just cancel it unconditionally.
  */
 static void rcu_cleanup_after_idle(int cpu)
 {
-	del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	del_timer(&rdtp->idle_gp_timer);
 	trace_rcu_prep_idle("Cleanup after idle");
 }
 
@@ -2108,42 +2098,41 @@ static void rcu_cleanup_after_idle(int cpu)
  * Because it is not legal to invoke rcu_process_callbacks() with irqs
  * disabled, we do one pass of force_quiescent_state(), then do a
  * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ * later.  The ->dyntick_drain field controls the sequencing.
  *
  * The caller must have disabled interrupts.
  */
 static void rcu_prepare_for_idle(int cpu)
 {
 	struct timer_list *tp;
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
 	/*
 	 * If this is an idle re-entry, for example, due to use of
 	 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
 	 * loop, then don't take any state-machine actions, unless the
 	 * momentary exit from idle queued additional non-lazy callbacks.
-	 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
+	 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
 	 * pending.
 	 */
-	if (!per_cpu(rcu_idle_first_pass, cpu) &&
-	    (per_cpu(rcu_nonlazy_posted, cpu) ==
-	     per_cpu(rcu_nonlazy_posted_snap, cpu))) {
+	if (!rdtp->idle_first_pass &&
+	    (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
 		if (rcu_cpu_has_callbacks(cpu)) {
-			tp = &per_cpu(rcu_idle_gp_timer, cpu);
-			mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+			tp = &rdtp->idle_gp_timer;
+			mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
 		}
 		return;
 	}
-	per_cpu(rcu_idle_first_pass, cpu) = 0;
-	per_cpu(rcu_nonlazy_posted_snap, cpu) =
-		per_cpu(rcu_nonlazy_posted, cpu) - 1;
+	rdtp->idle_first_pass = 0;
+	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
 
 	/*
 	 * If there are no callbacks on this CPU, enter dyntick-idle mode.
 	 * Also reset state to avoid prejudicing later attempts.
 	 */
 	if (!rcu_cpu_has_callbacks(cpu)) {
-		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
-		per_cpu(rcu_dyntick_drain, cpu) = 0;
+		rdtp->dyntick_holdoff = jiffies - 1;
+		rdtp->dyntick_drain = 0;
 		trace_rcu_prep_idle("No callbacks");
 		return;
 	}
@@ -2152,38 +2141,37 @@ static void rcu_prepare_for_idle(int cpu)
 	 * If in holdoff mode, just return.  We will presumably have
 	 * refrained from disabling the scheduling-clock tick.
 	 */
-	if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
+	if (rdtp->dyntick_holdoff == jiffies) {
 		trace_rcu_prep_idle("In holdoff");
 		return;
 	}
 
-	/* Check and update the rcu_dyntick_drain sequencing. */
-	if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+	/* Check and update the ->dyntick_drain sequencing. */
+	if (rdtp->dyntick_drain <= 0) {
 		/* First time through, initialize the counter. */
-		per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
-	} else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
+		rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
+	} else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
 		   !rcu_pending(cpu) &&
 		   !local_softirq_pending()) {
 		/* Can we go dyntick-idle despite still having callbacks? */
-		per_cpu(rcu_dyntick_drain, cpu) = 0;
-		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+		rdtp->dyntick_drain = 0;
+		rdtp->dyntick_holdoff = jiffies;
 		if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
 			trace_rcu_prep_idle("Dyntick with callbacks");
-			per_cpu(rcu_idle_gp_timer_expires, cpu) =
+			rdtp->idle_gp_timer_expires =
 					   jiffies + RCU_IDLE_GP_DELAY;
 		} else {
-			per_cpu(rcu_idle_gp_timer_expires, cpu) =
+			rdtp->idle_gp_timer_expires =
 					   jiffies + RCU_IDLE_LAZY_GP_DELAY;
 			trace_rcu_prep_idle("Dyntick with lazy callbacks");
 		}
-		tp = &per_cpu(rcu_idle_gp_timer, cpu);
-		mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
-		per_cpu(rcu_nonlazy_posted_snap, cpu) =
-			per_cpu(rcu_nonlazy_posted, cpu);
+		tp = &rdtp->idle_gp_timer;
+		mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+		rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
 		return; /* Nothing more to do immediately. */
-	} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+	} else if (--(rdtp->dyntick_drain) <= 0) {
 		/* We have hit the limit, so time to give up. */
-		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+		rdtp->dyntick_holdoff = jiffies;
 		trace_rcu_prep_idle("Begin holdoff");
 		invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
 		return;
@@ -2229,7 +2217,7 @@ static void rcu_prepare_for_idle(int cpu)
  */
 static void rcu_idle_count_callbacks_posted(void)
 {
-	__this_cpu_add(rcu_nonlazy_posted, 1);
+	__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
 }
 
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
@@ -2240,11 +2228,12 @@ static void rcu_idle_count_callbacks_posted(void)
 
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
-	struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+	struct timer_list *tltp = &rdtp->idle_gp_timer;
 
 	sprintf(cp, "drain=%d %c timer=%lu",
-		per_cpu(rcu_dyntick_drain, cpu),
-		per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
+		rdtp->dyntick_drain,
+		rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
 		timer_pending(tltp) ? tltp->expires - jiffies : -1);
 }
 
-- 
cgit v1.2.3-71-gd317


From aa9b16306e3243229580ff889cc59fd66bf77973 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 10 May 2012 16:41:44 -0700
Subject: rcu: Precompute RCU_FAST_NO_HZ timer offsets

When a CPU is entering dyntick-idle mode, tick_nohz_stop_sched_tick()
calls rcu_needs_cpu() see if RCU needs that CPU, and, if not, computes the
next wakeup time based on the timer wheels.  Only later, when actually
entering the idle loop, rcu_prepare_for_idle() will be invoked.  In some
cases, rcu_prepare_for_idle() will post timers to wake the CPU back up.
But all for naught: The next wakeup time for the CPU has already been
computed, and posting a timer afterwards does not force that wakeup
time to be recomputed.  This means that rcu_prepare_for_idle()'s have
no effect.

This is not a problem on a busy system because something else will wake
up the CPU soon enough.  However, on lightly loaded systems, the CPU
might stay asleep for a considerable length of time.  If that CPU has
a callback that the rest of the system is waiting on, the system might
run very slowly or (in theory) even hang.

This commit avoids this problem by having rcu_needs_cpu() give
tick_nohz_stop_sched_tick() an estimate of when RCU will need the CPU
to wake back up, which tick_nohz_stop_sched_tick() takes into account
when programming the CPU's wakeup time.  An alternative approach is
for rcu_prepare_for_idle() to use hrtimers instead of normal timers,
but timers are much more efficient than are hrtimers for frequently
and repeatedly posting and cancelling a given timer, which is exactly
what RCU_FAST_NO_HZ does.

Reported-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
---
 include/linux/rcutiny.h  |  6 +++--
 include/linux/rcutree.h  |  2 +-
 kernel/rcutree_plugin.h  | 66 +++++++++++++++++++++++++++++++-----------------
 kernel/time/tick-sched.c |  7 ++++-
 4 files changed, 54 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index adb5e5a38cae..854dc4c5c271 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -87,8 +87,9 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 
 #ifdef CONFIG_TINY_RCU
 
-static inline int rcu_needs_cpu(int cpu)
+static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
+	*delta_jiffies = ULONG_MAX;
 	return 0;
 }
 
@@ -96,8 +97,9 @@ static inline int rcu_needs_cpu(int cpu)
 
 int rcu_preempt_needs_cpu(void);
 
-static inline int rcu_needs_cpu(int cpu)
+static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
+	*delta_jiffies = ULONG_MAX;
 	return rcu_preempt_needs_cpu();
 }
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 3c6083cde4fc..952b79339304 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -32,7 +32,7 @@
 
 extern void rcu_init(void);
 extern void rcu_note_context_switch(int cpu);
-extern int rcu_needs_cpu(int cpu);
+extern int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies);
 extern void rcu_cpu_stall_reset(void);
 
 /*
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 6bd9637d5d83..5271a020887e 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
  * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
  * any flavor of RCU.
  */
-int rcu_needs_cpu(int cpu)
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
+	*delta_jiffies = ULONG_MAX;
 	return rcu_cpu_has_callbacks(cpu);
 }
 
@@ -1962,28 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void)
 #define RCU_IDLE_GP_DELAY 6		/* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */
 
-/*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
- */
-int rcu_needs_cpu(int cpu)
-{
-	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-	/* Flag a new idle sojourn to the idle-entry state machine. */
-	rdtp->idle_first_pass = 1;
-	/* If no callbacks, RCU doesn't need the CPU. */
-	if (!rcu_cpu_has_callbacks(cpu))
-		return 0;
-	/* Otherwise, RCU needs the CPU only if it recently tried and failed. */
-	return rdtp->dyntick_holdoff == jiffies;
-}
-
 /*
  * Does the specified flavor of RCU have non-lazy callbacks pending on
  * the specified CPU?  Both RCU flavor and CPU are specified by the
@@ -2026,6 +2005,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
 	       rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
 }
 
+/*
+ * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
+ * callbacks on this CPU, (2) this CPU has not yet attempted to enter
+ * dyntick-idle mode, or (3) this CPU is in the process of attempting to
+ * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
+ * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
+ * it is better to incur scheduling-clock interrupts than to spin
+ * continuously for the same time duration!
+ *
+ * The delta_jiffies argument is used to store the time when RCU is
+ * going to need the CPU again if it still has callbacks.  The reason
+ * for this is that rcu_prepare_for_idle() might need to post a timer,
+ * but if so, it will do so after tick_nohz_stop_sched_tick() has set
+ * the wakeup time for this CPU.  This means that RCU's timer can be
+ * delayed until the wakeup time, which defeats the purpose of posting
+ * a timer.
+ */
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+{
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	/* Flag a new idle sojourn to the idle-entry state machine. */
+	rdtp->idle_first_pass = 1;
+	/* If no callbacks, RCU doesn't need the CPU. */
+	if (!rcu_cpu_has_callbacks(cpu)) {
+		*delta_jiffies = ULONG_MAX;
+		return 0;
+	}
+	if (rdtp->dyntick_holdoff == jiffies) {
+		/* RCU recently tried and failed, so don't try again. */
+		*delta_jiffies = 1;
+		return 1;
+	}
+	/* Set up for the possibility that RCU will post a timer. */
+	if (rcu_cpu_has_nonlazy_callbacks(cpu))
+		*delta_jiffies = RCU_IDLE_GP_DELAY;
+	else
+		*delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
+	return 0;
+}
+
 /*
  * Handler for smp_call_function_single().  The only point of this
  * handler is to wake the CPU up, so the handler does only tracing.
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..52f5ebbd443b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
+	unsigned long rcu_delta_jiffies;
 	ktime_t last_update, expires, now;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 	u64 time_delta;
@@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
 		time_delta = timekeeping_max_deferment();
 	} while (read_seqretry(&xtime_lock, seq));
 
-	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
+	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
 	    arch_needs_cpu(cpu)) {
 		next_jiffies = last_jiffies + 1;
 		delta_jiffies = 1;
@@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
 		/* Get the next timer wheel timer */
 		next_jiffies = get_next_timer_interrupt(last_jiffies);
 		delta_jiffies = next_jiffies - last_jiffies;
+		if (rcu_delta_jiffies < delta_jiffies) {
+			next_jiffies = last_jiffies + rcu_delta_jiffies;
+			delta_jiffies = rcu_delta_jiffies;
+		}
 	}
 	/*
 	 * Do not stop the tick, if we are only one off
-- 
cgit v1.2.3-71-gd317


From bafb282df29c1524b1617019adebd6d0c3eb7a47 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Thu, 7 Jun 2012 14:21:11 -0700
Subject: c/r: prctl: update prctl_set_mm_exe_file() after
 mm->num_exe_file_vmas removal

A fix for commit b32dfe377102 ("c/r: prctl: add ability to set new
mm_struct::exe_file").

After removing mm->num_exe_file_vmas kernel keeps mm->exe_file until
final mmput(), it never becomes NULL while task is alive.

We can check for other mapped files in mm instead of checking
mm->num_exe_file_vmas, and mark mm with flag MMF_EXE_FILE_CHANGED in
order to forbid second changing of mm->exe_file.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 kernel/sys.c          | 31 +++++++++++++++++++------------
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6029d8c54476..c688d4cc2e40 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -439,6 +439,7 @@ extern int get_dumpable(struct mm_struct *mm);
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
+#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ff89cb9657a..54f20fdee93c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1796,17 +1796,11 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma,
 
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
+	struct vm_area_struct *vma;
 	struct file *exe_file;
 	struct dentry *dentry;
 	int err;
 
-	/*
-	 * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
-	 * remain. So perform a quick test first.
-	 */
-	if (mm->num_exe_file_vmas)
-		return -EBUSY;
-
 	exe_file = fget(fd);
 	if (!exe_file)
 		return -EBADF;
@@ -1827,17 +1821,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	if (err)
 		goto exit;
 
+	down_write(&mm->mmap_sem);
+
+	/*
+	 * Forbid mm->exe_file change if there are mapped other files.
+	 */
+	err = -EBUSY;
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
+						&exe_file->f_path))
+			goto exit_unlock;
+	}
+
 	/*
 	 * The symlink can be changed only once, just to disallow arbitrary
 	 * transitions malicious software might bring in. This means one
 	 * could make a snapshot over all processes running and monitor
 	 * /proc/pid/exe changes to notice unusual activity if needed.
 	 */
-	down_write(&mm->mmap_sem);
-	if (likely(!mm->exe_file))
-		set_mm_exe_file(mm, exe_file);
-	else
-		err = -EBUSY;
+	err = -EPERM;
+	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
+		goto exit_unlock;
+
+	set_mm_exe_file(mm, exe_file);
+exit_unlock:
 	up_write(&mm->mmap_sem);
 
 exit:
-- 
cgit v1.2.3-71-gd317


From 1ad75b9e16280ca4e2501a629a225319cf2eef2e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 7 Jun 2012 14:21:11 -0700
Subject: c/r: prctl: add minimal address test to PR_SET_MM

Make sure the address being set is greater than mmap_min_addr (as
suggested by Kees Cook).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 54f20fdee93c..19a2c7139960 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1869,7 +1869,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (opt == PR_SET_MM_EXE_FILE)
 		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
-	if (addr >= TASK_SIZE)
+	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
 
 	error = -EINVAL;
-- 
cgit v1.2.3-71-gd317


From 300f786b2683f8bb1ec0afb6e1851183a479c86d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 7 Jun 2012 14:21:12 -0700
Subject: c/r: prctl: add ability to get clear_tid_address

Zero is written at clear_tid_address when the process exits.  This
functionality is used by pthread_join().

We already have sys_set_tid_address() to change this address for the
current task but there is no way to obtain it from user space.

Without the ability to find this address and dump it we can't restore
pthread'ed apps which call pthread_join() once they have been restored.

This patch introduces the PR_GET_TID_ADDRESS prctl option which allows
the current process to obtain own clear_tid_address.

This feature is available iif CONFIG_CHECKPOINT_RESTORE is set.

[akpm@linux-foundation.org: fix prctl numbering]
Signed-off-by: Andrew Vagin <avagin@openvz.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pedro Alves <palves@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h | 10 ++++++----
 kernel/sys.c          | 13 +++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 711e0a30aacc..3988012255dc 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -127,8 +127,8 @@
 #define PR_SET_PTRACER 0x59616d61
 # define PR_SET_PTRACER_ANY ((unsigned long)-1)
 
-#define PR_SET_CHILD_SUBREAPER 36
-#define PR_GET_CHILD_SUBREAPER 37
+#define PR_SET_CHILD_SUBREAPER	36
+#define PR_GET_CHILD_SUBREAPER	37
 
 /*
  * If no_new_privs is set, then operations that grant new privileges (i.e.
@@ -142,7 +142,9 @@
  * asking selinux for a specific new context (e.g. with runcon) will result
  * in execve returning -EPERM.
  */
-#define PR_SET_NO_NEW_PRIVS 38
-#define PR_GET_NO_NEW_PRIVS 39
+#define PR_SET_NO_NEW_PRIVS	38
+#define PR_GET_NO_NEW_PRIVS	39
+
+#define PR_GET_TID_ADDRESS	40
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 19a2c7139960..0ec1942ba7ea 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1988,12 +1988,22 @@ out:
 	up_read(&mm->mmap_sem);
 	return error;
 }
+
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+	return put_user(me->clear_child_tid, tid_addr);
+}
+
 #else /* CONFIG_CHECKPOINT_RESTORE */
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
 	return -EINVAL;
 }
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+	return -EINVAL;
+}
 #endif
 
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -2131,6 +2141,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				else
 					return -EINVAL;
 				break;
+		case PR_GET_TID_ADDRESS:
+			error = prctl_get_tid_address(me, (int __user **)arg2);
+			break;
 			default:
 				return -EINVAL;
 			}
-- 
cgit v1.2.3-71-gd317


From 736f24d5e59d699c6e300c5da7e3bb882eddda67 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 7 Jun 2012 14:21:12 -0700
Subject: c/r: prctl: drop VMA flags test on PR_SET_MM_ stack data assignment

In commit b76437579d13 ("procfs: mark thread stack correctly in
proc/<pid>/maps") the stack allocated via clone() is marked in
/proc/<pid>/maps as [stack:%d] thus it might be out of the former
mm->start_stack/end_stack values (and even has some custom VMA flags
set).

So to be able to restore mm->start_stack/end_stack drop vma flags test,
but still require the underlying VMA to exist.

As always note this feature is under CONFIG_CHECKPOINT_RESTORE and
requires CAP_SYS_RESOURCE to be granted.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 0ec1942ba7ea..f0ec44dcd415 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1786,14 +1786,6 @@ SYSCALL_DEFINE1(umask, int, mask)
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
-static bool vma_flags_mismatch(struct vm_area_struct *vma,
-			       unsigned long required,
-			       unsigned long banned)
-{
-	return (vma->vm_flags & required) != required ||
-		(vma->vm_flags & banned);
-}
-
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
 	struct vm_area_struct *vma;
@@ -1931,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
 			error = -EFAULT;
 			goto out;
 		}
-#ifdef CONFIG_STACK_GROWSUP
-		if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
-#else
-		if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
-#endif
-			goto out;
 		if (opt == PR_SET_MM_START_STACK)
 			mm->start_stack = addr;
 		else if (opt == PR_SET_MM_ARG_START)
-- 
cgit v1.2.3-71-gd317


From 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Thu, 7 Jun 2012 14:21:14 -0700
Subject: mm: correctly synchronize rss-counters at exit/exec

mm->rss_stat counters have per-task delta: task->rss_stat.  Before
changing task->mm pointer the kernel must flush this delta with
sync_mm_rss().

do_exit() already calls sync_mm_rss() to flush the rss-counters before
committing the rss statistics into task->signal->maxrss, taskstats,
audit and other stuff.  Unfortunately the kernel does this before
calling mm_release(), which can call put_user() for processing
task->clear_child_tid.  So at this point we can trigger page-faults and
task->rss_stat becomes non-zero again.  As a result mm->rss_stat becomes
inconsistent and check_mm() will print something like this:

| BUG: Bad rss-counter state mm:ffff88020813c380 idx:1 val:-1
| BUG: Bad rss-counter state mm:ffff88020813c380 idx:2 val:1

This patch moves sync_mm_rss() into mm_release(), and moves mm_release()
out of do_exit() and calls it earlier.  After mm_release() there should
be no pagefaults.

[akpm@linux-foundation.org: tweak comment]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org>		[3.4.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     |  1 -
 kernel/exit.c | 13 ++++++++-----
 kernel/fork.c |  8 ++++++++
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index a79786a8d2c8..b926ed19301e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -819,7 +819,6 @@ static int exec_mmap(struct mm_struct *mm)
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
-	sync_mm_rss(old_mm);
 	mm_release(tsk, old_mm);
 
 	if (old_mm) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 34867cc5b42a..804fb6bb8161 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -423,6 +423,7 @@ void daemonize(const char *name, ...)
 	 * user space pages.  We don't need them, and if we didn't close them
 	 * they would be locked into memory.
 	 */
+	mm_release(current, current->mm);
 	exit_mm(current);
 	/*
 	 * We don't want to get frozen, in case system-wide hibernation
@@ -640,7 +641,6 @@ static void exit_mm(struct task_struct * tsk)
 	struct mm_struct *mm = tsk->mm;
 	struct core_state *core_state;
 
-	mm_release(tsk, mm);
 	if (!mm)
 		return;
 	/*
@@ -960,9 +960,13 @@ void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	/* sync mm's RSS info before statistics gathering */
-	if (tsk->mm)
-		sync_mm_rss(tsk->mm);
+
+	/* Set exit_code before complete_vfork_done() in mm_release() */
+	tsk->exit_code = code;
+
+	/* Release mm and sync mm's RSS info before statistics gathering */
+	mm_release(tsk, tsk->mm);
+
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
@@ -975,7 +979,6 @@ void do_exit(long code)
 		tty_audit_exit();
 	audit_free(tsk);
 
-	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 
 	exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..0560781c6904 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -619,6 +619,14 @@ void mmput(struct mm_struct *mm)
 			module_put(mm->binfmt->module);
 		mmdrop(mm);
 	}
+
+	/*
+	 * Final rss-counter synchronization. After this point there must be
+	 * no pagefaults into this mm from the current context.  Otherwise
+	 * mm->rss_stat will be inconsistent.
+	 */
+	if (mm)
+		sync_mm_rss(mm);
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-- 
cgit v1.2.3-71-gd317


From 48d212a2eecaca2e1875925837ad27b2f43f48a3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 7 Jun 2012 17:54:07 -0700
Subject: Revert "mm: correctly synchronize rss-counters at exit/exec"

This reverts commit 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94.

It's horribly and utterly broken for at least the following reasons:

 - calling sync_mm_rss() from mmput() is fundamentally wrong, because
   there's absolutely no reason to believe that the task that does the
   mmput() always does it on its own VM.  Example: fork, ptrace, /proc -
   you name it.

 - calling it *after* having done mmdrop() on it is doubly insane, since
   the mm struct may well be gone now.

 - testing mm against NULL before you call it is insane too, since a
NULL mm there would have caused oopses long before.

.. and those are just the three bugs I found before I decided to give up
looking for me and revert it asap.  I should have caught it before I
even took it, but I trusted Andrew too much.

Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     |  1 +
 kernel/exit.c | 13 +++++--------
 kernel/fork.c |  8 --------
 3 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index b926ed19301e..a79786a8d2c8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -819,6 +819,7 @@ static int exec_mmap(struct mm_struct *mm)
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
+	sync_mm_rss(old_mm);
 	mm_release(tsk, old_mm);
 
 	if (old_mm) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 804fb6bb8161..34867cc5b42a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -423,7 +423,6 @@ void daemonize(const char *name, ...)
 	 * user space pages.  We don't need them, and if we didn't close them
 	 * they would be locked into memory.
 	 */
-	mm_release(current, current->mm);
 	exit_mm(current);
 	/*
 	 * We don't want to get frozen, in case system-wide hibernation
@@ -641,6 +640,7 @@ static void exit_mm(struct task_struct * tsk)
 	struct mm_struct *mm = tsk->mm;
 	struct core_state *core_state;
 
+	mm_release(tsk, mm);
 	if (!mm)
 		return;
 	/*
@@ -960,13 +960,9 @@ void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-
-	/* Set exit_code before complete_vfork_done() in mm_release() */
-	tsk->exit_code = code;
-
-	/* Release mm and sync mm's RSS info before statistics gathering */
-	mm_release(tsk, tsk->mm);
-
+	/* sync mm's RSS info before statistics gathering */
+	if (tsk->mm)
+		sync_mm_rss(tsk->mm);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
@@ -979,6 +975,7 @@ void do_exit(long code)
 		tty_audit_exit();
 	audit_free(tsk);
 
+	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 
 	exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0560781c6904..ab5211b9e622 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -619,14 +619,6 @@ void mmput(struct mm_struct *mm)
 			module_put(mm->binfmt->module);
 		mmdrop(mm);
 	}
-
-	/*
-	 * Final rss-counter synchronization. After this point there must be
-	 * no pagefaults into this mm from the current context.  Otherwise
-	 * mm->rss_stat will be inconsistent.
-	 */
-	if (mm)
-		sync_mm_rss(mm);
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-- 
cgit v1.2.3-71-gd317


From cd96891d48a945ca2011fbeceda73813d6286195 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Fri, 8 Jun 2012 13:18:33 -0700
Subject: sched/fair: fix lots of kernel-doc warnings

Fix lots of new kernel-doc warnings in kernel/sched/fair.c:

  Warning(kernel/sched/fair.c:3625): No description found for parameter 'env'
  Warning(kernel/sched/fair.c:3625): Excess function parameter 'sd' description in 'update_sg_lb_stats'
  Warning(kernel/sched/fair.c:3735): No description found for parameter 'env'
  Warning(kernel/sched/fair.c:3735): Excess function parameter 'sd' description in 'update_sd_pick_busiest'
  Warning(kernel/sched/fair.c:3735): Excess function parameter 'this_cpu' description in 'update_sd_pick_busiest'
  .. more warnings

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/fair.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2a2d236f27b..d5583f9588e7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3632,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
  * @group: sched_group whose statistics are to be updated.
  * @load_idx: Load index of sched_domain of this_cpu for load calc.
  * @local_group: Does group contain this_cpu.
@@ -3741,11 +3741,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 /**
  * update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
+ * @env: The load balancing environment.
  * @sds: sched_domain statistics
  * @sg: sched_group candidate to be checked for being the busiest
  * @sgs: sched_group statistics
- * @this_cpu: the current cpu
  *
  * Determine if @sg is a busier group than the previously selected
  * busiest group.
@@ -3783,9 +3782,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
+ * @env: The load balancing environment.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
@@ -3874,10 +3871,8 @@ static inline void update_sd_lb_stats(struct lb_env *env,
  * Returns 1 when packing is required and a task should be moved to
  * this CPU.  The amount of the imbalance is returned in *imbalance.
  *
- * @sd: The sched_domain whose packing is to be checked.
+ * @env: The load balancing environment.
  * @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
  */
 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 {
@@ -3903,9 +3898,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
  * fix_small_imbalance - Calculate the minor imbalance that exists
  *			amongst the groups of a sched_domain, during
  *			load balancing.
+ * @env: The load balancing environment.
  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
  */
 static inline
 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
@@ -4048,11 +4042,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
  * Also calculates the amount of weighted load which should be moved
  * to restore balance.
  *
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *		be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
+ * @env: The load balancing environment.
  * @cpus: The set of CPUs under consideration for load-balancing.
  * @balance: Pointer to a variable indicating if this_cpu
  *	is the appropriate cpu to perform load balancing at this_level.
-- 
cgit v1.2.3-71-gd317


From 6ebb017de9d59a18c3ff9648270e8f6abaa93438 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 5 Jun 2012 08:52:34 +0200
Subject: printk: Fix alignment of buf causing crash on ARM EABI

Commit 7ff9554bb578ba02166071d2d487b7fc7d860d62, printk: convert
byte-buffer to variable-length record buffer, causes systems using
EABI to crash very early in the boot cycle. The first entry in struct
log is a u64, which for EABI must be 8 byte aligned.

Make use of __alignof__() so the compiler to decide the alignment, but
allow it to be overridden using CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,
for systems which can perform unaligned access and want to save
a few bytes of space.

Tested on Orion5x and Kirkwood.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Stephen Warren <swarren@wwwdotorg.org>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
Acked-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 32462d2b364a..f205c25c37e2 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -227,10 +227,10 @@ static u32 clear_idx;
 #define LOG_LINE_MAX 1024
 
 /* record buffer */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 #define LOG_ALIGN 4
 #else
-#define LOG_ALIGN 8
+#define LOG_ALIGN __alignof__(struct log)
 #endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
-- 
cgit v1.2.3-71-gd317


From a70270468234749741c5893ae78e5bb524771402 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Wed, 13 Jun 2012 09:35:48 -0400
Subject: watchdog: Quiet down the boot messages

A bunch of bugzillas have complained how noisy the nmi_watchdog
is during boot-up especially with its expected failure cases
(like virt and bios resource contention).

This is my attempt to quiet them down and keep it less confusing
for the end user.  What I did is print the message for cpu0 and
save it for future comparisons.  If future cpus have an
identical message as cpu0, then don't print the redundant info.
However, if a future cpu has a different message, happily print
that loudly.

Before the change, you would see something like:

    ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
    CPU0: Intel(R) Core(TM)2 Quad CPU    Q9550  @ 2.83GHz stepping 0a
    Performance Events: PEBS fmt0+, Core2 events, Intel PMU driver.
    ... version:                2
    ... bit width:              40
    ... generic registers:      2
    ... value mask:             000000ffffffffff
    ... max period:             000000007fffffff
    ... fixed-purpose events:   3
    ... event mask:             0000000700000003
    NMI watchdog enabled, takes one hw-pmu counter.
    Booting Node   0, Processors  #1
    NMI watchdog enabled, takes one hw-pmu counter.
     #2
    NMI watchdog enabled, takes one hw-pmu counter.
     #3 Ok.
    NMI watchdog enabled, takes one hw-pmu counter.
    Brought up 4 CPUs
    Total of 4 processors activated (22607.24 BogoMIPS).

After the change, it is simplified to:

    ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
    CPU0: Intel(R) Core(TM)2 Quad CPU    Q9550  @ 2.83GHz stepping 0a
    Performance Events: PEBS fmt0+, Core2 events, Intel PMU driver.
    ... version:                2
    ... bit width:              40
    ... generic registers:      2
    ... value mask:             000000ffffffffff
    ... max period:             000000007fffffff
    ... fixed-purpose events:   3
    ... event mask:             0000000700000003
    NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
    Booting Node   0, Processors  #1 #2 #3 Ok.
    Brought up 4 CPUs

V2: little changes based on Joe Perches' feedback
V3: printk cleanup based on Ingo's feedback; checkpatch fix
V4: keep printk as one long line
V5: Ingo fix ups

Reported-and-tested-by: Nathan Zimmer <nzimmer@sgi.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: nzimmer@sgi.com
Cc: joe@perches.com
Link: http://lkml.kernel.org/r/1339594548-17227-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e5e1d85b8c7c..4b1dfba70f7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -372,6 +372,13 @@ static int watchdog(void *unused)
 
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+
 static int watchdog_nmi_enable(int cpu)
 {
 	struct perf_event_attr *wd_attr;
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu)
 
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+
+	/* save cpu0 error for future comparision */
+	if (cpu == 0 && IS_ERR(event))
+		cpu0_err = PTR_ERR(event);
+
 	if (!IS_ERR(event)) {
-		pr_info("enabled, takes one hw-pmu counter.\n");
+		/* only print for cpu0 or different than cpu0 */
+		if (cpu == 0 || cpu0_err)
+			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
 		goto out_save;
 	}
 
+	/* skip displaying the same error again */
+	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+		return PTR_ERR(event);
 
 	/* vary the KERN level based on the returned errno */
 	if (PTR_ERR(event) == -EOPNOTSUPP)
-- 
cgit v1.2.3-71-gd317


From e2ae715d66bf4becfb85eb84b7150e23cf27df30 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay@vrfy.org>
Date: Fri, 15 Jun 2012 14:07:51 +0200
Subject: kmsg - kmsg_dump() use iterator to receive log buffer content

Provide an iterator to receive the log buffer content, and convert all
kmsg_dump() users to it.

The structured data in the kmsg buffer now contains binary data, which
should no longer be copied verbatim to the kmsg_dump() users.

The iterator should provide reliable access to the buffer data, and also
supports proper log line-aware chunking of data while iterating.

Signed-off-by: Kay Sievers <kay@vrfy.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Reported-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Tested-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/pseries/nvram.c     |  61 +-------
 arch/x86/platform/mrst/early_printk_mrst.c |  13 +-
 drivers/mtd/mtdoops.c                      |  22 +--
 fs/pstore/platform.c                       |  34 ++---
 include/linux/kmsg_dump.h                  |  45 +++++-
 kernel/printk.c                            | 220 +++++++++++++++++++++++++----
 6 files changed, 258 insertions(+), 137 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 36f957f31842..8733a86ad52e 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -68,9 +68,7 @@ static const char *pseries_nvram_os_partitions[] = {
 };
 
 static void oops_to_nvram(struct kmsg_dumper *dumper,
-		enum kmsg_dump_reason reason,
-		const char *old_msgs, unsigned long old_len,
-		const char *new_msgs, unsigned long new_len);
+			  enum kmsg_dump_reason reason);
 
 static struct kmsg_dumper nvram_kmsg_dumper = {
 	.dump = oops_to_nvram
@@ -503,28 +501,6 @@ int __init pSeries_nvram_init(void)
 	return 0;
 }
 
-/*
- * Try to capture the last capture_len bytes of the printk buffer.  Return
- * the amount actually captured.
- */
-static size_t capture_last_msgs(const char *old_msgs, size_t old_len,
-				const char *new_msgs, size_t new_len,
-				char *captured, size_t capture_len)
-{
-	if (new_len >= capture_len) {
-		memcpy(captured, new_msgs + (new_len - capture_len),
-								capture_len);
-		return capture_len;
-	} else {
-		/* Grab the end of old_msgs. */
-		size_t old_tail_len = min(old_len, capture_len - new_len);
-		memcpy(captured, old_msgs + (old_len - old_tail_len),
-								old_tail_len);
-		memcpy(captured + old_tail_len, new_msgs, new_len);
-		return old_tail_len + new_len;
-	}
-}
-
 /*
  * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
  * would logging this oops/panic overwrite an RTAS event that rtas_errd
@@ -541,27 +517,6 @@ static int clobbering_unread_rtas_event(void)
 						NVRAM_RTAS_READ_TIMEOUT);
 }
 
-/* Squeeze out each line's <n> severity prefix. */
-static size_t elide_severities(char *buf, size_t len)
-{
-	char *in, *out, *buf_end = buf + len;
-	/* Assume a <n> at the very beginning marks the start of a line. */
-	int newline = 1;
-
-	in = out = buf;
-	while (in < buf_end) {
-		if (newline && in+3 <= buf_end &&
-				*in == '<' && isdigit(in[1]) && in[2] == '>') {
-			in += 3;
-			newline = 0;
-		} else {
-			newline = (*in == '\n');
-			*out++ = *in++;
-		}
-	}
-	return out - buf;
-}
-
 /* Derived from logfs_compress() */
 static int nvram_compress(const void *in, void *out, size_t inlen,
 							size_t outlen)
@@ -619,9 +574,7 @@ static int zip_oops(size_t text_len)
  * partition.  If that's too much, go back and capture uncompressed text.
  */
 static void oops_to_nvram(struct kmsg_dumper *dumper,
-		enum kmsg_dump_reason reason,
-		const char *old_msgs, unsigned long old_len,
-		const char *new_msgs, unsigned long new_len)
+			  enum kmsg_dump_reason reason)
 {
 	static unsigned int oops_count = 0;
 	static bool panicking = false;
@@ -660,14 +613,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 		return;
 
 	if (big_oops_buf) {
-		text_len = capture_last_msgs(old_msgs, old_len,
-			new_msgs, new_len, big_oops_buf, big_oops_buf_sz);
-		text_len = elide_severities(big_oops_buf, text_len);
+		kmsg_dump_get_buffer(dumper, false,
+				     big_oops_buf, big_oops_buf_sz, &text_len);
 		rc = zip_oops(text_len);
 	}
 	if (rc != 0) {
-		text_len = capture_last_msgs(old_msgs, old_len,
-				new_msgs, new_len, oops_data, oops_data_sz);
+		kmsg_dump_rewind(dumper);
+		kmsg_dump_get_buffer(dumper, true,
+				     oops_data, oops_data_sz, &text_len);
 		err_type = ERR_TYPE_KERNEL_PANIC;
 		*oops_len = (u16) text_len;
 	}
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 3c6e328483c7..028454f0c3a5 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -110,19 +110,16 @@ static struct kmsg_dumper dw_dumper;
 static int dumper_registered;
 
 static void dw_kmsg_dump(struct kmsg_dumper *dumper,
-			enum kmsg_dump_reason reason,
-			const char *s1, unsigned long l1,
-			const char *s2, unsigned long l2)
+			 enum kmsg_dump_reason reason)
 {
-	int i;
+	static char line[1024];
+	size_t len;
 
 	/* When run to this, we'd better re-init the HW */
 	mrst_early_console_init();
 
-	for (i = 0; i < l1; i++)
-		early_mrst_console.write(&early_mrst_console, s1 + i, 1);
-	for (i = 0; i < l2; i++)
-		early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+	while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len))
+		early_mrst_console.write(&early_mrst_console, line, len);
 }
 
 /* Set the ratio rate to 115200, 8n1, IRQ disabled */
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index ae36d7e1e913..551e316e4454 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -304,32 +304,17 @@ static void find_next_position(struct mtdoops_context *cxt)
 }
 
 static void mtdoops_do_dump(struct kmsg_dumper *dumper,
-		enum kmsg_dump_reason reason, const char *s1, unsigned long l1,
-		const char *s2, unsigned long l2)
+			    enum kmsg_dump_reason reason)
 {
 	struct mtdoops_context *cxt = container_of(dumper,
 			struct mtdoops_context, dump);
-	unsigned long s1_start, s2_start;
-	unsigned long l1_cpy, l2_cpy;
-	char *dst;
-
-	if (reason != KMSG_DUMP_OOPS &&
-	    reason != KMSG_DUMP_PANIC)
-		return;
 
 	/* Only dump oopses if dump_oops is set */
 	if (reason == KMSG_DUMP_OOPS && !dump_oops)
 		return;
 
-	dst = cxt->oops_buf + MTDOOPS_HEADER_SIZE; /* Skip the header */
-	l2_cpy = min(l2, record_size - MTDOOPS_HEADER_SIZE);
-	l1_cpy = min(l1, record_size - MTDOOPS_HEADER_SIZE - l2_cpy);
-
-	s2_start = l2 - l2_cpy;
-	s1_start = l1 - l1_cpy;
-
-	memcpy(dst, s1 + s1_start, l1_cpy);
-	memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
+	kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE,
+			     record_size - MTDOOPS_HEADER_SIZE, NULL);
 
 	/* Panics must be written immediately */
 	if (reason != KMSG_DUMP_OOPS)
@@ -375,6 +360,7 @@ static void mtdoops_notify_add(struct mtd_info *mtd)
 		return;
 	}
 
+	cxt->dump.max_reason = KMSG_DUMP_OOPS;
 	cxt->dump.dump = mtdoops_do_dump;
 	err = kmsg_dump_register(&cxt->dump);
 	if (err) {
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 82c585f715e3..03ce7a9b81cc 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -94,20 +94,15 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)
  * as we can from the end of the buffer.
  */
 static void pstore_dump(struct kmsg_dumper *dumper,
-	    enum kmsg_dump_reason reason,
-	    const char *s1, unsigned long l1,
-	    const char *s2, unsigned long l2)
+			enum kmsg_dump_reason reason)
 {
-	unsigned long	s1_start, s2_start;
-	unsigned long	l1_cpy, l2_cpy;
-	unsigned long	size, total = 0;
-	char		*dst;
+	unsigned long	total = 0;
 	const char	*why;
 	u64		id;
-	int		hsize, ret;
 	unsigned int	part = 1;
 	unsigned long	flags = 0;
 	int		is_locked = 0;
+	int		ret;
 
 	why = get_reason_str(reason);
 
@@ -119,30 +114,25 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		spin_lock_irqsave(&psinfo->buf_lock, flags);
 	oopscount++;
 	while (total < kmsg_bytes) {
+		char *dst;
+		unsigned long size;
+		int hsize;
+		size_t len;
+
 		dst = psinfo->buf;
 		hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part);
 		size = psinfo->bufsize - hsize;
 		dst += hsize;
 
-		l2_cpy = min(l2, size);
-		l1_cpy = min(l1, size - l2_cpy);
-
-		if (l1_cpy + l2_cpy == 0)
+		if (!kmsg_dump_get_buffer(dumper, true, dst, size, &len))
 			break;
 
-		s2_start = l2 - l2_cpy;
-		s1_start = l1 - l1_cpy;
-
-		memcpy(dst, s1 + s1_start, l1_cpy);
-		memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
-
 		ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
-				   hsize + l1_cpy + l2_cpy, psinfo);
+				    hsize + len, psinfo);
 		if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
 			pstore_new_entry = 1;
-		l1 -= l1_cpy;
-		l2 -= l2_cpy;
-		total += l1_cpy + l2_cpy;
+
+		total += hsize + len;
 		part++;
 	}
 	if (in_nmi()) {
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 35f7237ec972..af4eb5a39d9a 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -21,6 +21,7 @@
  * is passed to the kernel.
  */
 enum kmsg_dump_reason {
+	KMSG_DUMP_UNDEF,
 	KMSG_DUMP_PANIC,
 	KMSG_DUMP_OOPS,
 	KMSG_DUMP_EMERG,
@@ -31,23 +32,37 @@ enum kmsg_dump_reason {
 
 /**
  * struct kmsg_dumper - kernel crash message dumper structure
- * @dump:	The callback which gets called on crashes. The buffer is passed
- * 		as two sections, where s1 (length l1) contains the older
- * 		messages and s2 (length l2) contains the newer.
  * @list:	Entry in the dumper list (private)
+ * @dump:	Call into dumping code which will retrieve the data with
+ * 		through the record iterator
+ * @max_reason:	filter for highest reason number that should be dumped
  * @registered:	Flag that specifies if this is already registered
  */
 struct kmsg_dumper {
-	void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason,
-			const char *s1, unsigned long l1,
-			const char *s2, unsigned long l2);
 	struct list_head list;
-	int registered;
+	void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason);
+	enum kmsg_dump_reason max_reason;
+	bool active;
+	bool registered;
+
+	/* private state of the kmsg iterator */
+	u32 cur_idx;
+	u32 next_idx;
+	u64 cur_seq;
+	u64 next_seq;
 };
 
 #ifdef CONFIG_PRINTK
 void kmsg_dump(enum kmsg_dump_reason reason);
 
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+			char *line, size_t size, size_t *len);
+
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+			  char *buf, size_t size, size_t *len);
+
+void kmsg_dump_rewind(struct kmsg_dumper *dumper);
+
 int kmsg_dump_register(struct kmsg_dumper *dumper);
 
 int kmsg_dump_unregister(struct kmsg_dumper *dumper);
@@ -56,6 +71,22 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason)
 {
 }
 
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+			  const char *line, size_t size, size_t *len)
+{
+	return false;
+}
+
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+			    char *buf, size_t size, size_t *len)
+{
+	return false;
+}
+
+void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+{
+}
+
 static inline int kmsg_dump_register(struct kmsg_dumper *dumper)
 {
 	return -EINVAL;
diff --git a/kernel/printk.c b/kernel/printk.c
index f205c25c37e2..ceb4a2f775a1 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -909,7 +909,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		/*
 		 * Find first record that fits, including all following records,
 		 * into the user-provided buffer for this dump.
-		*/
+		 */
 		seq = clear_seq;
 		idx = clear_idx;
 		while (seq < log_next_seq) {
@@ -919,6 +919,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			idx = log_next(idx);
 			seq++;
 		}
+
+		/* move first record forward until length fits into the buffer */
 		seq = clear_seq;
 		idx = clear_idx;
 		while (len > size && seq < log_next_seq) {
@@ -929,7 +931,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			seq++;
 		}
 
-		/* last message in this dump */
+		/* last message fitting into this dump */
 		next_seq = log_next_seq;
 
 		len = 0;
@@ -2300,48 +2302,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
  * kmsg_dump - dump kernel log to kernel message dumpers.
  * @reason: the reason (oops, panic etc) for dumping
  *
- * Iterate through each of the dump devices and call the oops/panic
- * callbacks with the log buffer.
+ * Call each of the registered dumper's dump() callback, which can
+ * retrieve the kmsg records with kmsg_dump_get_line() or
+ * kmsg_dump_get_buffer().
  */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
-	u64 idx;
 	struct kmsg_dumper *dumper;
-	const char *s1, *s2;
-	unsigned long l1, l2;
 	unsigned long flags;
 
 	if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
 		return;
 
-	/* Theoretically, the log could move on after we do this, but
-	   there's not a lot we can do about that. The new messages
-	   will overwrite the start of what we dump. */
+	rcu_read_lock();
+	list_for_each_entry_rcu(dumper, &dump_list, list) {
+		if (dumper->max_reason && reason > dumper->max_reason)
+			continue;
+
+		/* initialize iterator with data about the stored records */
+		dumper->active = true;
+
+		raw_spin_lock_irqsave(&logbuf_lock, flags);
+		dumper->cur_seq = clear_seq;
+		dumper->cur_idx = clear_idx;
+		dumper->next_seq = log_next_seq;
+		dumper->next_idx = log_next_idx;
+		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+		/* invoke dumper which will iterate over records */
+		dumper->dump(dumper, reason);
+
+		/* reset iterator */
+		dumper->active = false;
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+			char *line, size_t size, size_t *len)
+{
+	unsigned long flags;
+	struct log *msg;
+	size_t l = 0;
+	bool ret = false;
+
+	if (!dumper->active)
+		goto out;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	if (syslog_seq < log_first_seq)
-		idx = syslog_idx;
-	else
-		idx = log_first_idx;
+	if (dumper->cur_seq < log_first_seq) {
+		/* messages are gone, move to first available one */
+		dumper->cur_seq = log_first_seq;
+		dumper->cur_idx = log_first_idx;
+	}
 
-	if (idx > log_next_idx) {
-		s1 = log_buf;
-		l1 = log_next_idx;
+	/* last entry */
+	if (dumper->cur_seq >= log_next_seq) {
+		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		goto out;
+	}
 
-		s2 = log_buf + idx;
-		l2 = log_buf_len - idx;
-	} else {
-		s1 = "";
-		l1 = 0;
+	msg = log_from_idx(dumper->cur_idx);
+	l = msg_print_text(msg, syslog,
+			      line, size);
+
+	dumper->cur_idx = log_next(dumper->cur_idx);
+	dumper->cur_seq++;
+	ret = true;
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+	if (len)
+		*len = l;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
+
+/**
+ * kmsg_dump_get_buffer - copy kmsg log lines
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the end of the kmsg buffer and fill the provided buffer
+ * with as many of the the *youngest* kmsg records that fit into it.
+ * If the buffer is large enough, all available kmsg records will be
+ * copied with a single call.
+ *
+ * Consecutive calls will fill the buffer with the next block of
+ * available older records, not including the earlier retrieved ones.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+			  char *buf, size_t size, size_t *len)
+{
+	unsigned long flags;
+	u64 seq;
+	u32 idx;
+	u64 next_seq;
+	u32 next_idx;
+	size_t l = 0;
+	bool ret = false;
+
+	if (!dumper->active)
+		goto out;
+
+	raw_spin_lock_irqsave(&logbuf_lock, flags);
+	if (dumper->cur_seq < log_first_seq) {
+		/* messages are gone, move to first available one */
+		dumper->cur_seq = log_first_seq;
+		dumper->cur_idx = log_first_idx;
+	}
+
+	/* last entry */
+	if (dumper->cur_seq >= dumper->next_seq) {
+		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		goto out;
+	}
+
+	/* calculate length of entire buffer */
+	seq = dumper->cur_seq;
+	idx = dumper->cur_idx;
+	while (seq < dumper->next_seq) {
+		struct log *msg = log_from_idx(idx);
+
+		l += msg_print_text(msg, true, NULL, 0);
+		idx = log_next(idx);
+		seq++;
+	}
+
+	/* move first record forward until length fits into the buffer */
+	seq = dumper->cur_seq;
+	idx = dumper->cur_idx;
+	while (l > size && seq < dumper->next_seq) {
+		struct log *msg = log_from_idx(idx);
 
-		s2 = log_buf + idx;
-		l2 = log_next_idx - idx;
+		l -= msg_print_text(msg, true, NULL, 0);
+		idx = log_next(idx);
+		seq++;
 	}
+
+	/* last message in next interation */
+	next_seq = seq;
+	next_idx = idx;
+
+	l = 0;
+	while (seq < dumper->next_seq) {
+		struct log *msg = log_from_idx(idx);
+
+		l += msg_print_text(msg, syslog,
+				    buf + l, size - l);
+
+		idx = log_next(idx);
+		seq++;
+	}
+
+	dumper->next_seq = next_seq;
+	dumper->next_idx = next_idx;
+	ret = true;
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+	if (len)
+		*len = l;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(dumper, &dump_list, list)
-		dumper->dump(dumper, reason, s1, l1, s2, l2);
-	rcu_read_unlock();
+/**
+ * kmsg_dump_rewind - reset the interator
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&logbuf_lock, flags);
+	dumper->cur_seq = clear_seq;
+	dumper->cur_idx = clear_idx;
+	dumper->next_seq = log_next_seq;
+	dumper->next_idx = log_next_idx;
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 #endif
-- 
cgit v1.2.3-71-gd317


From 4a77a5a06ec66ed05199b301e7c25f42f979afdc Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Sat, 16 Jun 2012 21:21:51 +0800
Subject: printk: use mutex lock to stop syslog_seq from going wild

Although syslog_seq and log_next_seq stuff are protected by logbuf_lock
spin log, it's not enough. Say we have two processes A and B, and let
syslog_seq = N, while log_next_seq = N + 1, and the two processes both
come to syslog_print at almost the same time. And No matter which
process get the spin lock first, it will increase syslog_seq by one,
then release spin lock; thus later, another process increase syslog_seq
by one again. In this case, syslog_seq is bigger than syslog_next_seq.
And latter, it would make:
   wait_event_interruptiable(log_wait, syslog != log_next_seq)
don't wait any more even there is no new write comes. Thus it introduce
a infinite loop reading.

I can easily see this kind of issue by the following steps:
  # cat /proc/kmsg # at meantime, I don't kill rsyslog
                   # So they are the two processes.
  # xinit          # I added drm.debug=6 in the kernel parameter line,
                   # so that it will produce lots of message and let that
                   # issue happen

It's 100% reproducable on my side. And my disk will be filled up by
/var/log/messages in a quite short time.

So, introduce a mutex_lock to stop syslog_seq from going wild just like
what devkmsg_read() does. It does fix this issue as expected.

v2: use mutex_lock_interruptiable() instead (comments from Kay)

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Reviewed-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-By: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index ceb4a2f775a1..572730bd8a5c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -414,7 +414,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (!user)
 		return -EBADF;
 
-	mutex_lock(&user->lock);
+	ret = mutex_lock_interruptible(&user->lock);
+	if (ret)
+		return ret;
 	raw_spin_lock(&logbuf_lock);
 	while (user->seq == log_next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
@@ -976,6 +978,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
 	bool clear = false;
 	static int saved_console_loglevel = -1;
+	static DEFINE_MUTEX(syslog_mutex);
 	int error;
 
 	error = check_syslog_permissions(type, from_file);
@@ -1002,11 +1005,17 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			error = -EFAULT;
 			goto out;
 		}
+		error = mutex_lock_interruptible(&syslog_mutex);
+		if (error)
+			goto out;
 		error = wait_event_interruptible(log_wait,
 						 syslog_seq != log_next_seq);
-		if (error)
+		if (error) {
+			mutex_unlock(&syslog_mutex);
 			goto out;
+		}
 		error = syslog_print(buf, len);
+		mutex_unlock(&syslog_mutex);
 		break;
 	/* Read/clear last kernel messages */
 	case SYSLOG_ACTION_READ_CLEAR:
-- 
cgit v1.2.3-71-gd317


From b56a39ac263e5b8cafedd551a49c2105e68b98c2 Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Sat, 16 Jun 2012 12:40:55 +0800
Subject: printk: return -EINVAL if the message len is bigger than the buf size

Just like what devkmsg_read() does, return -EINVAL if the message len is
bigger than the buf size, or it will trigger a segfault error.

Acked-by: Kay Sievers <kay@vrfy.org>
Acked-by: Fengguang Wu <wfg@linux.intel.com>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 572730bd8a5c..a2276b916769 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -880,7 +880,9 @@ static int syslog_print(char __user *buf, int size)
 	syslog_seq++;
 	raw_spin_unlock_irq(&logbuf_lock);
 
-	if (len > 0 && copy_to_user(buf, text, len))
+	if (len > size)
+		len = -EINVAL;
+	else if (len > 0 && copy_to_user(buf, text, len))
 		len = -EFAULT;
 
 	kfree(text);
-- 
cgit v1.2.3-71-gd317


From 9c5da09d266ca9b32eb16cf940f8161d949c2fe5 Mon Sep 17 00:00:00 2001
From: Salman Qazi <sqazi@google.com>
Date: Thu, 14 Jun 2012 15:31:09 -0700
Subject: perf: Use css_tryget() to avoid propping up css refcount

An rmdir pushes css's ref count to zero.  However, if the associated
directory is open at the time, the dentry ref count is non-zero.  If
the fd for this directory is then passed into perf_event_open, it
does a css_get().  This bounces the ref count back up from zero.  This
is a problem by itself.  But what makes it turn into a crash is the
fact that we end up doing an extra dput, since we perform a dput
when css_put sees the ref count go down to zero.

css_tryget() does not fall into that trap. So, we use that instead.

Reproduction test-case for the bug:

 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <linux/unistd.h>
 #include <linux/perf_event.h>
 #include <string.h>
 #include <errno.h>
 #include <stdio.h>

 #define PERF_FLAG_PID_CGROUP    (1U << 2)

 int perf_event_open(struct perf_event_attr *hw_event_uptr,
                     pid_t pid, int cpu, int group_fd, unsigned long flags) {
         return syscall(__NR_perf_event_open,hw_event_uptr, pid, cpu,
                 group_fd, flags);
 }

 /*
  * Directly poke at the perf_event bug, since it's proving hard to repro
  * depending on where in the kernel tree.  what moved?
  */
 int main(int argc, char **argv)
 {
        int fd;
        struct perf_event_attr attr;
        memset(&attr, 0, sizeof(attr));
        attr.exclude_kernel = 1;
        attr.size = sizeof(attr);
        mkdir("/dev/cgroup/perf_event/blah", 0777);
        fd = open("/dev/cgroup/perf_event/blah", O_RDONLY);
        perror("open");
        rmdir("/dev/cgroup/perf_event/blah");
        sleep(2);
        perf_event_open(&attr, fd, 0, -1,  PERF_FLAG_PID_CGROUP);
        perror("perf_event_open");
        close(fd);
        return 0;
 }

Signed-off-by: Salman Qazi <sqazi@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20120614223108.1025.2503.stgit@dungbeetle.mtv.corp.google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f85c0154b333..d7d71d6ec972 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)
 	return !event->cgrp || event->cgrp == cpuctx->cgrp;
 }
 
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
 {
-	css_get(&event->cgrp->css);
+	return css_tryget(&event->cgrp->css);
 }
 
 static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	event->cgrp = cgrp;
 
 	/* must be done before we fput() the file */
-	perf_get_cgroup(event);
+	if (!perf_tryget_cgroup(event)) {
+		event->cgrp = NULL;
+		ret = -ENOENT;
+		goto out;
+	}
 
 	/*
 	 * all events in a group must monitor
-- 
cgit v1.2.3-71-gd317


From 8e3bbf42c6d73881956863cc3305456afe2bc4ea Mon Sep 17 00:00:00 2001
From: Salman Qazi <sqazi@google.com>
Date: Thu, 14 Jun 2012 14:55:30 -0700
Subject: cgroups: Account for CSS_DEACT_BIAS in __css_put

When we fixed the race between atomic_dec and css_refcnt, we missed
the fact that css_refcnt internally subtracts CSS_DEACT_BIAS to get
the actual reference count.  This can potentially cause a refcount leak
if __css_put races with cgroup_clear_css_refs.

Signed-off-by: Salman Qazi <sqazi@google.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ceeafe874b3f..2097684cf194 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void)
 
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
 
+static int css_unbias_refcnt(int refcnt)
+{
+	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
+}
+
 /* the current nr of refs, always >= 0 whether @css is deactivated or not */
 static int css_refcnt(struct cgroup_subsys_state *css)
 {
 	int v = atomic_read(&css->refcnt);
 
-	return v >= 0 ? v : v - CSS_DEACT_BIAS;
+	return css_unbias_refcnt(v);
 }
 
 /* convenient tests for these bits */
@@ -4982,9 +4987,12 @@ EXPORT_SYMBOL_GPL(__css_tryget);
 void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
+	int v;
 
 	rcu_read_lock();
-	switch (atomic_dec_return(&css->refcnt)) {
+	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
+
+	switch (v) {
 	case 1:
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
-- 
cgit v1.2.3-71-gd317


From 4fe7efdbdfb1c7e7a7f31decfd831c0f31d37091 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Wed, 20 Jun 2012 12:53:01 -0700
Subject: mm: correctly synchronize rss-counters at exit/exec

do_exit() and exec_mmap() call sync_mm_rss() before mm_release() does
put_user(clear_child_tid) which can update task->rss_stat and thus make
mm->rss_stat inconsistent.  This triggers the "BUG:" printk in check_mm().

Let's fix this bug in the safest way, and optimize/cleanup this later.

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     | 2 +-
 kernel/exit.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index a79786a8d2c8..da27b91ff1e8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -819,10 +819,10 @@ static int exec_mmap(struct mm_struct *mm)
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
-	sync_mm_rss(old_mm);
 	mm_release(tsk, old_mm);
 
 	if (old_mm) {
+		sync_mm_rss(old_mm);
 		/*
 		 * Make sure that if there is a core dump in progress
 		 * for the old mm, we get out and die instead of going
diff --git a/kernel/exit.c b/kernel/exit.c
index 34867cc5b42a..c0277d3f1aaa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -643,6 +643,7 @@ static void exit_mm(struct task_struct * tsk)
 	mm_release(tsk, mm);
 	if (!mm)
 		return;
+	sync_mm_rss(mm);
 	/*
 	 * Serialize with any possible pending coredump.
 	 * We must hold mmap_sem around checking core_state
-- 
cgit v1.2.3-71-gd317


From 6347e90091041e34bea625370794c92f4ce71228 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 20 Jun 2012 12:53:03 -0700
Subject: pidns: guarantee that the pidns init will be the last pidns process
 reaped

Today we have a twofold bug.  Sometimes release_task on pid == 1 in a pid
namespace can run before other processes in a pid namespace have had
release task called.  With the result that pid_ns_release_proc can be
called before the last proc_flus_task() is done using upid->ns->proc_mnt,
resulting in the use of a stale pointer.  This same set of circumstances
can lead to waitpid(...) returning for a processes started with
clone(CLONE_NEWPID) before the every process in the pid namespace has
actually exited.

To fix this modify zap_pid_ns_processess wait until all other processes in
the pid namespace have exited, even EXIT_DEAD zombies.

The delay_group_leader and related tests ensure that the thread gruop
leader will be the last thread of a process group to be reaped, or to
become EXIT_DEAD and self reap.  With the change to zap_pid_ns_processes
we get the guarantee that pid == 1 in a pid namespace will be the last
task that release_task is called on.

With pid == 1 being the last task to pass through release_task
pid_ns_release_proc can no longer be called too early nor can wait return
before all of the EXIT_DEAD tasks in a pid namespace have exited.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <louis.rilling@kerlabs.com>
Cc: Mike Galbraith <efault@gmx.de>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Tested-by: Andrew Wagin <avagin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c          | 14 +++++++++++++-
 kernel/pid_namespace.c | 20 ++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c0277d3f1aaa..a85efd2348bd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,7 +64,6 @@ static void exit_mm(struct task_struct * tsk);
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
 	nr_threads--;
-	detach_pid(p, PIDTYPE_PID);
 	if (group_dead) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
@@ -72,7 +71,20 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
+		/*
+		 * If we are the last child process in a pid namespace to be
+		 * reaped, notify the reaper sleeping zap_pid_ns_processes().
+		 */
+		if (IS_ENABLED(CONFIG_PID_NS)) {
+			struct task_struct *parent = p->real_parent;
+
+			if ((task_active_pid_ns(p)->child_reaper == parent) &&
+			    list_empty(&parent->children) &&
+			    (parent->flags & PF_EXITING))
+				wake_up_process(parent);
+		}
 	}
+	detach_pid(p, PIDTYPE_PID);
 	list_del_rcu(&p->thread_group);
 }
 
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 16b20e38c4a1..b3c7fd554250 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	}
 	read_unlock(&tasklist_lock);
 
+	/* Firstly reap the EXIT_ZOMBIE children we may have. */
 	do {
 		clear_thread_flag(TIF_SIGPENDING);
 		rc = sys_wait4(-1, NULL, __WALL, NULL);
 	} while (rc != -ECHILD);
 
+	/*
+	 * sys_wait4() above can't reap the TASK_DEAD children.
+	 * Make sure they all go away, see __unhash_process().
+	 */
+	for (;;) {
+		bool need_wait = false;
+
+		read_lock(&tasklist_lock);
+		if (!list_empty(&current->children)) {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			need_wait = true;
+		}
+		read_unlock(&tasklist_lock);
+
+		if (!need_wait)
+			break;
+		schedule();
+	}
+
 	if (pid_ns->reboot)
 		current->signal->group_exit_code = pid_ns->reboot;
 
-- 
cgit v1.2.3-71-gd317


From 50d75f8daead8a1f850c40a3b6c6575ab19b48cf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 20 Jun 2012 12:53:04 -0700
Subject: pidns: find_new_reaper() can no longer switch to
 init_pid_ns.child_reaper

find_new_reaper() changes pid_ns->child_reaper, see add0d4df ("pid_ns:
zap_pid_ns_processes: fix the ->child_reaper changing").

The original reason has gone away after the previous patch, ->children
list must be empty after zap_pid_ns_processes().

However now we can not switch to init_pid_ns.child_reaper.
__unhash_process() relies on the "->child_reaper == parent" check, but
this check does not work if the last exiting task is also the child
reaper.

As Eric sugested, we can change __unhash_process() to use the parent's
pid_ns and remove this code.

Also, with this change we can move detach_pid(PIDTYPE_PID) back, where it
was before the previous fix.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Louis Rilling <louis.rilling@kerlabs.com>
Cc: Mike Galbraith <efault@gmx.de>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Tested-by: Andrew Wagin <avagin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a85efd2348bd..2f59cc334516 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,6 +64,7 @@ static void exit_mm(struct task_struct * tsk);
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
 	nr_threads--;
+	detach_pid(p, PIDTYPE_PID);
 	if (group_dead) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
@@ -78,13 +79,12 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 		if (IS_ENABLED(CONFIG_PID_NS)) {
 			struct task_struct *parent = p->real_parent;
 
-			if ((task_active_pid_ns(p)->child_reaper == parent) &&
+			if ((task_active_pid_ns(parent)->child_reaper == parent) &&
 			    list_empty(&parent->children) &&
 			    (parent->flags & PF_EXITING))
 				wake_up_process(parent);
 		}
 	}
-	detach_pid(p, PIDTYPE_PID);
 	list_del_rcu(&p->thread_group);
 }
 
@@ -732,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 
 		zap_pid_ns_processes(pid_ns);
 		write_lock_irq(&tasklist_lock);
-		/*
-		 * We can not clear ->child_reaper or leave it alone.
-		 * There may by stealth EXIT_DEAD tasks on ->children,
-		 * forget_original_parent() must move them somewhere.
-		 */
-		pid_ns->child_reaper = init_pid_ns.child_reaper;
 	} else if (father->signal->has_child_subreaper) {
 		struct task_struct *reaper;
 
-- 
cgit v1.2.3-71-gd317


From 5702c5eeab959e86ee2d9b4fe7f2d87e65b25d46 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 20 Jun 2012 12:53:04 -0700
Subject: c/r: prctl: Move PR_GET_TID_ADDRESS to a proper place

During merging of PR_GET_TID_ADDRESS patch the code has been misplaced (it
happened to appear under PR_MCE_KILL) in result noone can use this option.

Fix it by moving code snippet to a proper place.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index f0ec44dcd415..e0c8ffc50d7f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2127,9 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				else
 					return -EINVAL;
 				break;
-		case PR_GET_TID_ADDRESS:
-			error = prctl_get_tid_address(me, (int __user **)arg2);
-			break;
 			default:
 				return -EINVAL;
 			}
@@ -2147,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_MM:
 			error = prctl_set_mm(arg2, arg3, arg4, arg5);
 			break;
+		case PR_GET_TID_ADDRESS:
+			error = prctl_get_tid_address(me, (int __user **)arg2);
+			break;
 		case PR_SET_CHILD_SUBREAPER:
 			me->signal->is_child_subreaper = !!arg2;
 			error = 0;
-- 
cgit v1.2.3-71-gd317