From 517a92c4e19fcea815332d3155e9fb7723251274 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 09:02:13 +0100
Subject: panic: print more informative messages on stackprotect failure

pointed out by pageexec@freemail.hu:

we just simply panic() when there's a stackprotector attack - giving
the attacked person no information about what kernel code the attack went
against.

print out the attacked function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..f236001cc4db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -327,7 +327,8 @@ EXPORT_SYMBOL(warn_on_slowpath);
  */
 void __stack_chk_fail(void)
 {
-	panic("stack-protector: Kernel stack is corrupted");
+	panic("stack-protector: Kernel stack is corrupted in: %p\n",
+		__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
 #endif
-- 
cgit v1.2.3-71-gd317


From 5cb273013e182a35e7db614d3e20a144cba71e53 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 09:07:01 +0100
Subject: panic: print out stacktrace if DEBUG_BUGVERBOSE

if CONFIG_DEBUG_BUGVERBOSE is set then the user most definitely wanted
to see as much information about kernel crashes as possible - so give
them at least a stack dump.

this is particularly useful for stackprotector related panics, where
the stacktrace can give us the exact location of the (attempted)
attack.

Pointed out by pageexec@freemail.hu in the stackprotector breakage
threads.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index f236001cc4db..17aad578a2f2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -80,6 +80,9 @@ NORET_TYPE void panic(const char * fmt, ...)
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+	dump_stack();
+#endif
 	bust_spinlocks(0);
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 54371a43a66f4477889769b4fa00df936855dc8f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Feb 2008 15:33:12 -0800
Subject: x86: add CONFIG_CC_STACKPROTECTOR self-test

This patch adds a simple self-test capability to the stackprotector
feature. The test deliberately overflows a stack buffer and then
checks if the canary trap function gets called.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 17aad578a2f2..50cf9257b234 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -324,14 +324,82 @@ EXPORT_SYMBOL(warn_on_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
+
+static unsigned long __stack_check_testing;
+/*
+ * Self test function for the stack-protector feature.
+ * This test requires that the local variable absolutely has
+ * a stack slot, hence the barrier()s.
+ */
+static noinline void __stack_chk_test_func(void)
+{
+	unsigned long foo;
+	barrier();
+	/*
+	 * we need to make sure we're not about to clobber the return address,
+	 * while real exploits do this, it's unhealthy on a running system.
+	 * Besides, if we would, the test is already failed anyway so
+	 * time to pull the emergency brake on it.
+	 */
+	if ((unsigned long)__builtin_return_address(0) == 
+					*(((unsigned long *)&foo)+1)) {
+		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
+		return;
+	}
+#ifdef CONFIG_FRAME_POINTER
+	/* We also don't want to clobber the frame pointer */
+	if ((unsigned long)__builtin_return_address(0) == 
+					*(((unsigned long *)&foo)+2)) {
+		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
+		return;
+	}
+#endif
+	barrier();
+	if (current->stack_canary == *(((unsigned long *)&foo)+1))
+		*(((unsigned long *)&foo)+1) = 0;
+	else
+		printk(KERN_ERR "No -fstack-protector canary found\n");
+	barrier();
+}
+
+static int __stack_chk_test(void)
+{
+	printk(KERN_INFO "Testing -fstack-protector-all feature\n");
+	__stack_check_testing = (unsigned long)&__stack_chk_test_func;
+	__stack_chk_test_func();
+	if (__stack_check_testing) {
+		printk(KERN_ERR "-fstack-protector-all test failed\n");
+		WARN_ON(1);
+	}
+	return 0;
+}
 /*
  * Called when gcc's -fstack-protector feature is used, and
  * gcc detects corruption of the on-stack canary value
  */
 void __stack_chk_fail(void)
 {
+	if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) {
+		long delta;
+
+		delta = (unsigned long)__builtin_return_address(0) -
+				__stack_check_testing;
+		/*
+		 * The test needs to happen inside the test function, so
+		 * check if the return address is close to that function.
+		 * The function is only 2 dozen bytes long, but keep a wide
+		 * safety margin to avoid panic()s for normal users regardless
+		 * of the quality of the compiler.
+		 */
+		if (delta >= 0 && delta <= 400) {
+			__stack_check_testing = 0;
+			return;
+		}
+	}
 	panic("stack-protector: Kernel stack is corrupted in: %p\n",
 		__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
+
+late_initcall(__stack_chk_test);
 #endif
-- 
cgit v1.2.3-71-gd317


From b719ac56c0032bc1602914c6ea70b0f1581b08c7 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Mon, 14 Apr 2008 10:03:50 -0700
Subject: panic.c: fix whitespace additions

trivial: remove white space addition in stack protector

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 50cf9257b234..866be9b72e4f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -341,14 +341,14 @@ static noinline void __stack_chk_test_func(void)
 	 * Besides, if we would, the test is already failed anyway so
 	 * time to pull the emergency brake on it.
 	 */
-	if ((unsigned long)__builtin_return_address(0) == 
+	if ((unsigned long)__builtin_return_address(0) ==
 					*(((unsigned long *)&foo)+1)) {
 		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
 		return;
 	}
 #ifdef CONFIG_FRAME_POINTER
 	/* We also don't want to clobber the frame pointer */
-	if ((unsigned long)__builtin_return_address(0) == 
+	if ((unsigned long)__builtin_return_address(0) ==
 					*(((unsigned long *)&foo)+2)) {
 		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
 		return;
-- 
cgit v1.2.3-71-gd317


From b40a4392a3c262e0d1b5379b4e142a8eefa63439 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 18 Apr 2008 06:16:45 -0700
Subject: stackprotector: turn not having the right gcc into a #warning

If the user selects the stack-protector config option, but does not have
a gcc that has the right bits enabled (for example because it isn't build
with a glibc that supports TLS, as is common for cross-compilers, but also
because it may be too old), then the runtime test fails right now.

This patch adds a warning message for this scenario. This warning accomplishes
two goals
1) the user is informed that the security option he selective isn't available
2) the user is suggested to turn of the CONFIG option that won't work for him,
   and would make the runtime test fail anyway.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Makefile | 2 +-
 kernel/panic.c    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3cff3c894cf3..c3e0eeeb1dd2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -73,7 +73,7 @@ else
 
         stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
         stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
-                "$(CC)" -fstack-protector )
+                "$(CC)" "-fstack-protector -DGCC_HAS_SP" )
         stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
                 "$(CC)" -fstack-protector-all )
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 866be9b72e4f..6729e3f4ebcb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -325,6 +325,9 @@ EXPORT_SYMBOL(warn_on_slowpath);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 
+#ifndef GCC_HAS_SP
+#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
+#endif
 static unsigned long __stack_check_testing;
 /*
  * Self test function for the stack-protector feature.
-- 
cgit v1.2.3-71-gd317


From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 22 Apr 2008 16:38:23 -0500
Subject: stackprotector: use canary at end of stack to indicate overruns at
 oops time

(Updated with a common max-stack-used checker that knows about
the canary, as suggested by Joe Perches)

Use a canary at the end of the stack to clearly indicate
at oops time whether the stack has ever overflowed.

This is a very simple implementation with a couple of
drawbacks:

1) a thread may legitimately use exactly up to the last
   word on the stack

 -- but the chances of doing this and then oopsing later seem slim

2) it's possible that the stack usage isn't dense enough
   that the canary location could get skipped over

 -- but the worst that happens is that we don't flag the overrun
 -- though this happens fairly often in my testing :(

With the code in place, an intentionally-bloated stack oops might
do:

BUG: unable to handle kernel paging request at ffff8103f84cc680
IP: [<ffffffff810253df>] update_curr+0x9a/0xa8
PGD 8063 PUD 0
Thread overran stack or stack corrupted
Oops: 0000 [1] SMP
CPU 0
...

... unless the stack overrun is so bad that it corrupts some other
thread.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/fault.c   |  7 +++++++
 include/linux/magic.h |  1 +
 include/linux/sched.h | 13 +++++++++++++
 kernel/exit.c         |  5 +----
 kernel/fork.c         |  5 +++++
 kernel/sched.c        |  7 +------
 6 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75a..1f524df68b96 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -25,6 +25,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/magic.h>
 
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	unsigned long address;
 	int write, si_code;
 	int fault;
+	unsigned long *stackend;
+
 #ifdef CONFIG_X86_64
 	unsigned long flags;
 #endif
@@ -850,6 +853,10 @@ no_context:
 
 	show_fault_oops(regs, error_code, address);
 
+ 	stackend = end_of_stack(tsk);
+	if (*stackend != STACK_END_MAGIC)
+		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
 	tsk->thread.error_code = error_code;
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 1fa0c2ce4dec..74e68e201166 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -42,4 +42,5 @@
 #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
 #define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
 
+#define STACK_END_MAGIC		0x57AC6E9D
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d6a515158783..c5181e77f305 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1969,6 +1969,19 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
 
 extern void thread_info_cache_init(void);
 
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static inline unsigned long stack_not_used(struct task_struct *p)
+{
+	unsigned long *n = end_of_stack(p);
+
+	do { 	/* Skip over canary */
+		n++;
+	} while (!*n);
+
+	return (unsigned long)n - (unsigned long)end_of_stack(p);
+}
+#endif
+
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f6185e69b69..fb8de6cbf2c7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -899,12 +899,9 @@ static void check_stack_usage(void)
 {
 	static DEFINE_SPINLOCK(low_water_lock);
 	static int lowest_to_date = THREAD_SIZE;
-	unsigned long *n = end_of_stack(current);
 	unsigned long free;
 
-	while (*n == 0)
-		n++;
-	free = (unsigned long)n - (unsigned long)end_of_stack(current);
+	free = stack_not_used(current);
 
 	if (free >= lowest_to_date)
 		return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..d428336e7aa1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -54,6 +54,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/magic.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
 	struct task_struct *tsk;
 	struct thread_info *ti;
+	unsigned long *stackend;
+
 	int err;
 
 	prepare_to_copy(orig);
@@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		goto out;
 
 	setup_thread_stack(tsk, orig);
+	stackend = end_of_stack(tsk);
+	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 	tsk->stack_canary = get_random_int();
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..a964ed945094 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5748,12 +5748,7 @@ void sched_show_task(struct task_struct *p)
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
-	{
-		unsigned long *n = end_of_stack(p);
-		while (!*n)
-			n++;
-		free = (unsigned long)n - (unsigned long)end_of_stack(p);
-	}
+	free = stack_not_used(p);
 #endif
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
-- 
cgit v1.2.3-71-gd317


From aa92db14270b79f0f91a9060b547a46f9e2639da Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 11 Jul 2008 05:09:55 -0700
Subject: stackprotector: better self-test

check stackprotector functionality by manipulating the canary briefly
during bootup.

far more robust than trying to overflow the stack. (which is architecture
dependent, etc.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6729e3f4ebcb..28153aec7100 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -347,22 +347,18 @@ static noinline void __stack_chk_test_func(void)
 	if ((unsigned long)__builtin_return_address(0) ==
 					*(((unsigned long *)&foo)+1)) {
 		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-		return;
 	}
 #ifdef CONFIG_FRAME_POINTER
 	/* We also don't want to clobber the frame pointer */
 	if ((unsigned long)__builtin_return_address(0) ==
 					*(((unsigned long *)&foo)+2)) {
 		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-		return;
 	}
 #endif
-	barrier();
-	if (current->stack_canary == *(((unsigned long *)&foo)+1))
-		*(((unsigned long *)&foo)+1) = 0;
-	else
+	if (current->stack_canary != *(((unsigned long *)&foo)+1))
 		printk(KERN_ERR "No -fstack-protector canary found\n");
-	barrier();
+
+	current->stack_canary = ~current->stack_canary;
 }
 
 static int __stack_chk_test(void)
@@ -373,7 +369,8 @@ static int __stack_chk_test(void)
 	if (__stack_check_testing) {
 		printk(KERN_ERR "-fstack-protector-all test failed\n");
 		WARN_ON(1);
-	}
+	};
+	current->stack_canary = ~current->stack_canary;
 	return 0;
 }
 /*
-- 
cgit v1.2.3-71-gd317


From af9ff7868f0f76d3364351b1641b9dfa99588e77 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Jul 2008 09:36:38 -0700
Subject: x86: simplify stackprotector self-check

Clean up the code by removing no longer needed code;
make sure the pda is updated and kept in sync

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/pda.h |  1 +
 kernel/panic.c        | 29 +++++++----------------------
 2 files changed, 8 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index 62b734986a44..a5ff5bb76299 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -131,4 +131,5 @@ do {									\
 
 #define PDA_STACKOFFSET (5*8)
 
+#define refresh_stack_canary() write_pda(stack_canary, current->stack_canary)
 #endif
diff --git a/kernel/panic.c b/kernel/panic.c
index 28153aec7100..87445a894c3a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -328,37 +328,21 @@ EXPORT_SYMBOL(warn_on_slowpath);
 #ifndef GCC_HAS_SP
 #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
 #endif
+
 static unsigned long __stack_check_testing;
+
 /*
  * Self test function for the stack-protector feature.
  * This test requires that the local variable absolutely has
- * a stack slot, hence the barrier()s.
+ * a stack slot.
  */
 static noinline void __stack_chk_test_func(void)
 {
-	unsigned long foo;
-	barrier();
-	/*
-	 * we need to make sure we're not about to clobber the return address,
-	 * while real exploits do this, it's unhealthy on a running system.
-	 * Besides, if we would, the test is already failed anyway so
-	 * time to pull the emergency brake on it.
-	 */
-	if ((unsigned long)__builtin_return_address(0) ==
-					*(((unsigned long *)&foo)+1)) {
-		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-	}
-#ifdef CONFIG_FRAME_POINTER
-	/* We also don't want to clobber the frame pointer */
-	if ((unsigned long)__builtin_return_address(0) ==
-					*(((unsigned long *)&foo)+2)) {
-		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-	}
-#endif
-	if (current->stack_canary != *(((unsigned long *)&foo)+1))
-		printk(KERN_ERR "No -fstack-protector canary found\n");
+	unsigned long dummy_buffer[64]; /* force gcc to use the canary */
 
 	current->stack_canary = ~current->stack_canary;
+	refresh_stack_canary();
+	dummy_buffer[3] = 1; /* fool gcc into keeping the variable */
 }
 
 static int __stack_chk_test(void)
@@ -371,6 +355,7 @@ static int __stack_chk_test(void)
 		WARN_ON(1);
 	};
 	current->stack_canary = ~current->stack_canary;
+	refresh_stack_canary();
 	return 0;
 }
 /*
-- 
cgit v1.2.3-71-gd317


From 4f962d4d65923d7b722192e729840cfb79af0a5a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 13 Jul 2008 21:42:44 +0200
Subject: stackprotector: remove self-test

turns out gcc generates such stackprotector-failure sequences
in certain circumstances:

        movq    -8(%rbp), %rax  # D.16032,
        xorq    %gs:40, %rax    #,
        jne     .L17    #,
        leave
        ret
.L17:
        call    __stack_chk_fail        #
        .size   __stack_chk_test_func, .-__stack_chk_test_func
        .section        .init.text,"ax",@progbits
        .type   panic_setup, @function
panic_setup:
        pushq   %rbp    #

note that there's no jump back to the failing context after the
call to __stack_chk_fail - i.e. it has a ((noreturn)) attribute.

Which is fair enough in the normal case but kills the self-test.
(as we cannot reliably return in the self-test)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 47 -----------------------------------------------
 1 file changed, 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 87445a894c3a..c35c9eca3eb2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -329,62 +329,15 @@ EXPORT_SYMBOL(warn_on_slowpath);
 #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
 #endif
 
-static unsigned long __stack_check_testing;
-
-/*
- * Self test function for the stack-protector feature.
- * This test requires that the local variable absolutely has
- * a stack slot.
- */
-static noinline void __stack_chk_test_func(void)
-{
-	unsigned long dummy_buffer[64]; /* force gcc to use the canary */
-
-	current->stack_canary = ~current->stack_canary;
-	refresh_stack_canary();
-	dummy_buffer[3] = 1; /* fool gcc into keeping the variable */
-}
-
-static int __stack_chk_test(void)
-{
-	printk(KERN_INFO "Testing -fstack-protector-all feature\n");
-	__stack_check_testing = (unsigned long)&__stack_chk_test_func;
-	__stack_chk_test_func();
-	if (__stack_check_testing) {
-		printk(KERN_ERR "-fstack-protector-all test failed\n");
-		WARN_ON(1);
-	};
-	current->stack_canary = ~current->stack_canary;
-	refresh_stack_canary();
-	return 0;
-}
 /*
  * Called when gcc's -fstack-protector feature is used, and
  * gcc detects corruption of the on-stack canary value
  */
 void __stack_chk_fail(void)
 {
-	if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) {
-		long delta;
-
-		delta = (unsigned long)__builtin_return_address(0) -
-				__stack_check_testing;
-		/*
-		 * The test needs to happen inside the test function, so
-		 * check if the return address is close to that function.
-		 * The function is only 2 dozen bytes long, but keep a wide
-		 * safety margin to avoid panic()s for normal users regardless
-		 * of the quality of the compiler.
-		 */
-		if (delta >= 0 && delta <= 400) {
-			__stack_check_testing = 0;
-			return;
-		}
-	}
 	panic("stack-protector: Kernel stack is corrupted in: %p\n",
 		__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
 
-late_initcall(__stack_chk_test);
 #endif
-- 
cgit v1.2.3-71-gd317


From 490dea45d00f01847ebebd007685d564aaf2cd98 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 24 Nov 2008 17:06:57 +0100
Subject: itimers: remove the per-cpu-ish-ness

Either we bounce once cacheline per cpu per tick, yielding n^2 bounces
or we just bounce a single..

Also, using per-cpu allocations for the thread-groups complicates the
per-cpu allocator in that its currently aimed to be a fixed sized
allocator and the only possible extention to that would be vmap based,
which is seriously constrained on 32 bit archs.

So making the per-cpu memory requirement depend on the number of
processes is an issue.

Lastly, it didn't deal with cpu-hotplug, although admittedly that might
be fixable.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h |  6 ++++
 include/linux/sched.h     | 29 ++++++++++++--------
 kernel/fork.c             | 15 +++++-----
 kernel/posix-cpu-timers.c | 70 -----------------------------------------------
 kernel/sched_stats.h      | 33 ++++++++++------------
 5 files changed, 46 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2f3c2d4ef73b..ea0ea1a4c36f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -48,6 +48,12 @@ extern struct fs_struct init_fs;
 	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
+	.cputime	= { .totals = {					\
+		.utime = cputime_zero,					\
+		.stime = cputime_zero,					\
+		.sum_exec_runtime = 0,					\
+		.lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock),	\
+	}, },								\
 }
 
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..c20943eabb4c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -450,6 +450,7 @@ struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long long sum_exec_runtime;
+	spinlock_t lock;
 };
 /* Alternate field names when used to cache expirations. */
 #define prof_exp	stime
@@ -465,7 +466,7 @@ struct task_cputime {
  * used for thread group CPU clock calculations.
  */
 struct thread_group_cputime {
-	struct task_cputime *totals;
+	struct task_cputime totals;
 };
 
 /*
@@ -2180,24 +2181,30 @@ static inline int spin_needbreak(spinlock_t *lock)
  * Thread group CPU time accounting.
  */
 
-extern int thread_group_cputime_alloc(struct task_struct *);
-extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
-
-static inline void thread_group_cputime_init(struct signal_struct *sig)
+static inline
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
-	sig->cputime.totals = NULL;
+	struct task_cputime *totals = &tsk->signal->cputime.totals;
+	unsigned long flags;
+
+	spin_lock_irqsave(&totals->lock, flags);
+	*times = *totals;
+	spin_unlock_irqrestore(&totals->lock, flags);
 }
 
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
+static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
-	if (curr->signal->cputime.totals)
-		return 0;
-	return thread_group_cputime_alloc(curr);
+	sig->cputime.totals = (struct task_cputime){
+		.utime = cputime_zero,
+		.stime = cputime_zero,
+		.sum_exec_runtime = 0,
+	};
+
+	spin_lock_init(&sig->cputime.totals.lock);
 }
 
 static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
-	free_percpu(sig->cputime.totals);
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b8f2a78be3d..7087d8c0e5e2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -820,14 +820,15 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	int ret;
 
 	if (clone_flags & CLONE_THREAD) {
-		ret = thread_group_cputime_clone_thread(current);
-		if (likely(!ret)) {
-			atomic_inc(&current->signal->count);
-			atomic_inc(&current->signal->live);
-		}
-		return ret;
+		atomic_inc(&current->signal->count);
+		atomic_inc(&current->signal->live);
+		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+
+	if (sig)
+		posix_cpu_timers_init_group(sig);
+
 	tsk->signal = sig;
 	if (!sig)
 		return -ENOMEM;
@@ -864,8 +865,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
 	task_unlock(current->group_leader);
 
-	posix_cpu_timers_init_group(sig);
-
 	acct_init_pacct(&sig->pacct);
 
 	tty_audit_fork(sig);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 157de3a47832..fa07da94d7be 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,76 +9,6 @@
 #include <asm/uaccess.h>
 #include <linux/kernel_stat.h>
 
-/*
- * Allocate the thread_group_cputime structure appropriately and fill in the
- * current values of the fields.  Called from copy_signal() via
- * thread_group_cputime_clone_thread() when adding a second or subsequent
- * thread to a thread group.  Assumes interrupts are enabled when called.
- */
-int thread_group_cputime_alloc(struct task_struct *tsk)
-{
-	struct signal_struct *sig = tsk->signal;
-	struct task_cputime *cputime;
-
-	/*
-	 * If we have multiple threads and we don't already have a
-	 * per-CPU task_cputime struct (checked in the caller), allocate
-	 * one and fill it in with the times accumulated so far.  We may
-	 * race with another thread so recheck after we pick up the sighand
-	 * lock.
-	 */
-	cputime = alloc_percpu(struct task_cputime);
-	if (cputime == NULL)
-		return -ENOMEM;
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (sig->cputime.totals) {
-		spin_unlock_irq(&tsk->sighand->siglock);
-		free_percpu(cputime);
-		return 0;
-	}
-	sig->cputime.totals = cputime;
-	cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
-	cputime->utime = tsk->utime;
-	cputime->stime = tsk->stime;
-	cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
-	spin_unlock_irq(&tsk->sighand->siglock);
-	return 0;
-}
-
-/**
- * thread_group_cputime - Sum the thread group time fields across all CPUs.
- *
- * @tsk:	The task we use to identify the thread group.
- * @times:	task_cputime structure in which we return the summed fields.
- *
- * Walk the list of CPUs to sum the per-CPU time fields in the thread group
- * time structure.
- */
-void thread_group_cputime(
-	struct task_struct *tsk,
-	struct task_cputime *times)
-{
-	struct task_cputime *totals, *tot;
-	int i;
-
-	totals = tsk->signal->cputime.totals;
-	if (!totals) {
-		times->utime = tsk->utime;
-		times->stime = tsk->stime;
-		times->sum_exec_runtime = tsk->se.sum_exec_runtime;
-		return;
-	}
-
-	times->stime = times->utime = cputime_zero;
-	times->sum_exec_runtime = 0;
-	for_each_possible_cpu(i) {
-		tot = per_cpu_ptr(totals, i);
-		times->utime = cputime_add(times->utime, tot->utime);
-		times->stime = cputime_add(times->stime, tot->stime);
-		times->sum_exec_runtime += tot->sum_exec_runtime;
-	}
-}
-
 /*
  * Called after updating RLIMIT_CPU to set timer expiration if necessary.
  */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index f2773b5d1226..8ab0cef8ecab 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -296,6 +296,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 static inline void account_group_user_time(struct task_struct *tsk,
 					   cputime_t cputime)
 {
+	struct task_cputime *times;
 	struct signal_struct *sig;
 
 	/* tsk == current, ensure it is safe to use ->signal */
@@ -303,13 +304,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
 		return;
 
 	sig = tsk->signal;
-	if (sig->cputime.totals) {
-		struct task_cputime *times;
+	times = &sig->cputime.totals;
 
-		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
-		times->utime = cputime_add(times->utime, cputime);
-		put_cpu_no_resched();
-	}
+	spin_lock(&times->lock);
+	times->utime = cputime_add(times->utime, cputime);
+	spin_unlock(&times->lock);
 }
 
 /**
@@ -325,6 +324,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
 					     cputime_t cputime)
 {
+	struct task_cputime *times;
 	struct signal_struct *sig;
 
 	/* tsk == current, ensure it is safe to use ->signal */
@@ -332,13 +332,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
 		return;
 
 	sig = tsk->signal;
-	if (sig->cputime.totals) {
-		struct task_cputime *times;
+	times = &sig->cputime.totals;
 
-		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
-		times->stime = cputime_add(times->stime, cputime);
-		put_cpu_no_resched();
-	}
+	spin_lock(&times->lock);
+	times->stime = cputime_add(times->stime, cputime);
+	spin_unlock(&times->lock);
 }
 
 /**
@@ -354,6 +352,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
 					      unsigned long long ns)
 {
+	struct task_cputime *times;
 	struct signal_struct *sig;
 
 	sig = tsk->signal;
@@ -362,11 +361,9 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (unlikely(!sig))
 		return;
 
-	if (sig->cputime.totals) {
-		struct task_cputime *times;
+	times = &sig->cputime.totals;
 
-		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
-		times->sum_exec_runtime += ns;
-		put_cpu_no_resched();
-	}
+	spin_lock(&times->lock);
+	times->sum_exec_runtime += ns;
+	spin_unlock(&times->lock);
 }
-- 
cgit v1.2.3-71-gd317


From 783adf42cf039083dd3c734c07c3bdc707e2bb15 Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Sun, 11 Jan 2009 01:04:21 -0800
Subject: kernel/fork.c: unused variable 'ret'

Removed the unused variable.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index e995899ea83f..81da4aae85cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,7 +817,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct signal_struct *sig;
-	int ret;
 
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
-- 
cgit v1.2.3-71-gd317


From 7f7ace0cda64c99599c23785f8979a072e118058 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 21:58:08 -0800
Subject: cpumask: update irq_desc to use cpumask_var_t

Impact: reduce memory usage, use new cpumask API.

Replace the affinity and pending_masks with cpumask_var_t's.  This adds
to the significant size reduction done with the SPARSE_IRQS changes.

The added functions (init_alloc_desc_masks & init_copy_desc_masks) are
in the include file so they can be inlined (and optimized out for the
!CONFIG_CPUMASKS_OFFSTACK case.)  [Naming chosen to be consistent with
the other init*irq functions, as well as the backwards arg declaration
of "from, to" instead of the more common "to, from" standard.]

Includes a slight change to the declaration of struct irq_desc to embed
the pending_mask within ifdef(CONFIG_SMP) to be consistent with other
references, and some small changes to Xen.

Tested: sparse/non-sparse/cpumask_offstack/non-cpumask_offstack/nonuma/nosmp on x86_64

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: virtualization@lists.osdl.org
Cc: xen-devel@lists.xensource.com
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
---
 arch/x86/kernel/io_apic.c | 20 ++++++------
 arch/x86/kernel/irq_32.c  |  2 +-
 arch/x86/kernel/irq_64.c  |  2 +-
 drivers/xen/events.c      |  4 +--
 include/linux/irq.h       | 81 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/irq/chip.c         |  5 ++-
 kernel/irq/handle.c       | 26 ++++++++-------
 kernel/irq/manage.c       | 12 +++----
 kernel/irq/migration.c    | 12 +++----
 kernel/irq/numa_migrate.c | 12 ++++++-
 kernel/irq/proc.c         |  4 +--
 11 files changed, 135 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1c4a1302536c..1337eab60ecc 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -356,7 +356,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
 
 	if (!cfg->move_in_progress) {
 		/* it means that domain is not changed */
-		if (!cpumask_intersects(&desc->affinity, mask))
+		if (!cpumask_intersects(desc->affinity, mask))
 			cfg->move_desc_pending = 1;
 	}
 }
@@ -579,9 +579,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;
 
-	cpumask_and(&desc->affinity, cfg->domain, mask);
+	cpumask_and(desc->affinity, cfg->domain, mask);
 	set_extra_move_desc(desc, mask);
-	return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+	return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
 }
 
 static void
@@ -2383,7 +2383,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);
 
-	cpumask_copy(&desc->affinity, mask);
+	cpumask_copy(desc->affinity, mask);
 }
 
 static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2405,11 +2405,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq_desc(desc, &desc->pending_mask);
+	migrate_ioapic_irq_desc(desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
-	cpumask_clear(&desc->pending_mask);
+	cpumask_clear(desc->pending_mask);
 
 unmask:
 	unmask_IO_APIC_irq_desc(desc);
@@ -2434,7 +2434,7 @@ static void ir_irq_migration(struct work_struct *work)
 				continue;
 			}
 
-			desc->chip->set_affinity(irq, &desc->pending_mask);
+			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
 	}
@@ -2448,7 +2448,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 {
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(&desc->pending_mask, mask);
+		cpumask_copy(desc->pending_mask, mask);
 		migrate_irq_remapped_level_desc(desc);
 		return;
 	}
@@ -2516,7 +2516,7 @@ static void irq_complete_move(struct irq_desc **descp)
 
 		/* domain has not changed, but affinity did */
 		me = smp_processor_id();
-		if (cpu_isset(me, desc->affinity)) {
+		if (cpumask_test_cpu(me, desc->affinity)) {
 			*descp = desc = move_irq_desc(desc, me);
 			/* get the new one */
 			cfg = desc->chip_data;
@@ -4039,7 +4039,7 @@ void __init setup_ioapic_dest(void)
 			 */
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-				mask = &desc->affinity;
+				mask = desc->affinity;
 			else
 				mask = TARGET_CPUS;
 
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..e0f29be8ab0b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -248,7 +248,7 @@ void fixup_irqs(void)
 		if (irq == 2)
 			continue;
 
-		affinity = &desc->affinity;
+		affinity = desc->affinity;
 		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 			printk("Breaking affinity for irq %i\n", irq);
 			affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..0b21cb1ea11f 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -100,7 +100,7 @@ void fixup_irqs(void)
 		/* interrupt's are disabled at this point */
 		spin_lock(&desc->lock);
 
-		affinity = &desc->affinity;
+		affinity = desc->affinity;
 		if (!irq_has_action(irq) ||
 		    cpumask_equal(affinity, cpu_online_mask)) {
 			spin_unlock(&desc->lock);
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index eb0dfdeaa949..e0767ff35d6c 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -125,7 +125,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 
 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-	irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu);
+	cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
 #endif
 
 	__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
@@ -142,7 +142,7 @@ static void init_evtchn_cpu_bindings(void)
 
 	/* By default all event channels notify CPU#0. */
 	for_each_irq_desc(i, desc) {
-		desc->affinity = cpumask_of_cpu(0);
+		cpumask_copy(desc->affinity, cpumask_of(0));
 	}
 #endif
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..fa27210f1dfd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -182,11 +182,11 @@ struct irq_desc {
 	unsigned int		irqs_unhandled;
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
-	cpumask_t		affinity;
+	cpumask_var_t		affinity;
 	unsigned int		cpu;
-#endif
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	cpumask_t		pending_mask;
+	cpumask_var_t		pending_mask;
+#endif
 #endif
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*dir;
@@ -422,4 +422,79 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
 #endif /* !CONFIG_S390 */
 
+#ifdef CONFIG_SMP
+/**
+ * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * @desc:	pointer to irq_desc struct
+ * @boot:	true if need bootmem
+ *
+ * Allocates affinity and pending_mask cpumask if required.
+ * Returns true if successful (or not required).
+ * Side effect: affinity has all bits set, pending_mask has all bits clear.
+ */
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+								bool boot)
+{
+	if (boot) {
+		alloc_bootmem_cpumask_var(&desc->affinity);
+		cpumask_setall(desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+		alloc_bootmem_cpumask_var(&desc->pending_mask);
+		cpumask_clear(desc->pending_mask);
+#endif
+		return true;
+	}
+
+	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+		return false;
+	cpumask_setall(desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+		free_cpumask_var(desc->affinity);
+		return false;
+	}
+	cpumask_clear(desc->pending_mask);
+#endif
+	return true;
+}
+
+/**
+ * init_copy_desc_masks - copy cpumasks for irq_desc
+ * @old_desc:	pointer to old irq_desc struct
+ * @new_desc:	pointer to new irq_desc struct
+ *
+ * Insures affinity and pending_masks are copied to new irq_desc.
+ * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
+ * irq_desc struct so the copy is redundant.
+ */
+
+static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+					struct irq_desc *new_desc)
+{
+#ifdef CONFIG_CPUMASKS_OFFSTACK
+	cpumask_copy(new_desc->affinity, old_desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
+#endif
+#endif
+}
+
+#else /* !CONFIG_SMP */
+
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+								bool boot)
+{
+	return true;
+}
+
+static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+					struct irq_desc *new_desc)
+{
+}
+
+#endif	/* CONFIG_SMP */
+
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f63c706d25e1..c248eba98b43 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-	cpumask_setall(&desc->affinity);
+	cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_clear(desc->pending_mask);
+#endif
 #endif
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..b8fa1354f01c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -64,9 +64,6 @@ static struct irq_desc irq_desc_init = {
 	.handle_irq = handle_bad_irq,
 	.depth      = 1,
 	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-	.affinity   = CPU_MASK_ALL
-#endif
 };
 
 void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
@@ -88,6 +85,8 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 {
+	int node = cpu_to_node(cpu);
+
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
@@ -101,6 +100,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+		BUG_ON(1);
+	}
 	arch_init_chip_data(desc, cpu);
 }
 
@@ -119,9 +122,6 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
 		.handle_irq = handle_bad_irq,
 		.depth	    = 1,
 		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-		.affinity   = CPU_MASK_ALL
-#endif
 	}
 };
 
@@ -141,7 +141,7 @@ int __init early_irq_init(void)
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy[i];
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-
+		init_alloc_desc_masks(&desc[i], 0, true);
 		irq_desc_ptrs[i] = desc + i;
 	}
 
@@ -188,6 +188,10 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+		BUG_ON(1);
+	}
 	init_one_irq_desc(irq, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
@@ -207,9 +211,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 		.handle_irq = handle_bad_irq,
 		.depth = 1,
 		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
-#ifdef CONFIG_SMP
-		.affinity = CPU_MASK_ALL
-#endif
 	}
 };
 
@@ -222,9 +223,10 @@ int __init early_irq_init(void)
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		desc[i].irq = i;
-
+		init_alloc_desc_masks(&desc[i], 0, true);
+	}
 	return arch_early_irq_init();
 }
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..b98739af4558 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -98,14 +98,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		cpumask_copy(&desc->affinity, cpumask);
+		cpumask_copy(desc->affinity, cpumask);
 		desc->chip->set_affinity(irq, cpumask);
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(&desc->pending_mask, cpumask);
+		cpumask_copy(desc->pending_mask, cpumask);
 	}
 #else
-	cpumask_copy(&desc->affinity, cpumask);
+	cpumask_copy(desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
 	desc->status |= IRQ_AFFINITY_SET;
@@ -127,16 +127,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 	 * one of the targets is online.
 	 */
 	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-		if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+		if (cpumask_any_and(desc->affinity, cpu_online_mask)
 		    < nr_cpu_ids)
 			goto set_affinity;
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
 
-	cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
+	cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
 set_affinity:
-	desc->chip->set_affinity(irq, &desc->affinity);
+	desc->chip->set_affinity(irq, desc->affinity);
 
 	return 0;
 }
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630c..e05ad9be43b7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
 
 	desc->status &= ~IRQ_MOVE_PENDING;
 
-	if (unlikely(cpumask_empty(&desc->pending_mask)))
+	if (unlikely(cpumask_empty(desc->pending_mask)))
 		return;
 
 	if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
 	 * For correct operation this depends on the caller
 	 * masking the irqs.
 	 */
-	if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+	if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
 		   < nr_cpu_ids)) {
-		cpumask_and(&desc->affinity,
-			    &desc->pending_mask, cpu_online_mask);
-		desc->chip->set_affinity(irq, &desc->affinity);
+		cpumask_and(desc->affinity,
+			    desc->pending_mask, cpu_online_mask);
+		desc->chip->set_affinity(irq, desc->affinity);
 	}
-	cpumask_clear(&desc->pending_mask);
+	cpumask_clear(desc->pending_mask);
 }
 
 void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ecf765c6a77a..f001a4ea6414 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -46,6 +46,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 	desc->cpu = cpu;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	init_copy_desc_masks(old_desc, desc);
 	arch_init_copy_chip_data(old_desc, desc, cpu);
 }
 
@@ -76,11 +77,20 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 	if (!desc) {
-		printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
+		printk(KERN_ERR "irq %d: can not get new irq_desc "
+				"for migration.\n", irq);
 		/* still use old one */
 		desc = old_desc;
 		goto out_unlock;
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+				"for migration.\n", irq);
+		/* still use old one */
+		kfree(desc);
+		desc = old_desc;
+		goto out_unlock;
+	}
 	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bcec..692363dd591f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	const struct cpumask *mask = &desc->affinity;
+	const struct cpumask *mask = desc->affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PENDING)
-		mask = &desc->pending_mask;
+		mask = desc->pending_mask;
 #endif
 	seq_cpumask(m, mask);
 	seq_putc(m, '\n');
-- 
cgit v1.2.3-71-gd317


From 802bf931f2688ad125b73db597ce63cc842fb27a Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 21:58:09 -0800
Subject: cpumask: fix bug in use cpumask_var_t in irq_desc

Impact: fix bug where new irq_desc uses old cpumask pointers which are freed.

As Yinghai pointed out, init_copy_one_irq_desc() copies the old desc to
the new desc overwriting the cpumask pointers.  Since the old_desc and
the cpumask pointers are freed, then memory corruption will occur if
these old pointers are used.

Move the allocation of these pointers to after the copy.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/irq.h       |  9 +++++++--
 kernel/irq/handle.c       |  8 +-------
 kernel/irq/numa_migrate.c | 13 ++++++++-----
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index fa27210f1dfd..27a67536511e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -426,15 +426,18 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 /**
  * init_alloc_desc_masks - allocate cpumasks for irq_desc
  * @desc:	pointer to irq_desc struct
+ * @cpu:	cpu which will be handling the cpumasks
  * @boot:	true if need bootmem
  *
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
  * Side effect: affinity has all bits set, pending_mask has all bits clear.
  */
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
+	int node;
+
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
 		cpumask_setall(desc->affinity);
@@ -446,6 +449,8 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
 		return true;
 	}
 
+	node = cpu_to_node(cpu);
+
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
 	cpumask_setall(desc->affinity);
@@ -484,7 +489,7 @@ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
 	return true;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b8fa1354f01c..f01c0a30cb42 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -85,8 +85,6 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 {
-	int node = cpu_to_node(cpu);
-
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
@@ -100,7 +98,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
+	if (!init_alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
@@ -188,10 +186,6 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
-		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
-		BUG_ON(1);
-	}
 	init_one_irq_desc(irq, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index f001a4ea6414..666260e4c065 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -38,16 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 	old_desc->kstat_irqs = NULL;
 }
 
-static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 		 struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	if (!init_alloc_desc_masks(desc, cpu, false)) {
+		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+				"for migration.\n", irq);
+		return false;
+	}
 	spin_lock_init(&desc->lock);
 	desc->cpu = cpu;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
 	init_copy_desc_masks(old_desc, desc);
 	arch_init_copy_chip_data(old_desc, desc, cpu);
+	return true;
 }
 
 static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -83,15 +89,12 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 		desc = old_desc;
 		goto out_unlock;
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
-		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
-				"for migration.\n", irq);
+	if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
 		/* still use old one */
 		kfree(desc);
 		desc = old_desc;
 		goto out_unlock;
 	}
-	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
 
-- 
cgit v1.2.3-71-gd317


From d38b223c86db3162dc85b5a1997ac8a210e1660b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 21:58:11 -0800
Subject: cpumask: reduce stack usage in find_lowest_rq

Impact: reduce stack usage, cleanup

Use a cpumask_var_t in find_lowest_rq() and clean up other old
cpumask_t calls.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/sched_rt.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 954e1a81b796..da932f4c8524 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -960,16 +960,17 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
-static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+static inline int pick_optimal_cpu(int this_cpu,
+				   const struct cpumask *mask)
 {
 	int first;
 
 	/* "this_cpu" is cheaper to preempt than a remote processor */
-	if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+	if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
 		return this_cpu;
 
-	first = first_cpu(*mask);
-	if (first != NR_CPUS)
+	first = cpumask_first(mask);
+	if (first < nr_cpu_ids)
 		return first;
 
 	return -1;
@@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task)
 	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
+	cpumask_var_t domain_mask;
 
 	if (task->rt.nr_cpus_allowed == 1)
 		return -1; /* No other targets possible */
@@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task)
 	if (this_cpu == cpu)
 		this_cpu = -1; /* Skip this_cpu opt if the same */
 
-	for_each_domain(cpu, sd) {
-		if (sd->flags & SD_WAKE_AFFINE) {
-			cpumask_t domain_mask;
-			int       best_cpu;
+	if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
+		for_each_domain(cpu, sd) {
+			if (sd->flags & SD_WAKE_AFFINE) {
+				int best_cpu;
 
-			cpumask_and(&domain_mask, sched_domain_span(sd),
-				    lowest_mask);
+				cpumask_and(domain_mask,
+					    sched_domain_span(sd),
+					    lowest_mask);
 
-			best_cpu = pick_optimal_cpu(this_cpu,
-						    &domain_mask);
-			if (best_cpu != -1)
-				return best_cpu;
+				best_cpu = pick_optimal_cpu(this_cpu,
+							    domain_mask);
+
+				if (best_cpu != -1) {
+					free_cpumask_var(domain_mask);
+					return best_cpu;
+				}
+			}
 		}
+		free_cpumask_var(domain_mask);
 	}
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 9594949b060efe86ecaa1a66839232a3b9800bc9 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:06 -0800
Subject: irq: change references from NR_IRQS to nr_irqs

Impact: preparation, cleanup, add KERN_INFO printk

Modify references from NR_IRQS to nr_irqs as the later will become
variable-sized based on nr_cpu_ids when CONFIG_SPARSE_IRQS=y.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/kernel/io_apic.c |  2 +-
 kernel/irq/handle.c       | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1337eab60ecc..ae80638012de 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3183,7 +3183,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
-	for (new = irq_want; new < NR_IRQS; new++) {
+	for (new = irq_want; new < nr_irqs; new++) {
 		if (platform_legacy_irq(new))
 			continue;
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index f01c0a30cb42..790c5fa7ea39 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,6 +132,8 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
+	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
+
 	desc = irq_desc_legacy;
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
@@ -143,7 +145,7 @@ int __init early_irq_init(void)
 		irq_desc_ptrs[i] = desc + i;
 	}
 
-	for (i = legacy_count; i < NR_IRQS; i++)
+	for (i = legacy_count; i < nr_irqs; i++)
 		irq_desc_ptrs[i] = NULL;
 
 	return arch_early_irq_init();
@@ -151,7 +153,7 @@ int __init early_irq_init(void)
 
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
-	return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+	return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL;
 }
 
 struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@ -160,9 +162,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 	unsigned long flags;
 	int node;
 
-	if (irq >= NR_IRQS) {
-		printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
-				irq, NR_IRQS);
+	if (irq >= nr_irqs) {
+		printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n",
+				irq, nr_irqs);
 		WARN_ON(1);
 		return NULL;
 	}
@@ -214,6 +216,8 @@ int __init early_irq_init(void)
 	int count;
 	int i;
 
+	printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
 
-- 
cgit v1.2.3-71-gd317


From e2f4d06545ec1f29b0e838ee34cbf3500ea5b9a4 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:06 -0800
Subject: irq: use WARN() instead of WARN_ON().

Impact: cleanup WARN msg.

Ingo requested:
> While at it, could you please also convert this to a WARN() construct
> instead? (in a separate commit)

... and it shall be done.  ;-)

Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/irq/handle.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 790c5fa7ea39..fd1ef16252f4 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -163,9 +163,8 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 	int node;
 
 	if (irq >= nr_irqs) {
-		printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n",
-				irq, nr_irqs);
-		WARN_ON(1);
+		WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
+			irq, nr_irqs);
 		return NULL;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 0fa0ebbf15addc1be8f73325d809c8547a9de304 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:06 -0800
Subject: irq: allocate irq_desc_ptrs array based on nr_irqs

Impact: allocate irq_desc_ptrs in preparation for making it variable-sized.

This addresses this memory usage bump when NR_CPUS bumped from 128 to 4096:

    34816   +229376    264192  +658%  irq_desc_ptrs(.data.read_mostly)

The patch is split into two parts, the first simply allocates the
irq_desc_ptrs array.  Then next will deal with making it variable.
This is only when CONFIG_SPARSE_IRQS=y.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/irq/handle.c    | 11 +++++++++--
 kernel/irq/internals.h |  7 +++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index fd1ef16252f4..d0b8f7e72790 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
+#include <linux/bootmem.h>
 
 #include "internals.h"
 
@@ -110,7 +111,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
  */
 DEFINE_SPINLOCK(sparse_irq_lock);
 
-struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+struct irq_desc **irq_desc_ptrs __read_mostly;
 
 static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS_LEGACY-1] = {
@@ -137,6 +138,9 @@ int __init early_irq_init(void)
 	desc = irq_desc_legacy;
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
+	/* allocate irq_desc_ptrs array based on nr_irqs */
+	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy[i];
@@ -153,7 +157,10 @@ int __init early_irq_init(void)
 
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL;
+	if (irq_desc_ptrs && irq < nr_irqs)
+		return irq_desc_ptrs[irq];
+
+	return NULL;
 }
 
 struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..40416a81a0f5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
 extern spinlock_t sparse_irq_lock;
+
+#ifdef CONFIG_SPARSE_IRQ
+/* irq_desc_ptrs allocated at boot time */
+extern struct irq_desc **irq_desc_ptrs;
+#else
+/* irq_desc_ptrs is a fixed size array */
 extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+#endif
 
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
-- 
cgit v1.2.3-71-gd317


From 9332fccdedf8e09448f3b69b624211ae879f6c45 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:07 -0800
Subject: irq: initialize nr_irqs based on nr_cpu_ids

Impact: Reduce memory usage.

This is the second half of the changes to make the irq_desc_ptrs be
variable sized based on nr_cpu_ids.  This is done by adding a new
"max_nr_irqs" macro to irq_vectors.h (and a dummy in irqnr.h) to
return a max NR_IRQS value based on NR_CPUS or nr_cpu_ids.

This necessitated moving the define of MAX_IO_APICS to a separate
file (asm/apicnum.h) so it could be included without the baggage
of the other asm/apicdef.h declarations.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/include/asm/apicdef.h     |  8 ++------
 arch/x86/include/asm/apicnum.h     | 12 ++++++++++++
 arch/x86/include/asm/irq_vectors.h | 16 +++++++++++-----
 include/linux/irqnr.h              |  7 +++++++
 kernel/irq/handle.c                |  3 +++
 5 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 arch/x86/include/asm/apicnum.h

(limited to 'kernel')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..1a6454ef7f6c 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -132,12 +132,8 @@
 #define APIC_BASE_MSR	0x800
 #define X2APIC_ENABLE	(1UL << 10)
 
-#ifdef CONFIG_X86_32
-# define MAX_IO_APICS 64
-#else
-# define MAX_IO_APICS 128
-# define MAX_LOCAL_APIC 32768
-#endif
+/* get MAX_IO_APICS */
+#include <asm/apicnum.h>
 
 /*
  * All x86-64 systems are xAPIC compatible.
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
new file mode 100644
index 000000000000..82f613c607ce
--- /dev/null
+++ b/arch/x86/include/asm/apicnum.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_APICNUM_H
+#define _ASM_X86_APICNUM_H
+
+/* define MAX_IO_APICS */
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index f7ff65032b9d..602361ad0e74 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -105,6 +105,8 @@
 
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 
+#include <asm/apicnum.h>	/* need MAX_IO_APICS */
+
 #ifndef CONFIG_SPARSE_IRQ
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
@@ -112,11 +114,15 @@
 #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
 # endif
 #else
-# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
-#  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
-# else
-#  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
-# endif
+
+/* defined as a macro so nr_irqs = max_nr_irqs(nr_cpu_ids) can be used */
+# define max_nr_irqs(nr_cpus)				\
+	((8 * nr_cpus) > (32 * MAX_IO_APICS) ?		\
+		(NR_VECTORS + (8 * NR_CPUS)) :		\
+		(NR_VECTORS + (32 * MAX_IO_APICS)))	\
+
+# define NR_IRQS max_nr_irqs(NR_CPUS)
+
 #endif
 
 #elif defined(CONFIG_X86_VOYAGER)
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 86af92e9e84c..de66e4e10406 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -20,11 +20,18 @@
 
 # define for_each_irq_desc_reverse(irq, desc)                          \
 	for (irq = nr_irqs - 1; irq >= 0; irq--)
+
 #else /* CONFIG_GENERIC_HARDIRQS */
 
+#include <asm/irq_vectors.h>	/* need possible max_nr_irqs() */
+
 extern int nr_irqs;
 extern struct irq_desc *irq_to_desc(unsigned int irq);
 
+# ifndef max_nr_irqs
+#  define max_nr_irqs(nr_cpus)	NR_IRQS
+# endif
+
 # define for_each_irq_desc(irq, desc)					\
 	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
 	     irq++, desc = irq_to_desc(irq))				\
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d0b8f7e72790..ebba7a116f14 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,6 +133,9 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
+	/* initialize nr_irqs based on nr_cpu_ids */
+	nr_irqs = max_nr_irqs(nr_cpu_ids);
+
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
 
 	desc = irq_desc_legacy;
-- 
cgit v1.2.3-71-gd317


From 542d865bbed4ce1f050f586e53cf1cfadda93766 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:07 -0800
Subject: kstat: modify kstat_irqs_legacy to be variable sized

Impact: reduce memory usage.

Allocate kstat_irqs_legacy based on nr_cpu_ids to deal with this
memory usage bump when NR_CPUS bumped from 128 to 4096:

     8192   +253952    262144 +3100%  kstat_irqs_legacy(.bss)

This is only when CONFIG_SPARSE_IRQS=y.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/irq/handle.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index ebba7a116f14..b39f32ac8f80 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -124,8 +124,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
 	}
 };
 
-/* FIXME: use bootmem alloc ...*/
-static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+static unsigned int *kstat_irqs_legacy;
 
 int __init early_irq_init(void)
 {
@@ -144,9 +143,14 @@ int __init early_irq_init(void)
 	/* allocate irq_desc_ptrs array based on nr_irqs */
 	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
 
+	/* allocate based on nr_cpu_ids */
+	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
+	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
+					  sizeof(int));
+
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
-		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
 		init_alloc_desc_masks(&desc[i], 0, true);
 		irq_desc_ptrs[i] = desc + i;
-- 
cgit v1.2.3-71-gd317


From 92296c6d6e908c35fca287a21af27be814af9c75 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sun, 11 Jan 2009 09:22:58 -0800
Subject: cpumask, irq: non-x86 build failures

Ingo Molnar wrote:

> All non-x86 architectures fail to build:
>
> In file included from /home/mingo/tip/include/linux/random.h:11,
>                  from /home/mingo/tip/include/linux/stackprotector.h:6,
>                  from /home/mingo/tip/init/main.c:17:
> /home/mingo/tip/include/linux/irqnr.h:26:63: error: asm/irq_vectors.h: No such file or directory

Do not include asm/irq_vectors.h in generic code - it's not available
on all architectures.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apicdef.h | 8 ++++++--
 include/linux/irqnr.h          | 6 ------
 kernel/irq/handle.c            | 5 +++++
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 1a6454ef7f6c..63134e31e8b9 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -132,8 +132,12 @@
 #define APIC_BASE_MSR	0x800
 #define X2APIC_ENABLE	(1UL << 10)
 
-/* get MAX_IO_APICS */
-#include <asm/apicnum.h>
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
 
 /*
  * All x86-64 systems are xAPIC compatible.
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index de66e4e10406..887477bc2ab0 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -23,15 +23,9 @@
 
 #else /* CONFIG_GENERIC_HARDIRQS */
 
-#include <asm/irq_vectors.h>	/* need possible max_nr_irqs() */
-
 extern int nr_irqs;
 extern struct irq_desc *irq_to_desc(unsigned int irq);
 
-# ifndef max_nr_irqs
-#  define max_nr_irqs(nr_cpus)	NR_IRQS
-# endif
-
 # define for_each_irq_desc(irq, desc)					\
 	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
 	     irq++, desc = irq_to_desc(irq))				\
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b39f32ac8f80..04d3e46031e5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -58,6 +58,11 @@ int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
 #ifdef CONFIG_SPARSE_IRQ
+
+#ifndef max_nr_irqs
+#define max_nr_irqs(nr_cpus)	NR_IRQS
+#endif
+
 static struct irq_desc irq_desc_init = {
 	.irq	    = -1,
 	.status	    = IRQ_DISABLED,
-- 
cgit v1.2.3-71-gd317


From 4a046d1754ee6ebb6f399696805ed61ea0444d4c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 12 Jan 2009 17:39:24 -0800
Subject: x86: arch_probe_nr_irqs

Impact: save RAM with large NR_CPUS, get smaller nr_irqs

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/include/asm/irq_vectors.h |  7 ++-----
 arch/x86/kernel/io_apic.c          | 16 ++++++++++++++++
 include/linux/interrupt.h          |  1 +
 kernel/irq/handle.c                |  9 ++-------
 kernel/softirq.c                   |  5 +++++
 5 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 602361ad0e74..a16a2ab2b429 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -115,14 +115,11 @@
 # endif
 #else
 
-/* defined as a macro so nr_irqs = max_nr_irqs(nr_cpu_ids) can be used */
-# define max_nr_irqs(nr_cpus)				\
-	((8 * nr_cpus) > (32 * MAX_IO_APICS) ?		\
+# define NR_IRQS					\
+	((8 * NR_CPUS) > (32 * MAX_IO_APICS) ?		\
 		(NR_VECTORS + (8 * NR_CPUS)) :		\
 		(NR_VECTORS + (32 * MAX_IO_APICS)))	\
 
-# define NR_IRQS max_nr_irqs(NR_CPUS)
-
 #endif
 
 #elif defined(CONFIG_X86_VOYAGER)
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index ae80638012de..157986916cd1 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3850,6 +3850,22 @@ void __init probe_nr_irqs_gsi(void)
 		nr_irqs_gsi = nr;
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+	int nr;
+
+	nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ?
+		(NR_VECTORS + (8 * nr_cpu_ids)) :
+		(NR_VECTORS + (32 * nr_ioapics)));
+
+	if (nr < nr_irqs && nr > nr_irqs_gsi)
+		nr_irqs = nr;
+
+	return 0;
+}
+#endif
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..472f11765f60 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -467,6 +467,7 @@ int show_interrupts(struct seq_file *p, void *v);
 struct irq_desc;
 
 extern int early_irq_init(void);
+extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 04d3e46031e5..375d68cd5bf0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -59,10 +59,6 @@ EXPORT_SYMBOL_GPL(nr_irqs);
 
 #ifdef CONFIG_SPARSE_IRQ
 
-#ifndef max_nr_irqs
-#define max_nr_irqs(nr_cpus)	NR_IRQS
-#endif
-
 static struct irq_desc irq_desc_init = {
 	.irq	    = -1,
 	.status	    = IRQ_DISABLED,
@@ -137,9 +133,8 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
-	/* initialize nr_irqs based on nr_cpu_ids */
-	nr_irqs = max_nr_irqs(nr_cpu_ids);
-
+	 /* initialize nr_irqs based on nr_cpu_ids */
+	arch_probe_nr_irqs();
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
 
 	desc = irq_desc_legacy;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..0365b4899a3d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -795,6 +795,11 @@ int __init __weak early_irq_init(void)
 	return 0;
 }
 
+int __init __weak arch_probe_nr_irqs(void)
+{
+	return 0;
+}
+
 int __init __weak arch_early_irq_init(void)
 {
 	return 0;
-- 
cgit v1.2.3-71-gd317


From e4fa4c97016037620f9dc8bafe03e1086b665b4c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 14 Jan 2009 14:58:15 +0800
Subject: rcu: add __cpuinit to rcu_init_percpu_data()

Impact: reduce memory footprint

add __cpuinit to rcu_init_percpu_data(), and this function's text
will be discarded after boot when !CONFIG_HOTPLUG_CPU.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 2 +-
 kernel/rcutree.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 490934fc7ac3..bd5a9003497c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -716,7 +716,7 @@ void rcu_check_callbacks(int cpu, int user)
 	raise_rcu_softirq();
 }
 
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
 	unsigned long flags;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f2d8638e6c60..b2fd602a6f6f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1314,7 +1314,7 @@ int rcu_needs_cpu(int cpu)
  * access due to the fact that this CPU cannot possibly have any RCU
  * callbacks in flight yet.
  */
-static void
+static void __cpuinit
 rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-- 
cgit v1.2.3-71-gd317


From baf48f6577e581a9adb8fe849dc80e24b21d171d Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Mon, 12 Jan 2009 21:15:17 -0800
Subject: softlock: fix false panic which can occur if softlockup_thresh is
 reduced

At run-time, if softlockup_thresh is changed to a much lower value,
touch_timestamp is likely to be much older than the new softlock_thresh.

This will cause a false softlockup to be detected. If softlockup_panic
is enabled, the system will panic.

The fix is to touch all watchdogs before changing softlockup_thresh.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 3 +++
 kernel/softlockup.c   | 9 +++++++++
 kernel/sysctl.c       | 2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..54cbabf3b871 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -293,6 +293,9 @@ extern void sched_show_task(struct task_struct *p);
 extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
+extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+				    struct file *filp, void __user *buffer,
+				    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9188c66278a..85d5a2455103 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
 #include <linux/lockdep.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 
 #include <asm/irq_regs.h>
 
@@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void)
 }
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
 
+int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+			     struct file *filp, void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	touch_all_softlockup_watchdogs();
+	return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+}
+
 /*
  * This callback runs from the timer interrupt, and checks
  * whether the watchdog thread has hung or not:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 89d74436318c..596dc31a7116 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -800,7 +800,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &softlockup_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_dosoftlockup_thresh,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &neg_one,
 		.extra2		= &sixty,
-- 
cgit v1.2.3-71-gd317


From 2ea038917bbdd51a7ae4a898c6a04641324dd033 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 14 Jan 2009 21:38:20 +0100
Subject: Revert "kbuild: strip generated symbols from *.ko"

This reverts commit ad7a953c522ceb496611d127e51e278bfe0ff483.

And commit: ("allow stripping of generated symbols under CONFIG_KALLSYMS_ALL")
            9bb482476c6c9d1ae033306440c51ceac93ea80c

These stripping patches has caused a set of issues:

1) People have reported compatibility issues with binutils due to
   lack of support for `--strip-unneeded-symbols' with objcopy 2.15.92.0.2
   Reported by: Wenji
2) ccache and distcc no longer works as expeced
   Reported by: Ted, Roland, + others
3) The installed modules increased a lot in size
   Reported by: Ted, Davej + others

Reported-by: Wenji Huang <wenji.huang@oracle.com>
Reported-by: "Theodore Ts'o" <tytso@mit.edu>
Reported-by: Dave Jones <davej@redhat.com>
Reported-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 Makefile                            |  59 ++++-------
 arch/x86/scripts/strip-symbols      |   1 -
 init/Kconfig                        |   7 --
 kernel/kallsyms.c                   |  16 +--
 scripts/Makefile.build              |  55 ++++-------
 scripts/Makefile.modinst            |   3 +-
 scripts/genksyms/genksyms.c         |  21 ++--
 scripts/genksyms/keywords.c_shipped | 189 ++++++++++++++++++------------------
 scripts/genksyms/keywords.gperf     |   2 -
 scripts/kallsyms.c                  |  21 ++--
 scripts/mksysmap                    |   7 +-
 scripts/strip-symbols               |  22 -----
 12 files changed, 166 insertions(+), 237 deletions(-)
 delete mode 100644 arch/x86/scripts/strip-symbols
 delete mode 100644 scripts/strip-symbols

(limited to 'kernel')

diff --git a/Makefile b/Makefile
index c06e250eca18..c2c4bbeef59d 100644
--- a/Makefile
+++ b/Makefile
@@ -606,25 +606,20 @@ export	INSTALL_PATH ?= /boot
 MODLIB	= $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE)
 export MODLIB
 
-strip-symbols := $(srctree)/scripts/strip-symbols \
-		 $(wildcard $(srctree)/arch/$(ARCH)/scripts/strip-symbols)
-
 #
-# INSTALL_MOD_STRIP, if defined, will cause modules to be stripped while
-# they get installed.  If INSTALL_MOD_STRIP is '1', then the default
-# options (see below) will be used.  Otherwise, INSTALL_MOD_STRIP will
-# be used as the option(s) to the objcopy command.
+#  INSTALL_MOD_STRIP, if defined, will cause modules to be
+#  stripped after they are installed.  If INSTALL_MOD_STRIP is '1', then
+#  the default option --strip-debug will be used.  Otherwise,
+#  INSTALL_MOD_STRIP will used as the options to the strip command.
+
 ifdef INSTALL_MOD_STRIP
 ifeq ($(INSTALL_MOD_STRIP),1)
-mod_strip_cmd = $(OBJCOPY) --strip-debug
-ifeq ($(CONFIG_KALLSYMS_ALL),$(CONFIG_KALLSYMS_STRIP_GENERATED))
-mod_strip_cmd += --wildcard $(addprefix --strip-symbols ,$(strip-symbols))
-endif
+mod_strip_cmd = $(STRIP) --strip-debug
 else
-mod_strip_cmd = $(OBJCOPY) $(INSTALL_MOD_STRIP)
+mod_strip_cmd = $(STRIP) $(INSTALL_MOD_STRIP)
 endif # INSTALL_MOD_STRIP=1
 else
-mod_strip_cmd = false
+mod_strip_cmd = true
 endif # INSTALL_MOD_STRIP
 export mod_strip_cmd
 
@@ -754,7 +749,6 @@ last_kallsyms := 2
 endif
 
 kallsyms.o := .tmp_kallsyms$(last_kallsyms).o
-kallsyms.h := $(wildcard include/config/kallsyms/*.h) $(wildcard include/config/kallsyms/*/*.h)
 
 define verify_kallsyms
 	$(Q)$(if $($(quiet)cmd_sysmap),                                      \
@@ -779,41 +773,24 @@ endef
 
 # Generate .S file with all kernel symbols
 quiet_cmd_kallsyms = KSYM    $@
-      cmd_kallsyms = { test $* -eq 0 || $(NM) -n $<; } \
-		     | $(KALLSYMS) $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) >$@
-
-quiet_cmd_kstrip = STRIP   $@
-      cmd_kstrip = $(OBJCOPY) --wildcard $(addprefix --strip$(if $(CONFIG_RELOCATABLE),-unneeded)-symbols ,$(filter %/scripts/strip-symbols,$^)) $< $@
+      cmd_kallsyms = $(NM) -n $< | $(KALLSYMS) \
+                     $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) > $@
 
-$(foreach n,0 1 2 3,.tmp_kallsyms$(n).o): KBUILD_AFLAGS += -Wa,--strip-local-absolute
-$(foreach n,0 1 2 3,.tmp_kallsyms$(n).o): %.o: %.S scripts FORCE
+.tmp_kallsyms1.o .tmp_kallsyms2.o .tmp_kallsyms3.o: %.o: %.S scripts FORCE
 	$(call if_changed_dep,as_o_S)
 
-ifeq ($(CONFIG_KALLSYMS_STRIP_GENERATED),y)
-strip-ext := .stripped
-endif
-
-.tmp_kallsyms%.S: .tmp_vmlinux%$(strip-ext) $(KALLSYMS) $(kallsyms.h)
+.tmp_kallsyms%.S: .tmp_vmlinux% $(KALLSYMS)
 	$(call cmd,kallsyms)
 
-# make -jN seems to have problems with intermediate files, see bug #3330.
-.SECONDARY: $(foreach n,1 2 3,.tmp_vmlinux$(n).stripped)
-.tmp_vmlinux%.stripped: .tmp_vmlinux% $(strip-symbols) $(kallsyms.h)
-	$(call cmd,kstrip)
-
-ifneq ($(CONFIG_DEBUG_INFO),y)
-.tmp_vmlinux%: LDFLAGS_vmlinux += -S
-endif
 # .tmp_vmlinux1 must be complete except kallsyms, so update vmlinux version
-.tmp_vmlinux%: $(vmlinux-lds) $(vmlinux-all) FORCE
-	$(if $(filter 1,$*),$(call if_changed_rule,ksym_ld),$(call if_changed,vmlinux__))
+.tmp_vmlinux1: $(vmlinux-lds) $(vmlinux-all) FORCE
+	$(call if_changed_rule,ksym_ld)
 
-.tmp_vmlinux0$(strip-ext):
-	$(Q)echo "placeholder" >$@
+.tmp_vmlinux2: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms1.o FORCE
+	$(call if_changed,vmlinux__)
 
-.tmp_vmlinux1: .tmp_kallsyms0.o
-.tmp_vmlinux2: .tmp_kallsyms1.o
-.tmp_vmlinux3: .tmp_kallsyms2.o
+.tmp_vmlinux3: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms2.o FORCE
+	$(call if_changed,vmlinux__)
 
 # Needs to visit scripts/ before $(KALLSYMS) can be used.
 $(KALLSYMS): scripts ;
diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols
deleted file mode 100644
index a2f1ccb827c7..000000000000
--- a/arch/x86/scripts/strip-symbols
+++ /dev/null
@@ -1 +0,0 @@
-__cpu_vendor_dev_X86_VENDOR_*
diff --git a/init/Kconfig b/init/Kconfig
index a724a149bf3f..0e9924743a17 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -626,13 +626,6 @@ config KALLSYMS_ALL
 
 	   Say N.
 
-config KALLSYMS_STRIP_GENERATED
-	bool "Strip machine generated symbols from kallsyms"
-	depends on KALLSYMS_ALL
-	default y
-	help
-	  Say N if you want kallsyms to retain even machine generated symbols.
-
 config KALLSYMS_EXTRA_PASS
 	bool "Do an extra kallsyms pass"
 	depends on KALLSYMS
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index e694afa0eb8c..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,19 +30,20 @@
 #define all_var 0
 #endif
 
-extern const unsigned long kallsyms_addresses[];
-extern const u8 kallsyms_names[];
+/* These will be re-linked against their real values during the second link stage */
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
 
 /* tell the compiler that the count isn't in the small data section if the arch
  * has one (eg: FRV)
  */
 extern const unsigned long kallsyms_num_syms
-	__attribute__((__section__(".rodata")));
+__attribute__((weak, section(".rodata")));
 
-extern const u8 kallsyms_token_table[];
-extern const u16 kallsyms_token_index[];
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
 
-extern const unsigned long kallsyms_markers[];
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
 
 static inline int is_kernel_inittext(unsigned long addr)
 {
@@ -167,6 +168,9 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	unsigned long symbol_start = 0, symbol_end = 0;
 	unsigned long i, low, high, mid;
 
+	/* This kernel should never had been booted. */
+	BUG_ON(!kallsyms_addresses);
+
 	/* do a binary search on the sorted kallsyms_addresses array */
 	low = 0;
 	high = kallsyms_num_syms;
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 5d900307de3e..c7de8b39fcf1 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -151,16 +151,16 @@ cmd_cc_i_c       = $(CPP) $(c_flags)   -o $@ $<
 $(obj)/%.i: $(src)/%.c FORCE
 	$(call if_changed_dep,cc_i_c)
 
-cmd_genksyms =                                                              \
+cmd_gensymtypes =                                                           \
     $(CPP) -D__GENKSYMS__ $(c_flags) $< |                                   \
-    $(GENKSYMS) -T $@ -A -a $(ARCH)                                         \
+    $(GENKSYMS) -T $@ -a $(ARCH)                                            \
      $(if $(KBUILD_PRESERVE),-p)                                            \
      $(if $(1),-r $(firstword $(wildcard $(@:.symtypes=.symref) /dev/null)))
 
 quiet_cmd_cc_symtypes_c = SYM $(quiet_modtag) $@
 cmd_cc_symtypes_c =                                                         \
     set -e;                                                                 \
-    $(call cmd_genksyms, true) >/dev/null;                                  \
+    $(call cmd_gensymtypes, true) >/dev/null;                               \
     test -s $@ || rm -f $@
 
 $(obj)/%.symtypes : $(src)/%.c FORCE
@@ -177,38 +177,28 @@ cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
 
 else
 # When module versioning is enabled the following steps are executed:
-# o compile a .tmp_<file>.s from <file>.c
-# o if .tmp_<file>.s doesn't contain a __ksymtab version, i.e. does
-#   not export symbols, we just assemble .tmp_<file>.s to <file>.o and
+# o compile a .tmp_<file>.o from <file>.c
+# o if .tmp_<file>.o doesn't contain a __ksymtab version, i.e. does
+#   not export symbols, we just rename .tmp_<file>.o to <file>.o and
 #   are done.
 # o otherwise, we calculate symbol versions using the good old
 #   genksyms on the preprocessed source and postprocess them in a way
-#   that they are usable as assembly source
-# o assemble <file>.o from .tmp_<file>.s forcing inclusion of directives
-#   defining the actual values of __crc_*, followed by objcopy-ing them
-#   to force these symbols to be local to permit stripping them later.
-s_file = $(@D)/.tmp_$(@F:.o=.s)
-v_file = $(@D)/.tmp_$(@F:.o=.v)
-tmp_o_file = $(@D)/.tmp_$(@F)
-no_g_c_flags = $(filter-out -g%,$(c_flags))
-
-cmd_cc_o_c = $(CC) $(c_flags) -S -o $(s_file) $<
+#   that they are usable as a linker script
+# o generate <file>.o from .tmp_<file>.o using the linker to
+#   replace the unresolved symbols __crc_exported_symbol with
+#   the actual value of the checksum generated by genksyms
 
+cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $<
 cmd_modversions =							\
-	if grep -q __ksymtab $(s_file); then				\
-		if $(call cmd_genksyms, $(KBUILD_SYMTYPES)) > $(v_file) \
-		   && $(CC) $(no_g_c_flags) -c -Wa,$(v_file)		\
-			    -o $(tmp_o_file) $(s_file)			\
-		   && $(OBJCOPY) -L '__crc_*' -L '___crc_*' -w		\
-				 $(tmp_o_file) $@;			\
-		then							\
-			: ;						\
-		else							\
-			rm -f $@; exit 1;				\
-		fi;							\
+	if $(OBJDUMP) -h $(@D)/.tmp_$(@F) | grep -q __ksymtab; then	\
+		$(call cmd_gensymtypes, $(KBUILD_SYMTYPES))		\
+		    > $(@D)/.tmp_$(@F:.o=.ver);				\
+									\
+		$(LD) $(LDFLAGS) -r -o $@ $(@D)/.tmp_$(@F) 		\
+			-T $(@D)/.tmp_$(@F:.o=.ver);			\
+		rm -f $(@D)/.tmp_$(@F) $(@D)/.tmp_$(@F:.o=.ver);	\
 	else								\
-		rm -f $(v_file);					\
-		$(CC) $(no_g_c_flags) -c -o $@ $(s_file);		\
+		mv -f $(@D)/.tmp_$(@F) $@;				\
 	fi;
 endif
 
@@ -225,12 +215,7 @@ define rule_cc_o_c
 	$(cmd_record_mcount)						  \
 	scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,cc_o_c)' >    \
 	                                              $(dot-target).tmp;  \
-	if [ -r $(@D)/.tmp_$(@F:.o=.v) ]; then				  \
-		echo >> $(dot-target).tmp;				  \
-		echo '$@: $(GENKSYMS)' >> $(dot-target).tmp;		  \
-		echo '$(GENKSYMS):: ;' >> $(dot-target).tmp;		  \
-	fi;								  \
-	rm -f $(depfile) $(@D)/.tmp_$(@F:.o=.?);			  \
+	rm -f $(depfile);						  \
 	mv -f $(dot-target).tmp $(dot-target).cmd
 endef
 
diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
index a5122dce1264..efa5d940e632 100644
--- a/scripts/Makefile.modinst
+++ b/scripts/Makefile.modinst
@@ -17,8 +17,7 @@ __modinst: $(modules)
 	@:
 
 quiet_cmd_modules_install = INSTALL $@
-      cmd_modules_install = mkdir -p $(2); \
-			    $(mod_strip_cmd) $@ $(2)/$(notdir $@) || cp $@ $(2)
+      cmd_modules_install = mkdir -p $(2); cp $@ $(2) ; $(mod_strip_cmd) $(2)/$(notdir $@)
 
 # Modules built outside the kernel source tree go into extra by default
 INSTALL_MOD_DIR ?= extra
diff --git a/scripts/genksyms/genksyms.c b/scripts/genksyms/genksyms.c
index f8bb4cabd62d..3a8297b5184c 100644
--- a/scripts/genksyms/genksyms.c
+++ b/scripts/genksyms/genksyms.c
@@ -43,7 +43,7 @@ int cur_line = 1;
 char *cur_filename;
 
 static int flag_debug, flag_dump_defs, flag_reference, flag_dump_types,
-	   flag_preserve, flag_warnings, flag_asm;
+	   flag_preserve, flag_warnings;
 static const char *arch = "";
 static const char *mod_prefix = "";
 
@@ -610,11 +610,8 @@ void export_symbol(const char *name)
 		if (flag_dump_defs)
 			fputs(">\n", debugfile);
 
-		/* Used as assembly source or a linker script. */
-		printf(flag_asm
-		       ? ".equiv %s__crc_%s, %#08lx\n"
-		       : "%s__crc_%s = %#08lx ;\n",
-		       mod_prefix, name, crc);
+		/* Used as a linker script. */
+		printf("%s__crc_%s = 0x%08lx ;\n", mod_prefix, name, crc);
 	}
 }
 
@@ -651,10 +648,9 @@ void error_with_pos(const char *fmt, ...)
 
 static void genksyms_usage(void)
 {
-	fputs("Usage:\n" "genksyms [-aAdDTwqhV] > /path/to/.tmp_obj.ver\n" "\n"
+	fputs("Usage:\n" "genksyms [-adDTwqhV] > /path/to/.tmp_obj.ver\n" "\n"
 #ifdef __GNU_LIBRARY__
 	      "  -a, --arch            Select architecture\n"
-	      "  -A, --asm             Generate assembly rather than linker script\n"
 	      "  -d, --debug           Increment the debug level (repeatable)\n"
 	      "  -D, --dump            Dump expanded symbol defs (for debugging only)\n"
 	      "  -r, --reference file  Read reference symbols from a file\n"
@@ -666,7 +662,6 @@ static void genksyms_usage(void)
 	      "  -V, --version         Print the release version\n"
 #else				/* __GNU_LIBRARY__ */
 	      "  -a                    Select architecture\n"
-	      "  -A                    Generate assembly rather than linker script\n"
 	      "  -d                    Increment the debug level (repeatable)\n"
 	      "  -D                    Dump expanded symbol defs (for debugging only)\n"
 	      "  -r file               Read reference symbols from a file\n"
@@ -688,7 +683,6 @@ int main(int argc, char **argv)
 #ifdef __GNU_LIBRARY__
 	struct option long_opts[] = {
 		{"arch", 1, 0, 'a'},
-		{"asm", 0, 0, 'A'},
 		{"debug", 0, 0, 'd'},
 		{"warnings", 0, 0, 'w'},
 		{"quiet", 0, 0, 'q'},
@@ -701,10 +695,10 @@ int main(int argc, char **argv)
 		{0, 0, 0, 0}
 	};
 
-	while ((o = getopt_long(argc, argv, "a:dwqVADr:T:ph",
+	while ((o = getopt_long(argc, argv, "a:dwqVDr:T:ph",
 				&long_opts[0], NULL)) != EOF)
 #else				/* __GNU_LIBRARY__ */
-	while ((o = getopt(argc, argv, "a:dwqVADr:T:ph")) != EOF)
+	while ((o = getopt(argc, argv, "a:dwqVDr:T:ph")) != EOF)
 #endif				/* __GNU_LIBRARY__ */
 		switch (o) {
 		case 'a':
@@ -722,9 +716,6 @@ int main(int argc, char **argv)
 		case 'V':
 			fputs("genksyms version 2.5.60\n", stderr);
 			break;
-		case 'A':
-			flag_asm = 1;
-			break;
 		case 'D':
 			flag_dump_defs = 1;
 			break;
diff --git a/scripts/genksyms/keywords.c_shipped b/scripts/genksyms/keywords.c_shipped
index 83484fe93ede..971e0113ae7a 100644
--- a/scripts/genksyms/keywords.c_shipped
+++ b/scripts/genksyms/keywords.c_shipped
@@ -1,4 +1,4 @@
-/* ANSI-C code produced by gperf version 3.0.1 */
+/* ANSI-C code produced by gperf version 3.0.2 */
 /* Command-line: gperf -L ANSI-C -a -C -E -g -H is_reserved_hash -k '1,3,$' -N is_reserved_word -p -t scripts/genksyms/keywords.gperf  */
 
 #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
@@ -32,7 +32,7 @@
 
 #line 3 "scripts/genksyms/keywords.gperf"
 struct resword { const char *name; int token; };
-/* maximum key range = 64, duplicates = 0 */
+/* maximum key range = 62, duplicates = 0 */
 
 #ifdef __GNUC__
 __inline
@@ -46,32 +46,32 @@ is_reserved_hash (register const char *str, register unsigned int len)
 {
   static const unsigned char asso_values[] =
     {
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67,  0,
-      67, 67, 67, 67, 67, 67, 15, 67, 67, 67,
-       0, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67,  0, 67,  0, 67,  5,
-      25, 20, 15, 30, 67, 15, 67, 67, 10,  0,
-      10, 40, 20, 67, 10,  5,  0, 10, 15, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
-      67, 67, 67, 67, 67, 67
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65,  5,
+      65, 65, 65, 65, 65, 65, 35, 65, 65, 65,
+       0, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65,  0, 65,  0, 65,  5,
+      20, 15, 10, 30, 65, 15, 65, 65, 20,  0,
+      10, 35, 20, 65, 10,  5,  0, 10,  5, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+      65, 65, 65, 65, 65, 65
     };
   return len + asso_values[(unsigned char)str[2]] + asso_values[(unsigned char)str[0]] + asso_values[(unsigned char)str[len - 1]];
 }
@@ -84,119 +84,116 @@ is_reserved_word (register const char *str, register unsigned int len)
 {
   enum
     {
-      TOTAL_KEYWORDS = 45,
+      TOTAL_KEYWORDS = 43,
       MIN_WORD_LENGTH = 3,
       MAX_WORD_LENGTH = 24,
       MIN_HASH_VALUE = 3,
-      MAX_HASH_VALUE = 66
+      MAX_HASH_VALUE = 64
     };
 
   static const struct resword wordlist[] =
     {
       {""}, {""}, {""},
-#line 28 "scripts/genksyms/keywords.gperf"
+#line 26 "scripts/genksyms/keywords.gperf"
       {"asm", ASM_KEYW},
       {""},
-#line 10 "scripts/genksyms/keywords.gperf"
+#line 8 "scripts/genksyms/keywords.gperf"
       {"__asm", ASM_KEYW},
       {""},
-#line 11 "scripts/genksyms/keywords.gperf"
+#line 9 "scripts/genksyms/keywords.gperf"
       {"__asm__", ASM_KEYW},
       {""}, {""},
-#line 54 "scripts/genksyms/keywords.gperf"
+#line 52 "scripts/genksyms/keywords.gperf"
       {"__typeof__", TYPEOF_KEYW},
       {""},
-#line 14 "scripts/genksyms/keywords.gperf"
+#line 12 "scripts/genksyms/keywords.gperf"
       {"__const", CONST_KEYW},
-#line 13 "scripts/genksyms/keywords.gperf"
+#line 11 "scripts/genksyms/keywords.gperf"
       {"__attribute__", ATTRIBUTE_KEYW},
-#line 15 "scripts/genksyms/keywords.gperf"
+#line 13 "scripts/genksyms/keywords.gperf"
       {"__const__", CONST_KEYW},
-#line 20 "scripts/genksyms/keywords.gperf"
+#line 18 "scripts/genksyms/keywords.gperf"
       {"__signed__", SIGNED_KEYW},
-#line 46 "scripts/genksyms/keywords.gperf"
+#line 44 "scripts/genksyms/keywords.gperf"
       {"static", STATIC_KEYW},
-      {""},
-#line 41 "scripts/genksyms/keywords.gperf"
+#line 20 "scripts/genksyms/keywords.gperf"
+      {"__volatile__", VOLATILE_KEYW},
+#line 39 "scripts/genksyms/keywords.gperf"
       {"int", INT_KEYW},
-#line 34 "scripts/genksyms/keywords.gperf"
+#line 32 "scripts/genksyms/keywords.gperf"
       {"char", CHAR_KEYW},
-#line 35 "scripts/genksyms/keywords.gperf"
+#line 33 "scripts/genksyms/keywords.gperf"
       {"const", CONST_KEYW},
-#line 47 "scripts/genksyms/keywords.gperf"
+#line 45 "scripts/genksyms/keywords.gperf"
       {"struct", STRUCT_KEYW},
-#line 26 "scripts/genksyms/keywords.gperf"
+#line 24 "scripts/genksyms/keywords.gperf"
       {"__restrict__", RESTRICT_KEYW},
-#line 27 "scripts/genksyms/keywords.gperf"
-      {"restrict", RESTRICT_KEYW},
-#line 7 "scripts/genksyms/keywords.gperf"
-      {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW},
-#line 18 "scripts/genksyms/keywords.gperf"
-      {"__inline__", INLINE_KEYW},
-      {""},
-#line 22 "scripts/genksyms/keywords.gperf"
-      {"__volatile__", VOLATILE_KEYW},
-#line 5 "scripts/genksyms/keywords.gperf"
-      {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW},
 #line 25 "scripts/genksyms/keywords.gperf"
+      {"restrict", RESTRICT_KEYW},
+#line 23 "scripts/genksyms/keywords.gperf"
       {"_restrict", RESTRICT_KEYW},
-      {""},
-#line 12 "scripts/genksyms/keywords.gperf"
-      {"__attribute", ATTRIBUTE_KEYW},
-#line 6 "scripts/genksyms/keywords.gperf"
-      {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW},
 #line 16 "scripts/genksyms/keywords.gperf"
+      {"__inline__", INLINE_KEYW},
+#line 10 "scripts/genksyms/keywords.gperf"
+      {"__attribute", ATTRIBUTE_KEYW},
+      {""},
+#line 14 "scripts/genksyms/keywords.gperf"
       {"__extension__", EXTENSION_KEYW},
-#line 37 "scripts/genksyms/keywords.gperf"
+#line 35 "scripts/genksyms/keywords.gperf"
       {"enum", ENUM_KEYW},
-#line 8 "scripts/genksyms/keywords.gperf"
-      {"EXPORT_UNUSED_SYMBOL", EXPORT_SYMBOL_KEYW},
-#line 38 "scripts/genksyms/keywords.gperf"
+#line 19 "scripts/genksyms/keywords.gperf"
+      {"__volatile", VOLATILE_KEYW},
+#line 36 "scripts/genksyms/keywords.gperf"
       {"extern", EXTERN_KEYW},
       {""},
-#line 19 "scripts/genksyms/keywords.gperf"
+#line 17 "scripts/genksyms/keywords.gperf"
       {"__signed", SIGNED_KEYW},
-#line 9 "scripts/genksyms/keywords.gperf"
-      {"EXPORT_UNUSED_SYMBOL_GPL", EXPORT_SYMBOL_KEYW},
-#line 49 "scripts/genksyms/keywords.gperf"
-      {"union", UNION_KEYW},
-#line 53 "scripts/genksyms/keywords.gperf"
+#line 7 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW},
+      {""},
+#line 51 "scripts/genksyms/keywords.gperf"
       {"typeof", TYPEOF_KEYW},
-#line 48 "scripts/genksyms/keywords.gperf"
+#line 46 "scripts/genksyms/keywords.gperf"
       {"typedef", TYPEDEF_KEYW},
-#line 17 "scripts/genksyms/keywords.gperf"
+#line 15 "scripts/genksyms/keywords.gperf"
       {"__inline", INLINE_KEYW},
-#line 33 "scripts/genksyms/keywords.gperf"
+#line 31 "scripts/genksyms/keywords.gperf"
       {"auto", AUTO_KEYW},
-#line 21 "scripts/genksyms/keywords.gperf"
-      {"__volatile", VOLATILE_KEYW},
+#line 47 "scripts/genksyms/keywords.gperf"
+      {"union", UNION_KEYW},
       {""}, {""},
-#line 50 "scripts/genksyms/keywords.gperf"
+#line 48 "scripts/genksyms/keywords.gperf"
       {"unsigned", UNSIGNED_KEYW},
-      {""},
-#line 44 "scripts/genksyms/keywords.gperf"
+#line 49 "scripts/genksyms/keywords.gperf"
+      {"void", VOID_KEYW},
+#line 42 "scripts/genksyms/keywords.gperf"
       {"short", SHORT_KEYW},
-#line 40 "scripts/genksyms/keywords.gperf"
+      {""}, {""},
+#line 50 "scripts/genksyms/keywords.gperf"
+      {"volatile", VOLATILE_KEYW},
+      {""},
+#line 37 "scripts/genksyms/keywords.gperf"
+      {"float", FLOAT_KEYW},
+#line 34 "scripts/genksyms/keywords.gperf"
+      {"double", DOUBLE_KEYW},
+      {""},
+#line 5 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW},
+      {""}, {""},
+#line 38 "scripts/genksyms/keywords.gperf"
       {"inline", INLINE_KEYW},
+#line 6 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW},
+#line 41 "scripts/genksyms/keywords.gperf"
+      {"register", REGISTER_KEYW},
       {""},
-#line 52 "scripts/genksyms/keywords.gperf"
-      {"volatile", VOLATILE_KEYW},
-#line 42 "scripts/genksyms/keywords.gperf"
-      {"long", LONG_KEYW},
-#line 24 "scripts/genksyms/keywords.gperf"
+#line 22 "scripts/genksyms/keywords.gperf"
       {"_Bool", BOOL_KEYW},
-      {""}, {""},
 #line 43 "scripts/genksyms/keywords.gperf"
-      {"register", REGISTER_KEYW},
-#line 51 "scripts/genksyms/keywords.gperf"
-      {"void", VOID_KEYW},
-#line 39 "scripts/genksyms/keywords.gperf"
-      {"float", FLOAT_KEYW},
-#line 36 "scripts/genksyms/keywords.gperf"
-      {"double", DOUBLE_KEYW},
-      {""}, {""}, {""}, {""},
-#line 45 "scripts/genksyms/keywords.gperf"
-      {"signed", SIGNED_KEYW}
+      {"signed", SIGNED_KEYW},
+      {""}, {""},
+#line 40 "scripts/genksyms/keywords.gperf"
+      {"long", LONG_KEYW}
     };
 
   if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
diff --git a/scripts/genksyms/keywords.gperf b/scripts/genksyms/keywords.gperf
index 8abe7ab8d88f..5ef3733225fb 100644
--- a/scripts/genksyms/keywords.gperf
+++ b/scripts/genksyms/keywords.gperf
@@ -5,8 +5,6 @@ struct resword { const char *name; int token; }
 EXPORT_SYMBOL, EXPORT_SYMBOL_KEYW
 EXPORT_SYMBOL_GPL, EXPORT_SYMBOL_KEYW
 EXPORT_SYMBOL_GPL_FUTURE, EXPORT_SYMBOL_KEYW
-EXPORT_UNUSED_SYMBOL, EXPORT_SYMBOL_KEYW
-EXPORT_UNUSED_SYMBOL_GPL, EXPORT_SYMBOL_KEYW
 __asm, ASM_KEYW
 __asm__, ASM_KEYW
 __attribute, ATTRIBUTE_KEYW
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 92758120a767..ad2434b26970 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -130,9 +130,18 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 static int symbol_valid(struct sym_entry *s)
 {
 	/* Symbols which vary between passes.  Passes 1 and 2 must have
-	 * identical symbol lists.
+	 * identical symbol lists.  The kallsyms_* symbols below are only added
+	 * after pass 1, they would be included in pass 2 when --all-symbols is
+	 * specified so exclude them to get a stable symbol list.
 	 */
 	static char *special_symbols[] = {
+		"kallsyms_addresses",
+		"kallsyms_num_syms",
+		"kallsyms_names",
+		"kallsyms_markers",
+		"kallsyms_token_table",
+		"kallsyms_token_index",
+
 	/* Exclude linker generated symbols which vary between passes */
 		"_SDA_BASE_",		/* ppc */
 		"_SDA2_BASE_",		/* ppc */
@@ -164,9 +173,7 @@ static int symbol_valid(struct sym_entry *s)
 	}
 
 	/* Exclude symbols which vary between passes. */
-	if (strstr((char *)s->sym + offset, "_compiled.") ||
-	    strncmp((char*)s->sym + offset, "__compound_literal.", 19) == 0 ||
-	    strncmp((char*)s->sym + offset, "__compound_literal$", 19) == 0)
+	if (strstr((char *)s->sym + offset, "_compiled."))
 		return 0;
 
 	for (i = 0; special_symbols[i]; i++)
@@ -543,10 +550,8 @@ int main(int argc, char **argv)
 		usage();
 
 	read_map(stdin);
-	if (table_cnt) {
-		sort_symbols();
-		optimize_token_table();
-	}
+	sort_symbols();
+	optimize_token_table();
 	write_src();
 
 	return 0;
diff --git a/scripts/mksysmap b/scripts/mksysmap
index 1db316a3712b..6e133a0bae7a 100644
--- a/scripts/mksysmap
+++ b/scripts/mksysmap
@@ -37,6 +37,9 @@
 
 # readprofile starts reading symbols when _stext is found, and
 # continue until it finds a symbol which is not either of 'T', 't',
-# 'W' or 'w'.
+# 'W' or 'w'. __crc_ are 'A' and placed in the middle
+# so we just ignore them to let readprofile continue to work.
+# (At least sparc64 has __crc_ in the middle).
+
+$NM -n $1 | grep -v '\( [aNUw] \)\|\(__crc_\)\|\( \$[adt]\)' > $2
 
-$NM -n $1 | grep -v '\( [aNUw] \)\|\( \$[adt]\)' > $2
diff --git a/scripts/strip-symbols b/scripts/strip-symbols
deleted file mode 100644
index 29ee8c1a014b..000000000000
--- a/scripts/strip-symbols
+++ /dev/null
@@ -1,22 +0,0 @@
-<*>
-*.h
-__compound_literal[$.][0-9]*
-__crc_[a-zA-Z_]*
-__exitcall_[a-zA-Z_]*
-__func__[$.][0-9]*
-__FUNCTION__[$.][0-9]*
-gcc[0-9]_compiled[$.]
-__initcall_[a-zA-Z_]*
-__kcrctab_[a-zA-Z_]*
-__kstrtab_[a-zA-Z_]*
-__ksymtab_[a-zA-Z_]*
-__mod_[a-zA-Z_]*[0-9]
-__module_depends
-__param_[a-zA-Z_]*
-__pci_fixup_*PCI_ANY_IDPCI_ANY_ID*
-__pci_fixup_*PCI_ANY_IDPCI_DEVICE_ID_*
-__pci_fixup_*PCI_VENDOR_ID_*PCI_ANY_ID*
-__pci_fixup_*PCI_VENDOR_ID_*PCI_DEVICE_ID_*
-__PRETTY_FUNCTION__[$.][0-9]*
-__setup_[a-zA-Z_]*
-____versions
-- 
cgit v1.2.3-71-gd317


From 934d96eafadcf3eb3ccd094af9919f020907fc41 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@infradead.org>
Date: Wed, 14 Jan 2009 20:38:17 +0530
Subject: time-sched.c: tick_nohz_update_jiffies should be static

Impact: cleanup, reduce kernel size a bit, avoid sparse warning

Fixes sparse warning:

 kernel/time/tick-sched.c:137:6: warning: symbol 'tick_nohz_update_jiffies' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1b6c05bd0d0a..d3f1ef4d5cbe 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,7 +134,7 @@ __setup("nohz=", setup_tick_nohz);
  * value. We do this unconditionally on any cpu, as we don't know whether the
  * cpu, which has the update task assigned is in a long sleep.
  */
-void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(void)
 {
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-- 
cgit v1.2.3-71-gd317


From 68564a46976017496c2227660930d81240f82355 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 16 Jan 2009 15:31:15 -0800
Subject: work_on_cpu: don't try to get_online_cpus() in work_on_cpu.

Impact: remove potential circular lock dependency with cpu hotplug lock

This has caused more problems than it solved, with a pile of cpu
hotplug locking issues.

Followup patches will get_online_cpus() in callers that need it, but
if they don't do it they're no worse than before when they were using
set_cpus_allowed without locking.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/workqueue.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae37..a35afdbc0161 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -991,8 +991,8 @@ static void do_work_for_cpu(struct work_struct *w)
  * @fn: the function to run
  * @arg: the function arg
  *
- * This will return -EINVAL in the cpu is not online, or the return value
- * of @fn otherwise.
+ * This will return the value @fn returns.
+ * It is up to the caller to ensure that the cpu doesn't go offline.
  */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
@@ -1001,14 +1001,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 	INIT_WORK(&wfc.work, do_work_for_cpu);
 	wfc.fn = fn;
 	wfc.arg = arg;
-	get_online_cpus();
-	if (unlikely(!cpu_online(cpu)))
-		wfc.ret = -EINVAL;
-	else {
-		schedule_work_on(cpu, &wfc.work);
-		flush_work(&wfc.work);
-	}
-	put_online_cpus();
+	schedule_work_on(cpu, &wfc.work);
+	flush_work(&wfc.work);
 
 	return wfc.ret;
 }
-- 
cgit v1.2.3-71-gd317


From e1d9ec6246a2668a5d037f529877efb7cf176af8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 16 Jan 2009 15:31:15 -0800
Subject: work_on_cpu: Use our own workqueue.

Impact: remove potential clashes with generic kevent workqueue

Annoyingly, some places we want to use work_on_cpu are already in
workqueues.  As per Ingo's suggestion, we create a different workqueue
for work_on_cpu.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/workqueue.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a35afdbc0161..1f0c509b40d3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -971,6 +971,8 @@ undo:
 }
 
 #ifdef CONFIG_SMP
+static struct workqueue_struct *work_on_cpu_wq __read_mostly;
+
 struct work_for_cpu {
 	struct work_struct work;
 	long (*fn)(void *);
@@ -1001,7 +1003,7 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 	INIT_WORK(&wfc.work, do_work_for_cpu);
 	wfc.fn = fn;
 	wfc.arg = arg;
-	schedule_work_on(cpu, &wfc.work);
+	queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
 	flush_work(&wfc.work);
 
 	return wfc.ret;
@@ -1019,4 +1021,8 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+#ifdef CONFIG_SMP
+	work_on_cpu_wq = create_workqueue("work_on_cpu");
+	BUG_ON(!work_on_cpu_wq);
+#endif
 }
-- 
cgit v1.2.3-71-gd317


From b786c6a98ef6fa81114ba7b9fbfc0d67060775e3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Sat, 17 Jan 2009 12:04:36 +0100
Subject: relay: fix lock imbalance in relay_late_setup_files

One fail path in relay_late_setup_files() omits
mutex_unlock(&relay_channels_mutex);
Add it.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/relay.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77b..9d79b7854fa6 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -663,8 +663,10 @@ int relay_late_setup_files(struct rchan *chan,
 
 	mutex_lock(&relay_channels_mutex);
 	/* Is chan already set up? */
-	if (unlikely(chan->has_base_filename))
+	if (unlikely(chan->has_base_filename)) {
+		mutex_unlock(&relay_channels_mutex);
 		return -EEXIST;
+	}
 	chan->has_base_filename = 1;
 	chan->parent = parent;
 	curr_cpu = get_cpu();
-- 
cgit v1.2.3-71-gd317


From 1d4a7f1c4faf53eb9e822743ec8a70b3019a26d2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 18 Jan 2009 16:39:29 +0100
Subject: hrtimers: fix inconsistent lock state on resume in hres_timers_resume

Andrey Borzenkov reported this lockdep assert:

> [17854.688347] =================================
> [17854.688347] [ INFO: inconsistent lock state ]
> [17854.688347] 2.6.29-rc2-1avb #1
> [17854.688347] ---------------------------------
> [17854.688347] inconsistent {in-hardirq-W} -> {hardirq-on-W} usage.
> [17854.688347] pm-suspend/18240 [HC0[0]:SC0[0]:HE1:SE1] takes:
> [17854.688347]  (&cpu_base->lock){++..}, at: [<c0136fcc>] retrigger_next_event+0x5c/0xa0
> [17854.688347] {in-hardirq-W} state was registered at:
> [17854.688347]   [<c01443cd>] __lock_acquire+0x79d/0x1930
> [17854.688347]   [<c01455bc>] lock_acquire+0x5c/0x80
> [17854.688347]   [<c03092e5>] _spin_lock+0x35/0x70
> [17854.688347]   [<c0136e61>] hrtimer_run_queues+0x31/0x140
> [17854.688347]   [<c0128d98>] run_local_timers+0x8/0x20
> [17854.688347]   [<c0128dd3>] update_process_times+0x23/0x60
> [17854.688347]   [<c013e274>] tick_periodic+0x24/0x80
> [17854.688347]   [<c013e2e2>] tick_handle_periodic+0x12/0x70
> [17854.688347]   [<c0104e24>] timer_interrupt+0x14/0x20
> [17854.688347]   [<c01607b9>] handle_IRQ_event+0x29/0x60
> [17854.688347]   [<c0161c59>] handle_level_irq+0x69/0xe0
> [17854.688347]   [<ffffffff>] 0xffffffff
> [17854.688347] irq event stamp: 55771
> [17854.688347] hardirqs last  enabled at (55771): [<c0309125>] _spin_unlock_irqrestore+0x35/0x60
> [17854.688347] hardirqs last disabled at (55770): [<c0309419>] _spin_lock_irqsave+0x19/0x80
> [17854.688347] softirqs last  enabled at (54836): [<c0124f54>] __do_softirq+0xc4/0x110
> [17854.688347] softirqs last disabled at (54831): [<c01049ae>] do_softirq+0x8e/0xe0
> [17854.688347]
> [17854.688347] other info that might help us debug this:
> [17854.688347] 3 locks held by pm-suspend/18240:
> [17854.688347]  #0:  (&buffer->mutex){--..}, at: [<c01dd4c5>] sysfs_write_file+0x25/0x100
> [17854.688347]  #1:  (pm_mutex){--..}, at: [<c015056f>] enter_state+0x4f/0x140
> [17854.688347]  #2:  (dpm_list_mtx){--..}, at: [<c027880f>] device_pm_lock+0xf/0x20
> [17854.688347]
> [17854.688347] stack backtrace:
> [17854.688347] Pid: 18240, comm: pm-suspend Not tainted 2.6.29-rc2-1avb #1
> [17854.688347] Call Trace:
> [17854.688347]  [<c0306248>] ? printk+0x18/0x20
> [17854.688347]  [<c0141fac>] print_usage_bug+0x16c/0x1d0
> [17854.688347]  [<c0142bcf>] mark_lock+0x8bf/0xc90
> [17854.688347]  [<c0106b8f>] ? pit_next_event+0x2f/0x40
> [17854.688347]  [<c01441b0>] __lock_acquire+0x580/0x1930
> [17854.688347]  [<c030916d>] ? _spin_unlock+0x1d/0x20
> [17854.688347]  [<c0106b8f>] ? pit_next_event+0x2f/0x40
> [17854.688347]  [<c013dd38>] ? clockevents_program_event+0x98/0x160
> [17854.688347]  [<c0142fe8>] ? mark_held_locks+0x48/0x90
> [17854.688347]  [<c0309125>] ? _spin_unlock_irqrestore+0x35/0x60
> [17854.688347]  [<c0143229>] ? trace_hardirqs_on_caller+0x139/0x190
> [17854.688347]  [<c014328b>] ? trace_hardirqs_on+0xb/0x10
> [17854.688347]  [<c01455bc>] lock_acquire+0x5c/0x80
> [17854.688347]  [<c0136fcc>] ? retrigger_next_event+0x5c/0xa0
> [17854.688347]  [<c03092e5>] _spin_lock+0x35/0x70
> [17854.688347]  [<c0136fcc>] ? retrigger_next_event+0x5c/0xa0
> [17854.688347]  [<c0136fcc>] retrigger_next_event+0x5c/0xa0
> [17854.688347]  [<c013711a>] hres_timers_resume+0xa/0x10
> [17854.688347]  [<c013aa8e>] timekeeping_resume+0xee/0x150
> [17854.688347]  [<c0273384>] __sysdev_resume+0x14/0x50
> [17854.688347]  [<c0273407>] sysdev_resume+0x47/0x80
> [17854.688347]  [<c02791ab>] device_power_up+0xb/0x20
> [17854.688347]  [<c015043f>] suspend_devices_and_enter+0xcf/0x150
> [17854.688347]  [<c0150c2f>] ? freeze_processes+0x3f/0x90
> [17854.688347]  [<c0150614>] enter_state+0xf4/0x140
> [17854.688347]  [<c01506dd>] state_store+0x7d/0xc0
> [17854.688347]  [<c0150660>] ? state_store+0x0/0xc0
> [17854.688347]  [<c0202da4>] kobj_attr_store+0x24/0x30
> [17854.688347]  [<c01dd53c>] sysfs_write_file+0x9c/0x100
> [17854.688347]  [<c019916c>] vfs_write+0x9c/0x160
> [17854.688347]  [<c0103494>] ? restore_nocheck_notrace+0x0/0xe
> [17854.688347]  [<c01dd4a0>] ? sysfs_write_file+0x0/0x100
> [17854.688347]  [<c01992ed>] sys_write+0x3d/0x70
> [17854.688347]  [<c0103371>] sysenter_do_call+0x12/0x31

Andrey's analysis:

> timekeeping_resume() is called via class ->resume
> method; and according to comments in sysdev_resume() and
> device_power_up(), they are called with interrupts disabled.
>
> Looking at suspend_enter, irqs *are* disabled at this point.
>
> So it actually looks like something (may be some driver)
> unconditionally enabled irqs in resume path.

Add a debug check to test this theory. If it triggers then it
triggers because the resume code calls it with irqs enabled,
which is a no-no not just for timekeeping_resume(), but also
bad for a number of other resume handlers.

Reported-by: Andrey Borzenkov <arvidjaar@mail.ru>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1455b7651b6b..cb83c6d4c07c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -614,7 +614,9 @@ void clock_was_set(void)
  */
 void hres_timers_resume(void)
 {
-	/* Retrigger the CPU local events: */
+	WARN_ONCE(!irqs_disabled(),
+		  KERN_INFO "hres_timers_resume() called with IRQs enabled!");
+
 	retrigger_next_event(NULL);
 }
 
-- 
cgit v1.2.3-71-gd317


From 31ad9081200c06ccc350625d41d1f8b2d1cef29f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 16 Jan 2009 15:31:15 -0800
Subject: work_on_cpu: don't try to get_online_cpus() in work_on_cpu.

Impact: remove potential circular lock dependency with cpu hotplug lock

This has caused more problems than it solved, with a pile of cpu
hotplug locking issues.

Followup patches will get_online_cpus() in callers that need it, but
if they don't do it they're no worse than before when they were using
set_cpus_allowed without locking.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/workqueue.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae37..a35afdbc0161 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -991,8 +991,8 @@ static void do_work_for_cpu(struct work_struct *w)
  * @fn: the function to run
  * @arg: the function arg
  *
- * This will return -EINVAL in the cpu is not online, or the return value
- * of @fn otherwise.
+ * This will return the value @fn returns.
+ * It is up to the caller to ensure that the cpu doesn't go offline.
  */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
@@ -1001,14 +1001,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 	INIT_WORK(&wfc.work, do_work_for_cpu);
 	wfc.fn = fn;
 	wfc.arg = arg;
-	get_online_cpus();
-	if (unlikely(!cpu_online(cpu)))
-		wfc.ret = -EINVAL;
-	else {
-		schedule_work_on(cpu, &wfc.work);
-		flush_work(&wfc.work);
-	}
-	put_online_cpus();
+	schedule_work_on(cpu, &wfc.work);
+	flush_work(&wfc.work);
 
 	return wfc.ret;
 }
-- 
cgit v1.2.3-71-gd317


From 8ccad40df8d314f786fdb06bdbedd4f43f3257cd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 16 Jan 2009 15:31:15 -0800
Subject: work_on_cpu: Use our own workqueue.

Impact: remove potential clashes with generic kevent workqueue

Annoyingly, some places we want to use work_on_cpu are already in
workqueues.  As per Ingo's suggestion, we create a different workqueue
for work_on_cpu.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/workqueue.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a35afdbc0161..1f0c509b40d3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -971,6 +971,8 @@ undo:
 }
 
 #ifdef CONFIG_SMP
+static struct workqueue_struct *work_on_cpu_wq __read_mostly;
+
 struct work_for_cpu {
 	struct work_struct work;
 	long (*fn)(void *);
@@ -1001,7 +1003,7 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 	INIT_WORK(&wfc.work, do_work_for_cpu);
 	wfc.fn = fn;
 	wfc.arg = arg;
-	schedule_work_on(cpu, &wfc.work);
+	queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
 	flush_work(&wfc.work);
 
 	return wfc.ret;
@@ -1019,4 +1021,8 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+#ifdef CONFIG_SMP
+	work_on_cpu_wq = create_workqueue("work_on_cpu");
+	BUG_ON(!work_on_cpu_wq);
+#endif
 }
-- 
cgit v1.2.3-71-gd317


From cdf57cab27aef72f13a19c86858c6cac9951dc24 Mon Sep 17 00:00:00 2001
From: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Date: Wed, 21 Jan 2009 18:47:38 +0900
Subject: dma-coherent: per-device coherent area is in pages, not bytes.

Commit 58c6d3dfe436eb8cfb451981d8fdc9044eaf42da ("dma-coherent: catch
oversized requests to dma_alloc_from_coherent()") attempted to add a
sanity check to bail out on allocations larger than the coherent area.

Unfortunately when this was implemented, the fact the coherent area
is tracked in pages rather than bytes was overlooked, which subsequently
broke every single dma_alloc_from_coherent() user, forcing the allocation
silently through generic memory instead.

Signed-off-by: Adrian McMenamin <adrian@mcmen.demon.co.uk >
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 kernel/dma-coherent.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 038707404b76..38fa292c6aa9 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -118,8 +118,8 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 	mem = dev->dma_mem;
 	if (!mem)
 		return 0;
-	if (unlikely(size > mem->size))
- 		return 0;
+	if (unlikely(size > (mem->size << PAGE_SHIFT)))
+		return 0;
 
 	pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
 	if (pageno >= 0) {
-- 
cgit v1.2.3-71-gd317


From 0609697eab9775564845d4c94f9e3780fb791ffd Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 21 Jan 2009 18:51:53 +0900
Subject: dma-coherent: Restore dma_alloc_from_coherent() large alloc fall back
 policy.

When doing large allocations (larger than the per-device coherent area)
the generic memory allocators are silently fallen back on regardless of
consideration for the per-device constraints.

In the DMA_MEMORY_EXCLUSIVE case falling back on generic memory is not
an option, as it tends not to be addressable by the DMA hardware in
question. This issue showed up with the 8139too breakage on the
Dreamcast, where non-addressable buffers were silently allocated due to
the size mismatch calculation -- while it should have simply errored out
upon being unable to satisfy the allocation with the given device
constraints.

This restores fall back behaviour to what it was before the oversized
request change caused multiple regressions.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 kernel/dma-coherent.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 38fa292c6aa9..962a3b574f21 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
  * @size:	size of requested memory area
  * @dma_handle:	This will be filled with the correct dma handle
  * @ret:	This pointer will be filled with the virtual address
- * 		to allocated area.
+ *		to allocated area.
  *
  * This function should be only called from per-arch dma_alloc_coherent()
  * to support allocation from per-device coherent memory pools.
@@ -118,31 +118,32 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 	mem = dev->dma_mem;
 	if (!mem)
 		return 0;
+
+	*ret = NULL;
+
 	if (unlikely(size > (mem->size << PAGE_SHIFT)))
-		return 0;
+		goto err;
 
 	pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
-	if (pageno >= 0) {
-		/*
-		 * Memory was found in the per-device arena.
-		 */
-		*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
-		*ret = mem->virt_base + (pageno << PAGE_SHIFT);
-		memset(*ret, 0, size);
-	} else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
-		/*
-		 * The per-device arena is exhausted and we are not
-		 * permitted to fall back to generic memory.
-		 */
-		*ret = NULL;
-	} else {
-		/*
-		 * The per-device arena is exhausted and we are
-		 * permitted to fall back to generic memory.
-		 */
-		 return 0;
-	}
+	if (unlikely(pageno < 0))
+		goto err;
+
+	/*
+	 * Memory was found in the per-device area.
+	 */
+	*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+	*ret = mem->virt_base + (pageno << PAGE_SHIFT);
+	memset(*ret, 0, size);
+
 	return 1;
+
+err:
+	/*
+	 * In the case where the allocation can not be satisfied from the
+	 * per-device area, try to fall back to generic memory if the
+	 * constraints allow it.
+	 */
+	return mem->flags & DMA_MEMORY_EXCLUSIVE;
 }
 EXPORT_SYMBOL(dma_alloc_from_coherent);
 
-- 
cgit v1.2.3-71-gd317


From abfe2d7b915c872f3a1fd203267cedebf90daa45 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 19 Jan 2009 20:54:54 +0100
Subject: Hibernation: Introduce system_entering_hibernation

Introduce boolean function system_entering_hibernation() returning
'true' during the last phase of hibernation, in which devices are
being put into low power states and the sleep state (for example,
ACPI S4) is finally entered.

Some device drivers need such a function to check if the system is
in the final phase of hibernation.  In particular, some SATA drivers
are going to use it for blacklisting systems in which the disks
should not be spun down during the last phase of hibernation (the
BIOS will do that anyway).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 include/linux/suspend.h |  2 ++
 kernel/power/disk.c     | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 2b409c44db83..c7d9bb1832ba 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -237,6 +237,7 @@ extern int hibernate_nvs_alloc(void);
 extern void hibernate_nvs_free(void);
 extern void hibernate_nvs_save(void);
 extern void hibernate_nvs_restore(void);
+extern bool system_entering_hibernation(void);
 #else /* CONFIG_HIBERNATION */
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
@@ -252,6 +253,7 @@ static inline int hibernate_nvs_alloc(void) { return 0; }
 static inline void hibernate_nvs_free(void) {}
 static inline void hibernate_nvs_save(void) {}
 static inline void hibernate_nvs_restore(void) {}
+static inline bool system_entering_hibernation(void) { return false; }
 #endif /* CONFIG_HIBERNATION */
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 45e8541ab7e3..432ee575c9ee 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -71,6 +71,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
 	mutex_unlock(&pm_mutex);
 }
 
+static bool entering_platform_hibernation;
+
+bool system_entering_hibernation(void)
+{
+	return entering_platform_hibernation;
+}
+EXPORT_SYMBOL(system_entering_hibernation);
+
 #ifdef CONFIG_PM_DEBUG
 static void hibernation_debug_sleep(void)
 {
@@ -411,6 +419,7 @@ int hibernation_platform_enter(void)
 	if (error)
 		goto Close;
 
+	entering_platform_hibernation = true;
 	suspend_console();
 	error = device_suspend(PMSG_HIBERNATE);
 	if (error) {
@@ -445,6 +454,7 @@ int hibernation_platform_enter(void)
  Finish:
 	hibernation_ops->finish();
  Resume_devices:
+	entering_platform_hibernation = false;
 	device_resume(PMSG_RESTORE);
 	resume_console();
  Close:
-- 
cgit v1.2.3-71-gd317


From 1267a8df209c7453d65acbdd56e3588954bf890b Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Tue, 27 Jan 2009 09:53:21 -0800
Subject: Make irq_*_affinity depend on CONFIG_GENERIC_HARDIRQS too.

In interrupt.h these functions are declared only if
CONFIG_GENERIC_HARDIRQS is set.  We should define them under identical
conditions.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..618a64f1915a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -15,7 +15,7 @@
 
 #include "internals.h"
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 cpumask_var_t irq_default_affinity;
 
 static int init_irq_default_affinity(void)
-- 
cgit v1.2.3-71-gd317


From 97179fd46da7ddedd18e95388130ed3e06c5a0c7 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Tue, 27 Jan 2009 09:53:22 -0800
Subject: cpumask fallout: Initialize irq_default_affinity earlier

Move the initialization of irq_default_affinity to early_irq_init as
core_initcall is too late.

irq_default_affinity can be used in init_IRQ and potentially timer and
SMP init as well.  All of these happen before core_initcall.  Moving
the initialization to early_irq_init ensures that it is initialized
before it is used.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Acked-by: Mike Travis <travis@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/handle.c | 16 ++++++++++++++++
 kernel/irq/manage.c |  8 --------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..3aba8d12f328 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -39,6 +39,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 	ack_bad_irq(irq);
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+	alloc_bootmem_cpumask_var(&irq_default_affinity);
+	cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+
 /*
  * Linux has a controller-independent interrupt architecture.
  * Every controller has a 'controller-template', that is used
@@ -134,6 +146,8 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
+	init_irq_default_affinity();
+
 	desc = irq_desc_legacy;
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
@@ -219,6 +233,8 @@ int __init early_irq_init(void)
 	int count;
 	int i;
 
+	init_irq_default_affinity();
+
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 618a64f1915a..291f03664552 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,14 +18,6 @@
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 cpumask_var_t irq_default_affinity;
 
-static int init_irq_default_affinity(void)
-{
-	alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
-	cpumask_setall(irq_default_affinity);
-	return 0;
-}
-core_initcall(init_irq_default_affinity);
-
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
-- 
cgit v1.2.3-71-gd317


From baef99a08a2e23d9386b47e53fa5f0d44fc98f66 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 29 Jan 2009 14:25:10 -0800
Subject: cgroups: use hierarchy mutex in creation failure path

Now, cgrp->sibling is handled under hierarchy mutex.
error route should do so, too.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c29831076e7a..2ae7cb47dbfa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2434,7 +2434,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
  err_remove:
 
+	cgroup_lock_hierarchy(root);
 	list_del(&cgrp->sibling);
+	cgroup_unlock_hierarchy(root);
 	root->number_of_cgroups--;
 
  err_destroy:
-- 
cgit v1.2.3-71-gd317


From 1404f06565ee89e0ce04d4a5859c00b0e3a0dc8d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 29 Jan 2009 14:25:21 -0800
Subject: cgroups: fix lock inconsistency in cgroup_clone()

I fixed a bug in cgroup_clone() in Linus' tree in commit 7b574b7
("cgroups: fix a race between cgroup_clone and umount") without noticing
there was a cleanup patch in -mm tree that should be rebased (now commit
104cbd5, "cgroups: use task_lock() for access tsk->cgroups safe in
cgroup_clone()"), thus resulted in lock inconsistency.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2ae7cb47dbfa..0066092de19a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2993,20 +2993,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 		mutex_unlock(&cgroup_mutex);
 		return 0;
 	}
-	task_lock(tsk);
-	cg = tsk->cgroups;
-	parent = task_cgroup(tsk, subsys->subsys_id);
 
 	/* Pin the hierarchy */
-	if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
+	if (!atomic_inc_not_zero(&root->sb->s_active)) {
 		/* We race with the final deactivate_super() */
 		mutex_unlock(&cgroup_mutex);
 		return 0;
 	}
 
 	/* Keep the cgroup alive */
+	task_lock(tsk);
+	parent = task_cgroup(tsk, subsys->subsys_id);
+	cg = tsk->cgroups;
 	get_css_set(cg);
 	task_unlock(tsk);
+
 	mutex_unlock(&cgroup_mutex);
 
 	/* Now do the VFS work to create a cgroup */
@@ -3045,7 +3046,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 		mutex_unlock(&inode->i_mutex);
 		put_css_set(cg);
 
-		deactivate_super(parent->root->sb);
+		deactivate_super(root->sb);
 		/* The cgroup is still accessible in the VFS, but
 		 * we're not going to try to rmdir() it at this
 		 * point. */
@@ -3071,7 +3072,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 	mutex_lock(&cgroup_mutex);
 	put_css_set(cg);
 	mutex_unlock(&cgroup_mutex);
-	deactivate_super(parent->root->sb);
+	deactivate_super(root->sb);
 	return ret;
 }
 
-- 
cgit v1.2.3-71-gd317


From 804b3c28a4e4fa1c224571bf76edb534b9c4b1ed Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Thu, 29 Jan 2009 14:25:21 -0800
Subject: cgroups: add cpu_relax() calls in css_tryget() and
 cgroup_clear_css_refs()

css_tryget() and cgroup_clear_css_refs() contain polling loops; these
loops should have cpu_relax calls in them to reduce cross-cache traffic.

Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 1 +
 kernel/cgroup.c        | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e267e62827bb..e4e8e117d27d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -99,6 +99,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
 	while (!atomic_inc_not_zero(&css->refcnt)) {
 		if (test_bit(CSS_REMOVED, &css->flags))
 			return false;
+		cpu_relax();
 	}
 	return true;
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0066092de19a..492215d67fa5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2509,7 +2509,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 		int refcnt;
-		do {
+		while (1) {
 			/* We can only remove a CSS with a refcnt==1 */
 			refcnt = atomic_read(&css->refcnt);
 			if (refcnt > 1) {
@@ -2523,7 +2523,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
 			 * css_tryget() to spin until we set the
 			 * CSS_REMOVED bits or abort
 			 */
-		} while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
+				break;
+			cpu_relax();
+		}
 	}
  done:
 	for_each_subsys(cgrp->root, ss) {
-- 
cgit v1.2.3-71-gd317


From 839ec5452ebfd5905b9c69b20ceb640903a8ea1a Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Thu, 29 Jan 2009 14:25:22 -0800
Subject: cgroup: fix root_count when mount fails due to busy subsystem

root_count was being incremented in cgroup_get_sb() after all error
checking was complete, but decremented in cgroup_kill_sb(), which can be
called on a superblock that we gave up on due to an error.  This patch
changes cgroup_kill_sb() to only decrement root_count if the root was
previously linked into the list of roots.

Signed-off-by: Paul Menage <menage@google.com>
Tested-by: Serge Hallyn <serue@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 492215d67fa5..5a54ff42874e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1115,8 +1115,10 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	}
 	write_unlock(&css_set_lock);
 
-	list_del(&root->root_list);
-	root_count--;
+	if (!list_empty(&root->root_list)) {
+		list_del(&root->root_list);
+		root_count--;
+	}
 
 	mutex_unlock(&cgroup_mutex);
 
-- 
cgit v1.2.3-71-gd317