From 6ce51c431019310ca03371355a4366c4649fa349 Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Wed, 1 Apr 2009 18:06:35 +0100
Subject: genirq: do not execute DEBUG_SHIRQ when irq setup failed

When requesting an IRQ, the DEBUG_SHIRQ code executes a fake IRQ just to make
sure the driver is ready to receive an IRQ immediately.  The problem was that
this fake IRQ was being executed even if interrupt line failed to be allocated
by __setup_irq.

Signed-off-by: Luis Henriques <henrix@sapo.pt>
LKML-Reference: <20090401170635.GA4392@hades.domain.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[ fixed bug pointed out by a warning reported by Stephen Rothwell ]
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1516ab77355c..8c68d5b95d48 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -768,7 +768,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 		kfree(action);
 
 #ifdef CONFIG_DEBUG_SHIRQ
-	if (irqflags & IRQF_SHARED) {
+	if (!retval && (irqflags & IRQF_SHARED)) {
 		/*
 		 * It's a shared IRQ -- the driver ought to be prepared for it
 		 * to happen immediately, so let's make sure....
-- 
cgit v1.2.3-71-gd317


From 62a038d34db26771756cf3689e36de638bedd2c4 Mon Sep 17 00:00:00 2001
From: "K.Prasad" <prasad@linux.vnet.ibm.com>
Date: Mon, 1 Jun 2009 23:43:33 +0530
Subject: hw-breakpoints: introducing generic hardware breakpoint handler
 interfaces

This patch introduces the generic Hardware Breakpoint interfaces for both user
and kernel space requests.
This core Api handles the hardware breakpoints through new helpers. It
handles the user-space breakpoints and kernel breakpoints in front of
arch implementation.

One can choose kernel wide breakpoints using the following helpers
and passing them a generic struct hw_breakpoint:

- register_kernel_hw_breakpoint()
- unregister_kernel_hw_breakpoint()
- modify_kernel_hw_breakpoint()

On the other side, you can choose per task breakpoints.

- register_user_hw_breakpoint()
- unregister_user_hw_breakpoint()
- modify_user_hw_breakpoint()

[ fweisbec@gmail.com: fix conflict against perfcounter ]

Original-patch-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/Kconfig           |   4 +
 kernel/Makefile        |   1 +
 kernel/hw_breakpoint.c | 378 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 383 insertions(+)
 create mode 100644 kernel/hw_breakpoint.c

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 78a35e9dc104..1adf2d0e6356 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -112,3 +112,7 @@ config HAVE_DMA_API_DEBUG
 
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
+
+config HAVE_HW_BREAKPOINT
+	bool
+
diff --git a/kernel/Makefile b/kernel/Makefile
index a35eee3436de..18ad1110b226 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..c1f64e65a9f3
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,378 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ * This file contains the arch-independent routines.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_X86
+#include <asm/debugreg.h>
+#endif
+/*
+ * Spinlock that protects all (un)register operations over kernel/user-space
+ * breakpoint requests
+ */
+static DEFINE_SPINLOCK(hw_breakpoint_lock);
+
+/* Array of kernel-space breakpoint structures */
+struct hw_breakpoint *hbp_kernel[HBP_NUM];
+
+/*
+ * Per-processor copy of hbp_kernel[]. Used only when hbp_kernel is being
+ * modified but we need the older copy to handle any hbp exceptions. It will
+ * sync with hbp_kernel[] value after updation is done through IPIs.
+ */
+DEFINE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]);
+
+/*
+ * Kernel breakpoints grow downwards, starting from HBP_NUM
+ * 'hbp_kernel_pos' denotes lowest numbered breakpoint register occupied for
+ * kernel-space request. We will initialise it here and not in an __init
+ * routine because load_debug_registers(), which uses this variable can be
+ * called very early during CPU initialisation.
+ */
+unsigned int hbp_kernel_pos = HBP_NUM;
+
+/*
+ * An array containing refcount of threads using a given bkpt register
+ * Accesses are synchronised by acquiring hw_breakpoint_lock
+ */
+unsigned int hbp_user_refcount[HBP_NUM];
+
+/*
+ * Load the debug registers during startup of a CPU.
+ */
+void load_debug_registers(void)
+{
+	unsigned long flags;
+	struct task_struct *tsk = current;
+
+	spin_lock_bh(&hw_breakpoint_lock);
+
+	/* Prevent IPIs for new kernel breakpoint updates */
+	local_irq_save(flags);
+	arch_update_kernel_hw_breakpoint(NULL);
+	local_irq_restore(flags);
+
+	if (test_tsk_thread_flag(tsk, TIF_DEBUG))
+		arch_install_thread_hw_breakpoint(tsk);
+
+	spin_unlock_bh(&hw_breakpoint_lock);
+}
+
+/*
+ * Erase all the hardware breakpoint info associated with a thread.
+ *
+ * If tsk != current then tsk must not be usable (for example, a
+ * child being cleaned up from a failed fork).
+ */
+void flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	spin_lock_bh(&hw_breakpoint_lock);
+
+	/* The thread no longer has any breakpoints associated with it */
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	for (i = 0; i < HBP_NUM; i++) {
+		if (thread->hbp[i]) {
+			hbp_user_refcount[i]--;
+			kfree(thread->hbp[i]);
+			thread->hbp[i] = NULL;
+		}
+	}
+
+	arch_flush_thread_hw_breakpoint(tsk);
+
+	/* Actually uninstall the breakpoints if necessary */
+	if (tsk == current)
+		arch_uninstall_thread_hw_breakpoint();
+	spin_unlock_bh(&hw_breakpoint_lock);
+}
+
+/*
+ * Copy the hardware breakpoint info from a thread to its cloned child.
+ */
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags)
+{
+	/*
+	 * We will assume that breakpoint settings are not inherited
+	 * and the child starts out with no debug registers set.
+	 * But what about CLONE_PTRACE?
+	 */
+	clear_tsk_thread_flag(child, TIF_DEBUG);
+
+	/* We will call flush routine since the debugregs are not inherited */
+	arch_flush_thread_hw_breakpoint(child);
+
+	return 0;
+}
+
+static int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int rc;
+
+	/* Do not overcommit. Fail if kernel has used the hbp registers */
+	if (pos >= hbp_kernel_pos)
+		return -ENOSPC;
+
+	rc = arch_validate_hwbkpt_settings(bp, tsk);
+	if (rc)
+		return rc;
+
+	thread->hbp[pos] = bp;
+	hbp_user_refcount[pos]++;
+
+	arch_update_user_hw_breakpoint(pos, tsk);
+	/*
+	 * Does it need to be installed right now?
+	 * Otherwise it will get installed the next time tsk runs
+	 */
+	if (tsk == current)
+		arch_install_thread_hw_breakpoint(tsk);
+
+	return rc;
+}
+
+/*
+ * Modify the address of a hbp register already in use by the task
+ * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint()
+ */
+static int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if ((pos >= hbp_kernel_pos) || (arch_validate_hwbkpt_settings(bp, tsk)))
+		return -EINVAL;
+
+	if (thread->hbp[pos] == NULL)
+		return -EINVAL;
+
+	thread->hbp[pos] = bp;
+	/*
+	 * 'pos' must be that of a hbp register already used by 'tsk'
+	 * Otherwise arch_modify_user_hw_breakpoint() will fail
+	 */
+	arch_update_user_hw_breakpoint(pos, tsk);
+
+	if (tsk == current)
+		arch_install_thread_hw_breakpoint(tsk);
+
+	return 0;
+}
+
+static void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk)
+{
+	hbp_user_refcount[pos]--;
+	tsk->thread.hbp[pos] = NULL;
+
+	arch_update_user_hw_breakpoint(pos, tsk);
+
+	if (tsk == current)
+		arch_install_thread_hw_breakpoint(tsk);
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ * @bp: the breakpoint structure to register
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
+ * @bp->triggered must be set properly before invocation
+ *
+ */
+int register_user_hw_breakpoint(struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i, rc = -ENOSPC;
+
+	spin_lock_bh(&hw_breakpoint_lock);
+
+	for (i = 0; i < hbp_kernel_pos; i++) {
+		if (!thread->hbp[i]) {
+			rc = __register_user_hw_breakpoint(i, tsk, bp);
+			break;
+		}
+	}
+	if (!rc)
+		set_tsk_thread_flag(tsk, TIF_DEBUG);
+
+	spin_unlock_bh(&hw_breakpoint_lock);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+
+/**
+ * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ * @bp: the breakpoint structure to unregister
+ *
+ */
+int modify_user_hw_breakpoint(struct task_struct *tsk, struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i, ret = -ENOENT;
+
+	spin_lock_bh(&hw_breakpoint_lock);
+	for (i = 0; i < hbp_kernel_pos; i++) {
+		if (bp == thread->hbp[i]) {
+			ret = __modify_user_hw_breakpoint(i, tsk, bp);
+			break;
+		}
+	}
+	spin_unlock_bh(&hw_breakpoint_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
+
+/**
+ * unregister_user_hw_breakpoint - unregister a user-space hardware breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ * @bp: the breakpoint structure to unregister
+ *
+ */
+void unregister_user_hw_breakpoint(struct task_struct *tsk,
+						struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i, pos = -1, hbp_counter = 0;
+
+	spin_lock_bh(&hw_breakpoint_lock);
+	for (i = 0; i < hbp_kernel_pos; i++) {
+		if (thread->hbp[i])
+			hbp_counter++;
+		if (bp == thread->hbp[i])
+			pos = i;
+	}
+	if (pos >= 0) {
+		__unregister_user_hw_breakpoint(pos, tsk);
+		hbp_counter--;
+	}
+	if (!hbp_counter)
+		clear_tsk_thread_flag(tsk, TIF_DEBUG);
+
+	spin_unlock_bh(&hw_breakpoint_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_user_hw_breakpoint);
+
+/**
+ * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to register
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
+ * @bp->triggered must be set properly before invocation
+ *
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int rc;
+
+	rc = arch_validate_hwbkpt_settings(bp, NULL);
+	if (rc)
+		return rc;
+
+	spin_lock_bh(&hw_breakpoint_lock);
+
+	rc = -ENOSPC;
+	/* Check if we are over-committing */
+	if ((hbp_kernel_pos > 0) && (!hbp_user_refcount[hbp_kernel_pos-1])) {
+		hbp_kernel_pos--;
+		hbp_kernel[hbp_kernel_pos] = bp;
+		on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1);
+		rc = 0;
+	}
+
+	spin_unlock_bh(&hw_breakpoint_lock);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
+
+/**
+ * unregister_kernel_hw_breakpoint - unregister a HW breakpoint for kernel space
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int i, j;
+
+	spin_lock_bh(&hw_breakpoint_lock);
+
+	/* Find the 'bp' in our list of breakpoints for kernel */
+	for (i = hbp_kernel_pos; i < HBP_NUM; i++)
+		if (bp == hbp_kernel[i])
+			break;
+
+	/* Check if we did not find a match for 'bp'. If so return early */
+	if (i == HBP_NUM) {
+		spin_unlock_bh(&hw_breakpoint_lock);
+		return;
+	}
+
+	/*
+	 * We'll shift the breakpoints one-level above to compact if
+	 * unregistration creates a hole
+	 */
+	for (j = i; j > hbp_kernel_pos; j--)
+		hbp_kernel[j] = hbp_kernel[j-1];
+
+	hbp_kernel[hbp_kernel_pos] = NULL;
+	on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1);
+	hbp_kernel_pos++;
+
+	spin_unlock_bh(&hw_breakpoint_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	/* we need to be notified first */
+	.priority = 0x7fffffff
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+
+core_initcall(init_hw_breakpoint);
-- 
cgit v1.2.3-71-gd317


From 0722db015c246204044299eae3b02d18d3ca4faf Mon Sep 17 00:00:00 2001
From: "K.Prasad" <prasad@linux.vnet.ibm.com>
Date: Mon, 1 Jun 2009 23:46:40 +0530
Subject: hw-breakpoints: ftrace plugin for kernel symbol tracing using HW
 Breakpoint interfaces

This patch adds an ftrace plugin to detect and profile memory access over kernel
variables. It uses HW Breakpoint interfaces to 'watch memory addresses.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/Kconfig          |  21 ++
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace.h          |  23 ++
 kernel/trace/trace_ksym.c     | 525 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_selftest.c |  53 +++++
 5 files changed, 623 insertions(+)
 create mode 100644 kernel/trace/trace_ksym.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a508b9d2adb8..d7f01e6e8ba5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -314,6 +314,27 @@ config POWER_TRACER
 	  power management decisions, specifically the C-state and P-state
 	  behavior.
 
+config KSYM_TRACER
+	bool "Trace read and write access on kernel memory locations"
+	depends on HAVE_HW_BREAKPOINT
+	select TRACING
+	help
+	  This tracer helps find read and write operations on any given kernel
+	  symbol i.e. /proc/kallsyms.
+
+config PROFILE_KSYM_TRACER
+	bool "Profile all kernel memory accesses on 'watched' variables"
+	depends on KSYM_TRACER
+	help
+	  This tracer profiles kernel accesses on variables watched through the
+	  ksym tracer ftrace plugin. Depending upon the hardware, all read
+	  and write operations on kernel variables can be monitored for
+	  accesses.
+
+	  The results will be displayed in:
+	  /debugfs/tracing/profile_ksym
+
+	  Say N if unsure.
 
 config STACK_TRACER
 	bool "Trace max stack"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 06b85850fab4..658aace8c41e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,5 +51,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f8..7d5cc37b8fca 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,6 +15,10 @@
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
 
+#ifdef CONFIG_KSYM_TRACER
+#include <asm/hw_breakpoint.h>
+#endif
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -40,6 +44,7 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
+	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
@@ -207,6 +212,21 @@ struct syscall_trace_exit {
 	unsigned long		ret;
 };
 
+#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
+extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
+
+struct trace_ksym {
+	struct trace_entry	ent;
+	struct hw_breakpoint	*ksym_hbp;
+	unsigned long		ksym_addr;
+	unsigned long		ip;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	unsigned long		counter;
+#endif
+	struct hlist_node	ksym_hlist;
+	char			ksym_name[KSYM_NAME_LEN];
+	char			p_name[TASK_COMM_LEN];
+};
 
 /*
  * trace_flag_type is an enumeration that holds different
@@ -323,6 +343,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
+		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -540,6 +561,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
 extern int trace_selftest_startup_hw_branches(struct tracer *trace,
 					      struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..11c74f6404cc
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,525 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+#include "trace_stat.h"
+#include "trace.h"
+
+/* For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HBP_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
+
+static struct trace_array *ksym_trace_array;
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+
+#define MAX_UL_INT 0xffffffff
+
+static DEFINE_MUTEX(ksym_tracer_mutex);
+
+void ksym_collect_stats(unsigned long hbp_hit_addr)
+{
+	struct hlist_node *node;
+	struct trace_ksym *entry;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
+		if ((entry->ksym_addr == hbp_hit_addr) &&
+		    (entry->counter <= MAX_UL_INT)) {
+			entry->counter++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
+{
+	struct ring_buffer_event *event;
+	struct trace_array *tr;
+	struct trace_ksym *entry;
+	int pc;
+
+	if (!ksym_tracing_enabled)
+		return;
+
+	tr = ksym_trace_array;
+	pc = preempt_count();
+
+	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
+							sizeof(*entry), 0, pc);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	strlcpy(entry->ksym_name, hbp->info.name, KSYM_SYMBOL_LEN);
+	entry->ksym_hbp = hbp;
+	entry->ip = instruction_pointer(regs);
+	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	ksym_collect_stats(hbp->info.address);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+	trace_buffer_unlock_commit(tr, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *access_str)
+{
+	int pos, access = 0;
+
+	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
+		switch (access_str[pos]) {
+		case 'r':
+			access += (pos == 0) ? 4 : -1;
+			break;
+		case 'w':
+			access += (pos == 1) ? 2 : -1;
+			break;
+		case '-':
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (access) {
+	case 6:
+		access = HW_BREAKPOINT_RW;
+		break;
+	case 2:
+		access = HW_BREAKPOINT_WRITE;
+		break;
+	case 0:
+		access = 0;
+	}
+
+	return access;
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+							unsigned long *addr)
+{
+	char *delimiter = ":";
+	int ret;
+
+	ret = -EINVAL;
+	*ksymname = strsep(&input_string, delimiter);
+	*addr = kallsyms_lookup_name(*ksymname);
+
+	/* Check for malformed request: (2), (1) and (5) */
+	if ((!input_string) ||
+		(strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
+			(*addr == 0))
+		goto return_code;
+	ret = ksym_trace_get_access_type(input_string);
+
+return_code:
+	return ret;
+}
+
+int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+	struct trace_ksym *entry;
+	int ret;
+
+	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+		" new requests for tracing can be accepted now.\n",
+			KSYM_TRACER_MAX);
+		return -ENOSPC;
+	}
+
+	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->ksym_hbp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
+	if (!entry->ksym_hbp) {
+		kfree(entry);
+		return -ENOMEM;
+	}
+
+	entry->ksym_hbp->info.name = ksymname;
+	entry->ksym_hbp->info.type = op;
+	entry->ksym_addr = entry->ksym_hbp->info.address = addr;
+#ifdef CONFIG_X86
+	entry->ksym_hbp->info.len = HW_BREAKPOINT_LEN_4;
+#endif
+	entry->ksym_hbp->triggered = (void *)ksym_hbp_handler;
+
+	ret = register_kernel_hw_breakpoint(entry->ksym_hbp);
+	if (ret < 0) {
+		printk(KERN_INFO "ksym_tracer request failed. Try again"
+					" later!!\n");
+		kfree(entry->ksym_hbp);
+		kfree(entry);
+		return -EAGAIN;
+	}
+	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
+	ksym_filter_entry_count++;
+
+	return 0;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
+	ssize_t ret, cnt = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
+				entry->ksym_hbp->info.name);
+		if (entry->ksym_hbp->info.type == HW_BREAKPOINT_WRITE)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"-w-\n");
+		else if (entry->ksym_hbp->info.type == HW_BREAKPOINT_RW)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"rw-\n");
+	}
+	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
+	mutex_unlock(&ksym_tracer_mutex);
+
+	return ret;
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+					const char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char *input_string, *ksymname = NULL;
+	unsigned long ksym_addr = 0;
+	int ret, op, changed = 0;
+
+	/* Ignore echo "" > ksym_trace_filter */
+	if (count == 0)
+		return 0;
+
+	input_string = kzalloc(count, GFP_KERNEL);
+	if (!input_string)
+		return -ENOMEM;
+
+	if (copy_from_user(input_string, buffer, count)) {
+		kfree(input_string);
+		return -EFAULT;
+	}
+
+	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+	if (ret < 0) {
+		kfree(input_string);
+		return ret;
+	}
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	ret = -EINVAL;
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if (entry->ksym_addr == ksym_addr) {
+			/* Check for malformed request: (6) */
+			if (entry->ksym_hbp->info.type != op)
+				changed = 1;
+			else
+				goto err_ret;
+			break;
+		}
+	}
+	if (changed) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbp);
+		entry->ksym_hbp->info.type = op;
+		if (op > 0) {
+			ret = register_kernel_hw_breakpoint(entry->ksym_hbp);
+			if (ret == 0) {
+				ret = count;
+				goto unlock_ret_path;
+			}
+		}
+		ksym_filter_entry_count--;
+		hlist_del_rcu(&(entry->ksym_hlist));
+		synchronize_rcu();
+		kfree(entry->ksym_hbp);
+		kfree(entry);
+		ret = count;
+		goto err_ret;
+	} else {
+		/* Check for malformed request: (4) */
+		if (op == 0)
+			goto err_ret;
+		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+		if (ret)
+			goto err_ret;
+	}
+	ret = count;
+	goto unlock_ret_path;
+
+err_ret:
+	kfree(input_string);
+
+unlock_ret_path:
+	mutex_unlock(&ksym_tracer_mutex);
+	return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+	.open		= tracing_open_generic,
+	.read		= ksym_trace_filter_read,
+	.write		= ksym_trace_filter_write,
+};
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node, *node1;
+
+	ksym_tracing_enabled = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+								ksym_hlist) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbp);
+		ksym_filter_entry_count--;
+		hlist_del_rcu(&(entry->ksym_hlist));
+		synchronize_rcu();
+		/* Free the 'input_string' only if reset
+		 * after startup self-test
+		 */
+#ifdef CONFIG_FTRACE_SELFTEST
+		if (strncmp(entry->ksym_hbp->info.name, KSYM_SELFTEST_ENTRY,
+					strlen(KSYM_SELFTEST_ENTRY)) != 0)
+#endif /* CONFIG_FTRACE_SELFTEST*/
+			kfree(entry->ksym_hbp->info.name);
+		kfree(entry->ksym_hbp);
+		kfree(entry);
+	}
+	mutex_unlock(&ksym_tracer_mutex);
+}
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+	int cpu, ret = 0;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+	ksym_tracing_enabled = 1;
+	ksym_trace_array = tr;
+
+	return ret;
+}
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+
+	seq_puts(m,
+		 "#       TASK-PID      CPU#      Symbol         Type    "
+		 "Function         \n");
+	seq_puts(m,
+		 "#          |           |          |              |         "
+		 "|            \n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct trace_ksym *field;
+	char str[KSYM_SYMBOL_LEN];
+	int ret;
+
+	if (entry->type != TRACE_KSYM)
+		return TRACE_TYPE_UNHANDLED;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
+				entry->pid, iter->cpu, field->ksym_name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	switch (field->ksym_hbp->info.type) {
+	case HW_BREAKPOINT_WRITE:
+		ret = trace_seq_printf(s, " W  ");
+		break;
+	case HW_BREAKPOINT_RW:
+		ret = trace_seq_printf(s, " RW ");
+		break;
+	default:
+		return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	sprint_symbol(str, field->ip);
+	ret = trace_seq_printf(s, "%-20s\n", str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+	.name		= "ksym_tracer",
+	.init		= ksym_trace_init,
+	.reset		= ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_ksym,
+#endif
+	.print_header   = ksym_trace_print_header,
+	.print_line	= ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	ksym_filter_entry_count = 0;
+
+	entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
+				    NULL, &ksym_tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ksym_trace_filter' file\n");
+
+	return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
+
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_tracer_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "   Access type    ");
+	seq_printf(m, "            Symbol                     Counter     \n");
+	return 0;
+}
+
+static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *stat = v;
+	struct trace_ksym *entry;
+	int access_type = 0;
+	char fn_name[KSYM_NAME_LEN];
+
+	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+
+	if (entry->ksym_hbp)
+		access_type = entry->ksym_hbp->info.type;
+
+	switch (access_type) {
+	case HW_BREAKPOINT_WRITE:
+		seq_printf(m, "     W     ");
+		break;
+	case HW_BREAKPOINT_RW:
+		seq_printf(m, "     RW    ");
+		break;
+	default:
+		seq_printf(m, "     NA    ");
+	}
+
+	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
+		seq_printf(m, "               %s                 ", fn_name);
+	else
+		seq_printf(m, "               <NA>                ");
+
+	seq_printf(m, "%15lu\n", entry->counter);
+	return 0;
+}
+
+static void *ksym_tracer_stat_start(struct tracer_stat *trace)
+{
+	return &(ksym_filter_head.first);
+}
+
+static void *
+ksym_tracer_stat_next(void *v, int idx)
+{
+	struct hlist_node *stat = v;
+
+	return stat->next;
+}
+
+static struct tracer_stat ksym_tracer_stats = {
+	.name = "ksym_tracer",
+	.stat_start = ksym_tracer_stat_start,
+	.stat_next = ksym_tracer_stat_next,
+	.stat_headers = ksym_tracer_stat_headers,
+	.stat_show = ksym_tracer_stat_show
+};
+
+__init static int ksym_tracer_stat_init(void)
+{
+	int ret;
+
+	ret = register_stat_tracer(&ksym_tracer_stats);
+	if (ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "ksym tracer stats\n");
+		return 1;
+	}
+
+	return 0;
+}
+fs_initcall(ksym_tracer_stat_init);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..71f2edb0fd84 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
 	case TRACE_HW_BRANCHES:
+	case TRACE_KSYM:
 		return 1;
 	}
 	return 0;
@@ -807,3 +808,55 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
 	return ret;
 }
 #endif /* CONFIG_HW_BRANCH_TRACER */
+
+#ifdef CONFIG_KSYM_TRACER
+static int ksym_selftest_dummy;
+
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
+	ksym_selftest_dummy = 0;
+	/* Register the read-write tracing request */
+	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
+					(unsigned long)(&ksym_selftest_dummy));
+
+	if (ret < 0) {
+		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+		goto ret_path;
+	}
+	/* Perform a read and a write operation over the dummy variable to
+	 * trigger the tracer
+	 */
+	if (ksym_selftest_dummy == 0)
+		ksym_selftest_dummy++;
+
+	/* stop the tracing. */
+	tracing_stop();
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	/* read & write operations - one each is performed on the dummy variable
+	 * triggering two entries in the trace buffer
+	 */
+	if (!ret && count != 2) {
+		printk(KERN_CONT "Ksym tracer startup test failed");
+		ret = -1;
+	}
+
+ret_path:
+	return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */
+
-- 
cgit v1.2.3-71-gd317


From 73874005cd8800440be4299bd095387fff4b90ac Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 3 Jun 2009 01:43:38 +0200
Subject: hw-breakpoints: fix undeclared ksym_tracer_mutex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ksym_tracer_mutex is declared inside an #ifdef CONFIG_PROFILE_KSYM_TRACER
section. This makes it unavailable for the hardware breakpoint tracer if it is
configured without the breakpoint profiler.

This patch fixes the following build error:

kernel/trace/trace_ksym.c: In function ‘ksym_trace_filter_read’:
kernel/trace/trace_ksym.c:226: erreur: ‘ksym_tracer_mutex’ undeclared (first use in this function)
kernel/trace/trace_ksym.c:226: erreur: (Each undeclared identifier is reported only once
kernel/trace/trace_ksym.c:226: erreur: for each function it appears in.)
kernel/trace/trace_ksym.c: In function ‘ksym_trace_filter_write’:
kernel/trace/trace_ksym.c:273: erreur: ‘ksym_tracer_mutex’ undeclared (first use in this function)
kernel/trace/trace_ksym.c: In function ‘ksym_trace_reset’:
kernel/trace/trace_ksym.c:335: erreur: ‘ksym_tracer_mutex’ undeclared (first use in this function)
make[1]: *** [kernel/trace/trace_ksym.o] Erreur 1

[ Impact: fix a build error ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_ksym.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 11c74f6404cc..eef97e7c8db7 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -44,12 +44,12 @@ static unsigned int ksym_tracing_enabled;
 
 static HLIST_HEAD(ksym_filter_head);
 
+static DEFINE_MUTEX(ksym_tracer_mutex);
+
 #ifdef CONFIG_PROFILE_KSYM_TRACER
 
 #define MAX_UL_INT 0xffffffff
 
-static DEFINE_MUTEX(ksym_tracer_mutex);
-
 void ksym_collect_stats(unsigned long hbp_hit_addr)
 {
 	struct hlist_node *node;
-- 
cgit v1.2.3-71-gd317


From db59504d89db1462a5281fb55b1d962cb74a398f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Jul 2009 13:52:36 +0800
Subject: ksym_tracer: Extract trace entry from struct trace_ksym

struct trace_ksym is used as an entry in hbp list, and is also
used as trace_entry stored in ring buffer.

This is not necessary and is a waste of memory in ring buffer.

There is also a bug that dereferencing field->ksym_hbp in
ksym_trace_output() can be invalid.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A52E2A4.4050007@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h      | 13 ++++---------
 kernel/trace/trace_ksym.c | 26 ++++++++++++++++++--------
 2 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7d5cc37b8fca..ff1ef411a176 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -215,17 +215,12 @@ struct syscall_trace_exit {
 #define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
 extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
 
-struct trace_ksym {
+struct ksym_trace_entry {
 	struct trace_entry	ent;
-	struct hw_breakpoint	*ksym_hbp;
-	unsigned long		ksym_addr;
 	unsigned long		ip;
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-	unsigned long		counter;
-#endif
-	struct hlist_node	ksym_hlist;
+	unsigned char		type;
 	char			ksym_name[KSYM_NAME_LEN];
-	char			p_name[TASK_COMM_LEN];
+	char			cmd[TASK_COMM_LEN];
 };
 
 /*
@@ -343,7 +338,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
-		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM);	\
+		IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index eef97e7c8db7..085ff055fdfa 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -37,6 +37,15 @@
 #define KSYM_TRACER_OP_LEN 3 /* rw- */
 #define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
 
+struct trace_ksym {
+	struct hw_breakpoint	*ksym_hbp;
+	unsigned long		ksym_addr;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	unsigned long		counter;
+#endif
+	struct hlist_node	ksym_hlist;
+};
+
 static struct trace_array *ksym_trace_array;
 
 static unsigned int ksym_filter_entry_count;
@@ -71,7 +80,7 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
 {
 	struct ring_buffer_event *event;
 	struct trace_array *tr;
-	struct trace_ksym *entry;
+	struct ksym_trace_entry *entry;
 	int pc;
 
 	if (!ksym_tracing_enabled)
@@ -85,11 +94,12 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
 	if (!event)
 		return;
 
-	entry = ring_buffer_event_data(event);
+	entry		= ring_buffer_event_data(event);
+	entry->ip	= instruction_pointer(regs);
+	entry->type	= hbp->info.type;
 	strlcpy(entry->ksym_name, hbp->info.name, KSYM_SYMBOL_LEN);
-	entry->ksym_hbp = hbp;
-	entry->ip = instruction_pointer(regs);
-	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
+	strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
+
 #ifdef CONFIG_PROFILE_KSYM_TRACER
 	ksym_collect_stats(hbp->info.address);
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
@@ -380,7 +390,7 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct trace_ksym *field;
+	struct ksym_trace_entry *field;
 	char str[KSYM_SYMBOL_LEN];
 	int ret;
 
@@ -389,12 +399,12 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
+	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->cmd,
 				entry->pid, iter->cpu, field->ksym_name);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	switch (field->ksym_hbp->info.type) {
+	switch (field->type) {
 	case HW_BREAKPOINT_WRITE:
 		ret = trace_seq_printf(s, " W  ");
 		break;
-- 
cgit v1.2.3-71-gd317


From be9742e6cb107fe1d77db7a081ea4eb25e79e1ad Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Jul 2009 13:52:52 +0800
Subject: ksym_tracer: Rewrite ksym_trace_filter_read()

Reading ksym_trace_filter gave me some arbitrary characters,
when it should show nothing. It's because buf is not initialized
when there's no filter.

Also reduce stack usage by about 512 bytes.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A52E2B4.6030706@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 085ff055fdfa..b6710d31bdf0 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -35,7 +35,6 @@
 #define KSYM_TRACER_MAX HBP_NUM
 
 #define KSYM_TRACER_OP_LEN 3 /* rw- */
-#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
 
 struct trace_ksym {
 	struct hw_breakpoint	*ksym_hbp;
@@ -230,25 +229,33 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
 {
 	struct trace_ksym *entry;
 	struct hlist_node *node;
-	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
-	ssize_t ret, cnt = 0;
+	struct trace_seq *s;
+	ssize_t cnt = 0;
+	int ret;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+	trace_seq_init(s);
 
 	mutex_lock(&ksym_tracer_mutex);
 
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
-				entry->ksym_hbp->info.name);
+		ret = trace_seq_printf(s, "%s:", entry->ksym_hbp->info.name);
 		if (entry->ksym_hbp->info.type == HW_BREAKPOINT_WRITE)
-			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
-								"-w-\n");
+			ret = trace_seq_puts(s, "-w-\n");
 		else if (entry->ksym_hbp->info.type == HW_BREAKPOINT_RW)
-			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
-								"rw-\n");
+			ret = trace_seq_puts(s, "rw-\n");
+		WARN_ON_ONCE(!ret);
 	}
-	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
+
+	cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
 	mutex_unlock(&ksym_tracer_mutex);
 
-	return ret;
+	kfree(s);
+
+	return cnt;
 }
 
 static ssize_t ksym_trace_filter_write(struct file *file,
-- 
cgit v1.2.3-71-gd317


From f088e5471297cc78d7465e1fd997cb1a91a48019 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Jul 2009 13:53:18 +0800
Subject: ksym_tracer: Fix validation of access type

# echo 'pid_max:rw-' > ksym_trace_filter
 # cat ksym_trace_filter
 pid_max:rw-
 # echo 'pid_max:ww-' > ksym_trace_filter
 (should return -EINVAL)
 # cat ksym_trace_filter
 (but it ended up removing filter entry)

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A52E2CE.6080409@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index b6710d31bdf0..955600929907 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -114,24 +114,22 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
  * --x : Set Execution Break points (Not available yet)
  *
  */
-static int ksym_trace_get_access_type(char *access_str)
+static int ksym_trace_get_access_type(char *str)
 {
-	int pos, access = 0;
+	int access = 0;
 
-	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
-		switch (access_str[pos]) {
-		case 'r':
-			access += (pos == 0) ? 4 : -1;
-			break;
-		case 'w':
-			access += (pos == 1) ? 2 : -1;
-			break;
-		case '-':
-			break;
-		default:
-			return -EINVAL;
-		}
-	}
+	if (str[0] == 'r')
+		access += 4;
+	else if (str[0] != '-')
+		return -EINVAL;
+
+	if (str[1] == 'w')
+		access += 2;
+	else if (str[1] != '-')
+		return -EINVAL;
+
+	if (str[2] != '-')
+		return -EINVAL;
 
 	switch (access) {
 	case 6:
@@ -140,8 +138,6 @@ static int ksym_trace_get_access_type(char *access_str)
 	case 2:
 		access = HW_BREAKPOINT_WRITE;
 		break;
-	case 0:
-		access = 0;
 	}
 
 	return access;
-- 
cgit v1.2.3-71-gd317


From 92cf9f8f7e89c6bdbb1a724f879b8b18fc0dfe0f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Jul 2009 13:53:47 +0800
Subject: ksym_tracer: Fix validation of length of access type

Don't take newline into account, otherwise:

 # echo 'pid_max:-w-' > ksym_trace_filter
 # echo -n 'pid_max:rw-' > ksym_trace_filter
 bash: echo: write error: Invalid argument

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A52E2EB.9070503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 955600929907..72fcb46c39c0 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -158,21 +158,21 @@ static int ksym_trace_get_access_type(char *str)
 static int parse_ksym_trace_str(char *input_string, char **ksymname,
 							unsigned long *addr)
 {
-	char *delimiter = ":";
 	int ret;
 
-	ret = -EINVAL;
-	*ksymname = strsep(&input_string, delimiter);
+	strstrip(input_string);
+
+	*ksymname = strsep(&input_string, ":");
 	*addr = kallsyms_lookup_name(*ksymname);
 
 	/* Check for malformed request: (2), (1) and (5) */
 	if ((!input_string) ||
-		(strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
-			(*addr == 0))
-		goto return_code;
+	    (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
+	    (*addr == 0))
+		return -EINVAL;;
+
 	ret = ksym_trace_get_access_type(input_string);
 
-return_code:
 	return ret;
 }
 
-- 
cgit v1.2.3-71-gd317


From 011ed56853e07e30653d6f1bfddc56b396218664 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Jul 2009 13:54:08 +0800
Subject: ksym_tracer: NIL-terminate user input filter

Make sure the user input string is NULL-terminated.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A52E300.7020601@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 72fcb46c39c0..8cbed5a6286f 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -264,11 +264,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 	unsigned long ksym_addr = 0;
 	int ret, op, changed = 0;
 
-	/* Ignore echo "" > ksym_trace_filter */
-	if (count == 0)
-		return 0;
-
-	input_string = kzalloc(count, GFP_KERNEL);
+	input_string = kzalloc(count + 1, GFP_KERNEL);
 	if (!input_string)
 		return -ENOMEM;
 
@@ -276,6 +272,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 		kfree(input_string);
 		return -EFAULT;
 	}
+	input_string[count] = '\0';
 
 	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
 	if (ret < 0) {
-- 
cgit v1.2.3-71-gd317


From 0d109c8f70eab8b9f693bd5caea23012394e4876 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Jul 2009 13:54:28 +0800
Subject: ksym_tracer: Report error when failed to re-register hbp

When access type is changed, the hw break point will be
unregistered and then be registered again with new access
type. But the registration may fail, in this case, -errno
should be returned.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A52E314.7070004@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 8cbed5a6286f..891e3b86b3f6 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -302,13 +302,13 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 				ret = count;
 				goto unlock_ret_path;
 			}
-		}
+		} else
+			ret = count;
 		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
 		kfree(entry->ksym_hbp);
 		kfree(entry);
-		ret = count;
 		goto err_ret;
 	} else {
 		/* Check for malformed request: (4) */
-- 
cgit v1.2.3-71-gd317


From 558df6c8f74ac4a0b9026ef85b0028280f364d96 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Jul 2009 13:54:48 +0800
Subject: ksym_tracer: Fix memory leak

- When remove a filter, we leak entry->ksym_hbp->info.name.

- With CONFIG_FTRAC_SELFTEST enabled, we leak ->info.name:
    # echo ksym_tracer > current_tracer
    # echo 'ksym_selftest_dummy:rw-' > ksym_trace_filter
    # echo nop > current_tracer

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A52E328.8010200@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 61 +++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 891e3b86b3f6..7d349d34a0d1 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -179,7 +179,7 @@ static int parse_ksym_trace_str(char *input_string, char **ksymname,
 int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 {
 	struct trace_ksym *entry;
-	int ret;
+	int ret = -ENOMEM;
 
 	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
 		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
@@ -193,12 +193,13 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 		return -ENOMEM;
 
 	entry->ksym_hbp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
-	if (!entry->ksym_hbp) {
-		kfree(entry);
-		return -ENOMEM;
-	}
+	if (!entry->ksym_hbp)
+		goto err;
+
+	entry->ksym_hbp->info.name = kstrdup(ksymname, GFP_KERNEL);
+	if (!entry->ksym_hbp->info.name)
+		goto err;
 
-	entry->ksym_hbp->info.name = ksymname;
 	entry->ksym_hbp->info.type = op;
 	entry->ksym_addr = entry->ksym_hbp->info.address = addr;
 #ifdef CONFIG_X86
@@ -210,14 +211,18 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	if (ret < 0) {
 		printk(KERN_INFO "ksym_tracer request failed. Try again"
 					" later!!\n");
-		kfree(entry->ksym_hbp);
-		kfree(entry);
-		return -EAGAIN;
+		ret = -EAGAIN;
+		goto err;
 	}
 	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
 	ksym_filter_entry_count++;
-
 	return 0;
+err:
+	if (entry->ksym_hbp)
+		kfree(entry->ksym_hbp->info.name);
+	kfree(entry->ksym_hbp);
+	kfree(entry);
+	return ret;
 }
 
 static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
@@ -289,7 +294,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 			if (entry->ksym_hbp->info.type != op)
 				changed = 1;
 			else
-				goto err_ret;
+				goto out;
 			break;
 		}
 	}
@@ -298,34 +303,29 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 		entry->ksym_hbp->info.type = op;
 		if (op > 0) {
 			ret = register_kernel_hw_breakpoint(entry->ksym_hbp);
-			if (ret == 0) {
-				ret = count;
-				goto unlock_ret_path;
-			}
-		} else
-			ret = count;
+			if (ret == 0)
+				goto out;
+		}
 		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
+		kfree(entry->ksym_hbp->info.name);
 		kfree(entry->ksym_hbp);
 		kfree(entry);
-		goto err_ret;
+		goto out;
 	} else {
 		/* Check for malformed request: (4) */
 		if (op == 0)
-			goto err_ret;
+			goto out;
 		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
-		if (ret)
-			goto err_ret;
 	}
-	ret = count;
-	goto unlock_ret_path;
+out:
+	mutex_unlock(&ksym_tracer_mutex);
 
-err_ret:
 	kfree(input_string);
 
-unlock_ret_path:
-	mutex_unlock(&ksym_tracer_mutex);
+	if (!ret)
+		ret = count;
 	return ret;
 }
 
@@ -349,14 +349,7 @@ static void ksym_trace_reset(struct trace_array *tr)
 		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
-		/* Free the 'input_string' only if reset
-		 * after startup self-test
-		 */
-#ifdef CONFIG_FTRACE_SELFTEST
-		if (strncmp(entry->ksym_hbp->info.name, KSYM_SELFTEST_ENTRY,
-					strlen(KSYM_SELFTEST_ENTRY)) != 0)
-#endif /* CONFIG_FTRACE_SELFTEST*/
-			kfree(entry->ksym_hbp->info.name);
+		kfree(entry->ksym_hbp->info.name);
 		kfree(entry->ksym_hbp);
 		kfree(entry);
 	}
-- 
cgit v1.2.3-71-gd317


From 9d7e934408b52cd53dd85270eb36941a6a318cc5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Jul 2009 13:55:18 +0800
Subject: ksym_tracer: Fix the output of stat tracing

- make ksym_tracer_stat_start() return head->first instead of
  &head->first
- make the output properly aligned

Before:

   Access type                Symbol                     Counter
     NA                   <NA>                              0
     RW                   pid_max                               0

After:

  Access Type   Symbol                                       Counter
  -----------   ------                                       -------
  RW            pid_max                                            0

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A52E346.5050608@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 7d349d34a0d1..1256a6e8ee24 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -453,8 +453,10 @@ device_initcall(init_ksym_trace);
 #ifdef CONFIG_PROFILE_KSYM_TRACER
 static int ksym_tracer_stat_headers(struct seq_file *m)
 {
-	seq_printf(m, "   Access type    ");
-	seq_printf(m, "            Symbol                     Counter     \n");
+	seq_puts(m, "  Access Type ");
+	seq_puts(m, "  Symbol                                       Counter\n");
+	seq_puts(m, "  ----------- ");
+	seq_puts(m, "  ------                                       -------\n");
 	return 0;
 }
 
@@ -472,27 +474,27 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v)
 
 	switch (access_type) {
 	case HW_BREAKPOINT_WRITE:
-		seq_printf(m, "     W     ");
+		seq_puts(m, "  W           ");
 		break;
 	case HW_BREAKPOINT_RW:
-		seq_printf(m, "     RW    ");
+		seq_puts(m, "  RW          ");
 		break;
 	default:
-		seq_printf(m, "     NA    ");
+		seq_puts(m, "  NA          ");
 	}
 
 	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
-		seq_printf(m, "               %s                 ", fn_name);
+		seq_printf(m, "  %-36s", fn_name);
 	else
-		seq_printf(m, "               <NA>                ");
+		seq_printf(m, "  %-36s", "<NA>");
+	seq_printf(m, " %15lu\n", entry->counter);
 
-	seq_printf(m, "%15lu\n", entry->counter);
 	return 0;
 }
 
 static void *ksym_tracer_stat_start(struct tracer_stat *trace)
 {
-	return &(ksym_filter_head.first);
+	return ksym_filter_head.first;
 }
 
 static void *
-- 
cgit v1.2.3-71-gd317


From d857ace143df3884954887e1899a65831ca72ece Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 22 Jul 2009 11:21:31 +0800
Subject: tracing/ksym_tracer: fix the output of ksym tracer

Fix the output format of ksym tracer, make it properly aligned

Befor patch:
# tracer: ksym_tracer
#
#       TASK-PID      CPU#      Symbol         Type    Function
#          |           |          |              |         |
bash            1378  1   ksym_tracer_mutex     W  mutex_lock+0x11/0x27
bash            1378  1   ksym_filter_head      W  process_new_ksym_entry+0xd2/0x10c
bash            1378  1   ksym_tracer_mutex     W  mutex_unlock+0x12/0x1b
cat             1429  0   ksym_tracer_mutex     W  mutex_lock+0x11/0x27

After patch:
# tracer: ksym_tracer
#
#       TASK-PID   CPU#      Symbol                    Type    Function
#          |        |          |                        |         |
        cat-1423  [000] ksym_tracer_mutex               RW mutex_lock+0x11/0x27
        cat-1423  [000] ksym_filter_head                RW ksym_trace_filter_read+0x6e/0x10d
        cat-1423  [000] ksym_tracer_mutex               RW mutex_unlock+0x12/0x1b
        cat-1423  [000] ksym_tracer_mutex               RW mutex_lock+0x11/0x27
        cat-1423  [000] ksym_filter_head                RW ksym_trace_filter_read+0x6e/0x10d
        cat-1423  [000] ksym_tracer_mutex               RW mutex_unlock+0x12/0x1b

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
LKML-Reference: <4A6685BB.2090809@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_ksym.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 1256a6e8ee24..fbf3a8e13bc5 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -370,13 +370,12 @@ static int ksym_trace_init(struct trace_array *tr)
 
 static void ksym_trace_print_header(struct seq_file *m)
 {
-
 	seq_puts(m,
-		 "#       TASK-PID      CPU#      Symbol         Type    "
-		 "Function         \n");
+		 "#       TASK-PID   CPU#      Symbol                    "
+		 "Type    Function\n");
 	seq_puts(m,
-		 "#          |           |          |              |         "
-		 "|            \n");
+		 "#          |        |          |                       "
+		 " |         |\n");
 }
 
 static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
@@ -392,7 +391,7 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->cmd,
+	ret = trace_seq_printf(s, "%11s-%-5d [%03d] %-30s ", field->cmd,
 				entry->pid, iter->cpu, field->ksym_name);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -412,7 +411,7 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	sprint_symbol(str, field->ip);
-	ret = trace_seq_printf(s, "%-20s\n", str);
+	ret = trace_seq_printf(s, "%s\n", str);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3-71-gd317


From 8e068542a8d9efec55126284d2f5cb32f003d507 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 22 Jul 2009 11:23:41 +0800
Subject: tracing/ksym_tracer: fix write operation of ksym_trace_filter

This patch fix 2 bugs:
- fix the return value of ksym_trace_filter_write() when we want to
  clear symbol in ksym_trace_filter file
  for example:
  # echo global_trace:rw- > /debug/tracing/ksym_trace_filter
  # echo global_trace:--- > /debug/tracing/ksym_trace_filter
  -bash: echo: write error: Invalid argument
  # cat /debug/tracing/ksym_trace_filter
  #
  We want to clear 'global_trace' in ksym_trace_filter, it complain
  with "Invalid argument", but the operation is successful

- the "r--" access types is not allowed, but ksym_trace_filter file think
  it OK
  for example:
  # echo ksym_tracer_mutex:r-- > ksym_trace_filter
  -bash: echo: write error: Resource temporarily unavailable
  # dmesg
  ksym_tracer request failed. Try again later!!

  The error occur at register_kernel_hw_breakpoint(), but It's should
  at access types parser

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
LKML-Reference: <4A66863D.5090802@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_ksym.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index fbf3a8e13bc5..cd5cb656c3d2 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -135,6 +135,9 @@ static int ksym_trace_get_access_type(char *str)
 	case 6:
 		access = HW_BREAKPOINT_RW;
 		break;
+	case 4:
+		access = -EINVAL;
+		break;
 	case 2:
 		access = HW_BREAKPOINT_WRITE;
 		break;
@@ -312,6 +315,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 		kfree(entry->ksym_hbp->info.name);
 		kfree(entry->ksym_hbp);
 		kfree(entry);
+		ret = 0;
 		goto out;
 	} else {
 		/* Check for malformed request: (4) */
-- 
cgit v1.2.3-71-gd317


From 75e33751ca8bbb72dd6f1a74d2810ddc8cbe4bdf Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 23 Jul 2009 12:01:22 +0800
Subject: tracing/ksym_tracer: support quick clear for ksym_trace_filter -- v2

It's rather boring to clear symbol one by one in ksym_trace_filter
file, so, this patch will let ksym_trace_filter file support quickly
clear all break points. We can write "0" to this file and it will clear
all symbols

for example:
 # cat ksym_trace_filter
 ksym_filter_head:rw-
 global_trace:rw-
 # echo 0 > ksym_trace_filter
 # cat ksym_trace_filter
 #

Changelog v1->v2:
Add other ways to clear all breakpoints by writing NULL or "*:---"
to ksym_trace_filter file base on K.Prasad's suggestion

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
LKML-Reference: <4A67E092.3080202@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_ksym.c | 53 +++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index cd5cb656c3d2..2fde875ead4c 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -163,8 +163,6 @@ static int parse_ksym_trace_str(char *input_string, char **ksymname,
 {
 	int ret;
 
-	strstrip(input_string);
-
 	*ksymname = strsep(&input_string, ":");
 	*addr = kallsyms_lookup_name(*ksymname);
 
@@ -262,6 +260,25 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
 	return cnt;
 }
 
+static void __ksym_trace_reset(void)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node, *node1;
+
+	mutex_lock(&ksym_tracer_mutex);
+	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+								ksym_hlist) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbp);
+		ksym_filter_entry_count--;
+		hlist_del_rcu(&(entry->ksym_hlist));
+		synchronize_rcu();
+		kfree(entry->ksym_hbp->info.name);
+		kfree(entry->ksym_hbp);
+		kfree(entry);
+	}
+	mutex_unlock(&ksym_tracer_mutex);
+}
+
 static ssize_t ksym_trace_filter_write(struct file *file,
 					const char __user *buffer,
 						size_t count, loff_t *ppos)
@@ -282,6 +299,21 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 	}
 	input_string[count] = '\0';
 
+	strstrip(input_string);
+
+	/*
+	 * Clear all breakpoints if:
+	 * 1: echo > ksym_trace_filter
+	 * 2: echo 0 > ksym_trace_filter
+	 * 3: echo "*:---" > ksym_trace_filter
+	 */
+	if (!input_string[0] || !strcmp(input_string, "0") ||
+	    !strcmp(input_string, "*:---")) {
+		__ksym_trace_reset();
+		kfree(input_string);
+		return count;
+	}
+
 	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
 	if (ret < 0) {
 		kfree(input_string);
@@ -341,23 +373,8 @@ static const struct file_operations ksym_tracing_fops = {
 
 static void ksym_trace_reset(struct trace_array *tr)
 {
-	struct trace_ksym *entry;
-	struct hlist_node *node, *node1;
-
 	ksym_tracing_enabled = 0;
-
-	mutex_lock(&ksym_tracer_mutex);
-	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
-								ksym_hlist) {
-		unregister_kernel_hw_breakpoint(entry->ksym_hbp);
-		ksym_filter_entry_count--;
-		hlist_del_rcu(&(entry->ksym_hlist));
-		synchronize_rcu();
-		kfree(entry->ksym_hbp->info.name);
-		kfree(entry->ksym_hbp);
-		kfree(entry);
-	}
-	mutex_unlock(&ksym_tracer_mutex);
+	__ksym_trace_reset();
 }
 
 static int ksym_trace_init(struct trace_array *tr)
-- 
cgit v1.2.3-71-gd317


From bd1a5c849bdcc5c89e4a6a18216cd2b9a7a8a78f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 13 Aug 2009 16:34:53 -0400
Subject: tracing: Ftrace dynamic ftrace_event_call support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add dynamic ftrace_event_call support to ftrace. Trace engines can add
new ftrace_event_call to ftrace on the fly. Each operator function of
the call takes an ftrace_event_call data structure as an argument,
because these functions may be shared among several ftrace_event_calls.

Changes from v13:
 - Define remove_subsystem_dir() always (revirt a2ca5e03), because
   trace_remove_event_call() uses it.
 - Modify syscall tracer because of ftrace_event_call change.

[fweisbec@gmail.com: Fixed conflict against latest tracing/core]

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090813203453.31965.71901.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/ftrace_event.h  |  19 +++----
 include/linux/syscalls.h      |   4 +-
 include/trace/ftrace.h        |  16 +++---
 include/trace/syscall.h       |  11 ++--
 kernel/trace/trace_events.c   | 121 +++++++++++++++++++++++++++++-------------
 kernel/trace/trace_export.c   |  18 +++----
 kernel/trace/trace_syscalls.c |  20 +++----
 7 files changed, 130 insertions(+), 79 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index ace2da9e0a0d..1ab3089b5c59 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -112,12 +112,12 @@ struct ftrace_event_call {
 	struct dentry		*dir;
 	struct trace_event	*event;
 	int			enabled;
-	int			(*regfunc)(void *);
-	void			(*unregfunc)(void *);
+	int			(*regfunc)(struct ftrace_event_call *);
+	void			(*unregfunc)(struct ftrace_event_call *);
 	int			id;
-	int			(*raw_init)(void);
-	int			(*show_format)(struct ftrace_event_call *call,
-					       struct trace_seq *s);
+	int			(*raw_init)(struct ftrace_event_call *);
+	int			(*show_format)(struct ftrace_event_call *,
+					       struct trace_seq *);
 	int			(*define_fields)(struct ftrace_event_call *);
 	struct list_head	fields;
 	int			filter_active;
@@ -147,11 +147,12 @@ enum {
 	FILTER_PTR_STRING,
 };
 
-extern int trace_define_field(struct ftrace_event_call *call,
-			      const char *type, const char *name,
-			      int offset, int size, int is_signed,
-			      int filter_type);
 extern int trace_define_common_fields(struct ftrace_event_call *call);
+extern int trace_define_field(struct ftrace_event_call *call, char *type,
+			      char *name, int offset, int size, int is_signed,
+			      int filter_type);
+extern int trace_add_event_call(struct ftrace_event_call *call);
+extern void trace_remove_event_call(struct ftrace_event_call *call);
 
 #define is_signed_type(type)	(((type)(-1)) < 0)
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f124c8995555..646102eeff92 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -165,7 +165,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
 	};								\
-	static int init_enter_##sname(void)				\
+	static int init_enter_##sname(struct ftrace_event_call *call)	\
 	{								\
 		int num, id;						\
 		num = syscall_name_to_nr("sys"#sname);			\
@@ -202,7 +202,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
 	};								\
-	static int init_exit_##sname(void)				\
+	static int init_exit_##sname(struct ftrace_event_call *call)	\
 	{								\
 		int num, id;						\
 		num = syscall_name_to_nr("sys"#sname);			\
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 360a77ad79e1..f2bd7a8f8e8b 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -434,7 +434,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
  *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
- * static int ftrace_reg_event_<call>(void)
+ * static int ftrace_reg_event_<call>(struct ftrace_event_call *unused)
  * {
  *	int ret;
  *
@@ -445,7 +445,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
  *	return ret;
  * }
  *
- * static void ftrace_unreg_event_<call>(void)
+ * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
  * {
  *	unregister_trace_<call>(ftrace_event_<call>);
  * }
@@ -478,7 +478,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
  *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
  *
- * static int ftrace_raw_reg_event_<call>(void)
+ * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
  * {
  *	int ret;
  *
@@ -489,7 +489,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
  *	return ret;
  * }
  *
- * static void ftrace_unreg_event_<call>(void)
+ * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
  * {
  *	unregister_trace_<call>(ftrace_raw_event_<call>);
  * }
@@ -498,7 +498,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
  *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
- * static int ftrace_raw_init_event_<call>(void)
+ * static int ftrace_raw_init_event_<call>(struct ftrace_event_call *unused)
  * {
  *	int id;
  *
@@ -592,7 +592,7 @@ static void ftrace_raw_event_##call(proto)				\
 		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
 }									\
 									\
-static int ftrace_raw_reg_event_##call(void *ptr)			\
+static int ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)\
 {									\
 	int ret;							\
 									\
@@ -603,7 +603,7 @@ static int ftrace_raw_reg_event_##call(void *ptr)			\
 	return ret;							\
 }									\
 									\
-static void ftrace_raw_unreg_event_##call(void *ptr)			\
+static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)\
 {									\
 	unregister_trace_##call(ftrace_raw_event_##call);		\
 }									\
@@ -612,7 +612,7 @@ static struct trace_event ftrace_event_type_##call = {			\
 	.trace			= ftrace_raw_output_##call,		\
 };									\
 									\
-static int ftrace_raw_init_event_##call(void)				\
+static int ftrace_raw_init_event_##call(struct ftrace_event_call *unused)\
 {									\
 	int id;								\
 									\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 5dc283ba5ae0..e290b86f6167 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -39,16 +39,19 @@ void set_syscall_enter_id(int num, int id);
 void set_syscall_exit_id(int num, int id);
 extern struct trace_event event_syscall_enter;
 extern struct trace_event event_syscall_exit;
-extern int reg_event_syscall_enter(void *ptr);
-extern void unreg_event_syscall_enter(void *ptr);
-extern int reg_event_syscall_exit(void *ptr);
-extern void unreg_event_syscall_exit(void *ptr);
+
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
 extern int syscall_exit_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
 extern int syscall_enter_define_fields(struct ftrace_event_call *call);
 extern int syscall_exit_define_fields(struct ftrace_event_call *call);
+extern int reg_event_syscall_enter(struct ftrace_event_call *call);
+extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
+extern int reg_event_syscall_exit(struct ftrace_event_call *call);
+extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
+extern int
+ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
 enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
 #endif
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d33bcdeffe69..8079bb511c43 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,8 +27,8 @@ DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
 
-int trace_define_field(struct ftrace_event_call *call, const char *type,
-		       const char *name, int offset, int size, int is_signed,
+int trace_define_field(struct ftrace_event_call *call, char *type,
+		       char *name, int offset, int size, int is_signed,
 		       int filter_type)
 {
 	struct ftrace_event_field *field;
@@ -92,9 +92,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_define_common_fields);
 
-#ifdef CONFIG_MODULES
-
-static void trace_destroy_fields(struct ftrace_event_call *call)
+void trace_destroy_fields(struct ftrace_event_call *call)
 {
 	struct ftrace_event_field *field, *next;
 
@@ -106,8 +104,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
 	}
 }
 
-#endif /* CONFIG_MODULES */
-
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
@@ -116,14 +112,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 		if (call->enabled) {
 			call->enabled = 0;
 			tracing_stop_cmdline_record();
-			call->unregfunc(call->data);
+			call->unregfunc(call);
 		}
 		break;
 	case 1:
 		if (!call->enabled) {
 			call->enabled = 1;
 			tracing_start_cmdline_record();
-			call->regfunc(call->data);
+			call->regfunc(call);
 		}
 		break;
 	}
@@ -991,27 +987,43 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 	return 0;
 }
 
-#define for_each_event(event, start, end)			\
-	for (event = start;					\
-	     (unsigned long)event < (unsigned long)end;		\
-	     event++)
+static int __trace_add_event_call(struct ftrace_event_call *call)
+{
+	struct dentry *d_events;
+	int ret;
 
-#ifdef CONFIG_MODULES
+	if (!call->name)
+		return -EINVAL;
 
-static LIST_HEAD(ftrace_module_file_list);
+	if (call->raw_init) {
+		ret = call->raw_init(call);
+		if (ret < 0) {
+			if (ret != -ENOSYS)
+				pr_warning("Could not initialize trace "
+				"events/%s\n", call->name);
+			return ret;
+		}
+	}
 
-/*
- * Modules must own their file_operations to keep up with
- * reference counting.
- */
-struct ftrace_module_file_ops {
-	struct list_head		list;
-	struct module			*mod;
-	struct file_operations		id;
-	struct file_operations		enable;
-	struct file_operations		format;
-	struct file_operations		filter;
-};
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return -ENOENT;
+
+	list_add(&call->list, &ftrace_events);
+	return event_create_dir(call, d_events, &ftrace_event_id_fops,
+				&ftrace_enable_fops, &ftrace_event_filter_fops,
+				&ftrace_event_format_fops);
+}
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+	int ret;
+	mutex_lock(&event_mutex);
+	ret = __trace_add_event_call(call);
+	mutex_unlock(&event_mutex);
+	return ret;
+}
 
 static void remove_subsystem_dir(const char *name)
 {
@@ -1039,6 +1051,48 @@ static void remove_subsystem_dir(const char *name)
 	}
 }
 
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+	ftrace_event_enable_disable(call, 0);
+	if (call->event)
+		__unregister_ftrace_event(call->event);
+	debugfs_remove_recursive(call->dir);
+	list_del(&call->list);
+	trace_destroy_fields(call);
+	destroy_preds(call);
+	remove_subsystem_dir(call->system);
+}
+
+/* Remove an event_call */
+void trace_remove_event_call(struct ftrace_event_call *call)
+{
+	mutex_lock(&event_mutex);
+	__trace_remove_event_call(call);
+	mutex_unlock(&event_mutex);
+}
+
+#define for_each_event(event, start, end)			\
+	for (event = start;					\
+	     (unsigned long)event < (unsigned long)end;		\
+	     event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+	struct list_head		list;
+	struct module			*mod;
+	struct file_operations		id;
+	struct file_operations		enable;
+	struct file_operations		format;
+	struct file_operations		filter;
+};
+
 static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
@@ -1096,7 +1150,7 @@ static void trace_module_add_events(struct module *mod)
 		if (!call->name)
 			continue;
 		if (call->raw_init) {
-			ret = call->raw_init();
+			ret = call->raw_init(call);
 			if (ret < 0) {
 				if (ret != -ENOSYS)
 					pr_warning("Could not initialize trace "
@@ -1131,14 +1185,7 @@ static void trace_module_remove_events(struct module *mod)
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			found = true;
-			ftrace_event_enable_disable(call, 0);
-			if (call->event)
-				__unregister_ftrace_event(call->event);
-			debugfs_remove_recursive(call->dir);
-			list_del(&call->list);
-			trace_destroy_fields(call);
-			destroy_preds(call);
-			remove_subsystem_dir(call->system);
+			__trace_remove_event_call(call);
 		}
 	}
 
@@ -1256,7 +1303,7 @@ static __init int event_trace_init(void)
 		if (!call->name)
 			continue;
 		if (call->raw_init) {
-			ret = call->raw_init();
+			ret = call->raw_init(call);
 			if (ret < 0) {
 				if (ret != -ENOSYS)
 					pr_warning("Could not initialize trace "
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 029a91f42287..9cbe7f1930ea 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -117,10 +117,16 @@ ftrace_format_##call(struct ftrace_event_call *unused,			\
 #define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\
 	cmd;
 
+static int ftrace_raw_init_event(struct ftrace_event_call *event_call)
+{
+	INIT_LIST_HEAD(&event_call->fields);
+	init_preds(event_call);
+	return 0;
+}
+
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 int ftrace_define_fields_##call(struct ftrace_event_call *event_call);	\
-static int ftrace_raw_init_event_##call(void);				\
 									\
 struct ftrace_event_call __used						\
 __attribute__((__aligned__(4)))						\
@@ -128,16 +134,10 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_raw_init_event_##call,		\
+	.raw_init		= ftrace_raw_init_event,		\
 	.show_format		= ftrace_format_##call,			\
 	.define_fields		= ftrace_define_fields_##call,		\
-};									\
-static int ftrace_raw_init_event_##call(void)				\
-{									\
-	INIT_LIST_HEAD(&event_##call.fields);				\
-	init_preds(&event_##call);					\
-	return 0;							\
-}									\
+};
 
 #undef TRACE_EVENT_FORMAT_NOFILTER
 #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 85291c4de406..5931933587e9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -193,8 +193,8 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
 		return ret;
 
 	for (i = 0; i < meta->nb_args; i++) {
-		ret = trace_define_field(call, meta->types[i],
-					 meta->args[i], offset,
+		ret = trace_define_field(call, (char *)meta->types[i],
+					 (char *)meta->args[i], offset,
 					 sizeof(unsigned long), 0,
 					 FILTER_OTHER);
 		offset += sizeof(unsigned long);
@@ -277,13 +277,13 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
 		trace_current_buffer_unlock_commit(event, 0, 0);
 }
 
-int reg_event_syscall_enter(void *ptr)
+int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
 	char *name;
 
-	name = (char *)ptr;
+	name = (char *)call->data;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
 		return -ENOSYS;
@@ -301,12 +301,12 @@ int reg_event_syscall_enter(void *ptr)
 	return ret;
 }
 
-void unreg_event_syscall_enter(void *ptr)
+void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int num;
 	char *name;
 
-	name = (char *)ptr;
+	name = (char *)call->data;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
 		return;
@@ -318,13 +318,13 @@ void unreg_event_syscall_enter(void *ptr)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-int reg_event_syscall_exit(void *ptr)
+int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
 	char *name;
 
-	name = (char *)ptr;
+	name = (char *)call->data;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
 		return -ENOSYS;
@@ -342,12 +342,12 @@ int reg_event_syscall_exit(void *ptr)
 	return ret;
 }
 
-void unreg_event_syscall_exit(void *ptr)
+void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int num;
 	char *name;
 
-	name = (char *)ptr;
+	name = (char *)call->data;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
 		return;
-- 
cgit v1.2.3-71-gd317


From d93f12f3f417e49a175800da85c6fcb2a5096e03 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 13 Aug 2009 16:35:01 -0400
Subject: tracing: Introduce TRACE_FIELD_ZERO() macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use TRACE_FIELD_ZERO(type, item) instead of TRACE_FIELD_ZERO_CHAR(item).
This also includes a typo fix of TRACE_ZERO_CHAR() macro.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090813203501.31965.30172.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_event_types.h |  4 ++--
 kernel/trace/trace_export.c      | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 6db005e12487..e74f0906ab1a 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -109,7 +109,7 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(char *, fmt, fmt)
-		TRACE_FIELD_ZERO_CHAR(buf)
+		TRACE_FIELD_ZERO(char, buf)
 	),
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
@@ -117,7 +117,7 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD_ZERO_CHAR(buf)
+		TRACE_FIELD_ZERO(char, buf)
 	),
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9cbe7f1930ea..f75faeccf68e 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -42,9 +42,9 @@ extern void __bad_type_size(void);
 	if (!ret)							\
 		return 0;
 
-#undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)					\
-	ret = trace_seq_printf(s, "\tfield:char " #item ";\t"		\
+#undef TRACE_FIELD_ZERO
+#define TRACE_FIELD_ZERO(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
 			       "offset:%u;\tsize:0;\n",			\
 			       (unsigned int)offsetof(typeof(field), item)); \
 	if (!ret)							\
@@ -92,9 +92,6 @@ ftrace_format_##call(struct ftrace_event_call *unused,			\
 
 #include "trace_event_types.h"
 
-#undef TRACE_ZERO_CHAR
-#define TRACE_ZERO_CHAR(arg)
-
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
@@ -107,6 +104,9 @@ ftrace_format_##call(struct ftrace_event_call *unused,			\
 #define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
 	TRACE_FIELD(type, item, assign)
 
+#undef TRACE_FIELD_ZERO
+#define TRACE_FIELD_ZERO(type, item)
+
 #undef TP_CMD
 #define TP_CMD(cmd...)	cmd
 
@@ -180,8 +180,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	if (ret)							\
 		return ret;
 
-#undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)
+#undef TRACE_FIELD_ZERO
+#define TRACE_FIELD_ZERO(type, item)
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-- 
cgit v1.2.3-71-gd317


From 413d37d1eb69c1765b9ace0a612dac9b6c990e66 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 13 Aug 2009 16:35:11 -0400
Subject: tracing: Add kprobe-based event tracer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add kprobes-based event tracer on ftrace.

This tracer is similar to the events tracer which is based on Tracepoint
infrastructure. Instead of Tracepoint, this tracer is based on kprobes
(kprobe and kretprobe). It probes anywhere where kprobes can probe(this
 means, all functions body except for __kprobes functions).

Similar to the events tracer, this tracer doesn't need to be activated
via current_tracer, instead of that, just set probe points via
/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter.

This tracer supports following probe arguments for each probe.

  %REG  : Fetch register REG
  sN    : Fetch Nth entry of stack (N >= 0)
  sa    : Fetch stack address.
  @ADDR : Fetch memory at ADDR (ADDR should be in kernel)
  @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
  aN    : Fetch function argument. (N >= 0)
  rv    : Fetch return value.
  ra    : Fetch return address.
  +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.

See Documentation/trace/kprobetrace.txt in the next patch for details.

Changes from v13:
 - Support 'sa' for stack address.
 - Use call->data instead of container_of() macro.

[fweisbec@gmail.com: Fixed conflict against latest tracing/core]

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090813203510.31965.29123.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/Kconfig             |   12 +
 kernel/trace/Makefile            |    1 +
 kernel/trace/trace.h             |   30 +
 kernel/trace/trace_event_types.h |   18 +
 kernel/trace/trace_kprobe.c      | 1202 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 1263 insertions(+)
 create mode 100644 kernel/trace/trace_kprobe.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 06be85a7ef8c..fb5fbf75f279 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -411,6 +411,18 @@ config BLK_DEV_IO_TRACE
 
 	  If unsure, say N.
 
+config KPROBE_TRACER
+	depends on KPROBES
+	depends on X86
+	bool "Trace kprobes"
+	select TRACING
+	select GENERIC_TRACER
+	help
+	  This tracer probes everywhere where kprobes can probe it, and
+	  records various registers and memories specified by user.
+	  This also allows you to trace kprobe probe points as a dynamic
+	  defined events. It provides per-probe event filtering interface.
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..7c00a1ec1496 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -54,5 +54,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 654fd657bd03..667f832d16b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,6 +38,8 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
+	TRACE_KPROBE,
+	TRACE_KRETPROBE,
 
 	__TRACE_LAST_TYPE,
 };
@@ -205,6 +207,30 @@ struct syscall_trace_exit {
 	unsigned long		ret;
 };
 
+struct kprobe_trace_entry {
+	struct trace_entry	ent;
+	unsigned long		ip;
+	int			nargs;
+	unsigned long		args[];
+};
+
+#define SIZEOF_KPROBE_TRACE_ENTRY(n)			\
+	(offsetof(struct kprobe_trace_entry, args) +	\
+	(sizeof(unsigned long) * (n)))
+
+struct kretprobe_trace_entry {
+	struct trace_entry	ent;
+	unsigned long		func;
+	unsigned long		ret_ip;
+	int			nargs;
+	unsigned long		args[];
+};
+
+#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)			\
+	(offsetof(struct kretprobe_trace_entry, args) +	\
+	(sizeof(unsigned long) * (n)))
+
+
 
 /*
  * trace_flag_type is an enumeration that holds different
@@ -317,6 +343,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
+		IF_ASSIGN(var, ent, struct kprobe_trace_entry,		\
+			  TRACE_KPROBE);				\
+		IF_ASSIGN(var, ent, struct kretprobe_trace_entry,	\
+			  TRACE_KRETPROBE);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index e74f0906ab1a..186b598a1f11 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -175,4 +175,22 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
 	TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
 );
 
+TRACE_EVENT_FORMAT(kprobe, TRACE_KPROBE, kprobe_trace_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(int, nargs, nargs)
+		TRACE_FIELD_ZERO(unsigned long, args)
+	),
+	TP_RAW_FMT("%08lx: args:0x%lx ...")
+);
+
+TRACE_EVENT_FORMAT(kretprobe, TRACE_KRETPROBE, kretprobe_trace_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, func, func)
+		TRACE_FIELD(unsigned long, ret_ip, ret_ip)
+		TRACE_FIELD(int, nargs, nargs)
+		TRACE_FIELD_ZERO(unsigned long, args)
+	),
+	TP_RAW_FMT("%08lx <- %08lx: args:0x%lx ...")
+);
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..0c4f00aafb92
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1202 @@
+/*
+ * kprobe based kernel tracer
+ *
+ * Created by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define TRACE_KPROBE_ARGS 6
+#define MAX_ARGSTR_LEN 63
+
+/* currently, trace_kprobe only supports X86. */
+
+struct fetch_func {
+	unsigned long (*func)(struct pt_regs *, void *);
+	void *data;
+};
+
+static __kprobes unsigned long call_fetch(struct fetch_func *f,
+					  struct pt_regs *regs)
+{
+	return f->func(regs, f->data);
+}
+
+/* fetch handlers */
+static __kprobes unsigned long fetch_register(struct pt_regs *regs,
+					      void *offset)
+{
+	return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+}
+
+static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
+					   void *num)
+{
+	return regs_get_kernel_stack_nth(regs,
+					 (unsigned int)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+{
+	unsigned long retval;
+
+	if (probe_kernel_address(addr, retval))
+		return 0;
+	return retval;
+}
+
+static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
+{
+	return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
+					      void *dummy)
+{
+	return regs_return_value(regs);
+}
+
+static __kprobes unsigned long fetch_ip(struct pt_regs *regs, void *dummy)
+{
+	return instruction_pointer(regs);
+}
+
+static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
+						   void *dummy)
+{
+	return kernel_stack_pointer(regs);
+}
+
+/* Memory fetching by symbol */
+struct symbol_cache {
+	char *symbol;
+	long offset;
+	unsigned long addr;
+};
+
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+	sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+	if (sc->addr)
+		sc->addr += sc->offset;
+	return sc->addr;
+}
+
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+	kfree(sc->symbol);
+	kfree(sc);
+}
+
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+	struct symbol_cache *sc;
+
+	if (!sym || strlen(sym) == 0)
+		return NULL;
+	sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+	if (!sc)
+		return NULL;
+
+	sc->symbol = kstrdup(sym, GFP_KERNEL);
+	if (!sc->symbol) {
+		kfree(sc);
+		return NULL;
+	}
+	sc->offset = offset;
+
+	update_symbol_cache(sc);
+	return sc;
+}
+
+static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
+{
+	struct symbol_cache *sc = data;
+
+	if (sc->addr)
+		return fetch_memory(regs, (void *)sc->addr);
+	else
+		return 0;
+}
+
+/* Special indirect memory access interface */
+struct indirect_fetch_data {
+	struct fetch_func orig;
+	long offset;
+};
+
+static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
+{
+	struct indirect_fetch_data *ind = data;
+	unsigned long addr;
+
+	addr = call_fetch(&ind->orig, regs);
+	if (addr) {
+		addr += ind->offset;
+		return fetch_memory(regs, (void *)addr);
+	} else
+		return 0;
+}
+
+static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+{
+	if (data->orig.func == fetch_indirect)
+		free_indirect_fetch_data(data->orig.data);
+	else if (data->orig.func == fetch_symbol)
+		free_symbol_cache(data->orig.data);
+	kfree(data);
+}
+
+/**
+ * kprobe_trace_core
+ */
+
+struct trace_probe {
+	struct list_head	list;
+	union {
+		struct kprobe		kp;
+		struct kretprobe	rp;
+	};
+	const char		*symbol;	/* symbol name */
+	unsigned int		nr_args;
+	struct fetch_func	args[TRACE_KPROBE_ARGS];
+	struct ftrace_event_call	call;
+};
+
+static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_trace_func(struct kretprobe_instance *ri,
+				struct pt_regs *regs);
+
+static __kprobes int probe_is_return(struct trace_probe *tp)
+{
+	return (tp->rp.handler == kretprobe_trace_func);
+}
+
+static __kprobes const char *probe_symbol(struct trace_probe *tp)
+{
+	return tp->symbol ? tp->symbol : "unknown";
+}
+
+static __kprobes long probe_offset(struct trace_probe *tp)
+{
+	return (probe_is_return(tp)) ? tp->rp.kp.offset : tp->kp.offset;
+}
+
+static __kprobes void *probe_address(struct trace_probe *tp)
+{
+	return (probe_is_return(tp)) ? tp->rp.kp.addr : tp->kp.addr;
+}
+
+static int trace_arg_string(char *buf, size_t n, struct fetch_func *ff)
+{
+	int ret = -EINVAL;
+
+	if (ff->func == fetch_argument)
+		ret = snprintf(buf, n, "a%lu", (unsigned long)ff->data);
+	else if (ff->func == fetch_register) {
+		const char *name;
+		name = regs_query_register_name((unsigned int)((long)ff->data));
+		ret = snprintf(buf, n, "%%%s", name);
+	} else if (ff->func == fetch_stack)
+		ret = snprintf(buf, n, "s%lu", (unsigned long)ff->data);
+	else if (ff->func == fetch_memory)
+		ret = snprintf(buf, n, "@0x%p", ff->data);
+	else if (ff->func == fetch_symbol) {
+		struct symbol_cache *sc = ff->data;
+		ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
+	} else if (ff->func == fetch_retvalue)
+		ret = snprintf(buf, n, "rv");
+	else if (ff->func == fetch_ip)
+		ret = snprintf(buf, n, "ra");
+	else if (ff->func == fetch_stack_address)
+		ret = snprintf(buf, n, "sa");
+	else if (ff->func == fetch_indirect) {
+		struct indirect_fetch_data *id = ff->data;
+		size_t l = 0;
+		ret = snprintf(buf, n, "%+ld(", id->offset);
+		if (ret >= n)
+			goto end;
+		l += ret;
+		ret = trace_arg_string(buf + l, n - l, &id->orig);
+		if (ret < 0)
+			goto end;
+		l += ret;
+		ret = snprintf(buf + l, n - l, ")");
+		ret += l;
+	}
+end:
+	if (ret >= n)
+		return -ENOSPC;
+	return ret;
+}
+
+static int register_probe_event(struct trace_probe *tp);
+static void unregister_probe_event(struct trace_probe *tp);
+
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+
+static struct trace_probe *alloc_trace_probe(const char *symbol,
+					     const char *event)
+{
+	struct trace_probe *tp;
+
+	tp = kzalloc(sizeof(struct trace_probe), GFP_KERNEL);
+	if (!tp)
+		return ERR_PTR(-ENOMEM);
+
+	if (symbol) {
+		tp->symbol = kstrdup(symbol, GFP_KERNEL);
+		if (!tp->symbol)
+			goto error;
+	}
+	if (event) {
+		tp->call.name = kstrdup(event, GFP_KERNEL);
+		if (!tp->call.name)
+			goto error;
+	}
+
+	INIT_LIST_HEAD(&tp->list);
+	return tp;
+error:
+	kfree(tp->symbol);
+	kfree(tp);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void free_trace_probe(struct trace_probe *tp)
+{
+	int i;
+
+	for (i = 0; i < tp->nr_args; i++)
+		if (tp->args[i].func == fetch_symbol)
+			free_symbol_cache(tp->args[i].data);
+		else if (tp->args[i].func == fetch_indirect)
+			free_indirect_fetch_data(tp->args[i].data);
+
+	kfree(tp->call.name);
+	kfree(tp->symbol);
+	kfree(tp);
+}
+
+static struct trace_probe *find_probe_event(const char *event)
+{
+	struct trace_probe *tp;
+
+	list_for_each_entry(tp, &probe_list, list)
+		if (tp->call.name && !strcmp(tp->call.name, event))
+			return tp;
+	return NULL;
+}
+
+static void __unregister_trace_probe(struct trace_probe *tp)
+{
+	if (probe_is_return(tp))
+		unregister_kretprobe(&tp->rp);
+	else
+		unregister_kprobe(&tp->kp);
+}
+
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
+{
+	if (tp->call.name)
+		unregister_probe_event(tp);
+	__unregister_trace_probe(tp);
+	list_del(&tp->list);
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_probe(struct trace_probe *tp)
+{
+	struct trace_probe *old_tp;
+	int ret;
+
+	mutex_lock(&probe_lock);
+
+	if (probe_is_return(tp))
+		ret = register_kretprobe(&tp->rp);
+	else
+		ret = register_kprobe(&tp->kp);
+
+	if (ret) {
+		pr_warning("Could not insert probe(%d)\n", ret);
+		if (ret == -EILSEQ) {
+			pr_warning("Probing address(0x%p) is not an "
+				   "instruction boundary.\n",
+				   probe_address(tp));
+			ret = -EINVAL;
+		}
+		goto end;
+	}
+	/* register as an event */
+	if (tp->call.name) {
+		old_tp = find_probe_event(tp->call.name);
+		if (old_tp) {
+			/* delete old event */
+			unregister_trace_probe(old_tp);
+			free_trace_probe(old_tp);
+		}
+		ret = register_probe_event(tp);
+		if (ret) {
+			pr_warning("Faild to register probe event(%d)\n", ret);
+			__unregister_trace_probe(tp);
+		}
+	}
+	list_add_tail(&tp->list, &probe_list);
+end:
+	mutex_unlock(&probe_lock);
+	return ret;
+}
+
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, long *offset)
+{
+	char *tmp;
+	int ret;
+
+	if (!offset)
+		return -EINVAL;
+
+	tmp = strchr(symbol, '+');
+	if (!tmp)
+		tmp = strchr(symbol, '-');
+
+	if (tmp) {
+		/* skip sign because strict_strtol doesn't accept '+' */
+		ret = strict_strtol(tmp + 1, 0, offset);
+		if (ret)
+			return ret;
+		if (*tmp == '-')
+			*offset = -(*offset);
+		*tmp = '\0';
+	} else
+		*offset = 0;
+	return 0;
+}
+
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_trace_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+	int ret = 0;
+	unsigned long param;
+	long offset;
+	char *tmp;
+
+	switch (arg[0]) {
+	case 'a':	/* argument */
+		ret = strict_strtoul(arg + 1, 10, &param);
+		if (ret || param > PARAM_MAX_ARGS)
+			ret = -EINVAL;
+		else {
+			ff->func = fetch_argument;
+			ff->data = (void *)param;
+		}
+		break;
+	case 'r':	/* retval or retaddr */
+		if (is_return && arg[1] == 'v') {
+			ff->func = fetch_retvalue;
+			ff->data = NULL;
+		} else if (is_return && arg[1] == 'a') {
+			ff->func = fetch_ip;
+			ff->data = NULL;
+		} else
+			ret = -EINVAL;
+		break;
+	case '%':	/* named register */
+		ret = regs_query_register_offset(arg + 1);
+		if (ret >= 0) {
+			ff->func = fetch_register;
+			ff->data = (void *)(unsigned long)ret;
+			ret = 0;
+		}
+		break;
+	case 's':	/* stack */
+		if (arg[1] == 'a') {
+			ff->func = fetch_stack_address;
+			ff->data = NULL;
+		} else {
+			ret = strict_strtoul(arg + 1, 10, &param);
+			if (ret || param > PARAM_MAX_STACK)
+				ret = -EINVAL;
+			else {
+				ff->func = fetch_stack;
+				ff->data = (void *)param;
+			}
+		}
+		break;
+	case '@':	/* memory or symbol */
+		if (isdigit(arg[1])) {
+			ret = strict_strtoul(arg + 1, 0, &param);
+			if (ret)
+				break;
+			ff->func = fetch_memory;
+			ff->data = (void *)param;
+		} else {
+			ret = split_symbol_offset(arg + 1, &offset);
+			if (ret)
+				break;
+			ff->data = alloc_symbol_cache(arg + 1,
+							      offset);
+			if (ff->data)
+				ff->func = fetch_symbol;
+			else
+				ret = -EINVAL;
+		}
+		break;
+	case '+':	/* indirect memory */
+	case '-':
+		tmp = strchr(arg, '(');
+		if (!tmp) {
+			ret = -EINVAL;
+			break;
+		}
+		*tmp = '\0';
+		ret = strict_strtol(arg + 1, 0, &offset);
+		if (ret)
+			break;
+		if (arg[0] == '-')
+			offset = -offset;
+		arg = tmp + 1;
+		tmp = strrchr(arg, ')');
+		if (tmp) {
+			struct indirect_fetch_data *id;
+			*tmp = '\0';
+			id = kzalloc(sizeof(struct indirect_fetch_data),
+				     GFP_KERNEL);
+			if (!id)
+				return -ENOMEM;
+			id->offset = offset;
+			ret = parse_trace_arg(arg, &id->orig, is_return);
+			if (ret)
+				kfree(id);
+			else {
+				ff->func = fetch_indirect;
+				ff->data = (void *)id;
+			}
+		} else
+			ret = -EINVAL;
+		break;
+	default:
+		/* TODO: support custom handler */
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static int create_trace_probe(int argc, char **argv)
+{
+	/*
+	 * Argument syntax:
+	 *  - Add kprobe: p[:EVENT] SYMBOL[+OFFS|-OFFS]|ADDRESS [FETCHARGS]
+	 *  - Add kretprobe: r[:EVENT] SYMBOL[+0] [FETCHARGS]
+	 * Fetch args:
+	 *  aN	: fetch Nth of function argument. (N:0-)
+	 *  rv	: fetch return value
+	 *  ra	: fetch return address
+	 *  sa	: fetch stack address
+	 *  sN	: fetch Nth of stack (N:0-)
+	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
+	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+	 *  %REG	: fetch register REG
+	 * Indirect memory fetch:
+	 *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
+	 */
+	struct trace_probe *tp;
+	struct kprobe *kp;
+	int i, ret = 0;
+	int is_return = 0;
+	char *symbol = NULL, *event = NULL;
+	long offset = 0;
+	void *addr = NULL;
+
+	if (argc < 2)
+		return -EINVAL;
+
+	if (argv[0][0] == 'p')
+		is_return = 0;
+	else if (argv[0][0] == 'r')
+		is_return = 1;
+	else
+		return -EINVAL;
+
+	if (argv[0][1] == ':') {
+		event = &argv[0][2];
+		if (strlen(event) == 0) {
+			pr_info("Event name is not specifiled\n");
+			return -EINVAL;
+		}
+	}
+
+	if (isdigit(argv[1][0])) {
+		if (is_return)
+			return -EINVAL;
+		/* an address specified */
+		ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+		if (ret)
+			return ret;
+	} else {
+		/* a symbol specified */
+		symbol = argv[1];
+		/* TODO: support .init module functions */
+		ret = split_symbol_offset(symbol, &offset);
+		if (ret)
+			return ret;
+		if (offset && is_return)
+			return -EINVAL;
+	}
+
+	/* setup a probe */
+	tp = alloc_trace_probe(symbol, event);
+	if (IS_ERR(tp))
+		return PTR_ERR(tp);
+
+	if (is_return) {
+		kp = &tp->rp.kp;
+		tp->rp.handler = kretprobe_trace_func;
+	} else {
+		kp = &tp->kp;
+		tp->kp.pre_handler = kprobe_trace_func;
+	}
+
+	if (tp->symbol) {
+		kp->symbol_name = tp->symbol;
+		kp->offset = offset;
+	} else
+		kp->addr = addr;
+
+	/* parse arguments */
+	argc -= 2; argv += 2; ret = 0;
+	for (i = 0; i < argc && i < TRACE_KPROBE_ARGS; i++) {
+		if (strlen(argv[i]) > MAX_ARGSTR_LEN) {
+			pr_info("Argument%d(%s) is too long.\n", i, argv[i]);
+			ret = -ENOSPC;
+			goto error;
+		}
+		ret = parse_trace_arg(argv[i], &tp->args[i], is_return);
+		if (ret)
+			goto error;
+	}
+	tp->nr_args = i;
+
+	ret = register_trace_probe(tp);
+	if (ret)
+		goto error;
+	return 0;
+
+error:
+	free_trace_probe(tp);
+	return ret;
+}
+
+static void cleanup_all_probes(void)
+{
+	struct trace_probe *tp;
+
+	mutex_lock(&probe_lock);
+	/* TODO: Use batch unregistration */
+	while (!list_empty(&probe_list)) {
+		tp = list_entry(probe_list.next, struct trace_probe, list);
+		unregister_trace_probe(tp);
+		free_trace_probe(tp);
+	}
+	mutex_unlock(&probe_lock);
+}
+
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&probe_lock);
+	return seq_list_start(&probe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &probe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&probe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+	struct trace_probe *tp = v;
+	int i, ret;
+	char buf[MAX_ARGSTR_LEN + 1];
+
+	seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+	if (tp->call.name)
+		seq_printf(m, ":%s", tp->call.name);
+
+	if (tp->symbol)
+		seq_printf(m, " %s%+ld", probe_symbol(tp), probe_offset(tp));
+	else
+		seq_printf(m, " 0x%p", probe_address(tp));
+
+	for (i = 0; i < tp->nr_args; i++) {
+		ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		if (ret < 0) {
+			pr_warning("Argument%d decoding error(%d).\n", i, ret);
+			return ret;
+		}
+		seq_printf(m, " %s", buf);
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+	.start  = probes_seq_start,
+	.next   = probes_seq_next,
+	.stop   = probes_seq_stop,
+	.show   = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (file->f_flags & O_TRUNC))
+		cleanup_all_probes();
+
+	return seq_open(file, &probes_seq_op);
+}
+
+static int command_trace_probe(const char *buf)
+{
+	char **argv;
+	int argc = 0, ret = 0;
+
+	argv = argv_split(GFP_KERNEL, buf, &argc);
+	if (!argv)
+		return -ENOMEM;
+
+	if (argc)
+		ret = create_trace_probe(argc, argv);
+
+	argv_free(argv);
+	return ret;
+}
+
+#define WRITE_BUFSIZE 128
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+			    size_t count, loff_t *ppos)
+{
+	char *kbuf, *tmp;
+	int ret;
+	size_t done;
+	size_t size;
+
+	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	ret = done = 0;
+	while (done < count) {
+		size = count - done;
+		if (size >= WRITE_BUFSIZE)
+			size = WRITE_BUFSIZE - 1;
+		if (copy_from_user(kbuf, buffer + done, size)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		kbuf[size] = '\0';
+		tmp = strchr(kbuf, '\n');
+		if (tmp) {
+			*tmp = '\0';
+			size = tmp - kbuf + 1;
+		} else if (done + size < count) {
+			pr_warning("Line length is too long: "
+				   "Should be less than %d.", WRITE_BUFSIZE);
+			ret = -EINVAL;
+			goto out;
+		}
+		done += size;
+		/* Remove comments */
+		tmp = strchr(kbuf, '#');
+		if (tmp)
+			*tmp = '\0';
+
+		ret = command_trace_probe(kbuf);
+		if (ret)
+			goto out;
+	}
+	ret = done;
+out:
+	kfree(kbuf);
+	return ret;
+}
+
+static const struct file_operations kprobe_events_ops = {
+	.owner          = THIS_MODULE,
+	.open           = probes_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+	.write		= probes_write,
+};
+
+/* Kprobe handler */
+static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+{
+	struct trace_probe *tp = container_of(kp, struct trace_probe, kp);
+	struct kprobe_trace_entry *entry;
+	struct ring_buffer_event *event;
+	int size, i, pc;
+	unsigned long irq_flags;
+	struct ftrace_event_call *call = &event_kprobe;
+
+	if (&tp->call.name)
+		call = &tp->call;
+
+	local_save_flags(irq_flags);
+	pc = preempt_count();
+
+	size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+
+	event = trace_current_buffer_lock_reserve(TRACE_KPROBE, size,
+						  irq_flags, pc);
+	if (!event)
+		return 0;
+
+	entry = ring_buffer_event_data(event);
+	entry->nargs = tp->nr_args;
+	entry->ip = (unsigned long)kp->addr;
+	for (i = 0; i < tp->nr_args; i++)
+		entry->args[i] = call_fetch(&tp->args[i], regs);
+
+	if (!filter_current_check_discard(call, entry, event))
+		trace_nowake_buffer_unlock_commit(event, irq_flags, pc);
+	return 0;
+}
+
+/* Kretprobe handler */
+static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+					  struct pt_regs *regs)
+{
+	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+	struct kretprobe_trace_entry *entry;
+	struct ring_buffer_event *event;
+	int size, i, pc;
+	unsigned long irq_flags;
+	struct ftrace_event_call *call = &event_kretprobe;
+
+	if (&tp->call.name)
+		call = &tp->call;
+
+	local_save_flags(irq_flags);
+	pc = preempt_count();
+
+	size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+
+	event = trace_current_buffer_lock_reserve(TRACE_KRETPROBE, size,
+						  irq_flags, pc);
+	if (!event)
+		return 0;
+
+	entry = ring_buffer_event_data(event);
+	entry->nargs = tp->nr_args;
+	entry->func = (unsigned long)probe_address(tp);
+	entry->ret_ip = (unsigned long)ri->ret_addr;
+	for (i = 0; i < tp->nr_args; i++)
+		entry->args[i] = call_fetch(&tp->args[i], regs);
+
+	if (!filter_current_check_discard(call, entry, event))
+		trace_nowake_buffer_unlock_commit(event, irq_flags, pc);
+
+	return 0;
+}
+
+/* Event entry printers */
+enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags)
+{
+	struct kprobe_trace_entry *field;
+	struct trace_seq *s = &iter->seq;
+	int i;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+		goto partial;
+
+	if (!trace_seq_puts(s, ":"))
+		goto partial;
+
+	for (i = 0; i < field->nargs; i++)
+		if (!trace_seq_printf(s, " 0x%lx", field->args[i]))
+			goto partial;
+
+	if (!trace_seq_puts(s, "\n"))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags)
+{
+	struct kretprobe_trace_entry *field;
+	struct trace_seq *s = &iter->seq;
+	int i;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+		goto partial;
+
+	if (!trace_seq_puts(s, " <- "))
+		goto partial;
+
+	if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+		goto partial;
+
+	if (!trace_seq_puts(s, ":"))
+		goto partial;
+
+	for (i = 0; i < field->nargs; i++)
+		if (!trace_seq_printf(s, " 0x%lx", field->args[i]))
+			goto partial;
+
+	if (!trace_seq_puts(s, "\n"))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event kprobe_trace_event = {
+	.type	 	= TRACE_KPROBE,
+	.trace		= print_kprobe_event,
+};
+
+static struct trace_event kretprobe_trace_event = {
+	.type	 	= TRACE_KRETPROBE,
+	.trace		= print_kretprobe_event,
+};
+
+static int probe_event_enable(struct ftrace_event_call *call)
+{
+	struct trace_probe *tp = (struct trace_probe *)call->data;
+
+	if (probe_is_return(tp))
+		return enable_kretprobe(&tp->rp);
+	else
+		return enable_kprobe(&tp->kp);
+}
+
+static void probe_event_disable(struct ftrace_event_call *call)
+{
+	struct trace_probe *tp = (struct trace_probe *)call->data;
+
+	if (probe_is_return(tp))
+		disable_kretprobe(&tp->rp);
+	else
+		disable_kprobe(&tp->kp);
+}
+
+static int probe_event_raw_init(struct ftrace_event_call *event_call)
+{
+	INIT_LIST_HEAD(&event_call->fields);
+	init_preds(event_call);
+	return 0;
+}
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed)			\
+	do {								\
+		ret = trace_define_field(event_call, #type, name,	\
+					 offsetof(typeof(field), item),	\
+					 sizeof(field.item), is_signed, \
+					 FILTER_OTHER);			\
+		if (ret)						\
+			return ret;					\
+	} while (0)
+
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+	int ret, i;
+	struct kprobe_trace_entry field;
+	char buf[MAX_ARGSTR_LEN + 1];
+	struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+	ret = trace_define_common_fields(event_call);
+	if (!ret)
+		return ret;
+
+	DEFINE_FIELD(unsigned long, ip, "ip", 0);
+	DEFINE_FIELD(int, nargs, "nargs", 1);
+	for (i = 0; i < tp->nr_args; i++) {
+		/* Set argN as a field */
+		sprintf(buf, "arg%d", i);
+		DEFINE_FIELD(unsigned long, args[i], buf, 0);
+		/* Set argument string as an alias field */
+		ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		if (ret < 0)
+			return ret;
+		DEFINE_FIELD(unsigned long, args[i], buf, 0);
+	}
+	return 0;
+}
+
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+	int ret, i;
+	struct kretprobe_trace_entry field;
+	char buf[MAX_ARGSTR_LEN + 1];
+	struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+	ret = trace_define_common_fields(event_call);
+	if (!ret)
+		return ret;
+
+	DEFINE_FIELD(unsigned long, func, "func", 0);
+	DEFINE_FIELD(unsigned long, ret_ip, "ret_ip", 0);
+	DEFINE_FIELD(int, nargs, "nargs", 1);
+	for (i = 0; i < tp->nr_args; i++) {
+		/* Set argN as a field */
+		sprintf(buf, "arg%d", i);
+		DEFINE_FIELD(unsigned long, args[i], buf, 0);
+		/* Set argument string as an alias field */
+		ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		if (ret < 0)
+			return ret;
+		DEFINE_FIELD(unsigned long, args[i], buf, 0);
+	}
+	return 0;
+}
+
+static int __probe_event_show_format(struct trace_seq *s,
+				     struct trace_probe *tp, const char *fmt,
+				     const char *arg)
+{
+	int i, ret;
+	char buf[MAX_ARGSTR_LEN + 1];
+
+	/* Show aliases */
+	for (i = 0; i < tp->nr_args; i++) {
+		ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		if (ret < 0)
+			return ret;
+		if (!trace_seq_printf(s, "\talias: %s;\toriginal: arg%d;\n",
+				      buf, i))
+			return 0;
+	}
+	/* Show format */
+	if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
+		return 0;
+
+	for (i = 0; i < tp->nr_args; i++)
+		if (!trace_seq_puts(s, " 0x%lx"))
+			return 0;
+
+	if (!trace_seq_printf(s, "\", %s", arg))
+		return 0;
+
+	for (i = 0; i < tp->nr_args; i++)
+		if (!trace_seq_printf(s, ", arg%d", i))
+			return 0;
+
+	return trace_seq_puts(s, "\n");
+}
+
+#undef SHOW_FIELD
+#define SHOW_FIELD(type, item, name)					\
+	do {								\
+		ret = trace_seq_printf(s, "\tfield: " #type " %s;\t"	\
+				"offset:%u;tsize:%u;\n", name,		\
+				(unsigned int)offsetof(typeof(field), item),\
+				(unsigned int)sizeof(type));		\
+		if (!ret)						\
+			return 0;					\
+	} while (0)
+
+static int kprobe_event_show_format(struct ftrace_event_call *call,
+				    struct trace_seq *s)
+{
+	struct kprobe_trace_entry field __attribute__((unused));
+	int ret, i;
+	char buf[8];
+	struct trace_probe *tp = (struct trace_probe *)call->data;
+
+	SHOW_FIELD(unsigned long, ip, "ip");
+	SHOW_FIELD(int, nargs, "nargs");
+
+	/* Show fields */
+	for (i = 0; i < tp->nr_args; i++) {
+		sprintf(buf, "arg%d", i);
+		SHOW_FIELD(unsigned long, args[i], buf);
+	}
+	trace_seq_puts(s, "\n");
+
+	return __probe_event_show_format(s, tp, "%lx:", "ip");
+}
+
+static int kretprobe_event_show_format(struct ftrace_event_call *call,
+				       struct trace_seq *s)
+{
+	struct kretprobe_trace_entry field __attribute__((unused));
+	int ret, i;
+	char buf[8];
+	struct trace_probe *tp = (struct trace_probe *)call->data;
+
+	SHOW_FIELD(unsigned long, func, "func");
+	SHOW_FIELD(unsigned long, ret_ip, "ret_ip");
+	SHOW_FIELD(int, nargs, "nargs");
+
+	/* Show fields */
+	for (i = 0; i < tp->nr_args; i++) {
+		sprintf(buf, "arg%d", i);
+		SHOW_FIELD(unsigned long, args[i], buf);
+	}
+	trace_seq_puts(s, "\n");
+
+	return __probe_event_show_format(s, tp, "%lx <- %lx:",
+					  "func, ret_ip");
+}
+
+static int register_probe_event(struct trace_probe *tp)
+{
+	struct ftrace_event_call *call = &tp->call;
+	int ret;
+
+	/* Initialize ftrace_event_call */
+	call->system = "kprobes";
+	if (probe_is_return(tp)) {
+		call->event = &kretprobe_trace_event;
+		call->id = TRACE_KRETPROBE;
+		call->raw_init = probe_event_raw_init;
+		call->show_format = kretprobe_event_show_format;
+		call->define_fields = kretprobe_event_define_fields;
+	} else {
+		call->event = &kprobe_trace_event;
+		call->id = TRACE_KPROBE;
+		call->raw_init = probe_event_raw_init;
+		call->show_format = kprobe_event_show_format;
+		call->define_fields = kprobe_event_define_fields;
+	}
+	call->enabled = 1;
+	call->regfunc = probe_event_enable;
+	call->unregfunc = probe_event_disable;
+	call->data = tp;
+	ret = trace_add_event_call(call);
+	if (ret)
+		pr_info("Failed to register kprobe event: %s\n", call->name);
+	return ret;
+}
+
+static void unregister_probe_event(struct trace_probe *tp)
+{
+	/*
+	 * Prevent to unregister event itself because the event is shared
+	 * among other probes.
+	 */
+	tp->call.event = NULL;
+	trace_remove_event_call(&tp->call);
+}
+
+/* Make a debugfs interface for controling probe points */
+static __init int init_kprobe_trace(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+	int ret;
+
+	ret = register_ftrace_event(&kprobe_trace_event);
+	if (!ret) {
+		pr_warning("Could not register kprobe_trace_event type.\n");
+		return 0;
+	}
+	ret = register_ftrace_event(&kretprobe_trace_event);
+	if (!ret) {
+		pr_warning("Could not register kretprobe_trace_event type.\n");
+		return 0;
+	}
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+				    NULL, &kprobe_events_ops);
+
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'kprobe_events' entry\n");
+	return 0;
+}
+fs_initcall(init_kprobe_trace);
+
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+					int a4, int a5, int a6)
+{
+	return a1 + a2 + a3 + a4 + a5 + a6;
+}
+
+static __init int kprobe_trace_self_tests_init(void)
+{
+	int ret;
+	int (*target)(int, int, int, int, int, int);
+
+	target = kprobe_trace_selftest_target;
+
+	pr_info("Testing kprobe tracing: ");
+
+	ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+				  "a1 a2 a3 a4 a5 a6");
+	if (WARN_ON_ONCE(ret))
+		pr_warning("error enabling function entry\n");
+
+	ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+				  "ra rv");
+	if (WARN_ON_ONCE(ret))
+		pr_warning("error enabling function return\n");
+
+	ret = target(1, 2, 3, 4, 5, 6);
+
+	cleanup_all_probes();
+
+	pr_cont("OK\n");
+	return 0;
+}
+
+late_initcall(kprobe_trace_self_tests_init);
+
+#endif
-- 
cgit v1.2.3-71-gd317


From a82378d8802717b9776a7d9b54422f65c414d6cc Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 13 Aug 2009 16:35:18 -0400
Subject: tracing: Kprobe-tracer supports more than 6 arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support up to 128 arguments to fetch for each kprobes event.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090813203518.31965.96979.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt |  2 +-
 kernel/trace/trace_kprobe.c         | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index efff6eb1b3db..c9c09b45038d 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -32,7 +32,7 @@ Synopsis of kprobe_events
  SYMBOL[+offs|-offs]	: Symbol+offset where the probe is inserted.
  MEMADDR		: Address where the probe is inserted.
 
- FETCHARGS		: Arguments.
+ FETCHARGS		: Arguments. Each probe can have up to 128 args.
   %REG	: Fetch register REG
   sN	: Fetch Nth entry of stack (N >= 0)
   sa	: Fetch stack address.
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 0c4f00aafb92..6d488efd16b2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -32,7 +32,7 @@
 #include "trace.h"
 #include "trace_output.h"
 
-#define TRACE_KPROBE_ARGS 6
+#define MAX_TRACE_ARGS 128
 #define MAX_ARGSTR_LEN 63
 
 /* currently, trace_kprobe only supports X86. */
@@ -184,11 +184,15 @@ struct trace_probe {
 		struct kretprobe	rp;
 	};
 	const char		*symbol;	/* symbol name */
-	unsigned int		nr_args;
-	struct fetch_func	args[TRACE_KPROBE_ARGS];
 	struct ftrace_event_call	call;
+	unsigned int		nr_args;
+	struct fetch_func	args[];
 };
 
+#define SIZEOF_TRACE_PROBE(n)			\
+	(offsetof(struct trace_probe, args) +	\
+	(sizeof(struct fetch_func) * (n)))
+
 static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_trace_func(struct kretprobe_instance *ri,
 				struct pt_regs *regs);
@@ -263,11 +267,11 @@ static DEFINE_MUTEX(probe_lock);
 static LIST_HEAD(probe_list);
 
 static struct trace_probe *alloc_trace_probe(const char *symbol,
-					     const char *event)
+					     const char *event, int nargs)
 {
 	struct trace_probe *tp;
 
-	tp = kzalloc(sizeof(struct trace_probe), GFP_KERNEL);
+	tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
 	if (!tp)
 		return ERR_PTR(-ENOMEM);
 
@@ -573,9 +577,10 @@ static int create_trace_probe(int argc, char **argv)
 		if (offset && is_return)
 			return -EINVAL;
 	}
+	argc -= 2; argv += 2;
 
 	/* setup a probe */
-	tp = alloc_trace_probe(symbol, event);
+	tp = alloc_trace_probe(symbol, event, argc);
 	if (IS_ERR(tp))
 		return PTR_ERR(tp);
 
@@ -594,8 +599,8 @@ static int create_trace_probe(int argc, char **argv)
 		kp->addr = addr;
 
 	/* parse arguments */
-	argc -= 2; argv += 2; ret = 0;
-	for (i = 0; i < argc && i < TRACE_KPROBE_ARGS; i++) {
+	ret = 0;
+	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
 		if (strlen(argv[i]) > MAX_ARGSTR_LEN) {
 			pr_info("Argument%d(%s) is too long.\n", i, argv[i]);
 			ret = -ENOSPC;
-- 
cgit v1.2.3-71-gd317


From 4263565d491145b57621a761714f2ca6f1293a45 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 13 Aug 2009 16:35:26 -0400
Subject: tracing: Generate names for each kprobe event automatically
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Generate names for each kprobe event based on the probe point.
(SYMBOL+offs or MEMADDR).

Also remove generic k*probe event types because there is no user
of those types.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090813203526.31965.56672.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt |  3 +-
 kernel/trace/trace_event_types.h    | 18 -----------
 kernel/trace/trace_kprobe.c         | 64 +++++++++++++++++++------------------
 3 files changed, 35 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index c9c09b45038d..5e59e854e71b 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -28,7 +28,8 @@ Synopsis of kprobe_events
   p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS]	: Set a probe
   r[:EVENT] SYMBOL[+0] [FETCHARGS]			: Set a return probe
 
- EVENT			: Event name.
+ EVENT			: Event name. If omitted, the event name is generated
+			  based on SYMBOL+offs or MEMADDR.
  SYMBOL[+offs|-offs]	: Symbol+offset where the probe is inserted.
  MEMADDR		: Address where the probe is inserted.
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 186b598a1f11..e74f0906ab1a 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -175,22 +175,4 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
 	TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
 );
 
-TRACE_EVENT_FORMAT(kprobe, TRACE_KPROBE, kprobe_trace_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(int, nargs, nargs)
-		TRACE_FIELD_ZERO(unsigned long, args)
-	),
-	TP_RAW_FMT("%08lx: args:0x%lx ...")
-);
-
-TRACE_EVENT_FORMAT(kretprobe, TRACE_KRETPROBE, kretprobe_trace_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, func, func)
-		TRACE_FIELD(unsigned long, ret_ip, ret_ip)
-		TRACE_FIELD(int, nargs, nargs)
-		TRACE_FIELD_ZERO(unsigned long, args)
-	),
-	TP_RAW_FMT("%08lx <- %08lx: args:0x%lx ...")
-);
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6d488efd16b2..8aeb24cc295f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -34,6 +34,7 @@
 
 #define MAX_TRACE_ARGS 128
 #define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
 
 /* currently, trace_kprobe only supports X86. */
 
@@ -280,11 +281,11 @@ static struct trace_probe *alloc_trace_probe(const char *symbol,
 		if (!tp->symbol)
 			goto error;
 	}
-	if (event) {
-		tp->call.name = kstrdup(event, GFP_KERNEL);
-		if (!tp->call.name)
-			goto error;
-	}
+	if (!event)
+		goto error;
+	tp->call.name = kstrdup(event, GFP_KERNEL);
+	if (!tp->call.name)
+		goto error;
 
 	INIT_LIST_HEAD(&tp->list);
 	return tp;
@@ -314,7 +315,7 @@ static struct trace_probe *find_probe_event(const char *event)
 	struct trace_probe *tp;
 
 	list_for_each_entry(tp, &probe_list, list)
-		if (tp->call.name && !strcmp(tp->call.name, event))
+		if (!strcmp(tp->call.name, event))
 			return tp;
 	return NULL;
 }
@@ -330,8 +331,7 @@ static void __unregister_trace_probe(struct trace_probe *tp)
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
 static void unregister_trace_probe(struct trace_probe *tp)
 {
-	if (tp->call.name)
-		unregister_probe_event(tp);
+	unregister_probe_event(tp);
 	__unregister_trace_probe(tp);
 	list_del(&tp->list);
 }
@@ -360,18 +360,16 @@ static int register_trace_probe(struct trace_probe *tp)
 		goto end;
 	}
 	/* register as an event */
-	if (tp->call.name) {
-		old_tp = find_probe_event(tp->call.name);
-		if (old_tp) {
-			/* delete old event */
-			unregister_trace_probe(old_tp);
-			free_trace_probe(old_tp);
-		}
-		ret = register_probe_event(tp);
-		if (ret) {
-			pr_warning("Faild to register probe event(%d)\n", ret);
-			__unregister_trace_probe(tp);
-		}
+	old_tp = find_probe_event(tp->call.name);
+	if (old_tp) {
+		/* delete old event */
+		unregister_trace_probe(old_tp);
+		free_trace_probe(old_tp);
+	}
+	ret = register_probe_event(tp);
+	if (ret) {
+		pr_warning("Faild to register probe event(%d)\n", ret);
+		__unregister_trace_probe(tp);
 	}
 	list_add_tail(&tp->list, &probe_list);
 end:
@@ -580,7 +578,18 @@ static int create_trace_probe(int argc, char **argv)
 	argc -= 2; argv += 2;
 
 	/* setup a probe */
-	tp = alloc_trace_probe(symbol, event, argc);
+	if (!event) {
+		/* Make a new event name */
+		char buf[MAX_EVENT_NAME_LEN];
+		if (symbol)
+			snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
+				 is_return ? 'r' : 'p', symbol, offset);
+		else
+			snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
+				 is_return ? 'r' : 'p', addr);
+		tp = alloc_trace_probe(symbol, buf, argc);
+	} else
+		tp = alloc_trace_probe(symbol, event, argc);
 	if (IS_ERR(tp))
 		return PTR_ERR(tp);
 
@@ -661,8 +670,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	char buf[MAX_ARGSTR_LEN + 1];
 
 	seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
-	if (tp->call.name)
-		seq_printf(m, ":%s", tp->call.name);
+	seq_printf(m, ":%s", tp->call.name);
 
 	if (tp->symbol)
 		seq_printf(m, " %s%+ld", probe_symbol(tp), probe_offset(tp));
@@ -780,10 +788,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	struct ring_buffer_event *event;
 	int size, i, pc;
 	unsigned long irq_flags;
-	struct ftrace_event_call *call = &event_kprobe;
-
-	if (&tp->call.name)
-		call = &tp->call;
+	struct ftrace_event_call *call = &tp->call;
 
 	local_save_flags(irq_flags);
 	pc = preempt_count();
@@ -815,10 +820,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
 	struct ring_buffer_event *event;
 	int size, i, pc;
 	unsigned long irq_flags;
-	struct ftrace_event_call *call = &event_kretprobe;
-
-	if (&tp->call.name)
-		call = &tp->call;
+	struct ftrace_event_call *call = &tp->call;
 
 	local_save_flags(irq_flags);
 	pc = preempt_count();
-- 
cgit v1.2.3-71-gd317


From ff50d99136c3315513ef3b2921e77f35ab04d081 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 13 Aug 2009 16:35:34 -0400
Subject: tracing: Kprobe tracer assigns new event ids for each event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Assign new event ids for each kprobes event. This doesn't clear
ring_buffer when unregistering each kprobe event. Thus, if you mind
'Unknown event' messages, clear the buffer manually after changing
kprobe events.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090813203534.31965.49105.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace.h        |  6 ------
 kernel/trace/trace_kprobe.c | 51 +++++++++++++--------------------------------
 2 files changed, 15 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 667f832d16b7..f5362a0529eb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,8 +38,6 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
-	TRACE_KPROBE,
-	TRACE_KRETPROBE,
 
 	__TRACE_LAST_TYPE,
 };
@@ -343,10 +341,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
-		IF_ASSIGN(var, ent, struct kprobe_trace_entry,		\
-			  TRACE_KPROBE);				\
-		IF_ASSIGN(var, ent, struct kretprobe_trace_entry,	\
-			  TRACE_KRETPROBE);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8aeb24cc295f..9c067bf47d50 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -186,6 +186,7 @@ struct trace_probe {
 	};
 	const char		*symbol;	/* symbol name */
 	struct ftrace_event_call	call;
+	struct trace_event		event;
 	unsigned int		nr_args;
 	struct fetch_func	args[];
 };
@@ -795,7 +796,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 
 	size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
 
-	event = trace_current_buffer_lock_reserve(TRACE_KPROBE, size,
+	event = trace_current_buffer_lock_reserve(call->id, size,
 						  irq_flags, pc);
 	if (!event)
 		return 0;
@@ -827,7 +828,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
 
 	size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
 
-	event = trace_current_buffer_lock_reserve(TRACE_KRETPROBE, size,
+	event = trace_current_buffer_lock_reserve(call->id, size,
 						  irq_flags, pc);
 	if (!event)
 		return 0;
@@ -853,7 +854,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
 	struct trace_seq *s = &iter->seq;
 	int i;
 
-	trace_assign_type(field, iter->ent);
+	field = (struct kprobe_trace_entry *)iter->ent;
 
 	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto partial;
@@ -880,7 +881,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
 	struct trace_seq *s = &iter->seq;
 	int i;
 
-	trace_assign_type(field, iter->ent);
+	field = (struct kretprobe_trace_entry *)iter->ent;
 
 	if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto partial;
@@ -906,16 +907,6 @@ partial:
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event kprobe_trace_event = {
-	.type	 	= TRACE_KPROBE,
-	.trace		= print_kprobe_event,
-};
-
-static struct trace_event kretprobe_trace_event = {
-	.type	 	= TRACE_KRETPROBE,
-	.trace		= print_kretprobe_event,
-};
-
 static int probe_event_enable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1104,35 +1095,35 @@ static int register_probe_event(struct trace_probe *tp)
 	/* Initialize ftrace_event_call */
 	call->system = "kprobes";
 	if (probe_is_return(tp)) {
-		call->event = &kretprobe_trace_event;
-		call->id = TRACE_KRETPROBE;
+		tp->event.trace = print_kretprobe_event;
 		call->raw_init = probe_event_raw_init;
 		call->show_format = kretprobe_event_show_format;
 		call->define_fields = kretprobe_event_define_fields;
 	} else {
-		call->event = &kprobe_trace_event;
-		call->id = TRACE_KPROBE;
+		tp->event.trace = print_kprobe_event;
 		call->raw_init = probe_event_raw_init;
 		call->show_format = kprobe_event_show_format;
 		call->define_fields = kprobe_event_define_fields;
 	}
+	call->event = &tp->event;
+	call->id = register_ftrace_event(&tp->event);
+	if (!call->id)
+		return -ENODEV;
 	call->enabled = 1;
 	call->regfunc = probe_event_enable;
 	call->unregfunc = probe_event_disable;
 	call->data = tp;
 	ret = trace_add_event_call(call);
-	if (ret)
+	if (ret) {
 		pr_info("Failed to register kprobe event: %s\n", call->name);
+		unregister_ftrace_event(&tp->event);
+	}
 	return ret;
 }
 
 static void unregister_probe_event(struct trace_probe *tp)
 {
-	/*
-	 * Prevent to unregister event itself because the event is shared
-	 * among other probes.
-	 */
-	tp->call.event = NULL;
+	/* tp->event is unregistered in trace_remove_event_call() */
 	trace_remove_event_call(&tp->call);
 }
 
@@ -1141,18 +1132,6 @@ static __init int init_kprobe_trace(void)
 {
 	struct dentry *d_tracer;
 	struct dentry *entry;
-	int ret;
-
-	ret = register_ftrace_event(&kprobe_trace_event);
-	if (!ret) {
-		pr_warning("Could not register kprobe_trace_event type.\n");
-		return 0;
-	}
-	ret = register_ftrace_event(&kretprobe_trace_event);
-	if (!ret) {
-		pr_warning("Could not register kretprobe_trace_event type.\n");
-		return 0;
-	}
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
-- 
cgit v1.2.3-71-gd317


From cd7e7bd5e44718c7625ce1e1f0fda53d77cd3797 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 13 Aug 2009 16:35:42 -0400
Subject: tracing: Add kprobes event profiling interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add profiling interfaces for each kprobes event. This interface provides
how many times each probe hit or missed.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090813203541.31965.8452.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt |  8 +++++++
 kernel/trace/trace_kprobe.c         | 43 +++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 5e59e854e71b..3de751747164 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -70,6 +70,14 @@ filter:
  names and field names for describing filters.
 
 
+Event Profiling
+---------------
+ You can check the total number of probe hits and probe miss-hits via
+/sys/kernel/debug/tracing/kprobe_profile.
+ The first column is event name, the second is the number of probe hits,
+the third is the number of probe miss-hits.
+
+
 Usage examples
 --------------
 To add a probe as a new event, write a new definition to kprobe_events
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9c067bf47d50..ce68197767de 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -184,6 +184,7 @@ struct trace_probe {
 		struct kprobe		kp;
 		struct kretprobe	rp;
 	};
+	unsigned long 		nhit;
 	const char		*symbol;	/* symbol name */
 	struct ftrace_event_call	call;
 	struct trace_event		event;
@@ -781,6 +782,37 @@ static const struct file_operations kprobe_events_ops = {
 	.write		= probes_write,
 };
 
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+	struct trace_probe *tp = v;
+
+	seq_printf(m, "  %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
+		   probe_is_return(tp) ? tp->rp.kp.nmissed : tp->kp.nmissed);
+
+	return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+	.start  = probes_seq_start,
+	.next   = probes_seq_next,
+	.stop   = probes_seq_stop,
+	.show   = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations kprobe_profile_ops = {
+	.owner          = THIS_MODULE,
+	.open           = profile_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
 /* Kprobe handler */
 static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
@@ -791,6 +823,8 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
+	tp->nhit++;
+
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
@@ -1140,9 +1174,18 @@ static __init int init_kprobe_trace(void)
 	entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
 				    NULL, &kprobe_events_ops);
 
+	/* Event list interface */
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'kprobe_events' entry\n");
+
+	/* Profile interface */
+	entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+				    NULL, &kprobe_profile_ops);
+
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'kprobe_profile' entry\n");
 	return 0;
 }
 fs_initcall(init_kprobe_trace);
-- 
cgit v1.2.3-71-gd317


From 38a47497d9e34632abbeb484603cedf10c4b05e4 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 21 Aug 2009 15:43:43 -0400
Subject: tracing/kprobes: Fix format typo in trace_kprobes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a format typo in kprobe-tracer.

Currently, it shows 'tsize' in format;

$ cat /debug/tracing/events/kprobes/event/format
...
        field: unsigned long ip;        offset:16;tsize:8;
        field: int nargs;       offset:24;tsize:4;
...

This should be '\tsize';

$ cat /debug/tracing/events/kprobes/event/format
...
        field: unsigned long ip;        offset:16;	size:8;
        field: int nargs;       offset:24;	size:4;
...

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090821194343.12478.37618.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ce68197767de..1a9ca79fe645 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1070,7 +1070,7 @@ static int __probe_event_show_format(struct trace_seq *s,
 #define SHOW_FIELD(type, item, name)					\
 	do {								\
 		ret = trace_seq_printf(s, "\tfield: " #type " %s;\t"	\
-				"offset:%u;tsize:%u;\n", name,		\
+				"offset:%u;\tsize:%u;\n", name,		\
 				(unsigned int)offsetof(typeof(field), item),\
 				(unsigned int)sizeof(type));		\
 		if (!ret)						\
-- 
cgit v1.2.3-71-gd317


From 30a7e073b590ebd1829a906164b0a637e77cc967 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 21 Aug 2009 15:43:51 -0400
Subject: tracing/kprobes: Change trace_arg to probe_arg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change trace_arg_string() and parse_trace_arg() to probe_arg_string()
and parse_probe_arg(), since those are kprobe-tracer local functions.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090821194351.12478.15247.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1a9ca79fe645..f4ec3fc87b2d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -220,7 +220,7 @@ static __kprobes void *probe_address(struct trace_probe *tp)
 	return (probe_is_return(tp)) ? tp->rp.kp.addr : tp->kp.addr;
 }
 
-static int trace_arg_string(char *buf, size_t n, struct fetch_func *ff)
+static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
 {
 	int ret = -EINVAL;
 
@@ -250,7 +250,7 @@ static int trace_arg_string(char *buf, size_t n, struct fetch_func *ff)
 		if (ret >= n)
 			goto end;
 		l += ret;
-		ret = trace_arg_string(buf + l, n - l, &id->orig);
+		ret = probe_arg_string(buf + l, n - l, &id->orig);
 		if (ret < 0)
 			goto end;
 		l += ret;
@@ -408,7 +408,7 @@ static int split_symbol_offset(char *symbol, long *offset)
 #define PARAM_MAX_ARGS 16
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 
-static int parse_trace_arg(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
@@ -499,7 +499,7 @@ static int parse_trace_arg(char *arg, struct fetch_func *ff, int is_return)
 			if (!id)
 				return -ENOMEM;
 			id->offset = offset;
-			ret = parse_trace_arg(arg, &id->orig, is_return);
+			ret = parse_probe_arg(arg, &id->orig, is_return);
 			if (ret)
 				kfree(id);
 			else {
@@ -617,7 +617,7 @@ static int create_trace_probe(int argc, char **argv)
 			ret = -ENOSPC;
 			goto error;
 		}
-		ret = parse_trace_arg(argv[i], &tp->args[i], is_return);
+		ret = parse_probe_arg(argv[i], &tp->args[i], is_return);
 		if (ret)
 			goto error;
 	}
@@ -680,7 +680,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 		seq_printf(m, " 0x%p", probe_address(tp));
 
 	for (i = 0; i < tp->nr_args; i++) {
-		ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
 		if (ret < 0) {
 			pr_warning("Argument%d decoding error(%d).\n", i, ret);
 			return ret;
@@ -997,7 +997,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 		sprintf(buf, "arg%d", i);
 		DEFINE_FIELD(unsigned long, args[i], buf, 0);
 		/* Set argument string as an alias field */
-		ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
 		if (ret < 0)
 			return ret;
 		DEFINE_FIELD(unsigned long, args[i], buf, 0);
@@ -1024,7 +1024,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 		sprintf(buf, "arg%d", i);
 		DEFINE_FIELD(unsigned long, args[i], buf, 0);
 		/* Set argument string as an alias field */
-		ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
 		if (ret < 0)
 			return ret;
 		DEFINE_FIELD(unsigned long, args[i], buf, 0);
@@ -1041,7 +1041,7 @@ static int __probe_event_show_format(struct trace_seq *s,
 
 	/* Show aliases */
 	for (i = 0; i < tp->nr_args; i++) {
-		ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
 		if (ret < 0)
 			return ret;
 		if (!trace_seq_printf(s, "\talias: %s;\toriginal: arg%d;\n",
-- 
cgit v1.2.3-71-gd317


From 24851d2447830e6cba4c4b641cb73e713f312373 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 26 Aug 2009 23:38:30 +0200
Subject: tracing/kprobes: Dump the culprit kprobe in case of kprobe recursion

Kprobes can enter into a probing recursion, ie: a kprobe that does an
endless loop because one of its core mechanism function used during
probing is also probed itself.

This patch helps pinpointing the kprobe that raised such recursion
by dumping it and raising a BUG instead of a warning (we also disarm
the kprobe to try avoiding recursion in BUG itself). Having a BUG
instead of a warning stops the stacktrace in the right place and
doesn't pollute the logs with hundreds of traces that eventually end
up in a stack overflow.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
---
 arch/x86/kernel/kprobes.c | 8 ++++++--
 include/linux/kprobes.h   | 2 ++
 kernel/kprobes.c          | 7 +++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 16ae9610f6ff..ecee3d23fef8 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -490,9 +490,13 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 			/* A probe has been hit in the codepath leading up
 			 * to, or just after, single-stepping of a probed
 			 * instruction. This entire codepath should strictly
-			 * reside in .kprobes.text section. Raise a warning
-			 * to highlight this peculiar case.
+			 * reside in .kprobes.text section.
+			 * Raise a BUG or we'll continue in an endless
+			 * reentering loop and eventually a stack overflow.
 			 */
+			arch_disarm_kprobe(p);
+			dump_kprobe(p);
+			BUG();
 		}
 	default:
 		/* impossible cases */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bcd9c07848be..87eb79c9dd60 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -296,6 +296,8 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 int disable_kprobe(struct kprobe *kp);
 int enable_kprobe(struct kprobe *kp);
 
+void dump_kprobe(struct kprobe *kp);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ef177d653b2c..f72e96c25a38 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1141,6 +1141,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 	arch_remove_kprobe(p);
 }
 
+void __kprobes dump_kprobe(struct kprobe *kp)
+{
+	printk(KERN_WARNING "Dumping kprobe:\n");
+	printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
+	       kp->symbol_name, kp->addr, kp->offset);
+}
+
 /* Module notifier call back, checking kprobes on the module */
 static int __kprobes kprobes_module_callback(struct notifier_block *nb,
 					     unsigned long val, void *data)
-- 
cgit v1.2.3-71-gd317


From aeaeae1187d7520f1c5559623f0a149da6a1c96e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 27 Aug 2009 05:09:51 +0200
Subject: tracing: Restore the const qualifier for field names and types
 definition

Restore the const qualifier in field's name and type parameters of
trace_define_field that was lost while solving a conflict.

Fields names and types are defined as builtin constant strings in
static TRACE_EVENTs. But kprobes allocates these dynamically.

That said, we still want to always pass these strings as const char *
in trace_define_fields() to avoid any further accidental writes on
the pointed strings.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h  | 6 +++---
 kernel/trace/trace_events.c   | 4 ++--
 kernel/trace/trace_syscalls.c | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 1ab3089b5c59..73edf5a52e31 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -148,9 +148,9 @@ enum {
 };
 
 extern int trace_define_common_fields(struct ftrace_event_call *call);
-extern int trace_define_field(struct ftrace_event_call *call, char *type,
-			      char *name, int offset, int size, int is_signed,
-			      int filter_type);
+extern int trace_define_field(struct ftrace_event_call *call, const char *type,
+			      const char *name, int offset, int size,
+			      int is_signed, int filter_type);
 extern int trace_add_event_call(struct ftrace_event_call *call);
 extern void trace_remove_event_call(struct ftrace_event_call *call);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8079bb511c43..197cdaa96c43 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,8 +27,8 @@ DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
 
-int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size, int is_signed,
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+		       const char *name, int offset, int size, int is_signed,
 		       int filter_type)
 {
 	struct ftrace_event_field *field;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5931933587e9..a928dd004535 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -193,8 +193,8 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
 		return ret;
 
 	for (i = 0; i < meta->nb_args; i++) {
-		ret = trace_define_field(call, (char *)meta->types[i],
-					 (char *)meta->args[i], offset,
+		ret = trace_define_field(call, meta->types[i],
+					 meta->args[i], offset,
 					 sizeof(unsigned long), 0,
 					 FILTER_OTHER);
 		offset += sizeof(unsigned long);
-- 
cgit v1.2.3-71-gd317


From f8468f3695209735c1595342f6bd95f7bdab66e1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 27 Aug 2009 05:23:29 +0200
Subject: tracing: Remove unneeded pointer casts

Cleaup uneeded casts from void * to char * in syscalls tracing file.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a928dd004535..e7c676e50a7f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -324,7 +324,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 	int num;
 	char *name;
 
-	name = (char *)call->data;
+	name = call->data;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
 		return -ENOSYS;
@@ -347,7 +347,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	int num;
 	char *name;
 
-	name = (char *)call->data;
+	name = call->data;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
 		return;
-- 
cgit v1.2.3-71-gd317


From 8f270083587a4cb70fa14f0e2fd698eb08a4dd07 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 27 Aug 2009 13:23:18 -0400
Subject: kprobes: Fix to add __kprobes to notify_die

Add __kprobes to notify_die() because do_int3() calls notify_die()
instead of atomic_notify_call_chain() which is already marked as
__kprobes.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20090827172318.8246.53702.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/notifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..acd24e7643eb 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 
-int notrace notify_die(enum die_val val, const char *str,
+int notrace __kprobes notify_die(enum die_val val, const char *str,
 	       struct pt_regs *regs, long err, int trap, int sig)
 {
 	struct die_args args = {
-- 
cgit v1.2.3-71-gd317


From 65e234ec2c4a0659ca22531dc1372a185f088517 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 27 Aug 2009 13:23:32 -0400
Subject: kprobes: Prohibit to probe native_get_debugreg

Since do_debug() calls get_debugreg(), native_get_debugreg() will be
called from singlestepping. This can cause an int3 infinite loop.

We can't put it in the .text.kprobes section because it is inlined,
then we blacklist its name.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20090827172332.8246.34194.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/kprobes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f72e96c25a38..3267d90bc9d6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,7 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
  */
 static struct kprobe_blackpoint kprobe_blacklist[] = {
 	{"preempt_schedule",},
+	{"native_get_debugreg",},
 	{NULL}    /* Terminator */
 };
 
-- 
cgit v1.2.3-71-gd317


From d6a65dffb30d8636b1e5d4c201564ef401a246cf Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 7 Sep 2009 03:23:20 +0200
Subject: tracing: Fix ring-buffer and ksym tracer merge interaction

The compiler warns us about:

  kernel/trace/trace_ksym.c: In function ksym_hbp_handler:
  kernel/trace/trace_ksym.c:92: attention : passing argument 1 of trace_buffer_lock_reserve from incompatible pointer type
  kernel/trace/trace_ksym.c:106: attention : passing argument 1 of trace_buffer_unlock_commit from incompatible pointer type

Commit "e77405ad" (tracing: pass around ring buffer
instead of tracer) has changed the central tracing
APIs. And this change has updated every callsites of
these APIs except those that aren't in tracing/core,
such as the ksym tracer.

Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 2fde875ead4c..6d5609c67378 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -78,17 +78,18 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
 void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
 {
 	struct ring_buffer_event *event;
-	struct trace_array *tr;
 	struct ksym_trace_entry *entry;
+	struct ring_buffer *buffer;
 	int pc;
 
 	if (!ksym_tracing_enabled)
 		return;
 
-	tr = ksym_trace_array;
+	buffer = ksym_trace_array->buffer;
+
 	pc = preempt_count();
 
-	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
+	event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
 							sizeof(*entry), 0, pc);
 	if (!event)
 		return;
@@ -103,7 +104,7 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
 	ksym_collect_stats(hbp->info.address);
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
 
-	trace_buffer_unlock_commit(tr, event, 0, pc);
+	trace_buffer_unlock_commit(buffer, event, 0, pc);
 }
 
 /* Valid access types are represented as
-- 
cgit v1.2.3-71-gd317


From a00e817f42663941ea0aa5f85a9d1c4f8b212839 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 8 Sep 2009 12:47:55 -0400
Subject: kprobes/x86-32: Move irq-exit functions to kprobes section

Move irq-exit functions to .kprobes.text section to protect against
kprobes recursion.

When I ran kprobe stress test on x86-32, I found below symbols
cause unrecoverable recursive probing:

	ret_from_exception
	ret_from_intr
	check_userspace
	restore_all
	restore_all_notrace
	restore_nocheck
	irq_return

And also, I found some interrupt/exception entry points that
cause similar problems.

This patch moves those symbols (including their container functions)
to .kprobes.text section to prevent any kprobes probing.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20090908164755.24050.81182.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/entry_32.S | 24 ++++++++++++++++++++++++
 kernel/kprobes.c           |  2 ++
 2 files changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..beb30da203d6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -333,6 +333,10 @@ ENTRY(ret_from_fork)
 	CFI_ENDPROC
 END(ret_from_fork)
 
+/*
+ * Interrupt exit functions should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
 /*
  * Return to user mode is not as complex as all this looks,
  * but we want the default path for a system call return to
@@ -383,6 +387,10 @@ need_resched:
 END(resume_kernel)
 #endif
 	CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+	.popsection
 
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
@@ -513,6 +521,10 @@ sysexit_audit:
 	PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)
 
+/*
+ * syscall stub including irq exit should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
 	# system call handler stub
 ENTRY(system_call)
 	RING0_INT_FRAME			# can't unwind into user space anyway
@@ -705,6 +717,10 @@ syscall_badsys:
 	jmp resume_userspace
 END(syscall_badsys)
 	CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+	.popsection
 
 /*
  * System calls that need a pt_regs pointer.
@@ -814,6 +830,10 @@ common_interrupt:
 ENDPROC(common_interrupt)
 	CFI_ENDPROC
 
+/*
+ *  Irq entries should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
 #define BUILD_INTERRUPT3(name, nr, fn)	\
 ENTRY(name)				\
 	RING0_INT_FRAME;		\
@@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug)
 	jmp error_code
 	CFI_ENDPROC
 END(spurious_interrupt_bug)
+/*
+ * End of kprobes section
+ */
+	.popsection
 
 ENTRY(kernel_thread_helper)
 	pushl $0		# fake return address for unwinder
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3267d90bc9d6..00d01b0f9fee 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -91,6 +91,8 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 static struct kprobe_blackpoint kprobe_blacklist[] = {
 	{"preempt_schedule",},
 	{"native_get_debugreg",},
+	{"irq_entries_start",},
+	{"common_interrupt",},
 	{NULL}    /* Terminator */
 };
 
-- 
cgit v1.2.3-71-gd317


From 2fba0c8867af47f6455490e7b59e512dd180c027 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 10 Sep 2009 19:53:14 -0400
Subject: tracing/kprobes: Fix probe offset to be unsigned

Prohibit user to specify negative offset from symbols.
Since kprobe.offset is unsigned int, the offset must be always positive
value.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090910235314.22412.64631.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt | 14 +++++++-------
 kernel/trace/trace_kprobe.c         | 19 +++++++------------
 2 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 3de751747164..db5531865648 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -25,15 +25,15 @@ probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter.
 
 Synopsis of kprobe_events
 -------------------------
-  p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS]	: Set a probe
-  r[:EVENT] SYMBOL[+0] [FETCHARGS]			: Set a return probe
+  p[:EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS]	: Set a probe
+  r[:EVENT] SYMBOL[+0] [FETCHARGS]		: Set a return probe
 
- EVENT			: Event name. If omitted, the event name is generated
-			  based on SYMBOL+offs or MEMADDR.
- SYMBOL[+offs|-offs]	: Symbol+offset where the probe is inserted.
- MEMADDR		: Address where the probe is inserted.
+ EVENT		: Event name. If omitted, the event name is generated
+		  based on SYMBOL+offs or MEMADDR.
+ SYMBOL[+offs]	: Symbol+offset where the probe is inserted.
+ MEMADDR	: Address where the probe is inserted.
 
- FETCHARGS		: Arguments. Each probe can have up to 128 args.
+ FETCHARGS	: Arguments. Each probe can have up to 128 args.
   %REG	: Fetch register REG
   sN	: Fetch Nth entry of stack (N >= 0)
   sa	: Fetch stack address.
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 19a6de63b44b..c24b7e9d97c4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -210,7 +210,7 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
 	return tp->symbol ? tp->symbol : "unknown";
 }
 
-static __kprobes long probe_offset(struct trace_probe *tp)
+static __kprobes unsigned int probe_offset(struct trace_probe *tp)
 {
 	return (probe_is_return(tp)) ? tp->rp.kp.offset : tp->kp.offset;
 }
@@ -380,7 +380,7 @@ end:
 }
 
 /* Split symbol and offset. */
-static int split_symbol_offset(char *symbol, long *offset)
+static int split_symbol_offset(char *symbol, unsigned long *offset)
 {
 	char *tmp;
 	int ret;
@@ -389,16 +389,11 @@ static int split_symbol_offset(char *symbol, long *offset)
 		return -EINVAL;
 
 	tmp = strchr(symbol, '+');
-	if (!tmp)
-		tmp = strchr(symbol, '-');
-
 	if (tmp) {
 		/* skip sign because strict_strtol doesn't accept '+' */
-		ret = strict_strtol(tmp + 1, 0, offset);
+		ret = strict_strtoul(tmp + 1, 0, offset);
 		if (ret)
 			return ret;
-		if (*tmp == '-')
-			*offset = -(*offset);
 		*tmp = '\0';
 	} else
 		*offset = 0;
@@ -520,7 +515,7 @@ static int create_trace_probe(int argc, char **argv)
 {
 	/*
 	 * Argument syntax:
-	 *  - Add kprobe: p[:EVENT] SYMBOL[+OFFS|-OFFS]|ADDRESS [FETCHARGS]
+	 *  - Add kprobe: p[:EVENT] SYMBOL[+OFFS]|ADDRESS [FETCHARGS]
 	 *  - Add kretprobe: r[:EVENT] SYMBOL[+0] [FETCHARGS]
 	 * Fetch args:
 	 *  aN	: fetch Nth of function argument. (N:0-)
@@ -539,7 +534,7 @@ static int create_trace_probe(int argc, char **argv)
 	int i, ret = 0;
 	int is_return = 0;
 	char *symbol = NULL, *event = NULL;
-	long offset = 0;
+	unsigned long offset = 0;
 	void *addr = NULL;
 
 	if (argc < 2)
@@ -605,7 +600,7 @@ static int create_trace_probe(int argc, char **argv)
 
 	if (tp->symbol) {
 		kp->symbol_name = tp->symbol;
-		kp->offset = offset;
+		kp->offset = (unsigned int)offset;
 	} else
 		kp->addr = addr;
 
@@ -675,7 +670,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, ":%s", tp->call.name);
 
 	if (tp->symbol)
-		seq_printf(m, " %s%+ld", probe_symbol(tp), probe_offset(tp));
+		seq_printf(m, " %s+%u", probe_symbol(tp), probe_offset(tp));
 	else
 		seq_printf(m, " 0x%p", probe_address(tp));
 
-- 
cgit v1.2.3-71-gd317


From 4a846b443b4e8633057946a2234e23559a67ce42 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 11 Sep 2009 05:31:21 +0200
Subject: tracing/kprobes: Cleanup kprobe tracer code.

Simplify trace_probe to remove a union, and remove some redundant
wrappers.
And also, cleanup create_trace_probe() function.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090910235322.22412.52525.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 81 +++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c24b7e9d97c4..4ce728ca1b18 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -180,10 +180,7 @@ static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
 
 struct trace_probe {
 	struct list_head	list;
-	union {
-		struct kprobe		kp;
-		struct kretprobe	rp;
-	};
+	struct kretprobe	rp;	/* Use rp.kp for kprobe use */
 	unsigned long 		nhit;
 	const char		*symbol;	/* symbol name */
 	struct ftrace_event_call	call;
@@ -202,7 +199,7 @@ static int kretprobe_trace_func(struct kretprobe_instance *ri,
 
 static __kprobes int probe_is_return(struct trace_probe *tp)
 {
-	return (tp->rp.handler == kretprobe_trace_func);
+	return tp->rp.handler != NULL;
 }
 
 static __kprobes const char *probe_symbol(struct trace_probe *tp)
@@ -210,16 +207,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
 	return tp->symbol ? tp->symbol : "unknown";
 }
 
-static __kprobes unsigned int probe_offset(struct trace_probe *tp)
-{
-	return (probe_is_return(tp)) ? tp->rp.kp.offset : tp->kp.offset;
-}
-
-static __kprobes void *probe_address(struct trace_probe *tp)
-{
-	return (probe_is_return(tp)) ? tp->rp.kp.addr : tp->kp.addr;
-}
-
 static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
 {
 	int ret = -EINVAL;
@@ -269,8 +256,14 @@ static void unregister_probe_event(struct trace_probe *tp);
 static DEFINE_MUTEX(probe_lock);
 static LIST_HEAD(probe_list);
 
-static struct trace_probe *alloc_trace_probe(const char *symbol,
-					     const char *event, int nargs)
+/*
+ * Allocate new trace_probe and initialize it (including kprobes).
+ */
+static struct trace_probe *alloc_trace_probe(const char *event,
+					     void *addr,
+					     const char *symbol,
+					     unsigned long offs,
+					     int nargs, int is_return)
 {
 	struct trace_probe *tp;
 
@@ -282,7 +275,16 @@ static struct trace_probe *alloc_trace_probe(const char *symbol,
 		tp->symbol = kstrdup(symbol, GFP_KERNEL);
 		if (!tp->symbol)
 			goto error;
-	}
+		tp->rp.kp.symbol_name = tp->symbol;
+		tp->rp.kp.offset = offs;
+	} else
+		tp->rp.kp.addr = addr;
+
+	if (is_return)
+		tp->rp.handler = kretprobe_trace_func;
+	else
+		tp->rp.kp.pre_handler = kprobe_trace_func;
+
 	if (!event)
 		goto error;
 	tp->call.name = kstrdup(event, GFP_KERNEL);
@@ -327,7 +329,7 @@ static void __unregister_trace_probe(struct trace_probe *tp)
 	if (probe_is_return(tp))
 		unregister_kretprobe(&tp->rp);
 	else
-		unregister_kprobe(&tp->kp);
+		unregister_kprobe(&tp->rp.kp);
 }
 
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
@@ -349,14 +351,14 @@ static int register_trace_probe(struct trace_probe *tp)
 	if (probe_is_return(tp))
 		ret = register_kretprobe(&tp->rp);
 	else
-		ret = register_kprobe(&tp->kp);
+		ret = register_kprobe(&tp->rp.kp);
 
 	if (ret) {
 		pr_warning("Could not insert probe(%d)\n", ret);
 		if (ret == -EILSEQ) {
 			pr_warning("Probing address(0x%p) is not an "
 				   "instruction boundary.\n",
-				   probe_address(tp));
+				   tp->rp.kp.addr);
 			ret = -EINVAL;
 		}
 		goto end;
@@ -530,12 +532,12 @@ static int create_trace_probe(int argc, char **argv)
 	 *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
 	 */
 	struct trace_probe *tp;
-	struct kprobe *kp;
 	int i, ret = 0;
 	int is_return = 0;
 	char *symbol = NULL, *event = NULL;
 	unsigned long offset = 0;
 	void *addr = NULL;
+	char buf[MAX_EVENT_NAME_LEN];
 
 	if (argc < 2)
 		return -EINVAL;
@@ -577,33 +579,18 @@ static int create_trace_probe(int argc, char **argv)
 	/* setup a probe */
 	if (!event) {
 		/* Make a new event name */
-		char buf[MAX_EVENT_NAME_LEN];
 		if (symbol)
 			snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
 				 is_return ? 'r' : 'p', symbol, offset);
 		else
 			snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
 				 is_return ? 'r' : 'p', addr);
-		tp = alloc_trace_probe(symbol, buf, argc);
-	} else
-		tp = alloc_trace_probe(symbol, event, argc);
+		event = buf;
+	}
+	tp = alloc_trace_probe(event, addr, symbol, offset, argc, is_return);
 	if (IS_ERR(tp))
 		return PTR_ERR(tp);
 
-	if (is_return) {
-		kp = &tp->rp.kp;
-		tp->rp.handler = kretprobe_trace_func;
-	} else {
-		kp = &tp->kp;
-		tp->kp.pre_handler = kprobe_trace_func;
-	}
-
-	if (tp->symbol) {
-		kp->symbol_name = tp->symbol;
-		kp->offset = (unsigned int)offset;
-	} else
-		kp->addr = addr;
-
 	/* parse arguments */
 	ret = 0;
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
@@ -670,9 +657,9 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, ":%s", tp->call.name);
 
 	if (tp->symbol)
-		seq_printf(m, " %s+%u", probe_symbol(tp), probe_offset(tp));
+		seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
 	else
-		seq_printf(m, " 0x%p", probe_address(tp));
+		seq_printf(m, " 0x%p", tp->rp.kp.addr);
 
 	for (i = 0; i < tp->nr_args; i++) {
 		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
@@ -783,7 +770,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 	struct trace_probe *tp = v;
 
 	seq_printf(m, "  %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
-		   probe_is_return(tp) ? tp->rp.kp.nmissed : tp->kp.nmissed);
+		   tp->rp.kp.nmissed);
 
 	return 0;
 }
@@ -811,7 +798,7 @@ static const struct file_operations kprobe_profile_ops = {
 /* Kprobe handler */
 static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
-	struct trace_probe *tp = container_of(kp, struct trace_probe, kp);
+	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct kprobe_trace_entry *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
@@ -866,7 +853,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
 
 	entry = ring_buffer_event_data(event);
 	entry->nargs = tp->nr_args;
-	entry->func = (unsigned long)probe_address(tp);
+	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i], regs);
@@ -945,7 +932,7 @@ static int probe_event_enable(struct ftrace_event_call *call)
 	if (probe_is_return(tp))
 		return enable_kretprobe(&tp->rp);
 	else
-		return enable_kprobe(&tp->kp);
+		return enable_kprobe(&tp->rp.kp);
 }
 
 static void probe_event_disable(struct ftrace_event_call *call)
@@ -955,7 +942,7 @@ static void probe_event_disable(struct ftrace_event_call *call)
 	if (probe_is_return(tp))
 		disable_kretprobe(&tp->rp);
 	else
-		disable_kprobe(&tp->kp);
+		disable_kprobe(&tp->rp.kp);
 }
 
 static int probe_event_raw_init(struct ftrace_event_call *event_call)
-- 
cgit v1.2.3-71-gd317


From e08d1c657f70bcaca11401cd6ac5c8fe59bd2bb7 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 10 Sep 2009 19:53:30 -0400
Subject: tracing/kprobes: Add event profiling support

Add *probe_profile_enable/disable to support kprobes raw events
sampling from perf counters, like other ftrace events, when
CONFIG_PROFILE_EVENT=y.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090910235329.22412.94731.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt |   4 +-
 kernel/trace/trace_kprobe.c         | 110 +++++++++++++++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index db5531865648..8f882ebd1368 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -62,13 +62,15 @@ enabled:
   You can enable/disable the probe by writing 1 or 0 on it.
 
 format:
-  It shows the format of this probe event. It also shows aliases of arguments
+  This shows the format of this probe event. It also shows aliases of arguments
  which you specified to kprobe_events.
 
 filter:
   You can write filtering rules of this event. And you can use both of aliase
  names and field names for describing filters.
 
+id:
+  This shows the id of this probe event.
 
 Event Profiling
 ---------------
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 4ce728ca1b18..730e992d28da 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -28,6 +28,7 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/ptrace.h>
+#include <linux/perf_counter.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -280,6 +281,7 @@ static struct trace_probe *alloc_trace_probe(const char *event,
 	} else
 		tp->rp.kp.addr = addr;
 
+	/* Set handler here for checking whether this probe is return or not. */
 	if (is_return)
 		tp->rp.handler = kretprobe_trace_func;
 	else
@@ -929,10 +931,13 @@ static int probe_event_enable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
-	if (probe_is_return(tp))
+	if (probe_is_return(tp)) {
+		tp->rp.handler = kretprobe_trace_func;
 		return enable_kretprobe(&tp->rp);
-	else
+	} else {
+		tp->rp.kp.pre_handler = kprobe_trace_func;
 		return enable_kprobe(&tp->rp.kp);
+	}
 }
 
 static void probe_event_disable(struct ftrace_event_call *call)
@@ -1105,6 +1110,101 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 					  "func, ret_ip");
 }
 
+#ifdef CONFIG_EVENT_PROFILE
+
+/* Kprobe profile handler */
+static __kprobes int kprobe_profile_func(struct kprobe *kp,
+					 struct pt_regs *regs)
+{
+	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+	struct ftrace_event_call *call = &tp->call;
+	struct kprobe_trace_entry *entry;
+	int size, i, pc;
+	unsigned long irq_flags;
+
+	local_save_flags(irq_flags);
+	pc = preempt_count();
+
+	size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+
+	do {
+		char raw_data[size];
+		struct trace_entry *ent;
+
+		*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+		entry = (struct kprobe_trace_entry *)raw_data;
+		ent = &entry->ent;
+
+		tracing_generic_entry_update(ent, irq_flags, pc);
+		ent->type = call->id;
+		entry->nargs = tp->nr_args;
+		entry->ip = (unsigned long)kp->addr;
+		for (i = 0; i < tp->nr_args; i++)
+			entry->args[i] = call_fetch(&tp->args[i], regs);
+		perf_tpcounter_event(call->id, entry->ip, 1, entry, size);
+	} while (0);
+	return 0;
+}
+
+/* Kretprobe profile handler */
+static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+					    struct pt_regs *regs)
+{
+	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+	struct ftrace_event_call *call = &tp->call;
+	struct kretprobe_trace_entry *entry;
+	int size, i, pc;
+	unsigned long irq_flags;
+
+	local_save_flags(irq_flags);
+	pc = preempt_count();
+
+	size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+
+	do {
+		char raw_data[size];
+		struct trace_entry *ent;
+
+		*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+		entry = (struct kretprobe_trace_entry *)raw_data;
+		ent = &entry->ent;
+
+		tracing_generic_entry_update(ent, irq_flags, pc);
+		ent->type = call->id;
+		entry->nargs = tp->nr_args;
+		entry->func = (unsigned long)tp->rp.kp.addr;
+		entry->ret_ip = (unsigned long)ri->ret_addr;
+		for (i = 0; i < tp->nr_args; i++)
+			entry->args[i] = call_fetch(&tp->args[i], regs);
+		perf_tpcounter_event(call->id, entry->ret_ip, 1, entry, size);
+	} while (0);
+	return 0;
+}
+
+static int probe_profile_enable(struct ftrace_event_call *call)
+{
+	struct trace_probe *tp = (struct trace_probe *)call->data;
+
+	if (atomic_inc_return(&call->profile_count))
+		return 0;
+
+	if (probe_is_return(tp)) {
+		tp->rp.handler = kretprobe_profile_func;
+		return enable_kretprobe(&tp->rp);
+	} else {
+		tp->rp.kp.pre_handler = kprobe_profile_func;
+		return enable_kprobe(&tp->rp.kp);
+	}
+}
+
+static void probe_profile_disable(struct ftrace_event_call *call)
+{
+	if (atomic_add_negative(-1, &call->profile_count))
+		probe_event_disable(call);
+}
+
+#endif	/* CONFIG_EVENT_PROFILE */
+
 static int register_probe_event(struct trace_probe *tp)
 {
 	struct ftrace_event_call *call = &tp->call;
@@ -1130,6 +1230,12 @@ static int register_probe_event(struct trace_probe *tp)
 	call->enabled = 1;
 	call->regfunc = probe_event_enable;
 	call->unregfunc = probe_event_disable;
+
+#ifdef CONFIG_EVENT_PROFILE
+	atomic_set(&call->profile_count, -1);
+	call->profile_enable = probe_profile_enable;
+	call->profile_disable = probe_profile_disable;
+#endif
 	call->data = tp;
 	ret = trace_add_event_call(call);
 	if (ret) {
-- 
cgit v1.2.3-71-gd317


From eca0d916f6429785bbc88db3ff66631cde62b432 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 10 Sep 2009 19:53:38 -0400
Subject: tracing/kprobes: Add argument name support

Add argument name assignment support and remove "alias" lines from format.
This allows user to assign unique name to each argument. For example,

$ echo p do_sys_open dfd=a0 filename=a1 flags=a2 mode=a3 > kprobe_events

This assigns dfd, filename, flags, and mode to 1st - 4th arguments
respectively. Trace buffer shows those names too.

	<...>-1439  [000] 1200885.933147: do_sys_open+0x0/0xdf: dfd=ffffff9c filename=bfa898ac flags=8000 mode=0

This helps users to know what each value means.

Users can filter each events by these names too. Note that you can not
filter by argN anymore.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090910235337.22412.77383.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt |  46 ++++++-------
 kernel/trace/trace_kprobe.c         | 128 ++++++++++++++++++------------------
 2 files changed, 84 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 8f882ebd1368..aaa6c1067c78 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -42,7 +42,8 @@ Synopsis of kprobe_events
   aN	: Fetch function argument. (N >= 0)(*)
   rv	: Fetch return value.(**)
   ra	: Fetch return address.(**)
-  +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***)
+  +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
+  NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
 
   (*) aN may not correct on asmlinkaged functions and at the middle of
       function body.
@@ -62,12 +63,10 @@ enabled:
   You can enable/disable the probe by writing 1 or 0 on it.
 
 format:
-  This shows the format of this probe event. It also shows aliases of arguments
- which you specified to kprobe_events.
+  This shows the format of this probe event.
 
 filter:
-  You can write filtering rules of this event. And you can use both of aliase
- names and field names for describing filters.
+  You can write filtering rules of this event.
 
 id:
   This shows the id of this probe event.
@@ -85,10 +84,11 @@ Usage examples
 To add a probe as a new event, write a new definition to kprobe_events
 as below.
 
-  echo p:myprobe do_sys_open a0 a1 a2 a3 > /sys/kernel/debug/tracing/kprobe_events
+  echo p:myprobe do_sys_open dfd=a0 filename=a1 flags=a2 mode=a3 > /sys/kernel/debug/tracing/kprobe_events
 
  This sets a kprobe on the top of do_sys_open() function with recording
-1st to 4th arguments as "myprobe" event.
+1st to 4th arguments as "myprobe" event. As this example shows, users can
+choose more familiar names for each arguments.
 
   echo r:myretprobe do_sys_open rv ra >> /sys/kernel/debug/tracing/kprobe_events
 
@@ -99,7 +99,7 @@ recording return value and return address as "myretprobe" event.
 
   cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
 name: myprobe
-ID: 23
+ID: 75
 format:
 	field:unsigned short common_type;	offset:0;	size:2;
 	field:unsigned char common_flags;	offset:2;	size:1;
@@ -109,21 +109,15 @@ format:
 
 	field: unsigned long ip;	offset:16;tsize:8;
 	field: int nargs;	offset:24;tsize:4;
-	field: unsigned long arg0;	offset:32;tsize:8;
-	field: unsigned long arg1;	offset:40;tsize:8;
-	field: unsigned long arg2;	offset:48;tsize:8;
-	field: unsigned long arg3;	offset:56;tsize:8;
+	field: unsigned long dfd;	offset:32;tsize:8;
+	field: unsigned long filename;	offset:40;tsize:8;
+	field: unsigned long flags;	offset:48;tsize:8;
+	field: unsigned long mode;	offset:56;tsize:8;
 
-	alias: a0;	original: arg0;
-	alias: a1;	original: arg1;
-	alias: a2;	original: arg2;
-	alias: a3;	original: arg3;
+print fmt: "%lx: dfd=%lx filename=%lx flags=%lx mode=%lx", ip, REC->dfd, REC->filename, REC->flags, REC->mode
 
-print fmt: "%lx: 0x%lx 0x%lx 0x%lx 0x%lx", ip, arg0, arg1, arg2, arg3
 
-
- You can see that the event has 4 arguments and alias expressions
-corresponding to it.
+ You can see that the event has 4 arguments as in the expressions you specified.
 
   echo > /sys/kernel/debug/tracing/kprobe_events
 
@@ -135,12 +129,12 @@ corresponding to it.
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
-           <...>-1447  [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0
-           <...>-1447  [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a
-           <...>-1447  [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6
-           <...>-1447  [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a
-           <...>-1447  [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10
-           <...>-1447  [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a
+           <...>-1447  [001] 1038282.286875: do_sys_open+0x0/0xd6: dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
+           <...>-1447  [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: rv=fffffffffffffffe ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286885: do_sys_open+0x0/0xd6: dfd=ffffff9c filename=40413c flags=8000 mode=1b6
+           <...>-1447  [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: rv=3 ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286969: do_sys_open+0x0/0xd6: dfd=ffffff9c filename=4041c6 flags=98800 mode=10
+           <...>-1447  [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: rv=3 ra=ffffffff81367a3a
 
 
  Each line shows when the kernel hits a probe, and <- SYMBOL means kernel
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 730e992d28da..44dad1aa95d3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -176,9 +176,14 @@ static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
 }
 
 /**
- * kprobe_trace_core
+ * Kprobe tracer core functions
  */
 
+struct probe_arg {
+	struct fetch_func	fetch;
+	const char		*name;
+};
+
 struct trace_probe {
 	struct list_head	list;
 	struct kretprobe	rp;	/* Use rp.kp for kprobe use */
@@ -187,12 +192,12 @@ struct trace_probe {
 	struct ftrace_event_call	call;
 	struct trace_event		event;
 	unsigned int		nr_args;
-	struct fetch_func	args[];
+	struct probe_arg	args[];
 };
 
 #define SIZEOF_TRACE_PROBE(n)			\
 	(offsetof(struct trace_probe, args) +	\
-	(sizeof(struct fetch_func) * (n)))
+	(sizeof(struct probe_arg) * (n)))
 
 static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_trace_func(struct kretprobe_instance *ri,
@@ -301,15 +306,21 @@ error:
 	return ERR_PTR(-ENOMEM);
 }
 
+static void free_probe_arg(struct probe_arg *arg)
+{
+	if (arg->fetch.func == fetch_symbol)
+		free_symbol_cache(arg->fetch.data);
+	else if (arg->fetch.func == fetch_indirect)
+		free_indirect_fetch_data(arg->fetch.data);
+	kfree(arg->name);
+}
+
 static void free_trace_probe(struct trace_probe *tp)
 {
 	int i;
 
 	for (i = 0; i < tp->nr_args; i++)
-		if (tp->args[i].func == fetch_symbol)
-			free_symbol_cache(tp->args[i].data);
-		else if (tp->args[i].func == fetch_indirect)
-			free_indirect_fetch_data(tp->args[i].data);
+		free_probe_arg(&tp->args[i]);
 
 	kfree(tp->call.name);
 	kfree(tp->symbol);
@@ -532,11 +543,13 @@ static int create_trace_probe(int argc, char **argv)
 	 *  %REG	: fetch register REG
 	 * Indirect memory fetch:
 	 *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
+	 * Alias name of args:
+	 *  NAME=FETCHARG : set NAME as alias of FETCHARG.
 	 */
 	struct trace_probe *tp;
 	int i, ret = 0;
 	int is_return = 0;
-	char *symbol = NULL, *event = NULL;
+	char *symbol = NULL, *event = NULL, *arg = NULL;
 	unsigned long offset = 0;
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
@@ -596,12 +609,21 @@ static int create_trace_probe(int argc, char **argv)
 	/* parse arguments */
 	ret = 0;
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
-		if (strlen(argv[i]) > MAX_ARGSTR_LEN) {
-			pr_info("Argument%d(%s) is too long.\n", i, argv[i]);
+		/* Parse argument name */
+		arg = strchr(argv[i], '=');
+		if (arg)
+			*arg++ = '\0';
+		else
+			arg = argv[i];
+		tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+
+		/* Parse fetch argument */
+		if (strlen(arg) > MAX_ARGSTR_LEN) {
+			pr_info("Argument%d(%s) is too long.\n", i, arg);
 			ret = -ENOSPC;
 			goto error;
 		}
-		ret = parse_probe_arg(argv[i], &tp->args[i], is_return);
+		ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
 		if (ret)
 			goto error;
 	}
@@ -664,12 +686,12 @@ static int probes_seq_show(struct seq_file *m, void *v)
 		seq_printf(m, " 0x%p", tp->rp.kp.addr);
 
 	for (i = 0; i < tp->nr_args; i++) {
-		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
 		if (ret < 0) {
 			pr_warning("Argument%d decoding error(%d).\n", i, ret);
 			return ret;
 		}
-		seq_printf(m, " %s", buf);
+		seq_printf(m, " %s=%s", tp->args[i].name, buf);
 	}
 	seq_printf(m, "\n");
 	return 0;
@@ -824,7 +846,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i], regs);
+		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -858,7 +880,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i], regs);
+		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -872,9 +894,13 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
 {
 	struct kprobe_trace_entry *field;
 	struct trace_seq *s = &iter->seq;
+	struct trace_event *event;
+	struct trace_probe *tp;
 	int i;
 
 	field = (struct kprobe_trace_entry *)iter->ent;
+	event = ftrace_find_event(field->ent.type);
+	tp = container_of(event, struct trace_probe, event);
 
 	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto partial;
@@ -883,7 +909,8 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
 		goto partial;
 
 	for (i = 0; i < field->nargs; i++)
-		if (!trace_seq_printf(s, " 0x%lx", field->args[i]))
+		if (!trace_seq_printf(s, " %s=%lx",
+				      tp->args[i].name, field->args[i]))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -899,9 +926,13 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
 {
 	struct kretprobe_trace_entry *field;
 	struct trace_seq *s = &iter->seq;
+	struct trace_event *event;
+	struct trace_probe *tp;
 	int i;
 
 	field = (struct kretprobe_trace_entry *)iter->ent;
+	event = ftrace_find_event(field->ent.type);
+	tp = container_of(event, struct trace_probe, event);
 
 	if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto partial;
@@ -916,7 +947,8 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
 		goto partial;
 
 	for (i = 0; i < field->nargs; i++)
-		if (!trace_seq_printf(s, " 0x%lx", field->args[i]))
+		if (!trace_seq_printf(s, " %s=%lx",
+				      tp->args[i].name, field->args[i]))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -972,7 +1004,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 	int ret, i;
 	struct kprobe_trace_entry field;
-	char buf[MAX_ARGSTR_LEN + 1];
 	struct trace_probe *tp = (struct trace_probe *)event_call->data;
 
 	ret = trace_define_common_fields(event_call);
@@ -981,16 +1012,9 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 
 	DEFINE_FIELD(unsigned long, ip, "ip", 0);
 	DEFINE_FIELD(int, nargs, "nargs", 1);
-	for (i = 0; i < tp->nr_args; i++) {
-		/* Set argN as a field */
-		sprintf(buf, "arg%d", i);
-		DEFINE_FIELD(unsigned long, args[i], buf, 0);
-		/* Set argument string as an alias field */
-		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
-		if (ret < 0)
-			return ret;
-		DEFINE_FIELD(unsigned long, args[i], buf, 0);
-	}
+	/* Set argument names as fields */
+	for (i = 0; i < tp->nr_args; i++)
+		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
 	return 0;
 }
 
@@ -998,7 +1022,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 	int ret, i;
 	struct kretprobe_trace_entry field;
-	char buf[MAX_ARGSTR_LEN + 1];
 	struct trace_probe *tp = (struct trace_probe *)event_call->data;
 
 	ret = trace_define_common_fields(event_call);
@@ -1008,16 +1031,9 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 	DEFINE_FIELD(unsigned long, func, "func", 0);
 	DEFINE_FIELD(unsigned long, ret_ip, "ret_ip", 0);
 	DEFINE_FIELD(int, nargs, "nargs", 1);
-	for (i = 0; i < tp->nr_args; i++) {
-		/* Set argN as a field */
-		sprintf(buf, "arg%d", i);
-		DEFINE_FIELD(unsigned long, args[i], buf, 0);
-		/* Set argument string as an alias field */
-		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
-		if (ret < 0)
-			return ret;
-		DEFINE_FIELD(unsigned long, args[i], buf, 0);
-	}
+	/* Set argument names as fields */
+	for (i = 0; i < tp->nr_args; i++)
+		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
 	return 0;
 }
 
@@ -1025,31 +1041,21 @@ static int __probe_event_show_format(struct trace_seq *s,
 				     struct trace_probe *tp, const char *fmt,
 				     const char *arg)
 {
-	int i, ret;
-	char buf[MAX_ARGSTR_LEN + 1];
+	int i;
 
-	/* Show aliases */
-	for (i = 0; i < tp->nr_args; i++) {
-		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
-		if (ret < 0)
-			return ret;
-		if (!trace_seq_printf(s, "\talias: %s;\toriginal: arg%d;\n",
-				      buf, i))
-			return 0;
-	}
 	/* Show format */
 	if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
 		return 0;
 
 	for (i = 0; i < tp->nr_args; i++)
-		if (!trace_seq_puts(s, " 0x%lx"))
+		if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
 			return 0;
 
 	if (!trace_seq_printf(s, "\", %s", arg))
 		return 0;
 
 	for (i = 0; i < tp->nr_args; i++)
-		if (!trace_seq_printf(s, ", arg%d", i))
+		if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
 			return 0;
 
 	return trace_seq_puts(s, "\n");
@@ -1071,17 +1077,14 @@ static int kprobe_event_show_format(struct ftrace_event_call *call,
 {
 	struct kprobe_trace_entry field __attribute__((unused));
 	int ret, i;
-	char buf[8];
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
 	SHOW_FIELD(unsigned long, ip, "ip");
 	SHOW_FIELD(int, nargs, "nargs");
 
 	/* Show fields */
-	for (i = 0; i < tp->nr_args; i++) {
-		sprintf(buf, "arg%d", i);
-		SHOW_FIELD(unsigned long, args[i], buf);
-	}
+	for (i = 0; i < tp->nr_args; i++)
+		SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
 	trace_seq_puts(s, "\n");
 
 	return __probe_event_show_format(s, tp, "%lx:", "ip");
@@ -1092,7 +1095,6 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 {
 	struct kretprobe_trace_entry field __attribute__((unused));
 	int ret, i;
-	char buf[8];
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
 	SHOW_FIELD(unsigned long, func, "func");
@@ -1100,10 +1102,8 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 	SHOW_FIELD(int, nargs, "nargs");
 
 	/* Show fields */
-	for (i = 0; i < tp->nr_args; i++) {
-		sprintf(buf, "arg%d", i);
-		SHOW_FIELD(unsigned long, args[i], buf);
-	}
+	for (i = 0; i < tp->nr_args; i++)
+		SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
 	trace_seq_puts(s, "\n");
 
 	return __probe_event_show_format(s, tp, "%lx <- %lx:",
@@ -1140,7 +1140,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 		entry->nargs = tp->nr_args;
 		entry->ip = (unsigned long)kp->addr;
 		for (i = 0; i < tp->nr_args; i++)
-			entry->args[i] = call_fetch(&tp->args[i], regs);
+			entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 		perf_tpcounter_event(call->id, entry->ip, 1, entry, size);
 	} while (0);
 	return 0;
@@ -1175,7 +1175,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 		entry->func = (unsigned long)tp->rp.kp.addr;
 		entry->ret_ip = (unsigned long)ri->ret_addr;
 		for (i = 0; i < tp->nr_args; i++)
-			entry->args[i] = call_fetch(&tp->args[i], regs);
+			entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 		perf_tpcounter_event(call->id, entry->ret_ip, 1, entry, size);
 	} while (0);
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 6e9f23d1619f7badaf9090dac09e86a22d6061d8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 10 Sep 2009 19:53:45 -0400
Subject: tracing/kprobes: Show event name in trace output

Show event name in tracing/trace output. This also fixes kprobes events
format to comply with other tracepoint events formats.

Before patching:
<...>-1447  [001] 1038282.286875: do_sys_open+0x0/0xd6: ...
<...>-1447  [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: ...

After patching:
<...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) ...
<...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) ...

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090910235345.22412.76527.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt | 16 ++++++++--------
 kernel/trace/trace_kprobe.c         | 16 +++++++++++-----
 2 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index aaa6c1067c78..a849889e6092 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -114,7 +114,7 @@ format:
 	field: unsigned long flags;	offset:48;tsize:8;
 	field: unsigned long mode;	offset:56;tsize:8;
 
-print fmt: "%lx: dfd=%lx filename=%lx flags=%lx mode=%lx", ip, REC->dfd, REC->filename, REC->flags, REC->mode
+print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode
 
 
  You can see that the event has 4 arguments as in the expressions you specified.
@@ -129,15 +129,15 @@ print fmt: "%lx: dfd=%lx filename=%lx flags=%lx mode=%lx", ip, REC->dfd, REC->fi
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
-           <...>-1447  [001] 1038282.286875: do_sys_open+0x0/0xd6: dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
-           <...>-1447  [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: rv=fffffffffffffffe ra=ffffffff81367a3a
-           <...>-1447  [001] 1038282.286885: do_sys_open+0x0/0xd6: dfd=ffffff9c filename=40413c flags=8000 mode=1b6
-           <...>-1447  [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: rv=3 ra=ffffffff81367a3a
-           <...>-1447  [001] 1038282.286969: do_sys_open+0x0/0xd6: dfd=ffffff9c filename=4041c6 flags=98800 mode=10
-           <...>-1447  [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: rv=3 ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
+           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) rv=fffffffffffffffe ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6
+           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) rv=3 ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10
+           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) rv=3 ra=ffffffff81367a3a
 
 
- Each line shows when the kernel hits a probe, and <- SYMBOL means kernel
+ Each line shows when the kernel hits an event, and <- SYMBOL means kernel
 returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
 returns from do_sys_open to sys_open+0x1b).
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 44dad1aa95d3..1746afeaabf9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -902,10 +902,13 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
 	event = ftrace_find_event(field->ent.type);
 	tp = container_of(event, struct trace_probe, event);
 
+	if (!trace_seq_printf(s, "%s: (", tp->call.name))
+		goto partial;
+
 	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto partial;
 
-	if (!trace_seq_puts(s, ":"))
+	if (!trace_seq_puts(s, ")"))
 		goto partial;
 
 	for (i = 0; i < field->nargs; i++)
@@ -934,6 +937,9 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
 	event = ftrace_find_event(field->ent.type);
 	tp = container_of(event, struct trace_probe, event);
 
+	if (!trace_seq_printf(s, "%s: (", tp->call.name))
+		goto partial;
+
 	if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto partial;
 
@@ -943,7 +949,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
 	if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
 		goto partial;
 
-	if (!trace_seq_puts(s, ":"))
+	if (!trace_seq_puts(s, ")"))
 		goto partial;
 
 	for (i = 0; i < field->nargs; i++)
@@ -1087,7 +1093,7 @@ static int kprobe_event_show_format(struct ftrace_event_call *call,
 		SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
 	trace_seq_puts(s, "\n");
 
-	return __probe_event_show_format(s, tp, "%lx:", "ip");
+	return __probe_event_show_format(s, tp, "(%lx)", "REC->ip");
 }
 
 static int kretprobe_event_show_format(struct ftrace_event_call *call,
@@ -1106,8 +1112,8 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 		SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
 	trace_seq_puts(s, "\n");
 
-	return __probe_event_show_format(s, tp, "%lx <- %lx:",
-					  "func, ret_ip");
+	return __probe_event_show_format(s, tp, "(%lx <- %lx)",
+					  "REC->func, REC->ret_ip");
 }
 
 #ifdef CONFIG_EVENT_PROFILE
-- 
cgit v1.2.3-71-gd317


From f52487e9c0041842eeb77c6c48774414b1cede08 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 10 Sep 2009 19:53:53 -0400
Subject: tracing/kprobes: Support custom subsystem for each kprobe event

Support specifying a custom subsystem(group) for each kprobe event.
This allows users to create new group to control several probes
at once, or add events to existing groups as additional tracepoints.

New synopsis:
 p[:[subsys/]event-name] KADDR|KSYM[+offs] [ARGS]

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090910235353.22412.15149.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt |  5 +++--
 kernel/trace/trace_kprobe.c         | 33 +++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index a849889e6092..6521681e7838 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -25,9 +25,10 @@ probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter.
 
 Synopsis of kprobe_events
 -------------------------
-  p[:EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS]	: Set a probe
-  r[:EVENT] SYMBOL[+0] [FETCHARGS]		: Set a return probe
+  p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS]	: Set a probe
+  r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS]		: Set a return probe
 
+ GRP		: Group name. If omitted, use "kprobes" for it.
  EVENT		: Event name. If omitted, the event name is generated
 		  based on SYMBOL+offs or MEMADDR.
  SYMBOL[+offs]	: Symbol+offset where the probe is inserted.
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1746afeaabf9..cbc0870dcf5d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -36,6 +36,7 @@
 #define MAX_TRACE_ARGS 128
 #define MAX_ARGSTR_LEN 63
 #define MAX_EVENT_NAME_LEN 64
+#define KPROBE_EVENT_SYSTEM "kprobes"
 
 /* currently, trace_kprobe only supports X86. */
 
@@ -265,7 +266,8 @@ static LIST_HEAD(probe_list);
 /*
  * Allocate new trace_probe and initialize it (including kprobes).
  */
-static struct trace_probe *alloc_trace_probe(const char *event,
+static struct trace_probe *alloc_trace_probe(const char *group,
+					     const char *event,
 					     void *addr,
 					     const char *symbol,
 					     unsigned long offs,
@@ -298,9 +300,16 @@ static struct trace_probe *alloc_trace_probe(const char *event,
 	if (!tp->call.name)
 		goto error;
 
+	if (!group)
+		goto error;
+	tp->call.system = kstrdup(group, GFP_KERNEL);
+	if (!tp->call.system)
+		goto error;
+
 	INIT_LIST_HEAD(&tp->list);
 	return tp;
 error:
+	kfree(tp->call.name);
 	kfree(tp->symbol);
 	kfree(tp);
 	return ERR_PTR(-ENOMEM);
@@ -322,6 +331,7 @@ static void free_trace_probe(struct trace_probe *tp)
 	for (i = 0; i < tp->nr_args; i++)
 		free_probe_arg(&tp->args[i]);
 
+	kfree(tp->call.system);
 	kfree(tp->call.name);
 	kfree(tp->symbol);
 	kfree(tp);
@@ -530,8 +540,8 @@ static int create_trace_probe(int argc, char **argv)
 {
 	/*
 	 * Argument syntax:
-	 *  - Add kprobe: p[:EVENT] SYMBOL[+OFFS]|ADDRESS [FETCHARGS]
-	 *  - Add kretprobe: r[:EVENT] SYMBOL[+0] [FETCHARGS]
+	 *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+	 *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
 	 * Fetch args:
 	 *  aN	: fetch Nth of function argument. (N:0-)
 	 *  rv	: fetch return value
@@ -549,7 +559,7 @@ static int create_trace_probe(int argc, char **argv)
 	struct trace_probe *tp;
 	int i, ret = 0;
 	int is_return = 0;
-	char *symbol = NULL, *event = NULL, *arg = NULL;
+	char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
 	unsigned long offset = 0;
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
@@ -566,6 +576,15 @@ static int create_trace_probe(int argc, char **argv)
 
 	if (argv[0][1] == ':') {
 		event = &argv[0][2];
+		if (strchr(event, '/')) {
+			group = event;
+			event = strchr(group, '/') + 1;
+			event[-1] = '\0';
+			if (strlen(group) == 0) {
+				pr_info("Group name is not specifiled\n");
+				return -EINVAL;
+			}
+		}
 		if (strlen(event) == 0) {
 			pr_info("Event name is not specifiled\n");
 			return -EINVAL;
@@ -592,6 +611,8 @@ static int create_trace_probe(int argc, char **argv)
 	argc -= 2; argv += 2;
 
 	/* setup a probe */
+	if (!group)
+		group = KPROBE_EVENT_SYSTEM;
 	if (!event) {
 		/* Make a new event name */
 		if (symbol)
@@ -602,7 +623,8 @@ static int create_trace_probe(int argc, char **argv)
 				 is_return ? 'r' : 'p', addr);
 		event = buf;
 	}
-	tp = alloc_trace_probe(event, addr, symbol, offset, argc, is_return);
+	tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+			       is_return);
 	if (IS_ERR(tp))
 		return PTR_ERR(tp);
 
@@ -1217,7 +1239,6 @@ static int register_probe_event(struct trace_probe *tp)
 	int ret;
 
 	/* Initialize ftrace_event_call */
-	call->system = "kprobes";
 	if (probe_is_return(tp)) {
 		tp->event.trace = print_kretprobe_event;
 		call->raw_init = probe_event_raw_init;
-- 
cgit v1.2.3-71-gd317


From 2d5e067edc4635ff7515bfa9ab3edb38bc344cab Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 14 Sep 2009 16:48:56 -0400
Subject: tracing/kprobes: Fix trace_probe registration order

Fix trace_probe registration order. ftrace_event_call and ftrace_event
must be registered before kprobe/kretprobe, because tracing/profiling
handlers dereference the event-id.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090914204856.18779.52961.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index cbc0870dcf5d..ea0db8eee570 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -347,20 +347,15 @@ static struct trace_probe *find_probe_event(const char *event)
 	return NULL;
 }
 
-static void __unregister_trace_probe(struct trace_probe *tp)
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
 {
 	if (probe_is_return(tp))
 		unregister_kretprobe(&tp->rp);
 	else
 		unregister_kprobe(&tp->rp.kp);
-}
-
-/* Unregister a trace_probe and probe_event: call with locking probe_lock */
-static void unregister_trace_probe(struct trace_probe *tp)
-{
-	unregister_probe_event(tp);
-	__unregister_trace_probe(tp);
 	list_del(&tp->list);
+	unregister_probe_event(tp);
 }
 
 /* Register a trace_probe and probe_event */
@@ -371,6 +366,19 @@ static int register_trace_probe(struct trace_probe *tp)
 
 	mutex_lock(&probe_lock);
 
+	/* register as an event */
+	old_tp = find_probe_event(tp->call.name);
+	if (old_tp) {
+		/* delete old event */
+		unregister_trace_probe(old_tp);
+		free_trace_probe(old_tp);
+	}
+	ret = register_probe_event(tp);
+	if (ret) {
+		pr_warning("Faild to register probe event(%d)\n", ret);
+		goto end;
+	}
+
 	if (probe_is_return(tp))
 		ret = register_kretprobe(&tp->rp);
 	else
@@ -384,21 +392,9 @@ static int register_trace_probe(struct trace_probe *tp)
 				   tp->rp.kp.addr);
 			ret = -EINVAL;
 		}
-		goto end;
-	}
-	/* register as an event */
-	old_tp = find_probe_event(tp->call.name);
-	if (old_tp) {
-		/* delete old event */
-		unregister_trace_probe(old_tp);
-		free_trace_probe(old_tp);
-	}
-	ret = register_probe_event(tp);
-	if (ret) {
-		pr_warning("Faild to register probe event(%d)\n", ret);
-		__unregister_trace_probe(tp);
-	}
-	list_add_tail(&tp->list, &probe_list);
+		unregister_probe_event(tp);
+	} else
+		list_add_tail(&tp->list, &probe_list);
 end:
 	mutex_unlock(&probe_lock);
 	return ret;
-- 
cgit v1.2.3-71-gd317


From 588bebb74fe87270f94c2810652bd683d63c4b54 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 16 Sep 2009 11:42:55 -0400
Subject: ftrace: Fix trace_add_event_call() to initialize list

Handle failure path in trace_add_event_call() to fix the below bug
which occurred when I tried to add invalid event twice.

Could not create debugfs 'kmalloc' directory
Failed to register kprobe event: kmalloc
Faild to register probe event(-1)
------------[ cut here ]------------
WARNING: at /home/mhiramat/ksrc/random-tracing/lib/list_debug.c:26
__list_add+0x27/0x5c()
Hardware name:
list_add corruption. next->prev should be prev (c07d78cc), but was
00001000. (next=d854236c).
Modules linked in: sunrpc uinput virtio_net virtio_balloon i2c_piix4 pcspkr
i2c_core virtio_blk virtio_pci virtio_ring virtio [last unloaded:
scsi_wait_scan]
Pid: 1394, comm: tee Not tainted 2.6.31-rc9 #51
Call Trace:
 [<c0438424>] warn_slowpath_common+0x65/0x7c
 [<c05371b3>] ? __list_add+0x27/0x5c
 [<c043846f>] warn_slowpath_fmt+0x24/0x27
 [<c05371b3>] __list_add+0x27/0x5c
 [<c047f050>] list_add+0xa/0xc
 [<c047f8f5>] trace_add_event_call+0x60/0x97
 [<c0483133>] command_trace_probe+0x42c/0x51b
 [<c044a1b3>] ? remove_wait_queue+0x22/0x27
 [<c042a9c0>] ? __wake_up+0x32/0x3b
 [<c04832f6>] probes_write+0xd4/0x10a
 [<c0483222>] ? probes_write+0x0/0x10a
 [<c04b27a9>] vfs_write+0x80/0xdf
 [<c04b289c>] sys_write+0x3b/0x5d
 [<c0670d41>] syscall_call+0x7/0xb
---[ end trace 2b962b5dc1fdc07d ]---

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4AB1077F.6020107@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ba3492076ab2..83cc2c01195d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1010,9 +1010,12 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
 		return -ENOENT;
 
 	list_add(&call->list, &ftrace_events);
-	return event_create_dir(call, d_events, &ftrace_event_id_fops,
+	ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
 				&ftrace_enable_fops, &ftrace_event_filter_fops,
 				&ftrace_event_format_fops);
+	if (ret < 0)
+		list_del(&call->list);
+	return ret;
 }
 
 /* Add an additional event_call dynamically */
-- 
cgit v1.2.3-71-gd317


From 4fead8e46fded93cc0d432ced774d9a3a8d21bad Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 14 Sep 2009 16:49:12 -0400
Subject: ftrace: Fix trace_remove_event_call() to lock trace_event_mutex

Lock not only event_mutex but also trace_event_mutex in
trace_remove_event_call() to protect __unregister_ftrace_event().

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090914204912.18779.68734.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 83cc2c01195d..f85b0f1cb942 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1054,6 +1054,9 @@ static void remove_subsystem_dir(const char *name)
 	}
 }
 
+/*
+ * Must be called under locking both of event_mutex and trace_event_mutex.
+ */
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
 	ftrace_event_enable_disable(call, 0);
@@ -1070,7 +1073,9 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 void trace_remove_event_call(struct ftrace_event_call *call)
 {
 	mutex_lock(&event_mutex);
+	down_write(&trace_event_mutex);
 	__trace_remove_event_call(call);
+	up_write(&trace_event_mutex);
 	mutex_unlock(&event_mutex);
 }
 
-- 
cgit v1.2.3-71-gd317


From 50d780560785b068c358675c5f0bf6c83b5c373e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 14 Sep 2009 16:49:20 -0400
Subject: tracing/kprobes: Add probe handler dispatcher to support perf and
 ftrace concurrent use

Add kprobe_dispatcher and kretprobe_dispatcher to dispatch event
in both profile and tracing handlers.

This allows simultaneous kprobe uses by ftrace and perf.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090914204920.18779.57555.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 85 +++++++++++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ea0db8eee570..70b632c3bd08 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -185,10 +185,15 @@ struct probe_arg {
 	const char		*name;
 };
 
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE	1
+#define TP_FLAG_PROFILE	2
+
 struct trace_probe {
 	struct list_head	list;
 	struct kretprobe	rp;	/* Use rp.kp for kprobe use */
 	unsigned long 		nhit;
+	unsigned int		flags;	/* For TP_FLAG_* */
 	const char		*symbol;	/* symbol name */
 	struct ftrace_event_call	call;
 	struct trace_event		event;
@@ -200,10 +205,6 @@ struct trace_probe {
 	(offsetof(struct trace_probe, args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
-static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs);
-static int kretprobe_trace_func(struct kretprobe_instance *ri,
-				struct pt_regs *regs);
-
 static __kprobes int probe_is_return(struct trace_probe *tp)
 {
 	return tp->rp.handler != NULL;
@@ -263,6 +264,10 @@ static void unregister_probe_event(struct trace_probe *tp);
 static DEFINE_MUTEX(probe_lock);
 static LIST_HEAD(probe_list);
 
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+				struct pt_regs *regs);
+
 /*
  * Allocate new trace_probe and initialize it (including kprobes).
  */
@@ -288,11 +293,10 @@ static struct trace_probe *alloc_trace_probe(const char *group,
 	} else
 		tp->rp.kp.addr = addr;
 
-	/* Set handler here for checking whether this probe is return or not. */
 	if (is_return)
-		tp->rp.handler = kretprobe_trace_func;
+		tp->rp.handler = kretprobe_dispatcher;
 	else
-		tp->rp.kp.pre_handler = kprobe_trace_func;
+		tp->rp.kp.pre_handler = kprobe_dispatcher;
 
 	if (!event)
 		goto error;
@@ -379,6 +383,7 @@ static int register_trace_probe(struct trace_probe *tp)
 		goto end;
 	}
 
+	tp->flags = TP_FLAG_TRACE;
 	if (probe_is_return(tp))
 		ret = register_kretprobe(&tp->rp);
 	else
@@ -987,23 +992,24 @@ static int probe_event_enable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
-	if (probe_is_return(tp)) {
-		tp->rp.handler = kretprobe_trace_func;
+	tp->flags |= TP_FLAG_TRACE;
+	if (probe_is_return(tp))
 		return enable_kretprobe(&tp->rp);
-	} else {
-		tp->rp.kp.pre_handler = kprobe_trace_func;
+	else
 		return enable_kprobe(&tp->rp.kp);
-	}
 }
 
 static void probe_event_disable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
-	if (probe_is_return(tp))
-		disable_kretprobe(&tp->rp);
-	else
-		disable_kprobe(&tp->rp.kp);
+	tp->flags &= ~TP_FLAG_TRACE;
+	if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
+		if (probe_is_return(tp))
+			disable_kretprobe(&tp->rp);
+		else
+			disable_kprobe(&tp->rp.kp);
+	}
 }
 
 static int probe_event_raw_init(struct ftrace_event_call *event_call)
@@ -1212,22 +1218,57 @@ static int probe_profile_enable(struct ftrace_event_call *call)
 	if (atomic_inc_return(&call->profile_count))
 		return 0;
 
-	if (probe_is_return(tp)) {
-		tp->rp.handler = kretprobe_profile_func;
+	tp->flags |= TP_FLAG_PROFILE;
+	if (probe_is_return(tp))
 		return enable_kretprobe(&tp->rp);
-	} else {
-		tp->rp.kp.pre_handler = kprobe_profile_func;
+	else
 		return enable_kprobe(&tp->rp.kp);
-	}
 }
 
 static void probe_profile_disable(struct ftrace_event_call *call)
 {
+	struct trace_probe *tp = (struct trace_probe *)call->data;
+
 	if (atomic_add_negative(-1, &call->profile_count))
-		probe_event_disable(call);
+		tp->flags &= ~TP_FLAG_PROFILE;
+
+	if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
+		if (probe_is_return(tp))
+			disable_kretprobe(&tp->rp);
+		else
+			disable_kprobe(&tp->rp.kp);
+	}
 }
+#endif	/* CONFIG_EVENT_PROFILE */
+
+
+static __kprobes
+int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+{
+	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 
+	if (tp->flags & TP_FLAG_TRACE)
+		kprobe_trace_func(kp, regs);
+#ifdef CONFIG_EVENT_PROFILE
+	if (tp->flags & TP_FLAG_PROFILE)
+		kprobe_profile_func(kp, regs);
 #endif	/* CONFIG_EVENT_PROFILE */
+	return 0;	/* We don't tweek kernel, so just return 0 */
+}
+
+static __kprobes
+int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+
+	if (tp->flags & TP_FLAG_TRACE)
+		kretprobe_trace_func(ri, regs);
+#ifdef CONFIG_EVENT_PROFILE
+	if (tp->flags & TP_FLAG_PROFILE)
+		kretprobe_profile_func(ri, regs);
+#endif	/* CONFIG_EVENT_PROFILE */
+	return 0;	/* We don't tweek kernel, so just return 0 */
+}
 
 static int register_probe_event(struct trace_probe *tp)
 {
-- 
cgit v1.2.3-71-gd317


From 74ebb63e7cd25f6fb02a45fc2ea7735bce1217c9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 14 Sep 2009 16:49:28 -0400
Subject: tracing/kprobes: Fix profiling alignment for perf_counter buffer

Fix *probe_profile_func() to align buffer size, since perf_counter
requires its buffer entries to be 8 bytes aligned.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090914204928.18779.60029.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 70b632c3bd08..d8db9357489b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1149,18 +1149,23 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry *entry;
-	int size, i, pc;
+	int size, __size, i, pc;
 	unsigned long irq_flags;
 
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+	size = ALIGN(__size + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
 
 	do {
 		char raw_data[size];
 		struct trace_entry *ent;
-
+		/*
+		 * Zero dead bytes from alignment to avoid stack leak
+		 * to userspace
+		 */
 		*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
 		entry = (struct kprobe_trace_entry *)raw_data;
 		ent = &entry->ent;
@@ -1183,13 +1188,15 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry *entry;
-	int size, i, pc;
+	int size, __size, i, pc;
 	unsigned long irq_flags;
 
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+	size = ALIGN(__size + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
 
 	do {
 		char raw_data[size];
-- 
cgit v1.2.3-71-gd317


From 5a0d9050db4d1147722b42afef9011251b2651ee Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 14 Sep 2009 16:49:37 -0400
Subject: tracing/kprobes: Disable kprobe events by default after creation

Disable newly created kprobe events by default, not to disturb
another user using ftrace. "Disturb" means when someone is using
ftrace and another user tries to use perf-tools, (in near
future) if he defines new kprobe event via perf-tools, then new
events will mess up the frace buffer. Fix this to allow proper
and transparent kprobes events concurrent usage between ftrace
users and perf users.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090914204937.18779.59422.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt | 11 +++++++++--
 kernel/trace/trace_kprobe.c         |  4 ++--
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 6521681e7838..9b8f7c6040a7 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -122,8 +122,15 @@ print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, R
 
   echo > /sys/kernel/debug/tracing/kprobe_events
 
- This clears all probe points. and you can see the traced information via
-/sys/kernel/debug/tracing/trace.
+ This clears all probe points.
+
+ Right after definition, each event is disabled by default. For tracing these
+events, you need to enable it.
+
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
+
+ And you can see the traced information via /sys/kernel/debug/tracing/trace.
 
   cat /sys/kernel/debug/tracing/trace
 # tracer: nop
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d8db9357489b..f6821f16227e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -383,7 +383,7 @@ static int register_trace_probe(struct trace_probe *tp)
 		goto end;
 	}
 
-	tp->flags = TP_FLAG_TRACE;
+	tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
 	if (probe_is_return(tp))
 		ret = register_kretprobe(&tp->rp);
 	else
@@ -1298,7 +1298,7 @@ static int register_probe_event(struct trace_probe *tp)
 	call->id = register_ftrace_event(&tp->event);
 	if (!call->id)
 		return -ENODEV;
-	call->enabled = 1;
+	call->enabled = 0;
 	call->regfunc = probe_event_enable;
 	call->unregfunc = probe_event_disable;
 
-- 
cgit v1.2.3-71-gd317


From 1f0ab40976460bc4673fa204ce917a725185d8f2 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Tue, 15 Sep 2009 10:43:07 +0530
Subject: kprobes: Prevent re-registration of the same kprobe

Prevent re-registration of the same kprobe. This situation, though
unlikely, needs to be flagged since it can lead to a system crash if
it's not handled.

The core change itself is small, but the helper routine needed to be
moved around a bit; hence the diffstat.

Signed-off-by: Ananth N Mavinakayanahalli<ananth@in.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090915051307.GB26458@in.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/kprobes.c | 58 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 00d01b0f9fee..b946761f84bd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -676,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
 	return (kprobe_opcode_t *)(((char *)addr) + p->offset);
 }
 
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+{
+	struct kprobe *old_p, *list_p;
+
+	old_p = get_kprobe(p->addr);
+	if (unlikely(!old_p))
+		return NULL;
+
+	if (p != old_p) {
+		list_for_each_entry_rcu(list_p, &old_p->list, list)
+			if (list_p == p)
+			/* kprobe p is a valid probe */
+				goto valid;
+		return NULL;
+	}
+valid:
+	return old_p;
+}
+
+/* Return error if the kprobe is being re-registered */
+static inline int check_kprobe_rereg(struct kprobe *p)
+{
+	int ret = 0;
+	struct kprobe *old_p;
+
+	mutex_lock(&kprobe_mutex);
+	old_p = __get_valid_kprobe(p);
+	if (old_p)
+		ret = -EINVAL;
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+
 int __kprobes register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
@@ -688,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
 		return -EINVAL;
 	p->addr = addr;
 
+	ret = check_kprobe_rereg(p);
+	if (ret)
+		return ret;
+
 	preempt_disable();
 	if (!kernel_text_address((unsigned long) p->addr) ||
 	    in_kprobes_functions((unsigned long) p->addr)) {
@@ -757,26 +795,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
 
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
-{
-	struct kprobe *old_p, *list_p;
-
-	old_p = get_kprobe(p->addr);
-	if (unlikely(!old_p))
-		return NULL;
-
-	if (p != old_p) {
-		list_for_each_entry_rcu(list_p, &old_p->list, list)
-			if (list_p == p)
-			/* kprobe p is a valid probe */
-				goto valid;
-		return NULL;
-	}
-valid:
-	return old_p;
-}
-
 /*
  * Unregister a kprobe without a scheduler synchronization.
  */
-- 
cgit v1.2.3-71-gd317


From d01d4827858cdc2e1c437c87ab65ec0a00fd40f8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 21 Sep 2009 11:06:27 +0200
Subject: sched: Always show Cpus_allowed field in /proc/<pid>/status

The Cpus_allowed fields in /proc/<pid>/status is currently only
shown in case of CONFIG_CPUSETS. However their contents are also
useful for the !CONFIG_CPUSETS case.

So change the current behaviour and always show these fields.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090921090627.GD4649@osiris.boeblingen.de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c | 11 +++++++++++
 kernel/cpuset.c |  8 +-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650bbbb8..762aea9c9c71 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -321,6 +321,16 @@ static inline void task_context_switch_counts(struct seq_file *m,
 			p->nivcsw);
 }
 
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+	seq_printf(m, "Cpus_allowed:\t");
+	seq_cpumask(m, &task->cpus_allowed);
+	seq_printf(m, "\n");
+	seq_printf(m, "Cpus_allowed_list:\t");
+	seq_cpumask_list(m, &task->cpus_allowed);
+	seq_printf(m, "\n");
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -335,6 +345,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	}
 	task_sig(m, task);
 	task_cap(m, task);
+	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 #if defined(CONFIG_S390)
 	task_show_regs(m, task);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..b81f7f096e1c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2499,15 +2499,9 @@ const struct file_operations proc_cpuset_operations = {
 };
 #endif /* CONFIG_PROC_PID_CPUSET */
 
-/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
+/* Display task mems_allowed in /proc/<pid>/status file. */
 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 {
-	seq_printf(m, "Cpus_allowed:\t");
-	seq_cpumask(m, &task->cpus_allowed);
-	seq_printf(m, "\n");
-	seq_printf(m, "Cpus_allowed_list:\t");
-	seq_cpumask_list(m, &task->cpus_allowed);
-	seq_printf(m, "\n");
 	seq_printf(m, "Mems_allowed:\t");
 	seq_nodemask(m, &task->mems_allowed);
 	seq_printf(m, "\n");
-- 
cgit v1.2.3-71-gd317


From 3fff4c42bd0a89869a0eb1e7874cc06ffa4aa0f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 22 Sep 2009 16:18:09 +0200
Subject: printk: Remove ratelimit.h from kernel.h

Decouple kernel.h from ratelimit.h: the global declaration of
printk's ratelimit_state is not needed, and it leads to messy
circular dependencies due to ratelimit.h's (new) adding of a
spinlock_types.h include.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h     | 2 --
 include/linux/net.h        | 1 +
 kernel/printk.c            | 1 +
 kernel/sysctl.c            | 3 +++
 lib/ratelimit.c            | 2 +-
 net/core/sysctl_net_core.c | 2 ++
 net/core/utils.c           | 2 ++
 7 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b5b1e0899a8..3305f33201be 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -15,7 +15,6 @@
 #include <linux/bitops.h>
 #include <linux/log2.h>
 #include <linux/typecheck.h>
-#include <linux/ratelimit.h>
 #include <linux/dynamic_debug.h>
 #include <asm/byteorder.h>
 #include <asm/bug.h>
@@ -241,7 +240,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2))) __cold;
 
-extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
diff --git a/include/linux/net.h b/include/linux/net.h
index 9040a10584f7..df20f680f455 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -358,6 +358,7 @@ static const struct proto_ops name##_ops = {			\
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
+#include <linux/ratelimit.h>
 extern struct ratelimit_state net_ratelimit_state;
 #endif
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 602033acd6c7..b997c893cdcf 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
+#include <linux/ratelimit.h>
 
 #include <asm/uaccess.h>
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a631ba684a4..6c37048b9db9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,6 +37,7 @@
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
 #include <linux/writeback.h>
+#include <linux/ratelimit.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
 #include <linux/key.h>
@@ -155,6 +156,8 @@ extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
 
+extern struct ratelimit_state printk_ratelimit_state;
+
 #ifdef CONFIG_RT_MUTEXES
 extern int max_lock_depth;
 #endif
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 69bfcacda16d..5551731ae1d4 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -9,7 +9,7 @@
  * This file is released under the GPLv2.
  */
 
-#include <linux/kernel.h>
+#include <linux/ratelimit.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7db1de0497c6..887c03c4e3c6 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -10,7 +10,9 @@
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/netdevice.h>
+#include <linux/ratelimit.h>
 #include <linux/init.h>
+
 #include <net/ip.h>
 #include <net/sock.h>
 
diff --git a/net/core/utils.c b/net/core/utils.c
index 83221aee7084..838250241d26 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -24,6 +24,8 @@
 #include <linux/types.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/ratelimit.h>
+
 #include <net/sock.h>
 
 #include <asm/byteorder.h>
-- 
cgit v1.2.3-71-gd317


From 737f453fd115ea0c9642ed6b30e37e296a4e3ed7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 1 Aug 2009 03:42:44 +0200
Subject: tracing/filters: Cleanup useless headers

Cleanup remaining headers inclusion that were only useful when
the filter framework and its tracing related filesystem user interface
weren't yet separated.

v2: Keep module.h, needed for EXPORT_SYMBOL_GPL

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace_events_filter.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 23245785927f..189663d82aa7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,8 +18,6 @@
  * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
  */
 
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3-71-gd317


From f3f3f0092477d0165f3f1bf0fd518550b2abd097 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Sep 2009 15:27:41 +0200
Subject: tracing/event: Cleanup the useless dentry variable

Cleanup the useless dentry variable while creating a kernel
event set of files. trace_create_file() warns if it fails to
create the file anyway, and we don't store the dentry anywhere.

v2: Fix a small conflict in kernel/trace/trace_events.c

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace_events.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 56c260b83a9c..8c91b7c8f047 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -898,9 +898,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 			   "'%s/filter' entry\n", name);
 	}
 
-	entry = trace_create_file("enable", 0644, system->entry,
-				  (void *)system->name,
-				  &ftrace_system_enable_fops);
+	trace_create_file("enable", 0644, system->entry,
+			  (void *)system->name,
+			  &ftrace_system_enable_fops);
 
 	return system->entry;
 }
@@ -912,7 +912,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		 const struct file_operations *filter,
 		 const struct file_operations *format)
 {
-	struct dentry *entry;
 	int ret;
 
 	/*
@@ -930,12 +929,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 	}
 
 	if (call->regfunc)
-		entry = trace_create_file("enable", 0644, call->dir, call,
-					  enable);
+		trace_create_file("enable", 0644, call->dir, call,
+				  enable);
 
 	if (call->id && call->profile_enable)
-		entry = trace_create_file("id", 0444, call->dir, call,
-					  id);
+		trace_create_file("id", 0444, call->dir, call,
+		 		  id);
 
 	if (call->define_fields) {
 		ret = call->define_fields(call);
@@ -944,16 +943,16 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 				   " events/%s\n", call->name);
 			return ret;
 		}
-		entry = trace_create_file("filter", 0644, call->dir, call,
-					  filter);
+		trace_create_file("filter", 0644, call->dir, call,
+				  filter);
 	}
 
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
 
-	entry = trace_create_file("format", 0444, call->dir, call,
-				  format);
+	trace_create_file("format", 0444, call->dir, call,
+			  format);
 
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 1889d20922d14a97b2099fa4d47587217c0ba48b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Sep 2009 21:10:44 +0200
Subject: tracing/filters: Provide basic regex support

This patch provides basic support for regular expressions in filters.

It supports the following types of regexp:

- *match_beginning
- *match_middle*
- match_end*
- !don't match

Example:
	cd /debug/tracing/events/bkl/lock_kernel
	echo 'file == "*reiserfs*"' > filter
	echo 1 > enable

           gedit-4941  [000]   457.735437: lock_kernel: depth: 0, fs/reiserfs/namei.c:334 reiserfs_lookup()
     sync_supers-227   [001]   461.379985: lock_kernel: depth: 0, fs/reiserfs/super.c:69 reiserfs_sync_fs()
     sync_supers-227   [000]   461.383096: lock_kernel: depth: 0, fs/reiserfs/journal.c:1069 flush_commit_list()
      reiserfs/1-1369  [001]   461.479885: lock_kernel: depth: 0, fs/reiserfs/journal.c:3509 flush_async_commits()

Every string is now handled as a regexp in the filter framework, which
helps to factorize the code for handling both simple strings and
regexp comparisons.

(The regexp parsing code has been wildly cherry picked from ftrace.c
written by Steve.)

v2: Simplify the whole and drop the filter_regex file

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace.h               |  27 ++++---
 kernel/trace/trace_events_filter.c | 155 +++++++++++++++++++++++++++++++++----
 2 files changed, 157 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 86bcff94791a..8d0db6018fe4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -702,20 +702,29 @@ struct event_subsystem {
 };
 
 struct filter_pred;
+struct regex;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
 				 int val1, int val2);
 
+typedef int (*regex_match_func)(char *str, struct regex *r, int len);
+
+struct regex {
+	char			pattern[MAX_FILTER_STR_VAL];
+	int			len;
+	int			field_len;
+	regex_match_func	match;
+};
+
 struct filter_pred {
-	filter_pred_fn_t fn;
-	u64 val;
-	char str_val[MAX_FILTER_STR_VAL];
-	int str_len;
-	char *field_name;
-	int offset;
-	int not;
-	int op;
-	int pop_n;
+	filter_pred_fn_t 	fn;
+	u64 			val;
+	struct regex		regex;
+	char 			*field_name;
+	int 			offset;
+	int 			not;
+	int 			op;
+	int 			pop_n;
 };
 
 extern void print_event_filter(struct ftrace_event_call *call,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 189663d82aa7..d3c94c139567 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -195,9 +195,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 	char *addr = (char *)(event + pred->offset);
 	int cmp, match;
 
-	cmp = strncmp(addr, pred->str_val, pred->str_len);
+	cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
 
-	match = (!cmp) ^ pred->not;
+	match = cmp ^ pred->not;
 
 	return match;
 }
@@ -209,9 +209,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
 	char **addr = (char **)(event + pred->offset);
 	int cmp, match;
 
-	cmp = strncmp(*addr, pred->str_val, pred->str_len);
+	cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len);
 
-	match = (!cmp) ^ pred->not;
+	match = cmp ^ pred->not;
 
 	return match;
 }
@@ -235,9 +235,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
 	char *addr = (char *)(event + str_loc);
 	int cmp, match;
 
-	cmp = strncmp(addr, pred->str_val, str_len);
+	cmp = pred->regex.match(addr, &pred->regex, str_len);
 
-	match = (!cmp) ^ pred->not;
+	match = cmp ^ pred->not;
 
 	return match;
 }
@@ -248,6 +248,126 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
 	return 0;
 }
 
+/* Basic regex callbacks */
+static int regex_match_full(char *str, struct regex *r, int len)
+{
+	if (strncmp(str, r->pattern, len) == 0)
+		return 1;
+	return 0;
+}
+
+static int regex_match_front(char *str, struct regex *r, int len)
+{
+	if (strncmp(str, r->pattern, len) == 0)
+		return 1;
+	return 0;
+}
+
+static int regex_match_middle(char *str, struct regex *r, int len)
+{
+	if (strstr(str, r->pattern))
+		return 1;
+	return 0;
+}
+
+static int regex_match_end(char *str, struct regex *r, int len)
+{
+	char *ptr = strstr(str, r->pattern);
+
+	if (ptr && (ptr[r->len] == 0))
+		return 1;
+	return 0;
+}
+
+enum regex_type {
+	MATCH_FULL,
+	MATCH_FRONT_ONLY,
+	MATCH_MIDDLE_ONLY,
+	MATCH_END_ONLY,
+};
+
+/*
+ * Pass in a buffer containing a regex and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ *  search returns the pointer to use for comparison.
+ *  not returns 1 if buff started with a '!'
+ *     0 otherwise.
+ */
+static enum regex_type
+filter_parse_regex(char *buff, int len, char **search, int *not)
+{
+	int type = MATCH_FULL;
+	int i;
+
+	if (buff[0] == '!') {
+		*not = 1;
+		buff++;
+		len--;
+	} else
+		*not = 0;
+
+	*search = buff;
+
+	for (i = 0; i < len; i++) {
+		if (buff[i] == '*') {
+			if (!i) {
+				*search = buff + 1;
+				type = MATCH_END_ONLY;
+			} else {
+				if (type == MATCH_END_ONLY)
+					type = MATCH_MIDDLE_ONLY;
+				else
+					type = MATCH_FRONT_ONLY;
+				buff[i] = 0;
+				break;
+			}
+		}
+	}
+
+	return type;
+}
+
+static int filter_build_regex(struct filter_pred *pred)
+{
+	struct regex *r = &pred->regex;
+	char *search, *dup;
+	enum regex_type type;
+	int not;
+
+	type = filter_parse_regex(r->pattern, r->len, &search, &not);
+	dup = kstrdup(search, GFP_KERNEL);
+	if (!dup)
+		return -ENOMEM;
+
+	strcpy(r->pattern, dup);
+	kfree(dup);
+
+	r->len = strlen(r->pattern);
+
+	switch (type) {
+	case MATCH_FULL:
+		r->match = regex_match_full;
+		break;
+	case MATCH_FRONT_ONLY:
+		r->match = regex_match_front;
+		break;
+	case MATCH_MIDDLE_ONLY:
+		r->match = regex_match_middle;
+		break;
+	case MATCH_END_ONLY:
+		r->match = regex_match_end;
+		break;
+	}
+
+	pred->not ^= not;
+
+	return 0;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
@@ -394,7 +514,7 @@ static void filter_clear_pred(struct filter_pred *pred)
 {
 	kfree(pred->field_name);
 	pred->field_name = NULL;
-	pred->str_len = 0;
+	pred->regex.len = 0;
 }
 
 static int filter_set_pred(struct filter_pred *dest,
@@ -658,21 +778,24 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	}
 
 	if (is_string_field(field)) {
-		pred->str_len = field->size;
+		ret = filter_build_regex(pred);
+		if (ret)
+			return ret;
 
-		if (field->filter_type == FILTER_STATIC_STRING)
+		if (field->filter_type == FILTER_STATIC_STRING) {
 			fn = filter_pred_string;
-		else if (field->filter_type == FILTER_DYN_STRING)
-			fn = filter_pred_strloc;
+			pred->regex.field_len = field->size;
+		} else if (field->filter_type == FILTER_DYN_STRING)
+				fn = filter_pred_strloc;
 		else {
 			fn = filter_pred_pchar;
-			pred->str_len = strlen(pred->str_val);
+			pred->regex.field_len = strlen(pred->regex.pattern);
 		}
 	} else {
 		if (field->is_signed)
-			ret = strict_strtoll(pred->str_val, 0, &val);
+			ret = strict_strtoll(pred->regex.pattern, 0, &val);
 		else
-			ret = strict_strtoull(pred->str_val, 0, &val);
+			ret = strict_strtoull(pred->regex.pattern, 0, &val);
 		if (ret) {
 			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
 			return -EINVAL;
@@ -1042,8 +1165,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
 		return NULL;
 	}
 
-	strcpy(pred->str_val, operand2);
-	pred->str_len = strlen(operand2);
+	strcpy(pred->regex.pattern, operand2);
+	pred->regex.len = strlen(pred->regex.pattern);
 
 	pred->op = op;
 
-- 
cgit v1.2.3-71-gd317


From 3f6fe06dbf67b46d36fedec502300e04dffeb67a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Sep 2009 21:31:51 +0200
Subject: tracing/filters: Unify the regex parsing helpers

The filter code has stolen the regex parsing function from ftrace to
get the regex support.
We have duplicated this code, so factorize it in the filter area and
make it generally available, as the filter code is the most suited to
host this feature.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/ftrace.c              | 64 +++-----------------------------------
 kernel/trace/trace.h               |  9 ++++++
 kernel/trace/trace_events_filter.c | 20 ++++++------
 3 files changed, 23 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc615f84751b..ddf23a225b52 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1655,60 +1655,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 	return ret;
 }
 
-enum {
-	MATCH_FULL,
-	MATCH_FRONT_ONLY,
-	MATCH_MIDDLE_ONLY,
-	MATCH_END_ONLY,
-};
-
-/*
- * (static function - no need for kernel doc)
- *
- * Pass in a buffer containing a glob and this function will
- * set search to point to the search part of the buffer and
- * return the type of search it is (see enum above).
- * This does modify buff.
- *
- * Returns enum type.
- *  search returns the pointer to use for comparison.
- *  not returns 1 if buff started with a '!'
- *     0 otherwise.
- */
-static int
-ftrace_setup_glob(char *buff, int len, char **search, int *not)
-{
-	int type = MATCH_FULL;
-	int i;
-
-	if (buff[0] == '!') {
-		*not = 1;
-		buff++;
-		len--;
-	} else
-		*not = 0;
-
-	*search = buff;
-
-	for (i = 0; i < len; i++) {
-		if (buff[i] == '*') {
-			if (!i) {
-				*search = buff + 1;
-				type = MATCH_END_ONLY;
-			} else {
-				if (type == MATCH_END_ONLY)
-					type = MATCH_MIDDLE_ONLY;
-				else
-					type = MATCH_FRONT_ONLY;
-				buff[i] = 0;
-				break;
-			}
-		}
-	}
-
-	return type;
-}
-
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
 	int matched = 0;
@@ -1757,7 +1703,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
 	int not;
 
 	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	type = ftrace_setup_glob(buff, len, &search, &not);
+	type = filter_parse_regex(buff, len, &search, &not);
 
 	search_len = strlen(search);
 
@@ -1825,7 +1771,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 	}
 
 	if (strlen(buff)) {
-		type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
+		type = filter_parse_regex(buff, strlen(buff), &search, &not);
 		search_len = strlen(search);
 	}
 
@@ -1990,7 +1936,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	int count = 0;
 	char *search;
 
-	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+	type = filter_parse_regex(glob, strlen(glob), &search, &not);
 	len = strlen(search);
 
 	/* we do not support '!' for function probes */
@@ -2067,7 +2013,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	else if (glob) {
 		int not;
 
-		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+		type = filter_parse_regex(glob, strlen(glob), &search, &not);
 		len = strlen(search);
 
 		/* we do not support '!' for function probes */
@@ -2520,7 +2466,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 		return -ENODEV;
 
 	/* decode regex */
-	type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
+	type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
 	if (not)
 		return -EINVAL;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8d0db6018fe4..db6b83edd49b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -709,6 +709,13 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
 
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
 
+enum regex_type {
+	MATCH_FULL,
+	MATCH_FRONT_ONLY,
+	MATCH_MIDDLE_ONLY,
+	MATCH_END_ONLY,
+};
+
 struct regex {
 	char			pattern[MAX_FILTER_STR_VAL];
 	int			len;
@@ -727,6 +734,8 @@ struct filter_pred {
 	int 			pop_n;
 };
 
+extern enum regex_type
+filter_parse_regex(char *buff, int len, char **search, int *not);
 extern void print_event_filter(struct ftrace_event_call *call,
 			       struct trace_seq *s);
 extern int apply_event_filter(struct ftrace_event_call *call,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d3c94c139567..8c194de675b0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -279,15 +279,14 @@ static int regex_match_end(char *str, struct regex *r, int len)
 	return 0;
 }
 
-enum regex_type {
-	MATCH_FULL,
-	MATCH_FRONT_ONLY,
-	MATCH_MIDDLE_ONLY,
-	MATCH_END_ONLY,
-};
-
-/*
- * Pass in a buffer containing a regex and this function will
+/**
+ * filter_parse_regex - parse a basic regex
+ * @buff:   the raw regex
+ * @len:    length of the regex
+ * @search: will point to the beginning of the string to compare
+ * @not:    tell whether the match will have to be inverted
+ *
+ * This passes in a buffer containing a regex and this function will
  * set search to point to the search part of the buffer and
  * return the type of search it is (see enum above).
  * This does modify buff.
@@ -297,8 +296,7 @@ enum regex_type {
  *  not returns 1 if buff started with a '!'
  *     0 otherwise.
  */
-static enum regex_type
-filter_parse_regex(char *buff, int len, char **search, int *not)
+enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
 {
 	int type = MATCH_FULL;
 	int i;
-- 
cgit v1.2.3-71-gd317


From 7c68af6e32c73992bad24107311f3433c89016e2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sat, 19 Sep 2009 09:40:22 +0300
Subject: core, x86: Add user return notifiers

Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace.  The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.

This will be used initially to speed up KVM task switching by lazily
updating MSRs.

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1253342422-13811-1-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/Kconfig                         | 10 ++++++++
 arch/x86/Kconfig                     |  1 +
 arch/x86/include/asm/thread_info.h   |  7 ++++--
 arch/x86/kernel/process.c            |  2 ++
 arch/x86/kernel/signal.c             |  3 +++
 include/linux/user-return-notifier.h | 42 ++++++++++++++++++++++++++++++++
 kernel/user-return-notifier.c        | 46 ++++++++++++++++++++++++++++++++++++
 7 files changed, 109 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/user-return-notifier.h
 create mode 100644 kernel/user-return-notifier.c

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 7f418bbc261a..4e312fffbfd7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -83,6 +83,13 @@ config KRETPROBES
 	def_bool y
 	depends on KPROBES && HAVE_KRETPROBES
 
+config USER_RETURN_NOTIFIER
+	bool
+	depends on HAVE_USER_RETURN_NOTIFIER
+	help
+	  Provide a kernel-internal notification when a cpu is about to
+	  switch to user mode.
+
 config HAVE_IOREMAP_PROT
 	bool
 
@@ -126,4 +133,7 @@ config HAVE_DMA_API_DEBUG
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
 
+config HAVE_USER_RETURN_NOTIFIER
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8da93745c087..1df175d15aa8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_USER_RETURN_NOTIFIER
 
 config OUTPUT_FORMAT
 	string
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..375c917c37d2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
+	 _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
 #define PREEMPT_ACTIVE		0x10000000
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..e51b056fc88f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
+#include <linux/user-return-notifier.h>
 #include <trace/events/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -224,6 +225,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+	propagate_user_return_notify(prev_p, next_p);
 }
 
 int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..c49f90f7957a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		if (current->replacement_session_keyring)
 			key_replace_session_keyring();
 	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
 
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
new file mode 100644
index 000000000000..b6ac056291d7
--- /dev/null
+++ b/include/linux/user-return-notifier.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_USER_RETURN_NOTIFIER_H
+#define _LINUX_USER_RETURN_NOTIFIER_H
+
+#ifdef CONFIG_USER_RETURN_NOTIFIER
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct user_return_notifier {
+	void (*on_user_return)(struct user_return_notifier *urn);
+	struct hlist_node link;
+};
+
+
+void user_return_notifier_register(struct user_return_notifier *urn);
+void user_return_notifier_unregister(struct user_return_notifier *urn);
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+						struct task_struct *next)
+{
+	if (test_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY)) {
+		clear_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY);
+		set_tsk_thread_flag(next, TIF_USER_RETURN_NOTIFY);
+	}
+}
+
+void fire_user_return_notifiers(void);
+
+#else
+
+struct user_return_notifier {};
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+						struct task_struct *next)
+{
+}
+
+static inline void fire_user_return_notifiers(void) {}
+
+#endif
+
+#endif
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000000..530ccb816513
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,46 @@
+
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+
+#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
+
+/*
+ * Request a notification when the current cpu returns to userspace.  Must be
+ * called in atomic context.  The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+	set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+	hlist_add_head(&urn->link, &URN_LIST_HEAD);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+
+/*
+ * Removes a registered user return notifier.  Must be called from atomic
+ * context, and from the same cpu registration occured in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+	hlist_del(&urn->link);
+	if (hlist_empty(&URN_LIST_HEAD))
+		clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+	struct user_return_notifier *urn;
+	struct hlist_node *tmp1, *tmp2;
+	struct hlist_head *head;
+
+	head = &get_cpu_var(return_notifier_list);
+	hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+		urn->on_user_return(urn);
+	put_cpu_var();
+}
-- 
cgit v1.2.3-71-gd317


From a1a138d05fa060ac4238c19a1e890aacc25ed3ba Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 25 Sep 2009 11:20:12 -0700
Subject: tracing/kprobes: Use global event perf buffers in kprobe tracer

Use new percpu global event buffer instead of stack in kprobe
tracer while tracing through perf.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090925182011.10157.60140.stgit@omoto>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 115 ++++++++++++++++++++++++++++----------------
 1 file changed, 73 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 09cba270392d..97309d4714f7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1149,35 +1149,49 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry *entry;
-	int size, __size, i, pc;
+	struct trace_entry *ent;
+	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
+	char *raw_data;
 
-	local_save_flags(irq_flags);
 	pc = preempt_count();
-
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
+	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+		     "profile buffer not large enough"))
+		return 0;
 
-	do {
-		char raw_data[size];
-		struct trace_entry *ent;
-		/*
-		 * Zero dead bytes from alignment to avoid stack leak
-		 * to userspace
-		 */
-		*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-		entry = (struct kprobe_trace_entry *)raw_data;
-		ent = &entry->ent;
-
-		tracing_generic_entry_update(ent, irq_flags, pc);
-		ent->type = call->id;
-		entry->nargs = tp->nr_args;
-		entry->ip = (unsigned long)kp->addr;
-		for (i = 0; i < tp->nr_args; i++)
-			entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-		perf_tp_event(call->id, entry->ip, 1, entry, size);
-	} while (0);
+	/*
+	 * Protect the non nmi buffer
+	 * This also protects the rcu read side
+	 */
+	local_irq_save(irq_flags);
+	__cpu = smp_processor_id();
+
+	if (in_nmi())
+		raw_data = rcu_dereference(trace_profile_buf_nmi);
+	else
+		raw_data = rcu_dereference(trace_profile_buf);
+
+	if (!raw_data)
+		goto end;
+
+	raw_data = per_cpu_ptr(raw_data, __cpu);
+	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+	entry = (struct kprobe_trace_entry *)raw_data;
+	ent = &entry->ent;
+
+	tracing_generic_entry_update(ent, irq_flags, pc);
+	ent->type = call->id;
+	entry->nargs = tp->nr_args;
+	entry->ip = (unsigned long)kp->addr;
+	for (i = 0; i < tp->nr_args; i++)
+		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+	perf_tp_event(call->id, entry->ip, 1, entry, size);
+end:
+	local_irq_restore(irq_flags);
 	return 0;
 }
 
@@ -1188,33 +1202,50 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry *entry;
-	int size, __size, i, pc;
+	struct trace_entry *ent;
+	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
+	char *raw_data;
 
-	local_save_flags(irq_flags);
 	pc = preempt_count();
-
 	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
+	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+		     "profile buffer not large enough"))
+		return 0;
+
+	/*
+	 * Protect the non nmi buffer
+	 * This also protects the rcu read side
+	 */
+	local_irq_save(irq_flags);
+	__cpu = smp_processor_id();
+
+	if (in_nmi())
+		raw_data = rcu_dereference(trace_profile_buf_nmi);
+	else
+		raw_data = rcu_dereference(trace_profile_buf);
+
+	if (!raw_data)
+		goto end;
+
+	raw_data = per_cpu_ptr(raw_data, __cpu);
+	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+	entry = (struct kretprobe_trace_entry *)raw_data;
+	ent = &entry->ent;
 
-	do {
-		char raw_data[size];
-		struct trace_entry *ent;
-
-		*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-		entry = (struct kretprobe_trace_entry *)raw_data;
-		ent = &entry->ent;
-
-		tracing_generic_entry_update(ent, irq_flags, pc);
-		ent->type = call->id;
-		entry->nargs = tp->nr_args;
-		entry->func = (unsigned long)tp->rp.kp.addr;
-		entry->ret_ip = (unsigned long)ri->ret_addr;
-		for (i = 0; i < tp->nr_args; i++)
-			entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-		perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
-	} while (0);
+	tracing_generic_entry_update(ent, irq_flags, pc);
+	ent->type = call->id;
+	entry->nargs = tp->nr_args;
+	entry->func = (unsigned long)tp->rp.kp.addr;
+	entry->ret_ip = (unsigned long)ri->ret_addr;
+	for (i = 0; i < tp->nr_args; i++)
+		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
+end:
+	local_irq_restore(irq_flags);
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 88f70d7590538e427c8405a2e02ac2624847386c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 25 Sep 2009 11:20:54 -0700
Subject: tracing/ftrace: Fix to check create_event_dir() when adding new
 events

Check result of event_create_dir() and add ftrace_event_call to
ftrace_events list only if it is succeeded. Thanks to Li for pointing
it out.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20090925182054.10157.55219.stgit@omoto>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a4b7c9a9130c..155b5d5a4e45 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -957,12 +957,12 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
 	if (!d_events)
 		return -ENOENT;
 
-	list_add(&call->list, &ftrace_events);
 	ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
 				&ftrace_enable_fops, &ftrace_event_filter_fops,
 				&ftrace_event_format_fops);
-	if (ret < 0)
-		list_del(&call->list);
+	if (!ret)
+		list_add(&call->list, &ftrace_events);
+
 	return ret;
 }
 
@@ -1124,10 +1124,11 @@ static void trace_module_add_events(struct module *mod)
 				return;
 		}
 		call->mod = mod;
-		list_add(&call->list, &ftrace_events);
-		event_create_dir(call, d_events,
-				 &file_ops->id, &file_ops->enable,
-				 &file_ops->filter, &file_ops->format);
+		ret = event_create_dir(call, d_events,
+				       &file_ops->id, &file_ops->enable,
+				       &file_ops->filter, &file_ops->format);
+		if (!ret)
+			list_add(&call->list, &ftrace_events);
 	}
 }
 
@@ -1267,10 +1268,12 @@ static __init int event_trace_init(void)
 				continue;
 			}
 		}
-		list_add(&call->list, &ftrace_events);
-		event_create_dir(call, d_events, &ftrace_event_id_fops,
-				 &ftrace_enable_fops, &ftrace_event_filter_fops,
-				 &ftrace_event_format_fops);
+		ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+				       &ftrace_enable_fops,
+				       &ftrace_event_filter_fops,
+				       &ftrace_event_format_fops);
+		if (!ret)
+			list_add(&call->list, &ftrace_events);
 	}
 
 	while (true) {
-- 
cgit v1.2.3-71-gd317


From a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 2 Oct 2009 16:17:53 -0700
Subject: time: Implement logarithmic time accumulation

Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.

The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.

For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.

Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.

So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.

Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.

An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/timex.h     |  4 ---
 kernel/time/timekeeping.c | 85 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index e6967d10d9e5..0c0ef7d4db7c 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -261,11 +261,7 @@ static inline int ntp_synced(void)
 
 #define NTP_SCALE_SHIFT		32
 
-#ifdef CONFIG_NO_HZ
-#define NTP_INTERVAL_FREQ  (2)
-#else
 #define NTP_INTERVAL_FREQ  (HZ)
-#endif
 #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
 
 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46fa1ecd..5fdd78e0858a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
 				timekeeper.ntp_error_shift;
 }
 
+
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+	u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+
+	/* If the offset is smaller then a shifted interval, do nothing */
+	if (offset < timekeeper.cycle_interval<<shift)
+		return offset;
+
+	/* Accumulate one shifted interval */
+	offset -= timekeeper.cycle_interval << shift;
+	timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+
+	timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+	while (timekeeper.xtime_nsec >= nsecps) {
+		timekeeper.xtime_nsec -= nsecps;
+		xtime.tv_sec++;
+		second_overflow();
+	}
+
+	/* Accumulate into raw time */
+	raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+	while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+		raw_time.tv_nsec -= NSEC_PER_SEC;
+		raw_time.tv_sec++;
+	}
+
+	/* Accumulate error between NTP and clock interval */
+	timekeeper.ntp_error += tick_length << shift;
+	timekeeper.ntp_error -= timekeeper.xtime_interval <<
+				(timekeeper.ntp_error_shift + shift);
+
+	return offset;
+}
+
+
 /**
  * update_wall_time - Uses the current clocksource to increment the wall time
  *
@@ -731,6 +776,7 @@ void update_wall_time(void)
 	struct clocksource *clock;
 	cycle_t offset;
 	u64 nsecs;
+	int shift = 0, maxshift;
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -744,33 +790,22 @@ void update_wall_time(void)
 #endif
 	timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
 
-	/* normally this loop will run just once, however in the
-	 * case of lost or late ticks, it will accumulate correctly.
+	/*
+	 * With NO_HZ we may have to accumulate many cycle_intervals
+	 * (think "ticks") worth of time at once. To do this efficiently,
+	 * we calculate the largest doubling multiple of cycle_intervals
+	 * that is smaller then the offset. We then accumulate that
+	 * chunk in one go, and then try to consume the next smaller
+	 * doubled multiple.
 	 */
+	shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+	shift = max(0, shift);
+	/* Bound shift to one less then what overflows tick_length */
+	maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+	shift = min(shift, maxshift);
 	while (offset >= timekeeper.cycle_interval) {
-		u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
-
-		/* accumulate one interval */
-		offset -= timekeeper.cycle_interval;
-		clock->cycle_last += timekeeper.cycle_interval;
-
-		timekeeper.xtime_nsec += timekeeper.xtime_interval;
-		if (timekeeper.xtime_nsec >= nsecps) {
-			timekeeper.xtime_nsec -= nsecps;
-			xtime.tv_sec++;
-			second_overflow();
-		}
-
-		raw_time.tv_nsec += timekeeper.raw_interval;
-		if (raw_time.tv_nsec >= NSEC_PER_SEC) {
-			raw_time.tv_nsec -= NSEC_PER_SEC;
-			raw_time.tv_sec++;
-		}
-
-		/* accumulate error between NTP and clock interval */
-		timekeeper.ntp_error += tick_length;
-		timekeeper.ntp_error -= timekeeper.xtime_interval <<
-					timekeeper.ntp_error_shift;
+		offset = logarithmic_accumulation(offset, shift);
+		shift--;
 	}
 
 	/* correct the clock when NTP error is too big */
-- 
cgit v1.2.3-71-gd317


From 7bc7d637452383d56ba4368d4336b0dde1bb476d Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 2 Oct 2009 16:24:15 -0700
Subject: time: Remove xtime_cache

With the prior logarithmic time accumulation patch, xtime will now
always be within one "tick" of the current time, instead of
possibly half a second off.

This removes the need for the xtime_cache value, which always
stored the time at the last interrupt, so this patch cleans that up
removing the xtime_cache related code.

This is a bit simpler, but still could use some wider testing.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525855.7741.95.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time.c             |  1 -
 kernel/time/timekeeping.c | 27 ++++-----------------------
 2 files changed, 4 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..2ef4fe2079b6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
 	write_seqlock_irq(&xtime_lock);
 	wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
 	xtime.tv_sec += sys_tz.tz_minuteswest * 60;
-	update_xtime_cache(0);
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fdd78e0858a..96b3f0dfa5dc 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -164,13 +164,6 @@ struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
 
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
-	xtime_cache = xtime;
-	timespec_add_ns(&xtime_cache, nsec);
-}
-
 /* must hold xtime_lock */
 void timekeeping_leap_insert(int leapsecond)
 {
@@ -331,8 +324,6 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime = *tv;
 
-	update_xtime_cache(0);
-
 	timekeeper.ntp_error = 0;
 	ntp_clear();
 
@@ -547,7 +538,6 @@ void __init timekeeping_init(void)
 	}
 	set_normalized_timespec(&wall_to_monotonic,
 				-boot.tv_sec, -boot.tv_nsec);
-	update_xtime_cache(0);
 	total_sleep_time.tv_sec = 0;
 	total_sleep_time.tv_nsec = 0;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -581,7 +571,6 @@ static int timekeeping_resume(struct sys_device *dev)
 		wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
 		total_sleep_time = timespec_add_safe(total_sleep_time, ts);
 	}
-	update_xtime_cache(0);
 	/* re-base the last cycle value */
 	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
 	timekeeper.ntp_error = 0;
@@ -721,7 +710,6 @@ static void timekeeping_adjust(s64 offset)
 				timekeeper.ntp_error_shift;
 }
 
-
 /**
  * logarithmic_accumulation - shifted accumulation of cycles
  *
@@ -765,7 +753,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 	return offset;
 }
 
-
 /**
  * update_wall_time - Uses the current clocksource to increment the wall time
  *
@@ -775,7 +762,6 @@ void update_wall_time(void)
 {
 	struct clocksource *clock;
 	cycle_t offset;
-	u64 nsecs;
 	int shift = 0, maxshift;
 
 	/* Make sure we're fully resumed: */
@@ -841,9 +827,6 @@ void update_wall_time(void)
 	timekeeper.ntp_error +=	timekeeper.xtime_nsec <<
 				timekeeper.ntp_error_shift;
 
-	nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
-	update_xtime_cache(nsecs);
-
 	/* check to see if there is a new clocksource to use */
 	update_vsyscall(&xtime, timekeeper.clock);
 }
@@ -880,13 +863,13 @@ void monotonic_to_bootbased(struct timespec *ts)
 
 unsigned long get_seconds(void)
 {
-	return xtime_cache.tv_sec;
+	return xtime.tv_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 
 struct timespec __current_kernel_time(void)
 {
-	return xtime_cache;
+	return xtime;
 }
 
 struct timespec current_kernel_time(void)
@@ -896,8 +879,7 @@ struct timespec current_kernel_time(void)
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-
-		now = xtime_cache;
+		now = xtime;
 	} while (read_seqretry(&xtime_lock, seq));
 
 	return now;
@@ -911,8 +893,7 @@ struct timespec get_monotonic_coarse(void)
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-
-		now = xtime_cache;
+		now = xtime;
 		mono = wall_to_monotonic;
 	} while (read_seqretry(&xtime_lock, seq));
 
-- 
cgit v1.2.3-71-gd317


From cf82ff7ea7695b0e82ba07bc5e9f1bd03a74e1aa Mon Sep 17 00:00:00 2001
From: "Jayson R. King" <dev@jaysonking.com>
Date: Mon, 5 Oct 2009 05:21:26 -0500
Subject: sched: Remove obsolete comment in sched_init()

Remove the comment about calling alloc_bootmem() as it is not
called here since commit 36b7b6d465489c4754c4fd66fcec6086eba87896.

Signed-off-by: Jayson R. King <dev@jaysonking.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <trivial@kernel.org>
LKML-Reference: <4AC9C8A6.6010209@jaysonking.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 830967e18285..a56446d7fda2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9322,10 +9322,6 @@ void __init sched_init(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
-	/*
-	 * As sched_init() is called before page_alloc is setup,
-	 * we use alloc_bootmem().
-	 */
 	if (alloc_size) {
 		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 
-- 
cgit v1.2.3-71-gd317


From 26a50744b21fff65bd754874072857bee8967f4d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 6 Oct 2009 01:09:50 -0500
Subject: tracing/events: Add 'signed' field to format files

The sign info used for filters in the kernel is also useful to
applications that process the trace stream.  Add it to the format
files and make it available to userspace.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: rostedt@goodmis.org
Cc: lizf@cn.fujitsu.com
Cc: hch@infradead.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1254809398-8078-2-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/ftrace.h              | 15 +++++++++------
 kernel/trace/ring_buffer.c          | 15 +++++++++------
 kernel/trace/trace_events.c         | 24 ++++++++++++------------
 kernel/trace/trace_export.c         | 25 ++++++++++++++-----------
 kernel/trace/trace_syscalls.c       | 20 +++++++++++++-------
 tools/perf/util/trace-event-parse.c | 24 ++++++++++++++++++++++++
 tools/perf/util/trace-event.h       |  1 +
 7 files changed, 82 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index cc0d9667e182..c9bbcab95fbe 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -120,9 +120,10 @@
 #undef __field
 #define __field(type, item)					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",	\
 			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
+			       (unsigned int)sizeof(field.item),	\
+			       (unsigned int)is_signed_type(type));	\
 	if (!ret)							\
 		return 0;
 
@@ -132,19 +133,21 @@
 #undef __array
 #define __array(type, item, len)						\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",	\
 			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
+			       (unsigned int)sizeof(field.item),	\
+			       (unsigned int)is_signed_type(type));	\
 	if (!ret)							\
 		return 0;
 
 #undef __dynamic_array
 #define __dynamic_array(type, item, len)				       \
 	ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\
-			       "offset:%u;\tsize:%u;\n",		       \
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",	       \
 			       (unsigned int)offsetof(typeof(field),	       \
 					__data_loc_##item),		       \
-			       (unsigned int)sizeof(field.__data_loc_##item)); \
+			       (unsigned int)sizeof(field.__data_loc_##item), \
+			       (unsigned int)is_signed_type(type));	\
 	if (!ret)							       \
 		return 0;
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d4ff01970547..e43c928356ee 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 	int ret;
 
 	ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
-			       "offset:0;\tsize:%u;\n",
-			       (unsigned int)sizeof(field.time_stamp));
+			       "offset:0;\tsize:%u;\tsigned:%u;\n",
+			       (unsigned int)sizeof(field.time_stamp),
+			       (unsigned int)is_signed_type(u64));
 
 	ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
-			       "offset:%u;\tsize:%u;\n",
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 			       (unsigned int)offsetof(typeof(field), commit),
-			       (unsigned int)sizeof(field.commit));
+			       (unsigned int)sizeof(field.commit),
+			       (unsigned int)is_signed_type(long));
 
 	ret = trace_seq_printf(s, "\tfield: char data;\t"
-			       "offset:%u;\tsize:%u;\n",
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 			       (unsigned int)offsetof(typeof(field), data),
-			       (unsigned int)BUF_PAGE_SIZE);
+			       (unsigned int)BUF_PAGE_SIZE,
+			       (unsigned int)is_signed_type(char));
 
 	return ret;
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d128f65778e6..cf3cabf6ce14 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -507,7 +507,7 @@ extern char *__bad_type_size(void);
 #define FIELD(type, name)						\
 	sizeof(type) != sizeof(field.name) ? __bad_type_size() :	\
 	#type, "common_" #name, offsetof(typeof(field), name),		\
-		sizeof(field.name)
+		sizeof(field.name), is_signed_type(type)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -515,17 +515,17 @@ static int trace_write_header(struct trace_seq *s)
 
 	/* struct trace_entry */
 	return trace_seq_printf(s,
-				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-				"\n",
-				FIELD(unsigned short, type),
-				FIELD(unsigned char, flags),
-				FIELD(unsigned char, preempt_count),
-				FIELD(int, pid),
-				FIELD(int, lock_depth));
+			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+			"\n",
+			FIELD(unsigned short, type),
+			FIELD(unsigned char, flags),
+			FIELD(unsigned char, preempt_count),
+			FIELD(int, pid),
+			FIELD(int, lock_depth));
 }
 
 static ssize_t
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9753fcc61bc5..31da218ee10f 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void)		\
 #undef __field
 #define __field(type, item)						\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%zu;\tsize:%zu;\n",		\
+			       "offset:%zu;\tsize:%zu;\tsigned:%u;\n",	\
 			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
+			       sizeof(field.item), is_signed_type(type)); \
 	if (!ret)							\
 		return 0;
 
 #undef __field_desc
 #define __field_desc(type, container, item)				\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%zu;\tsize:%zu;\n",		\
+			       "offset:%zu;\tsize:%zu;\tsigned:%u;\n",	\
 			       offsetof(typeof(field), container.item),	\
-			       sizeof(field.container.item));		\
+			       sizeof(field.container.item),		\
+			       is_signed_type(type));			\
 	if (!ret)							\
 		return 0;
 
 #undef __array
 #define __array(type, item, len)					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-			       "offset:%zu;\tsize:%zu;\n",		\
-			       offsetof(typeof(field), item),	\
-			       sizeof(field.item));		\
+			       "offset:%zu;\tsize:%zu;\tsigned:%u;\n",	\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item), is_signed_type(type)); \
 	if (!ret)							\
 		return 0;
 
 #undef __array_desc
 #define __array_desc(type, container, item, len)			\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-			       "offset:%zu;\tsize:%zu;\n",		\
+			       "offset:%zu;\tsize:%zu;\tsigned:%u;\n",	\
 			       offsetof(typeof(field), container.item),	\
-			       sizeof(field.container.item));		\
+			       sizeof(field.container.item),		\
+			       is_signed_type(type));			\
 	if (!ret)							\
 		return 0;
 
 #undef __dynamic_array
 #define __dynamic_array(type, item)					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%zu;\tsize:0;\n",		\
-			       offsetof(typeof(field), item));		\
+			       "offset:%zu;\tsize:0;\tsigned:%u;\n",	\
+			       offsetof(typeof(field), item),		\
+			       is_signed_type(type));			\
 	if (!ret)							\
 		return 0;
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 527e17eae575..d99abc427c39 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -103,7 +103,8 @@ extern char *__bad_type_size(void);
 #define SYSCALL_FIELD(type, name)					\
 	sizeof(type) != sizeof(trace.name) ?				\
 		__bad_type_size() :					\
-		#type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
+		#type, #name, offsetof(typeof(trace), name),		\
+		sizeof(trace.name), is_signed_type(type)
 
 int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
 {
@@ -120,7 +121,8 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
 	if (!entry)
 		return 0;
 
-	ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
+	ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+			       "\tsigned:%u;\n",
 			       SYSCALL_FIELD(int, nr));
 	if (!ret)
 		return 0;
@@ -130,8 +132,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
 				        entry->args[i]);
 		if (!ret)
 			return 0;
-		ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
-				       sizeof(unsigned long));
+		ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
+				       "\tsigned:%u;\n", offset,
+				       sizeof(unsigned long),
+				       is_signed_type(unsigned long));
 		if (!ret)
 			return 0;
 		offset += sizeof(unsigned long);
@@ -163,8 +167,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
 	struct syscall_trace_exit trace;
 
 	ret = trace_seq_printf(s,
-			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
+			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+			       "\tsigned:%u;\n"
+			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+			       "\tsigned:%u;\n",
 			       SYSCALL_FIELD(int, nr),
 			       SYSCALL_FIELD(long, ret));
 	if (!ret)
@@ -212,7 +218,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
 	if (ret)
 		return ret;
 
-	ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0,
+	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
 				 FILTER_OTHER);
 
 	return ret;
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 55b41b9e3834..be8412d699a1 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -894,6 +894,21 @@ static int event_read_fields(struct event *event, struct format_field **fields)
 		field->size = strtoul(token, NULL, 0);
 		free_token(token);
 
+		if (read_expected(EVENT_OP, (char *)";") < 0)
+			goto fail_expect;
+
+		if (read_expected(EVENT_ITEM, (char *)"signed") < 0)
+			goto fail_expect;
+
+		if (read_expected(EVENT_OP, (char *)":") < 0)
+			goto fail_expect;
+
+		if (read_expect_type(EVENT_ITEM, &token))
+			goto fail;
+		if (strtoul(token, NULL, 0))
+			field->flags |= FIELD_IS_SIGNED;
+		free_token(token);
+
 		if (read_expected(EVENT_OP, (char *)";") < 0)
 			goto fail_expect;
 
@@ -2843,6 +2858,15 @@ static void parse_header_field(char *type,
 		return;
 	*size = atoi(token);
 	free_token(token);
+	if (read_expected(EVENT_OP, (char *)";") < 0)
+		return;
+	if (read_expected(EVENT_ITEM, (char *)"signed") < 0)
+		return;
+	if (read_expected(EVENT_OP, (char *)":") < 0)
+		return;
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		return;
+	free_token(token);
 	if (read_expected(EVENT_OP, (char *)";") < 0)
 		return;
 	if (read_expect_type(EVENT_NEWLINE, &token) < 0)
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 162c3e6deb93..00b440df66d8 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -26,6 +26,7 @@ enum {
 enum format_flags {
 	FIELD_IS_ARRAY		= 1,
 	FIELD_IS_POINTER	= 2,
+	FIELD_IS_SIGNED		= 4,
 };
 
 struct format_field {
-- 
cgit v1.2.3-71-gd317


From 405b2651e4bedf8d3932b64cad649b4d26b067f5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 7 Oct 2009 18:27:40 -0400
Subject: tracing/kprobes: Add $ prefix to special variables

Add $ prefix to the special variables(e.g. sa, rv) of kprobe-tracer.
This resolves consistency issues between kprobe_events and perf-kprobe.

The main goal is to avoid conflicts between local variable names of
probed functions, used by perf probe, and special variables used
in the kprobe event creation interface (stack values, etc...) and
also available from perf probe.

ie: we don't want rv (return value) to conflict with a local variable
named rv in a probed function.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
LKML-Reference: <20091007222740.1684.91170.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt | 20 ++++++-------
 kernel/trace/trace_kprobe.c         | 60 +++++++++++++++++++++++--------------
 2 files changed, 47 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 9b8f7c6040a7..33f531858c56 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -36,13 +36,13 @@ Synopsis of kprobe_events
 
  FETCHARGS	: Arguments. Each probe can have up to 128 args.
   %REG	: Fetch register REG
-  sN	: Fetch Nth entry of stack (N >= 0)
-  sa	: Fetch stack address.
   @ADDR	: Fetch memory at ADDR (ADDR should be in kernel)
   @SYM[+|-offs]	: Fetch memory at SYM +|- offs (SYM should be a data symbol)
-  aN	: Fetch function argument. (N >= 0)(*)
-  rv	: Fetch return value.(**)
-  ra	: Fetch return address.(**)
+  $sN	: Fetch Nth entry of stack (N >= 0)
+  $sa	: Fetch stack address.
+  $aN	: Fetch function argument. (N >= 0)(*)
+  $rv	: Fetch return value.(**)
+  $ra	: Fetch return address.(**)
   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
   NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
 
@@ -85,13 +85,13 @@ Usage examples
 To add a probe as a new event, write a new definition to kprobe_events
 as below.
 
-  echo p:myprobe do_sys_open dfd=a0 filename=a1 flags=a2 mode=a3 > /sys/kernel/debug/tracing/kprobe_events
+  echo p:myprobe do_sys_open dfd=$a0 filename=$a1 flags=$a2 mode=$a3 > /sys/kernel/debug/tracing/kprobe_events
 
  This sets a kprobe on the top of do_sys_open() function with recording
 1st to 4th arguments as "myprobe" event. As this example shows, users can
 choose more familiar names for each arguments.
 
-  echo r:myretprobe do_sys_open rv ra >> /sys/kernel/debug/tracing/kprobe_events
+  echo r:myretprobe do_sys_open $rv $ra >> /sys/kernel/debug/tracing/kprobe_events
 
  This sets a kretprobe on the return point of do_sys_open() function with
 recording return value and return address as "myretprobe" event.
@@ -138,11 +138,11 @@ events, you need to enable it.
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
            <...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
-           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) rv=fffffffffffffffe ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $rv=fffffffffffffffe $ra=ffffffff81367a3a
            <...>-1447  [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6
-           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) rv=3 ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 $ra=ffffffff81367a3a
            <...>-1447  [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10
-           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) rv=3 ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 $ra=ffffffff81367a3a
 
 
  Each line shows when the kernel hits an event, and <- SYMBOL means kernel
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 97309d4714f7..f63ead0cc5cf 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -220,24 +220,24 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
 	int ret = -EINVAL;
 
 	if (ff->func == fetch_argument)
-		ret = snprintf(buf, n, "a%lu", (unsigned long)ff->data);
+		ret = snprintf(buf, n, "$a%lu", (unsigned long)ff->data);
 	else if (ff->func == fetch_register) {
 		const char *name;
 		name = regs_query_register_name((unsigned int)((long)ff->data));
 		ret = snprintf(buf, n, "%%%s", name);
 	} else if (ff->func == fetch_stack)
-		ret = snprintf(buf, n, "s%lu", (unsigned long)ff->data);
+		ret = snprintf(buf, n, "$s%lu", (unsigned long)ff->data);
 	else if (ff->func == fetch_memory)
 		ret = snprintf(buf, n, "@0x%p", ff->data);
 	else if (ff->func == fetch_symbol) {
 		struct symbol_cache *sc = ff->data;
 		ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
 	} else if (ff->func == fetch_retvalue)
-		ret = snprintf(buf, n, "rv");
+		ret = snprintf(buf, n, "$rv");
 	else if (ff->func == fetch_ip)
-		ret = snprintf(buf, n, "ra");
+		ret = snprintf(buf, n, "$ra");
 	else if (ff->func == fetch_stack_address)
-		ret = snprintf(buf, n, "sa");
+		ret = snprintf(buf, n, "$sa");
 	else if (ff->func == fetch_indirect) {
 		struct indirect_fetch_data *id = ff->data;
 		size_t l = 0;
@@ -429,12 +429,10 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
 #define PARAM_MAX_ARGS 16
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 
-static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
-	long offset;
-	char *tmp;
 
 	switch (arg[0]) {
 	case 'a':	/* argument */
@@ -456,14 +454,6 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 		} else
 			ret = -EINVAL;
 		break;
-	case '%':	/* named register */
-		ret = regs_query_register_offset(arg + 1);
-		if (ret >= 0) {
-			ff->func = fetch_register;
-			ff->data = (void *)(unsigned long)ret;
-			ret = 0;
-		}
-		break;
 	case 's':	/* stack */
 		if (arg[1] == 'a') {
 			ff->func = fetch_stack_address;
@@ -478,6 +468,31 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 			}
 		}
 		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+	int ret = 0;
+	unsigned long param;
+	long offset;
+	char *tmp;
+
+	switch (arg[0]) {
+	case '$':
+		ret = parse_probe_vars(arg + 1, ff, is_return);
+		break;
+	case '%':	/* named register */
+		ret = regs_query_register_offset(arg + 1);
+		if (ret >= 0) {
+			ff->func = fetch_register;
+			ff->data = (void *)(unsigned long)ret;
+			ret = 0;
+		}
+		break;
 	case '@':	/* memory or symbol */
 		if (isdigit(arg[1])) {
 			ret = strict_strtoul(arg + 1, 0, &param);
@@ -489,8 +504,7 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 			ret = split_symbol_offset(arg + 1, &offset);
 			if (ret)
 				break;
-			ff->data = alloc_symbol_cache(arg + 1,
-							      offset);
+			ff->data = alloc_symbol_cache(arg + 1, offset);
 			if (ff->data)
 				ff->func = fetch_symbol;
 			else
@@ -544,11 +558,11 @@ static int create_trace_probe(int argc, char **argv)
 	 *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
 	 *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
 	 * Fetch args:
-	 *  aN	: fetch Nth of function argument. (N:0-)
-	 *  rv	: fetch return value
-	 *  ra	: fetch return address
-	 *  sa	: fetch stack address
-	 *  sN	: fetch Nth of stack (N:0-)
+	 *  $aN	: fetch Nth of function argument. (N:0-)
+	 *  $rv	: fetch return value
+	 *  $ra	: fetch return address
+	 *  $sa	: fetch stack address
+	 *  $sN	: fetch Nth of stack (N:0-)
 	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
 	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
 	 *  %REG	: fetch register REG
-- 
cgit v1.2.3-71-gd317


From 99329c44f28a1b7ac83beebfb4319e612042e319 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 7 Oct 2009 18:27:48 -0400
Subject: tracing/kprobes: Remove '$ra' special variable

Remove '$ra' (return address) because it is already shown at the head of
each entry.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
LKML-Reference: <20091007222748.1684.12711.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt | 13 ++++++-------
 kernel/trace/trace_kprobe.c         | 11 -----------
 2 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 33f531858c56..4208253b5a53 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -1,4 +1,4 @@
-                         Kprobe-based Event Tracer
+                        Kprobe-based Event Tracer
                          =========================
 
                  Documentation is written by Masami Hiramatsu
@@ -42,7 +42,6 @@ Synopsis of kprobe_events
   $sa	: Fetch stack address.
   $aN	: Fetch function argument. (N >= 0)(*)
   $rv	: Fetch return value.(**)
-  $ra	: Fetch return address.(**)
   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
   NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
 
@@ -91,10 +90,10 @@ as below.
 1st to 4th arguments as "myprobe" event. As this example shows, users can
 choose more familiar names for each arguments.
 
-  echo r:myretprobe do_sys_open $rv $ra >> /sys/kernel/debug/tracing/kprobe_events
+  echo r:myretprobe do_sys_open $rv >> /sys/kernel/debug/tracing/kprobe_events
 
  This sets a kretprobe on the return point of do_sys_open() function with
-recording return value and return address as "myretprobe" event.
+recording return value as "myretprobe" event.
  You can see the format of these events via
 /sys/kernel/debug/tracing/events/kprobes/<EVENT>/format.
 
@@ -138,11 +137,11 @@ events, you need to enable it.
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
            <...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
-           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $rv=fffffffffffffffe $ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $rv=fffffffffffffffe
            <...>-1447  [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6
-           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 $ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3
            <...>-1447  [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10
-           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 $ra=ffffffff81367a3a
+           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3
 
 
  Each line shows when the kernel hits an event, and <- SYMBOL means kernel
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f63ead0cc5cf..ba6d3bd48889 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -85,11 +85,6 @@ static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
 	return regs_return_value(regs);
 }
 
-static __kprobes unsigned long fetch_ip(struct pt_regs *regs, void *dummy)
-{
-	return instruction_pointer(regs);
-}
-
 static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
 						   void *dummy)
 {
@@ -234,8 +229,6 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
 		ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
 	} else if (ff->func == fetch_retvalue)
 		ret = snprintf(buf, n, "$rv");
-	else if (ff->func == fetch_ip)
-		ret = snprintf(buf, n, "$ra");
 	else if (ff->func == fetch_stack_address)
 		ret = snprintf(buf, n, "$sa");
 	else if (ff->func == fetch_indirect) {
@@ -448,9 +441,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
 		if (is_return && arg[1] == 'v') {
 			ff->func = fetch_retvalue;
 			ff->data = NULL;
-		} else if (is_return && arg[1] == 'a') {
-			ff->func = fetch_ip;
-			ff->data = NULL;
 		} else
 			ret = -EINVAL;
 		break;
@@ -560,7 +550,6 @@ static int create_trace_probe(int argc, char **argv)
 	 * Fetch args:
 	 *  $aN	: fetch Nth of function argument. (N:0-)
 	 *  $rv	: fetch return value
-	 *  $ra	: fetch return address
 	 *  $sa	: fetch stack address
 	 *  $sN	: fetch Nth of stack (N:0-)
 	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
-- 
cgit v1.2.3-71-gd317


From 369bc18f9a6c4e2686204c1d7476ab684a720968 Mon Sep 17 00:00:00 2001
From: Stefan Assmann <sassmann@redhat.com>
Date: Mon, 12 Oct 2009 22:17:21 +0200
Subject: ftrace: add kernel command line graph function filtering

Add a command line parameter to allow limiting the function graphs
that are traced on boot up from the given top-level callers , when
ftrace=function_graph is specified.

This patch adds the following command line option:
ftrace_graph_filter=function-list

Where function-list is a comma separated list of functions to filter.

[fweisbec@gmail.com: picked the documentation changes from the v2 patch]

Signed-off-by: Stefan Assmann <sassmann@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4AD2DEB9.2@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/kernel-parameters.txt |  7 +++++++
 kernel/trace/ftrace.c               | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6fa7292947e5..1dc4b9cc20e5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -778,6 +778,13 @@ and is between 256 and 4096 characters. It is defined in the file
 			by the set_ftrace_notrace file in the debugfs
 			tracing directory.
 
+	ftrace_graph_filter=[function-list]
+			[FTRACE] Limit the top level callers functions traced
+			by the function graph tracer at boot up.
+			function-list is a comma separated list of functions
+			that can be changed at run time by the
+			set_graph_function file in the debugfs tracing directory.
+
 	gamecon.map[2|3]=
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
 			support via parallel port (up to 5 devices per port)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9a72853a8f0a..91283d40821e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -78,6 +78,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+#endif
+
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_ops *op = ftrace_list;
@@ -2248,6 +2252,7 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
 #define FTRACE_FILTER_SIZE		COMMAND_LINE_SIZE
 static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
 
 static int __init set_ftrace_notrace(char *str)
 {
@@ -2263,6 +2268,31 @@ static int __init set_ftrace_filter(char *str)
 }
 __setup("ftrace_filter=", set_ftrace_filter);
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int __init set_graph_function(char *str)
+{
+	strncpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_graph_filter=", set_graph_function);
+
+static void __init set_ftrace_early_graph(char *buf)
+{
+	int ret;
+	char *func;
+
+	while (buf) {
+		func = strsep(&buf, ",");
+		/* we allow only one expression at a time */
+		ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
+				      func);
+		if (ret)
+			printk(KERN_DEBUG "ftrace: function %s not "
+					  "traceable\n", func);
+	}
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 static void __init set_ftrace_early_filter(char *buf, int enable)
 {
 	char *func;
@@ -2279,6 +2309,10 @@ static void __init set_ftrace_early_filters(void)
 		set_ftrace_early_filter(ftrace_filter_buf, 1);
 	if (ftrace_notrace_buf[0])
 		set_ftrace_early_filter(ftrace_notrace_buf, 0);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	if (ftrace_graph_buf[0])
+		set_ftrace_early_graph(ftrace_graph_buf);
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 }
 
 static int
-- 
cgit v1.2.3-71-gd317


From 2e06ff6389aedafc4a3a374344ac70672252f9b5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 7 Oct 2009 18:27:59 -0400
Subject: tracing/kprobes: Make special variable names more self-explainable

Rename special variables to more self-explainable names as below:
- $rv to $retval
- $sa to $stack
- $aN to $argN
- $sN to $stackN

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
LKML-Reference: <20091007222759.1684.3319.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/trace/kprobetrace.txt | 22 ++++++++--------
 kernel/trace/trace_kprobe.c         | 52 +++++++++++++++++--------------------
 2 files changed, 35 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 4208253b5a53..15415243a9a3 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -35,13 +35,13 @@ Synopsis of kprobe_events
  MEMADDR	: Address where the probe is inserted.
 
  FETCHARGS	: Arguments. Each probe can have up to 128 args.
-  %REG	: Fetch register REG
-  @ADDR	: Fetch memory at ADDR (ADDR should be in kernel)
+  %REG		: Fetch register REG
+  @ADDR		: Fetch memory at ADDR (ADDR should be in kernel)
   @SYM[+|-offs]	: Fetch memory at SYM +|- offs (SYM should be a data symbol)
-  $sN	: Fetch Nth entry of stack (N >= 0)
-  $sa	: Fetch stack address.
-  $aN	: Fetch function argument. (N >= 0)(*)
-  $rv	: Fetch return value.(**)
+  $stackN	: Fetch Nth entry of stack (N >= 0)
+  $stack	: Fetch stack address.
+  $argN		: Fetch function argument. (N >= 0)(*)
+  $retval	: Fetch return value.(**)
   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
   NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
 
@@ -84,13 +84,13 @@ Usage examples
 To add a probe as a new event, write a new definition to kprobe_events
 as below.
 
-  echo p:myprobe do_sys_open dfd=$a0 filename=$a1 flags=$a2 mode=$a3 > /sys/kernel/debug/tracing/kprobe_events
+  echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events
 
  This sets a kprobe on the top of do_sys_open() function with recording
 1st to 4th arguments as "myprobe" event. As this example shows, users can
 choose more familiar names for each arguments.
 
-  echo r:myretprobe do_sys_open $rv >> /sys/kernel/debug/tracing/kprobe_events
+  echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events
 
  This sets a kretprobe on the return point of do_sys_open() function with
 recording return value as "myretprobe" event.
@@ -137,11 +137,11 @@ events, you need to enable it.
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
            <...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
-           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $rv=fffffffffffffffe
+           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $retval=fffffffffffffffe
            <...>-1447  [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6
-           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3
+           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
            <...>-1447  [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10
-           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3
+           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
 
 
  Each line shows when the kernel hits an event, and <- SYMBOL means kernel
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ba6d3bd48889..3313fa74ce5f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -215,22 +215,22 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
 	int ret = -EINVAL;
 
 	if (ff->func == fetch_argument)
-		ret = snprintf(buf, n, "$a%lu", (unsigned long)ff->data);
+		ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
 	else if (ff->func == fetch_register) {
 		const char *name;
 		name = regs_query_register_name((unsigned int)((long)ff->data));
 		ret = snprintf(buf, n, "%%%s", name);
 	} else if (ff->func == fetch_stack)
-		ret = snprintf(buf, n, "$s%lu", (unsigned long)ff->data);
+		ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
 	else if (ff->func == fetch_memory)
 		ret = snprintf(buf, n, "@0x%p", ff->data);
 	else if (ff->func == fetch_symbol) {
 		struct symbol_cache *sc = ff->data;
 		ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
 	} else if (ff->func == fetch_retvalue)
-		ret = snprintf(buf, n, "$rv");
+		ret = snprintf(buf, n, "$retval");
 	else if (ff->func == fetch_stack_address)
-		ret = snprintf(buf, n, "$sa");
+		ret = snprintf(buf, n, "$stack");
 	else if (ff->func == fetch_indirect) {
 		struct indirect_fetch_data *id = ff->data;
 		size_t l = 0;
@@ -427,40 +427,36 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
 	int ret = 0;
 	unsigned long param;
 
-	switch (arg[0]) {
-	case 'a':	/* argument */
-		ret = strict_strtoul(arg + 1, 10, &param);
-		if (ret || param > PARAM_MAX_ARGS)
-			ret = -EINVAL;
-		else {
-			ff->func = fetch_argument;
-			ff->data = (void *)param;
-		}
-		break;
-	case 'r':	/* retval or retaddr */
-		if (is_return && arg[1] == 'v') {
+	if (strcmp(arg, "retval") == 0) {
+		if (is_return) {
 			ff->func = fetch_retvalue;
 			ff->data = NULL;
 		} else
 			ret = -EINVAL;
-		break;
-	case 's':	/* stack */
-		if (arg[1] == 'a') {
+	} else if (strncmp(arg, "stack", 5) == 0) {
+		if (arg[5] == '\0') {
 			ff->func = fetch_stack_address;
 			ff->data = NULL;
-		} else {
-			ret = strict_strtoul(arg + 1, 10, &param);
+		} else if (isdigit(arg[5])) {
+			ret = strict_strtoul(arg + 5, 10, &param);
 			if (ret || param > PARAM_MAX_STACK)
 				ret = -EINVAL;
 			else {
 				ff->func = fetch_stack;
 				ff->data = (void *)param;
 			}
+		} else
+			ret = -EINVAL;
+	} else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
+		ret = strict_strtoul(arg + 3, 10, &param);
+		if (ret || param > PARAM_MAX_ARGS)
+			ret = -EINVAL;
+		else {
+			ff->func = fetch_argument;
+			ff->data = (void *)param;
 		}
-		break;
-	default:
+	} else
 		ret = -EINVAL;
-	}
 	return ret;
 }
 
@@ -548,10 +544,10 @@ static int create_trace_probe(int argc, char **argv)
 	 *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
 	 *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
 	 * Fetch args:
-	 *  $aN	: fetch Nth of function argument. (N:0-)
-	 *  $rv	: fetch return value
-	 *  $sa	: fetch stack address
-	 *  $sN	: fetch Nth of stack (N:0-)
+	 *  $argN	: fetch Nth of function argument. (N:0-)
+	 *  $retval	: fetch return value
+	 *  $stack	: fetch stack address
+	 *  $stackN	: fetch Nth of stack (N:0-)
 	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
 	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
 	 *  %REG	: fetch register REG
-- 
cgit v1.2.3-71-gd317


From a703d946e883d8e447d0597de556e2effd110372 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 7 Oct 2009 18:28:07 -0400
Subject: tracing/kprobes: Avoid field name confliction

Check whether the argument name is in conflict with other field names
while creating a kprobe through the debugfs interface.

Changes in v3:
 - Check strcmp() == 0 instead of !strcmp().

Changes in v2:
 - Add common_lock_depth to reserved name list.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
LKML-Reference: <20091007222807.1684.26880.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 65 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3313fa74ce5f..bb6cb2bc6fae 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -38,6 +38,25 @@
 #define MAX_EVENT_NAME_LEN 64
 #define KPROBE_EVENT_SYSTEM "kprobes"
 
+/* Reserved field names */
+#define FIELD_STRING_IP "ip"
+#define FIELD_STRING_NARGS "nargs"
+#define FIELD_STRING_RETIP "ret_ip"
+#define FIELD_STRING_FUNC "func"
+
+const char *reserved_field_names[] = {
+	"common_type",
+	"common_flags",
+	"common_preempt_count",
+	"common_pid",
+	"common_tgid",
+	"common_lock_depth",
+	FIELD_STRING_IP,
+	FIELD_STRING_NARGS,
+	FIELD_STRING_RETIP,
+	FIELD_STRING_FUNC,
+};
+
 /* currently, trace_kprobe only supports X86. */
 
 struct fetch_func {
@@ -537,6 +556,20 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 	return ret;
 }
 
+/* Return 1 if name is reserved or already used by another argument */
+static int conflict_field_name(const char *name,
+			       struct probe_arg *args, int narg)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+		if (strcmp(reserved_field_names[i], name) == 0)
+			return 1;
+	for (i = 0; i < narg; i++)
+		if (strcmp(args[i].name, name) == 0)
+			return 1;
+	return 0;
+}
+
 static int create_trace_probe(int argc, char **argv)
 {
 	/*
@@ -637,6 +670,12 @@ static int create_trace_probe(int argc, char **argv)
 			*arg++ = '\0';
 		else
 			arg = argv[i];
+
+		if (conflict_field_name(argv[i], tp->args, i)) {
+			ret = -EINVAL;
+			goto error;
+		}
+
 		tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
 
 		/* Parse fetch argument */
@@ -1039,8 +1078,8 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 	if (!ret)
 		return ret;
 
-	DEFINE_FIELD(unsigned long, ip, "ip", 0);
-	DEFINE_FIELD(int, nargs, "nargs", 1);
+	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+	DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
 	/* Set argument names as fields */
 	for (i = 0; i < tp->nr_args; i++)
 		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
@@ -1057,9 +1096,9 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 	if (!ret)
 		return ret;
 
-	DEFINE_FIELD(unsigned long, func, "func", 0);
-	DEFINE_FIELD(unsigned long, ret_ip, "ret_ip", 0);
-	DEFINE_FIELD(int, nargs, "nargs", 1);
+	DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+	DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+	DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
 	/* Set argument names as fields */
 	for (i = 0; i < tp->nr_args; i++)
 		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
@@ -1108,15 +1147,16 @@ static int kprobe_event_show_format(struct ftrace_event_call *call,
 	int ret, i;
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
-	SHOW_FIELD(unsigned long, ip, "ip");
-	SHOW_FIELD(int, nargs, "nargs");
+	SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
+	SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
 
 	/* Show fields */
 	for (i = 0; i < tp->nr_args; i++)
 		SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
 	trace_seq_puts(s, "\n");
 
-	return __probe_event_show_format(s, tp, "(%lx)", "REC->ip");
+	return __probe_event_show_format(s, tp, "(%lx)",
+					 "REC->" FIELD_STRING_IP);
 }
 
 static int kretprobe_event_show_format(struct ftrace_event_call *call,
@@ -1126,9 +1166,9 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 	int ret, i;
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
-	SHOW_FIELD(unsigned long, func, "func");
-	SHOW_FIELD(unsigned long, ret_ip, "ret_ip");
-	SHOW_FIELD(int, nargs, "nargs");
+	SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
+	SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
+	SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
 
 	/* Show fields */
 	for (i = 0; i < tp->nr_args; i++)
@@ -1136,7 +1176,8 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 	trace_seq_puts(s, "\n");
 
 	return __probe_event_show_format(s, tp, "(%lx <- %lx)",
-					  "REC->func, REC->ret_ip");
+					 "REC->" FIELD_STRING_FUNC
+					 ", REC->" FIELD_STRING_RETIP);
 }
 
 #ifdef CONFIG_EVENT_PROFILE
-- 
cgit v1.2.3-71-gd317


From e93f4d8539d5e9dd59f4af9d8ef4e9b62cfa1f81 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 7 Oct 2009 18:28:14 -0400
Subject: tracing/kprobes: Robustify fixed field names against variable field
 names conflicts

Rename probe-common fixed field names to harder conflictable names,
because current 'ip', 'func', and other probe field names are easily in
conflict with user-specified variable names.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
LKML-Reference: <20091007222814.1684.407.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index bb6cb2bc6fae..739f70e8e924 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -39,10 +39,10 @@
 #define KPROBE_EVENT_SYSTEM "kprobes"
 
 /* Reserved field names */
-#define FIELD_STRING_IP "ip"
-#define FIELD_STRING_NARGS "nargs"
-#define FIELD_STRING_RETIP "ret_ip"
-#define FIELD_STRING_FUNC "func"
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_NARGS "__probe_nargs"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
 
 const char *reserved_field_names[] = {
 	"common_type",
-- 
cgit v1.2.3-71-gd317


From a2e2725541fad72416326798c2d7fa4dafb7d337 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 12 Oct 2009 23:40:10 -0700
Subject: net: Introduce recvmmsg socket syscall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Meaning receive multiple messages, reducing the number of syscalls and
net stack entry/exit operations.

Next patches will introduce mechanisms where protocols that want to
optimize this operation will provide an unlocked_recvmsg operation.

This takes into account comments made by:

. Paul Moore: sock_recvmsg is called only for the first datagram,
  sock_recvmsg_nosec is used for the rest.

. Caitlin Bestler: recvmmsg now has a struct timespec timeout, that
  works in the same fashion as the ppoll one.

  If the underlying protocol returns a datagram with MSG_OOB set, this
  will make recvmmsg return right away with as many datagrams (+ the OOB
  one) it has received so far.

. Rémi Denis-Courmont & Steven Whitehouse: If we receive N < vlen
  datagrams and then recvmsg returns an error, recvmmsg will return
  the successfully received datagrams, store the error and return it
  in the next call.

This paves the way for a subsequent optimization, sk_prot->unlocked_recvmsg,
where we will be able to acquire the lock only at batch start and end, not at
every underlying recvmsg call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/kernel/systbls.S            |   1 +
 arch/arm/kernel/calls.S                |   1 +
 arch/avr32/kernel/syscall_table.S      |   1 +
 arch/blackfin/mach-common/entry.S      |   1 +
 arch/ia64/kernel/entry.S               |   1 +
 arch/microblaze/kernel/syscall_table.S |   1 +
 arch/mips/kernel/scall32-o32.S         |   1 +
 arch/mips/kernel/scall64-64.S          |   1 +
 arch/mips/kernel/scall64-n32.S         |   1 +
 arch/mips/kernel/scall64-o32.S         |   1 +
 arch/sh/kernel/syscalls_64.S           |   1 +
 arch/sparc/kernel/systbls_32.S         |   2 +-
 arch/sparc/kernel/systbls_64.S         |   4 +-
 arch/x86/ia32/ia32entry.S              |   1 +
 arch/x86/include/asm/unistd_32.h       |   3 +-
 arch/x86/include/asm/unistd_64.h       |   2 +
 arch/x86/kernel/syscall_table_32.S     |   1 +
 arch/xtensa/include/asm/unistd.h       |   4 +-
 include/linux/net.h                    |   1 +
 include/linux/socket.h                 |  10 ++
 include/linux/syscalls.h               |   4 +
 include/net/compat.h                   |   8 ++
 kernel/sys_ni.c                        |   2 +
 net/compat.c                           |  33 ++++-
 net/socket.c                           | 225 +++++++++++++++++++++++++++------
 25 files changed, 261 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S
index 95c9aef1c106..cda6b8b3d573 100644
--- a/arch/alpha/kernel/systbls.S
+++ b/arch/alpha/kernel/systbls.S
@@ -497,6 +497,7 @@ sys_call_table:
 	.quad sys_signalfd
 	.quad sys_ni_syscall
 	.quad sys_eventfd
+	.quad sys_recvmmsg
 
 	.size sys_call_table, . - sys_call_table
 	.type sys_call_table, @object
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index fafce1b5c69f..f58c1156e779 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -374,6 +374,7 @@
 		CALL(sys_pwritev)
 		CALL(sys_rt_tgsigqueueinfo)
 		CALL(sys_perf_event_open)
+/* 365 */	CALL(sys_recvmmsg)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S
index 7ee0057613b3..e76bad16b0f0 100644
--- a/arch/avr32/kernel/syscall_table.S
+++ b/arch/avr32/kernel/syscall_table.S
@@ -295,4 +295,5 @@ sys_call_table:
 	.long	sys_signalfd
 	.long	sys_ni_syscall		/* 280, was sys_timerfd */
 	.long	sys_eventfd
+	.long	sys_recvmmsg
 	.long	sys_ni_syscall		/* r8 is saturated at nr_syscalls */
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index 1e7cac23e25f..48692724b74c 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1621,6 +1621,7 @@ ENTRY(_sys_call_table)
 	.long _sys_pwritev
 	.long _sys_rt_tgsigqueueinfo
 	.long _sys_perf_event_open
+	.long _sys_recvmmsg		/* 370 */
 
 	.rept NR_syscalls-(.-_sys_call_table)/4
 	.long _sys_ni_syscall
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index d0e7d37017b4..d75b872ca4dc 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1806,6 +1806,7 @@ sys_call_table:
 	data8 sys_preadv
 	data8 sys_pwritev			// 1320
 	data8 sys_rt_tgsigqueueinfo
+	data8 sys_recvmmsg
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
 #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S
index ecec19155135..c1ab1dc10898 100644
--- a/arch/microblaze/kernel/syscall_table.S
+++ b/arch/microblaze/kernel/syscall_table.S
@@ -371,3 +371,4 @@ ENTRY(sys_call_table)
 	.long sys_ni_syscall
 	.long sys_rt_tgsigqueueinfo	/* 365 */
 	.long sys_perf_event_open
+	.long sys_recvmmsg
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index fd2a9bb620d6..17202bbe843f 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -583,6 +583,7 @@ einval:	li	v0, -ENOSYS
 	sys	sys_rt_tgsigqueueinfo	4
 	sys	sys_perf_event_open	5
 	sys	sys_accept4		4
+	sys     sys_recvmmsg            5
 	.endm
 
 	/* We pre-compute the number of _instruction_ bytes needed to
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index 18bf7f32c5e4..a8a6c596eb04 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -420,4 +420,5 @@ sys_call_table:
 	PTR	sys_rt_tgsigqueueinfo
 	PTR	sys_perf_event_open
 	PTR	sys_accept4
+	PTR     sys_recvmmsg
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 6ebc07976694..5154e64f7cfe 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -418,4 +418,5 @@ EXPORT(sysn32_call_table)
 	PTR	compat_sys_rt_tgsigqueueinfo	/* 5295 */
 	PTR	sys_perf_event_open
 	PTR	sys_accept4
+	PTR     compat_sys_recvmmsg
 	.size	sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index 9bbf9775e0bd..d0eff53d7cb9 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -538,4 +538,5 @@ sys_call_table:
 	PTR	compat_sys_rt_tgsigqueueinfo
 	PTR	sys_perf_event_open
 	PTR	sys_accept4
+	PTR     compat_sys_recvmmsg
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S
index 5bfde6c77498..07d2aaea9ae8 100644
--- a/arch/sh/kernel/syscalls_64.S
+++ b/arch/sh/kernel/syscalls_64.S
@@ -391,3 +391,4 @@ sys_call_table:
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo
 	.long sys_perf_event_open
+	.long sys_recvmmsg		/* 365 */
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index 0f1658d37490..ceb1530f8aa6 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -82,5 +82,5 @@ sys_call_table:
 /*310*/	.long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 /*315*/	.long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
+/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg
 
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 009825f6e73c..f37bef747e60 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -83,7 +83,7 @@ sys_call_table32:
 /*310*/	.word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
 	.word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
-	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open
+	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open, compat_sys_recvmmsg
 
 #endif /* CONFIG_COMPAT */
 
@@ -158,4 +158,4 @@ sys_call_table:
 /*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 	.word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
+	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 74619c4f9fda..11a6c79d5f46 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -832,4 +832,5 @@ ia32_sys_call_table:
 	.quad compat_sys_pwritev
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_event_open
+	.quad compat_sys_recvmmsg
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c209a7e3..3baf379fa840 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,11 @@
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
+#define __NR_recvmmsg		337
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 337
+#define NR_syscalls 338
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0adbc68..4843f7ba754a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 #define __NR_perf_event_open			298
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
+#define __NR_recvmmsg				299
+__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..70c2125d55b9 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
+	.long sys_recvmmsg
diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h
index c092c8fbb2cf..4e55dc763021 100644
--- a/arch/xtensa/include/asm/unistd.h
+++ b/arch/xtensa/include/asm/unistd.h
@@ -681,8 +681,10 @@ __SYSCALL(304, sys_signalfd, 3)
 __SYSCALL(305, sys_ni_syscall, 0)
 #define __NR_eventfd				306
 __SYSCALL(306, sys_eventfd, 1)
+#define __NR_recvmmsg				307
+__SYSCALL(307, sys_recvmmsg, 5)
 
-#define __NR_syscall_count			307
+#define __NR_syscall_count			308
 
 /*
  * sysxtensa syscall handler
diff --git a/include/linux/net.h b/include/linux/net.h
index 529a0931711d..b42bb60fe92f 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -41,6 +41,7 @@
 #define SYS_SENDMSG	16		/* sys_sendmsg(2)		*/
 #define SYS_RECVMSG	17		/* sys_recvmsg(2)		*/
 #define SYS_ACCEPT4	18		/* sys_accept4(2)		*/
+#define SYS_RECVMMSG	19		/* sys_recvmmsg(2)		*/
 
 typedef enum {
 	SS_FREE = 0,			/* not allocated		*/
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 3273a0c5043b..59966f12990c 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -65,6 +65,12 @@ struct msghdr {
 	unsigned	msg_flags;
 };
 
+/* For recvmmsg/sendmmsg */
+struct mmsghdr {
+	struct msghdr   msg_hdr;
+	unsigned        msg_len;
+};
+
 /*
  *	POSIX 1003.1g - ancillary data object information
  *	Ancillary data consits of a sequence of pairs of
@@ -312,6 +318,10 @@ extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uadd
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
+struct timespec;
+
+extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+			  unsigned int flags, struct timespec *timeout);
 #endif
 #endif /* not kernel and not glibc */
 #endif /* _LINUX_SOCKET_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a990ace1a838..714f063a3e6d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -25,6 +25,7 @@ struct linux_dirent64;
 struct list_head;
 struct msgbuf;
 struct msghdr;
+struct mmsghdr;
 struct msqid_ds;
 struct new_utsname;
 struct nfsctl_arg;
@@ -677,6 +678,9 @@ asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
 asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
 				struct sockaddr __user *, int __user *);
 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
+asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
+			     unsigned int vlen, unsigned flags,
+			     struct timespec __user *timeout);
 asmlinkage long sys_socket(int, int, int);
 asmlinkage long sys_socketpair(int, int, int, int __user *);
 asmlinkage long sys_socketcall(int call, unsigned long __user *args);
diff --git a/include/net/compat.h b/include/net/compat.h
index 7c3002832d05..9679f05e9896 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -18,6 +18,11 @@ struct compat_msghdr {
 	compat_uint_t	msg_flags;
 };
 
+struct compat_mmsghdr {
+	struct compat_msghdr msg_hdr;
+	compat_uint_t        msg_len;
+};
+
 struct compat_cmsghdr {
 	compat_size_t	cmsg_len;
 	compat_int_t	cmsg_level;
@@ -35,6 +40,9 @@ extern int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *);
 extern int verify_compat_iovec(struct msghdr *, struct iovec *, struct sockaddr *, int);
 extern asmlinkage long compat_sys_sendmsg(int,struct compat_msghdr __user *,unsigned);
 extern asmlinkage long compat_sys_recvmsg(int,struct compat_msghdr __user *,unsigned);
+extern asmlinkage long compat_sys_recvmmsg(int, struct compat_mmsghdr __user *,
+					   unsigned, unsigned,
+					   struct timespec __user *);
 extern asmlinkage long compat_sys_getsockopt(int, int, int, char __user *, int __user *);
 extern int put_cmsg_compat(struct msghdr*, int, int, int, void *);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8d1951..f050ba85d420 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,8 +48,10 @@ cond_syscall(sys_shutdown);
 cond_syscall(sys_sendmsg);
 cond_syscall(compat_sys_sendmsg);
 cond_syscall(sys_recvmsg);
+cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
 cond_syscall(compat_sys_recvfrom);
+cond_syscall(compat_sys_recvmmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
diff --git a/net/compat.c b/net/compat.c
index a407c3addbae..e13f5256fd20 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -727,10 +727,10 @@ EXPORT_SYMBOL(compat_mc_getsockopt);
 
 /* Argument list sizes for compat_sys_socketcall */
 #define AL(x) ((x) * sizeof(u32))
-static unsigned char nas[19]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
+static unsigned char nas[20]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 				AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
 				AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-				AL(4)};
+				AL(4),AL(5)};
 #undef AL
 
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
@@ -755,13 +755,36 @@ asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len,
 	return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen);
 }
 
+asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+				    unsigned vlen, unsigned int flags,
+				    struct timespec __user *timeout)
+{
+	int datagrams;
+	struct timespec ktspec;
+	struct compat_timespec __user *utspec =
+			(struct compat_timespec __user *)timeout;
+
+	if (get_user(ktspec.tv_sec, &utspec->tv_sec) ||
+	    get_user(ktspec.tv_nsec, &utspec->tv_nsec))
+		return -EFAULT;
+
+	datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+				   flags | MSG_CMSG_COMPAT, &ktspec);
+	if (datagrams > 0 &&
+	    (put_user(ktspec.tv_sec, &utspec->tv_sec) ||
+	     put_user(ktspec.tv_nsec, &utspec->tv_nsec)))
+		datagrams = -EFAULT;
+
+	return datagrams;
+}
+
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 {
 	int ret;
 	u32 a[6];
 	u32 a0, a1;
 
-	if (call < SYS_SOCKET || call > SYS_ACCEPT4)
+	if (call < SYS_SOCKET || call > SYS_RECVMMSG)
 		return -EINVAL;
 	if (copy_from_user(a, args, nas[call]))
 		return -EFAULT;
@@ -823,6 +846,10 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 	case SYS_RECVMSG:
 		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
 		break;
+	case SYS_RECVMMSG:
+		ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
+					  compat_ptr(a[4]));
+		break;
 	case SYS_ACCEPT4:
 		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
 		break;
diff --git a/net/socket.c b/net/socket.c
index 807935693846..9dff31c9b799 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -683,10 +683,9 @@ void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(sock_recv_ts_and_drops);
 
-static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
-				 struct msghdr *msg, size_t size, int flags)
+static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
+				       struct msghdr *msg, size_t size, int flags)
 {
-	int err;
 	struct sock_iocb *si = kiocb_to_siocb(iocb);
 
 	si->sock = sock;
@@ -695,13 +694,17 @@ static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	si->size = size;
 	si->flags = flags;
 
-	err = security_socket_recvmsg(sock, msg, size, flags);
-	if (err)
-		return err;
-
 	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
 }
 
+static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+				 struct msghdr *msg, size_t size, int flags)
+{
+	int err = security_socket_recvmsg(sock, msg, size, flags);
+
+	return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
+}
+
 int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 		 size_t size, int flags)
 {
@@ -717,6 +720,21 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 	return ret;
 }
 
+static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
+			      size_t size, int flags)
+{
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
+
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 		   struct kvec *vec, size_t num, size_t size, int flags)
 {
@@ -1983,22 +2001,15 @@ out:
 	return err;
 }
 
-/*
- *	BSD recvmsg interface
- */
-
-SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
-		unsigned int, flags)
+static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned flags, int nosec)
 {
 	struct compat_msghdr __user *msg_compat =
 	    (struct compat_msghdr __user *)msg;
-	struct socket *sock;
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
-	struct msghdr msg_sys;
 	unsigned long cmsg_ptr;
 	int err, iov_size, total_len, len;
-	int fput_needed;
 
 	/* kernel mode address */
 	struct sockaddr_storage addr;
@@ -2008,27 +2019,23 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 	int __user *uaddr_len;
 
 	if (MSG_CMSG_COMPAT & flags) {
-		if (get_compat_msghdr(&msg_sys, msg_compat))
+		if (get_compat_msghdr(msg_sys, msg_compat))
 			return -EFAULT;
 	}
-	else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
+	else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
 		return -EFAULT;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
-
 	err = -EMSGSIZE;
-	if (msg_sys.msg_iovlen > UIO_MAXIOV)
-		goto out_put;
+	if (msg_sys->msg_iovlen > UIO_MAXIOV)
+		goto out;
 
 	/* Check whether to allocate the iovec area */
 	err = -ENOMEM;
-	iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
-	if (msg_sys.msg_iovlen > UIO_FASTIOV) {
+	iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
+	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
 		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
 		if (!iov)
-			goto out_put;
+			goto out;
 	}
 
 	/*
@@ -2036,46 +2043,47 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 	 *      kernel msghdr to use the kernel address space)
 	 */
 
-	uaddr = (__force void __user *)msg_sys.msg_name;
+	uaddr = (__force void __user *)msg_sys->msg_name;
 	uaddr_len = COMPAT_NAMELEN(msg);
 	if (MSG_CMSG_COMPAT & flags) {
-		err = verify_compat_iovec(&msg_sys, iov,
+		err = verify_compat_iovec(msg_sys, iov,
 					  (struct sockaddr *)&addr,
 					  VERIFY_WRITE);
 	} else
-		err = verify_iovec(&msg_sys, iov,
+		err = verify_iovec(msg_sys, iov,
 				   (struct sockaddr *)&addr,
 				   VERIFY_WRITE);
 	if (err < 0)
 		goto out_freeiov;
 	total_len = err;
 
-	cmsg_ptr = (unsigned long)msg_sys.msg_control;
-	msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+	cmsg_ptr = (unsigned long)msg_sys->msg_control;
+	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
 
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-	err = sock_recvmsg(sock, &msg_sys, total_len, flags);
+	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
+							  total_len, flags);
 	if (err < 0)
 		goto out_freeiov;
 	len = err;
 
 	if (uaddr != NULL) {
 		err = move_addr_to_user((struct sockaddr *)&addr,
-					msg_sys.msg_namelen, uaddr,
+					msg_sys->msg_namelen, uaddr,
 					uaddr_len);
 		if (err < 0)
 			goto out_freeiov;
 	}
-	err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
+	err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
 			 COMPAT_FLAGS(msg));
 	if (err)
 		goto out_freeiov;
 	if (MSG_CMSG_COMPAT & flags)
-		err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg_compat->msg_controllen);
 	else
-		err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg->msg_controllen);
 	if (err)
 		goto out_freeiov;
@@ -2084,21 +2092,150 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 out_freeiov:
 	if (iov != iovstack)
 		sock_kfree_s(sock->sk, iov, iov_size);
-out_put:
+out:
+	return err;
+}
+
+/*
+ *	BSD recvmsg interface
+ */
+
+SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
+		unsigned int, flags)
+{
+	int fput_needed, err;
+	struct msghdr msg_sys;
+	struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
+
+	if (!sock)
+		goto out;
+
+	err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+
 	fput_light(sock->file, fput_needed);
 out:
 	return err;
 }
 
-#ifdef __ARCH_WANT_SYS_SOCKETCALL
+/*
+ *     Linux recvmmsg interface
+ */
+
+int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+		   unsigned int flags, struct timespec *timeout)
+{
+	int fput_needed, err, datagrams;
+	struct socket *sock;
+	struct mmsghdr __user *entry;
+	struct msghdr msg_sys;
+	struct timespec end_time;
+
+	if (timeout &&
+	    poll_select_set_timeout(&end_time, timeout->tv_sec,
+				    timeout->tv_nsec))
+		return -EINVAL;
+
+	datagrams = 0;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		return err;
+
+	err = sock_error(sock->sk);
+	if (err)
+		goto out_put;
+
+	entry = mmsg;
+
+	while (datagrams < vlen) {
+		/*
+		 * No need to ask LSM for more than the first datagram.
+		 */
+		err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
+				    &msg_sys, flags, datagrams);
+		if (err < 0)
+			break;
+		err = put_user(err, &entry->msg_len);
+		if (err)
+			break;
+		++entry;
+		++datagrams;
+
+		if (timeout) {
+			ktime_get_ts(timeout);
+			*timeout = timespec_sub(end_time, *timeout);
+			if (timeout->tv_sec < 0) {
+				timeout->tv_sec = timeout->tv_nsec = 0;
+				break;
+			}
+
+			/* Timeout, return less than vlen datagrams */
+			if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
+				break;
+		}
+
+		/* Out of band data, return right away */
+		if (msg_sys.msg_flags & MSG_OOB)
+			break;
+	}
+
+out_put:
+	fput_light(sock->file, fput_needed);
 
+	if (err == 0)
+		return datagrams;
+
+	if (datagrams != 0) {
+		/*
+		 * We may return less entries than requested (vlen) if the
+		 * sock is non block and there aren't enough datagrams...
+		 */
+		if (err != -EAGAIN) {
+			/*
+			 * ... or  if recvmsg returns an error after we
+			 * received some datagrams, where we record the
+			 * error to return on the next call or if the
+			 * app asks about it using getsockopt(SO_ERROR).
+			 */
+			sock->sk->sk_err = -err;
+		}
+
+		return datagrams;
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
+		unsigned int, vlen, unsigned int, flags,
+		struct timespec __user *, timeout)
+{
+	int datagrams;
+	struct timespec timeout_sys;
+
+	if (!timeout)
+		return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
+
+	if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
+		return -EFAULT;
+
+	datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
+
+	if (datagrams > 0 &&
+	    copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
+		datagrams = -EFAULT;
+
+	return datagrams;
+}
+
+#ifdef __ARCH_WANT_SYS_SOCKETCALL
 /* Argument list sizes for sys_socketcall */
 #define AL(x) ((x) * sizeof(unsigned long))
-static const unsigned char nargs[19]={
+static const unsigned char nargs[20] = {
 	AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 	AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
 	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-	AL(4)
+	AL(4),AL(5)
 };
 
 #undef AL
@@ -2118,7 +2255,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 	int err;
 	unsigned int len;
 
-	if (call < 1 || call > SYS_ACCEPT4)
+	if (call < 1 || call > SYS_RECVMMSG)
 		return -EINVAL;
 
 	len = nargs[call];
@@ -2196,6 +2333,10 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 	case SYS_RECVMSG:
 		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
+	case SYS_RECVMMSG:
+		err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
+				   (struct timespec __user *)a[4]);
+		break;
 	case SYS_ACCEPT4:
 		err = sys_accept4(a0, (struct sockaddr __user *)a1,
 				  (int __user *)a[2], a[3]);
-- 
cgit v1.2.3-71-gd317


From aef6f81b55f462082699c06e8e67e6eb5630ed45 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 12 Oct 2009 22:23:24 +0200
Subject: tracing: Rename set_ftrace to set_bootup_ftrace

Do this rename because set_ftrace is too much generic and not enough
self-explainable as a name.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45068269ebb1..866daf8497f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf);
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
 static char *default_bootup_tracer;
 
-static int __init set_ftrace(char *str)
+static int __init set_bootup_ftrace(char *str)
 {
 	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +137,7 @@ static int __init set_ftrace(char *str)
 	ring_buffer_expanded = 1;
 	return 1;
 }
-__setup("ftrace=", set_ftrace);
+__setup("ftrace=", set_bootup_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
-- 
cgit v1.2.3-71-gd317


From bf7c5b43a12614847b83f507fb169ad30640e406 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 12 Oct 2009 22:31:32 +0200
Subject: tracing: Remove unused ftrace_trace_addr helper

Remove the ftrace_trace_addr() function as only its off-case is
implemented and there are no users of it currently.

But we keep ftrace_graph_addr() off-case, in case someone come to use
the function graph tracer to profit from top-level callers filtering.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 365fb19d9e11..f22a7ac32380 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -483,10 +483,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
 	return 0;
 }
 #else
-static inline int ftrace_trace_addr(unsigned long addr)
-{
-	return 1;
-}
 static inline int ftrace_graph_addr(unsigned long addr)
 {
 	return 1;
-- 
cgit v1.2.3-71-gd317


From 825332e4ff1373c55d931b49408df7ec2298f71e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 14 Oct 2009 08:17:36 +1100
Subject: capabilities: simplify bound checks for copy_from_user()

The capabilities syscall has a copy_from_user() call where gcc currently
cannot prove to itself that the copy is always within bounds.

This patch adds a very explicity bound check to prove to gcc that this
copy_from_user cannot overflow its destination buffer.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f5..c2316d3fa094 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -238,7 +238,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 {
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
-	unsigned i, tocopy;
+	unsigned i, tocopy, copybytes;
 	kernel_cap_t inheritable, permitted, effective;
 	struct cred *new;
 	int ret;
@@ -255,8 +255,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 	if (pid != 0 && pid != task_pid_vnr(current))
 		return -EPERM;
 
-	if (copy_from_user(&kdata, data,
-			   tocopy * sizeof(struct __user_cap_data_struct)))
+	copybytes = tocopy * sizeof(struct __user_cap_data_struct);
+	if (copybytes > sizeof(kdata))
+		return -EFAULT;
+
+	if (copy_from_user(&kdata, data, copybytes))
 		return -EFAULT;
 
 	for (i = 0; i < tocopy; i++) {
-- 
cgit v1.2.3-71-gd317


From 756d17ee7ee4fbc8238bdf97100af63e6ac441ef Mon Sep 17 00:00:00 2001
From: "jolsa@redhat.com" <jolsa@redhat.com>
Date: Tue, 13 Oct 2009 16:33:52 -0400
Subject: tracing: Support multiple pids in set_pid_ftrace file

Adding the possibility to set more than 1 pid in the set_pid_ftrace
file, thus allowing to trace more than 1 independent processes.

Usage:

 sh-4.0# echo 284 > ./set_ftrace_pid
 sh-4.0# cat ./set_ftrace_pid
 284
 sh-4.0# echo 1 >> ./set_ftrace_pid
 sh-4.0# echo 0 >> ./set_ftrace_pid
 sh-4.0# cat ./set_ftrace_pid
 swapper tasks
 1
 284
 sh-4.0# echo 4 > ./set_ftrace_pid
 sh-4.0# cat ./set_ftrace_pid
 4
 sh-4.0# echo > ./set_ftrace_pid
 sh-4.0# cat ./set_ftrace_pid
 no pid
 sh-4.0#

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091013203425.565454612@goodmis.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 234 +++++++++++++++++++++++++++++++++++---------------
 kernel/trace/trace.h  |   4 +-
 2 files changed, 167 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 45c965919cff..0c799d1af702 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -60,6 +60,13 @@ static int last_ftrace_enabled;
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
+/* List for set_ftrace_pid's pids. */
+LIST_HEAD(ftrace_pids);
+struct ftrace_pid {
+	struct list_head list;
+	struct pid *pid;
+};
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -159,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 		else
 			func = ftrace_list_func;
 
-		if (ftrace_pid_trace) {
+		if (!list_empty(&ftrace_pids)) {
 			set_ftrace_pid_function(func);
 			func = ftrace_pid_func;
 		}
@@ -207,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 		if (ftrace_list->next == &ftrace_list_end) {
 			ftrace_func_t func = ftrace_list->func;
 
-			if (ftrace_pid_trace) {
+			if (!list_empty(&ftrace_pids)) {
 				set_ftrace_pid_function(func);
 				func = ftrace_pid_func;
 			}
@@ -235,7 +242,7 @@ static void ftrace_update_pid_func(void)
 	func = __ftrace_trace_function;
 #endif
 
-	if (ftrace_pid_trace) {
+	if (!list_empty(&ftrace_pids)) {
 		set_ftrace_pid_function(func);
 		func = ftrace_pid_func;
 	} else {
@@ -825,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 }
 #endif /* CONFIG_FUNCTION_PROFILER */
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -2758,23 +2763,6 @@ static inline void ftrace_startup_enable(int command) { }
 # define ftrace_shutdown_sysctl()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
-static ssize_t
-ftrace_pid_read(struct file *file, char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
-{
-	char buf[64];
-	int r;
-
-	if (ftrace_pid_trace == ftrace_swapper_pid)
-		r = sprintf(buf, "swapper tasks\n");
-	else if (ftrace_pid_trace)
-		r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
-	else
-		r = sprintf(buf, "no pid\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
 static void clear_ftrace_swapper(void)
 {
 	struct task_struct *p;
@@ -2825,14 +2813,12 @@ static void set_ftrace_pid(struct pid *pid)
 	rcu_read_unlock();
 }
 
-static void clear_ftrace_pid_task(struct pid **pid)
+static void clear_ftrace_pid_task(struct pid *pid)
 {
-	if (*pid == ftrace_swapper_pid)
+	if (pid == ftrace_swapper_pid)
 		clear_ftrace_swapper();
 	else
-		clear_ftrace_pid(*pid);
-
-	*pid = NULL;
+		clear_ftrace_pid(pid);
 }
 
 static void set_ftrace_pid_task(struct pid *pid)
@@ -2843,11 +2829,140 @@ static void set_ftrace_pid_task(struct pid *pid)
 		set_ftrace_pid(pid);
 }
 
+static int ftrace_pid_add(int p)
+{
+	struct pid *pid;
+	struct ftrace_pid *fpid;
+	int ret = -EINVAL;
+
+	mutex_lock(&ftrace_lock);
+
+	if (!p)
+		pid = ftrace_swapper_pid;
+	else
+		pid = find_get_pid(p);
+
+	if (!pid)
+		goto out;
+
+	ret = 0;
+
+	list_for_each_entry(fpid, &ftrace_pids, list)
+		if (fpid->pid == pid)
+			goto out_put;
+
+	ret = -ENOMEM;
+
+	fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
+	if (!fpid)
+		goto out_put;
+
+	list_add(&fpid->list, &ftrace_pids);
+	fpid->pid = pid;
+
+	set_ftrace_pid_task(pid);
+
+	ftrace_update_pid_func();
+	ftrace_startup_enable(0);
+
+	mutex_unlock(&ftrace_lock);
+	return 0;
+
+out_put:
+	if (pid != ftrace_swapper_pid)
+		put_pid(pid);
+
+out:
+	mutex_unlock(&ftrace_lock);
+	return ret;
+}
+
+static void ftrace_pid_reset(void)
+{
+	struct ftrace_pid *fpid, *safe;
+
+	mutex_lock(&ftrace_lock);
+	list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
+		struct pid *pid = fpid->pid;
+
+		clear_ftrace_pid_task(pid);
+
+		list_del(&fpid->list);
+		kfree(fpid);
+	}
+
+	ftrace_update_pid_func();
+	ftrace_startup_enable(0);
+
+	mutex_unlock(&ftrace_lock);
+}
+
+static void *fpid_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&ftrace_lock);
+
+	if (list_empty(&ftrace_pids) && (!*pos))
+		return (void *) 1;
+
+	return seq_list_start(&ftrace_pids, *pos);
+}
+
+static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	if (v == (void *)1)
+		return NULL;
+
+	return seq_list_next(v, &ftrace_pids, pos);
+}
+
+static void fpid_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&ftrace_lock);
+}
+
+static int fpid_show(struct seq_file *m, void *v)
+{
+	const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
+
+	if (v == (void *)1) {
+		seq_printf(m, "no pid\n");
+		return 0;
+	}
+
+	if (fpid->pid == ftrace_swapper_pid)
+		seq_printf(m, "swapper tasks\n");
+	else
+		seq_printf(m, "%u\n", pid_vnr(fpid->pid));
+
+	return 0;
+}
+
+static const struct seq_operations ftrace_pid_sops = {
+	.start = fpid_start,
+	.next = fpid_next,
+	.stop = fpid_stop,
+	.show = fpid_show,
+};
+
+static int
+ftrace_pid_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (file->f_flags & O_TRUNC))
+		ftrace_pid_reset();
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_open(file, &ftrace_pid_sops);
+
+	return ret;
+}
+
 static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
-	struct pid *pid;
 	char buf[64];
 	long val;
 	int ret;
@@ -2860,57 +2975,38 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 
 	buf[cnt] = 0;
 
+	/*
+	 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
+	 * to clean the filter quietly.
+	 */
+	strstrip(buf);
+	if (strlen(buf) == 0)
+		return 1;
+
 	ret = strict_strtol(buf, 10, &val);
 	if (ret < 0)
 		return ret;
 
-	mutex_lock(&ftrace_lock);
-	if (val < 0) {
-		/* disable pid tracing */
-		if (!ftrace_pid_trace)
-			goto out;
-
-		clear_ftrace_pid_task(&ftrace_pid_trace);
-
-	} else {
-		/* swapper task is special */
-		if (!val) {
-			pid = ftrace_swapper_pid;
-			if (pid == ftrace_pid_trace)
-				goto out;
-		} else {
-			pid = find_get_pid(val);
-
-			if (pid == ftrace_pid_trace) {
-				put_pid(pid);
-				goto out;
-			}
-		}
-
-		if (ftrace_pid_trace)
-			clear_ftrace_pid_task(&ftrace_pid_trace);
-
-		if (!pid)
-			goto out;
-
-		ftrace_pid_trace = pid;
-
-		set_ftrace_pid_task(ftrace_pid_trace);
-	}
+	ret = ftrace_pid_add(val);
 
-	/* update the function call */
-	ftrace_update_pid_func();
-	ftrace_startup_enable(0);
+	return ret ? ret : cnt;
+}
 
- out:
-	mutex_unlock(&ftrace_lock);
+static int
+ftrace_pid_release(struct inode *inode, struct file *file)
+{
+	if (file->f_mode & FMODE_READ)
+		seq_release(inode, file);
 
-	return cnt;
+	return 0;
 }
 
 static const struct file_operations ftrace_pid_fops = {
-	.read = ftrace_pid_read,
-	.write = ftrace_pid_write,
+	.open		= ftrace_pid_open,
+	.write		= ftrace_pid_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= ftrace_pid_release,
 };
 
 static __init int ftrace_init_debugfs(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f22a7ac32380..acef8b4636f0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -496,12 +496,12 @@ print_graph_function(struct trace_iterator *iter)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-extern struct pid *ftrace_pid_trace;
+extern struct list_head ftrace_pids;
 
 #ifdef CONFIG_FUNCTION_TRACER
 static inline int ftrace_trace_task(struct task_struct *task)
 {
-	if (!ftrace_pid_trace)
+	if (list_empty(&ftrace_pids))
 		return 1;
 
 	return test_tsk_trace_trace(task);
-- 
cgit v1.2.3-71-gd317


From 5cb084bb1f3fd4dcdaf7e4cf564994346ec8f783 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 13 Oct 2009 16:33:53 -0400
Subject: tracing: Enable records during the module load

I was debuging some module using "function" and "function_graph"
tracers and noticed, that if you load module after you enabled
tracing, the module's hooks will convert only to NOP instructions.

The attached patch enables modules' hooks if there's function trace
allready on, thus allowing to trace module functions.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20091013203425.896285120@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0c799d1af702..aaea9cda8781 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1270,12 +1270,34 @@ static int ftrace_update_code(struct module *mod)
 		ftrace_new_addrs = p->newlist;
 		p->flags = 0L;
 
-		/* convert record (i.e, patch mcount-call with NOP) */
-		if (ftrace_code_disable(mod, p)) {
-			p->flags |= FTRACE_FL_CONVERTED;
-			ftrace_update_cnt++;
-		} else
+		/*
+		 * Do the initial record convertion from mcount jump
+		 * to the NOP instructions.
+		 */
+		if (!ftrace_code_disable(mod, p)) {
 			ftrace_free_rec(p);
+			continue;
+		}
+
+		p->flags |= FTRACE_FL_CONVERTED;
+		ftrace_update_cnt++;
+
+		/*
+		 * If the tracing is enabled, go ahead and enable the record.
+		 *
+		 * The reason not to enable the record immediatelly is the
+		 * inherent check of ftrace_make_nop/ftrace_make_call for
+		 * correct previous instructions.  Making first the NOP
+		 * conversion puts the module to the correct state, thus
+		 * passing the ftrace_make_call check.
+		 */
+		if (ftrace_start_up) {
+			int failed = __ftrace_replace_code(p, 1);
+			if (failed) {
+				ftrace_bug(failed, p->ip);
+				ftrace_free_rec(p);
+			}
+		}
 	}
 
 	stop = ftrace_now(raw_smp_processor_id());
@@ -2609,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 	return 0;
 }
 
-static int ftrace_convert_nops(struct module *mod,
+static int ftrace_process_locs(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
 {
@@ -2669,7 +2691,7 @@ static void ftrace_init_module(struct module *mod,
 {
 	if (ftrace_disabled || start == end)
 		return;
-	ftrace_convert_nops(mod, start, end);
+	ftrace_process_locs(mod, start, end);
 }
 
 static int ftrace_module_notify(struct notifier_block *self,
@@ -2730,7 +2752,7 @@ void __init ftrace_init(void)
 
 	last_ftrace_enabled = ftrace_enabled = 1;
 
-	ret = ftrace_convert_nops(NULL,
+	ret = ftrace_process_locs(NULL,
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
-- 
cgit v1.2.3-71-gd317


From c44fc770845163f8d9e573f37f92a7b7a7ade14e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 19 Sep 2009 06:50:42 +0200
Subject: tracing: Move syscalls metadata handling from arch to core

Most of the syscalls metadata processing is done from arch.
But these operations are mostly generic accross archs. Especially now
that we have a common variable name that expresses the number of
syscalls supported by an arch: NR_syscalls, the only remaining bits
that need to reside in arch is the syscall nr to addr translation.

v2: Compare syscalls symbols only after the "sys" prefix so that we
    avoid spurious mismatches with archs that have syscalls wrappers,
    in which case syscalls symbols have "SyS" prefixed aliases.
    (Reported by: Heiko Carstens)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
---
 arch/s390/kernel/ftrace.c     | 67 +--------------------------------
 arch/x86/kernel/ftrace.c      | 76 +-------------------------------------
 include/trace/syscall.h       |  2 +-
 kernel/trace/trace_syscalls.c | 86 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 91 insertions(+), 140 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 57bdcb1e3cdf..7c5752c3423d 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -206,73 +206,10 @@ out:
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
 extern unsigned int sys_call_table[];
 
-static struct syscall_metadata **syscalls_metadata;
-
-struct syscall_metadata *syscall_nr_to_meta(int nr)
-{
-	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
-		return NULL;
-
-	return syscalls_metadata[nr];
-}
-
-int syscall_name_to_nr(char *name)
-{
-	int i;
-
-	if (!syscalls_metadata)
-		return -1;
-	for (i = 0; i < NR_syscalls; i++)
-		if (syscalls_metadata[i])
-			if (!strcmp(syscalls_metadata[i]->name, name))
-				return i;
-	return -1;
-}
-
-void set_syscall_enter_id(int num, int id)
-{
-	syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
+unsigned long __init arch_syscall_addr(int nr)
 {
-	syscalls_metadata[num]->exit_id = id;
-}
-
-static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
-{
-	struct syscall_metadata *start;
-	struct syscall_metadata *stop;
-	char str[KSYM_SYMBOL_LEN];
-
-	start = (struct syscall_metadata *)__start_syscalls_metadata;
-	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
-	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
-
-	for ( ; start < stop; start++) {
-		if (start->name && !strcmp(start->name + 3, str + 3))
-			return start;
-	}
-	return NULL;
-}
-
-static int __init arch_init_ftrace_syscalls(void)
-{
-	struct syscall_metadata *meta;
-	int i;
-	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls,
-				    GFP_KERNEL);
-	if (!syscalls_metadata)
-		return -ENOMEM;
-	for (i = 0; i < NR_syscalls; i++) {
-		meta = find_syscall_meta((unsigned long)sys_call_table[i]);
-		syscalls_metadata[i] = meta;
-	}
-	return 0;
+	return (unsigned long)sys_call_table[nr];
 }
-arch_initcall(arch_init_ftrace_syscalls);
 #endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 25e6f5fc4b1e..5a1b9758fd62 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -470,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
 extern unsigned long *sys_call_table;
 
-static struct syscall_metadata **syscalls_metadata;
-
-static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
-{
-	struct syscall_metadata *start;
-	struct syscall_metadata *stop;
-	char str[KSYM_SYMBOL_LEN];
-
-
-	start = (struct syscall_metadata *)__start_syscalls_metadata;
-	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
-	kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
-
-	for ( ; start < stop; start++) {
-		if (start->name && !strcmp(start->name, str))
-			return start;
-	}
-	return NULL;
-}
-
-struct syscall_metadata *syscall_nr_to_meta(int nr)
-{
-	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
-		return NULL;
-
-	return syscalls_metadata[nr];
-}
-
-int syscall_name_to_nr(char *name)
-{
-	int i;
-
-	if (!syscalls_metadata)
-		return -1;
-
-	for (i = 0; i < NR_syscalls; i++) {
-		if (syscalls_metadata[i]) {
-			if (!strcmp(syscalls_metadata[i]->name, name))
-				return i;
-		}
-	}
-	return -1;
-}
-
-void set_syscall_enter_id(int num, int id)
-{
-	syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
+unsigned long __init arch_syscall_addr(int nr)
 {
-	syscalls_metadata[num]->exit_id = id;
-}
-
-static int __init arch_init_ftrace_syscalls(void)
-{
-	int i;
-	struct syscall_metadata *meta;
-	unsigned long **psys_syscall_table = &sys_call_table;
-
-	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
-					NR_syscalls, GFP_KERNEL);
-	if (!syscalls_metadata) {
-		WARN_ON(1);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < NR_syscalls; i++) {
-		meta = find_syscall_meta(psys_syscall_table[i]);
-		syscalls_metadata[i] = meta;
-	}
-	return 0;
+	return (unsigned long)(&sys_call_table)[nr];
 }
-arch_initcall(arch_init_ftrace_syscalls);
 #endif
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 5dc283ba5ae0..e972f0a40f8d 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -33,7 +33,7 @@ struct syscall_metadata {
 };
 
 #ifdef CONFIG_FTRACE_SYSCALLS
-extern struct syscall_metadata *syscall_nr_to_meta(int nr);
+extern unsigned long arch_syscall_addr(int nr);
 extern int syscall_name_to_nr(char *name);
 void set_syscall_enter_id(int num, int id);
 void set_syscall_exit_id(int num, int id);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9fbce6c9d2e1..8bda4bff2286 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -14,6 +14,69 @@ static int sys_refcount_exit;
 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 
+extern unsigned long __start_syscalls_metadata[];
+extern unsigned long __stop_syscalls_metadata[];
+
+static struct syscall_metadata **syscalls_metadata;
+
+static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+{
+	struct syscall_metadata *start;
+	struct syscall_metadata *stop;
+	char str[KSYM_SYMBOL_LEN];
+
+
+	start = (struct syscall_metadata *)__start_syscalls_metadata;
+	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+
+	for ( ; start < stop; start++) {
+		/*
+		 * Only compare after the "sys" prefix. Archs that use
+		 * syscall wrappers may have syscalls symbols aliases prefixed
+		 * with "SyS" instead of "sys", leading to an unwanted
+		 * mismatch.
+		 */
+		if (start->name && !strcmp(start->name + 3, str + 3))
+			return start;
+	}
+	return NULL;
+}
+
+static struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
+		return NULL;
+
+	return syscalls_metadata[nr];
+}
+
+int syscall_name_to_nr(char *name)
+{
+	int i;
+
+	if (!syscalls_metadata)
+		return -1;
+
+	for (i = 0; i < NR_syscalls; i++) {
+		if (syscalls_metadata[i]) {
+			if (!strcmp(syscalls_metadata[i]->name, name))
+				return i;
+		}
+	}
+	return -1;
+}
+
+void set_syscall_enter_id(int num, int id)
+{
+	syscalls_metadata[num]->enter_id = id;
+}
+
+void set_syscall_exit_id(int num, int id)
+{
+	syscalls_metadata[num]->exit_id = id;
+}
+
 enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags)
 {
@@ -375,6 +438,29 @@ struct trace_event event_syscall_exit = {
 	.trace			= print_syscall_exit,
 };
 
+int __init init_ftrace_syscalls(void)
+{
+	struct syscall_metadata *meta;
+	unsigned long addr;
+	int i;
+
+	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
+					NR_syscalls, GFP_KERNEL);
+	if (!syscalls_metadata) {
+		WARN_ON(1);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < NR_syscalls; i++) {
+		addr = arch_syscall_addr(i);
+		meta = find_syscall_meta(addr);
+		syscalls_metadata[i] = meta;
+	}
+
+	return 0;
+}
+core_initcall(init_ftrace_syscalls);
+
 #ifdef CONFIG_EVENT_PROFILE
 
 static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
-- 
cgit v1.2.3-71-gd317


From e6fe07a014c7a3466dcd1a387a9ac04d84c2703c Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 6 Aug 2009 13:22:40 -0600
Subject: pm_qos: remove BKL

pm_qos_power_open got its lock_kernel() calls from the open() pushdown.  A
look at the code shows that the only global resources accessed are
pm_qos_array and "name".  pm_qos_array doesn't change (things pointed to
therein do change, but they are atomics and/or are protected by
pm_qos_lock).  Accesses to "name" are totally unprotected with or without
the BKL; that will be fixed shortly.  The BKL is not helpful here; take it
out.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
LKML-Reference: <20091010153349.071381158@linutronix.de>
Acked-by: Mark Gross <mgross@linux.intel.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/pm_qos_params.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..d96b83ed21cb 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
 
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -352,20 +351,15 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 	int ret;
 	long pm_qos_class;
 
-	lock_kernel();
 	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
 	if (pm_qos_class >= 0) {
 		filp->private_data = (void *)pm_qos_class;
 		sprintf(name, "process_%d", current->pid);
 		ret = pm_qos_add_requirement(pm_qos_class, name,
 					PM_QOS_DEFAULT_VALUE);
-		if (ret >= 0) {
-			unlock_kernel();
+		if (ret >= 0)
 			return 0;
-		}
 	}
-	unlock_kernel();
-
 	return -EPERM;
 }
 
-- 
cgit v1.2.3-71-gd317


From 1a6deaea3584fd7af1cad492b1fe0867060b45db Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 6 Aug 2009 13:35:44 -0600
Subject: pm_qos: clean up racy global "name" variable

"name" is a poor name for a file-global variable.  It was used in three
different functions, with no mutual exclusion.  But it's just a tiny,
temporary string; let's just move it onto the stack in the functions that
need it.  Also use snprintf() just in case.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
LKML-Reference: <20091010153349.113570550@linutronix.de>
Acked-by: Mark Gross <mgross@linux.intel.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/pm_qos_params.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index d96b83ed21cb..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -343,18 +343,18 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
 
-#define PID_NAME_LEN sizeof("process_1234567890")
-static char name[PID_NAME_LEN];
+#define PID_NAME_LEN 32
 
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
 	int ret;
 	long pm_qos_class;
+	char name[PID_NAME_LEN];
 
 	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
 	if (pm_qos_class >= 0) {
 		filp->private_data = (void *)pm_qos_class;
-		sprintf(name, "process_%d", current->pid);
+		snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
 		ret = pm_qos_add_requirement(pm_qos_class, name,
 					PM_QOS_DEFAULT_VALUE);
 		if (ret >= 0)
@@ -366,9 +366,10 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
 	int pm_qos_class;
+	char name[PID_NAME_LEN];
 
 	pm_qos_class = (long)filp->private_data;
-	sprintf(name, "process_%d", current->pid);
+	snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
 	pm_qos_remove_requirement(pm_qos_class, name);
 
 	return 0;
@@ -379,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 {
 	s32 value;
 	int pm_qos_class;
+	char name[PID_NAME_LEN];
 
 	pm_qos_class = (long)filp->private_data;
 	if (count != sizeof(s32))
 		return -EINVAL;
 	if (copy_from_user(&value, buf, sizeof(s32)))
 		return -EFAULT;
-	sprintf(name, "process_%d", current->pid);
+	snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
 	pm_qos_update_requirement(pm_qos_class, name, value);
 
 	return  sizeof(s32);
-- 
cgit v1.2.3-71-gd317


From 6f15fa50087c8317e353145319466afbeb27a75d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 9 Oct 2009 20:31:33 +0200
Subject: sys: Remove BKL from sys_reboot

Serialization of sys_reboot can be done local. The BKL is not
protecting anything else.

LKML-Reference: <20091010153349.405590702@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sys.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d163e0..22ea9553c3bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
@@ -349,6 +348,9 @@ void kernel_power_off(void)
 	machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
+
+static DEFINE_MUTEX(reboot_mutex);
+
 /*
  * Reboot system call: for obvious reasons only root may call it,
  * and even root needs to set up some magic numbers in the registers
@@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
 		cmd = LINUX_REBOOT_CMD_HALT;
 
-	lock_kernel();
+	mutex_lock(&reboot_mutex);
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
 		kernel_restart(NULL);
@@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 
 	case LINUX_REBOOT_CMD_HALT:
 		kernel_halt();
-		unlock_kernel();
 		do_exit(0);
 		panic("cannot halt");
 
 	case LINUX_REBOOT_CMD_POWER_OFF:
 		kernel_power_off();
-		unlock_kernel();
 		do_exit(0);
 		break;
 
 	case LINUX_REBOOT_CMD_RESTART2:
 		if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
-			unlock_kernel();
-			return -EFAULT;
+			ret = -EFAULT;
+			break;
 		}
 		buffer[sizeof(buffer) - 1] = '\0';
 
@@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		ret = -EINVAL;
 		break;
 	}
-	unlock_kernel();
+	mutex_unlock(&reboot_mutex);
 	return ret;
 }
 
-- 
cgit v1.2.3-71-gd317


From 06f43d66ec36388056f5c697bf1e67c0e0a1645c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Oct 2009 20:43:39 +0200
Subject: ftrace: Copy ftrace_graph_filter boot param using strlcpy

We are using strncpy in the wrong way to copy the ftrace_graph_filter
boot param because we pass the buffer size instead of the max string
size it can contain (buffer size - 1). The end result might not be
NULL terminated as we are abusing the max string size.

Lets use strlcpy() instead.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index aaea9cda8781..b10c0d90a6ff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2293,7 +2293,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static int __init set_graph_function(char *str)
 {
-	strncpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_graph_filter=", set_graph_function);
-- 
cgit v1.2.3-71-gd317


From 1beee96bae0daf7f491356777c3080cc436950f5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Oct 2009 20:50:32 +0200
Subject: ftrace: Rename set_bootup_ftrace into set_cmdline_ftrace

set_cmdline_ftrace is a better match against what does this function:
apply a tracer name from the kernel command line.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4311ec3062f0..026e715a0c7a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf);
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
 static char *default_bootup_tracer;
 
-static int __init set_bootup_ftrace(char *str)
+static int __init set_cmdline_ftrace(char *str)
 {
 	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +137,7 @@ static int __init set_bootup_ftrace(char *str)
 	ring_buffer_expanded = 1;
 	return 1;
 }
-__setup("ftrace=", set_bootup_ftrace);
+__setup("ftrace=", set_cmdline_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
-- 
cgit v1.2.3-71-gd317


From 3397e040dfacbb303498ced1baa96be983dcea06 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 14 Oct 2009 16:36:38 -0700
Subject: rcu: Add rnp->blocked_tasks to tracing

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: npiggin@suse.de
Cc: jens.axboe@oracle.com
Cc: Josh Triplett <josh@joshtriplett.org>
LKML-Reference: <20091014233638.GE6763@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
 kernel/rcutree_trace.c |    8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)
---
 kernel/rcutree_trace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b31c779e62e..1984cdc51e9a 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -155,12 +155,14 @@ static const struct file_operations rcudata_csv_fops = {
 
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
+	long gpnum;
 	int level = 0;
 	struct rcu_node *rnp;
 
+	gpnum = rsp->gpnum;
 	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
 		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
-		   rsp->completed, rsp->gpnum, rsp->signaled,
+		   rsp->completed, gpnum, rsp->signaled,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff),
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
@@ -171,8 +173,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 			seq_puts(m, "\n");
 			level = rnp->level;
 		}
-		seq_printf(m, "%lx/%lx %d:%d ^%d    ",
+		seq_printf(m, "%lx/%lx %c>%c %d:%d ^%d    ",
 			   rnp->qsmask, rnp->qsmaskinit,
+			   "T."[list_empty(&rnp->blocked_tasks[gpnum & 1])],
+			   "T."[list_empty(&rnp->blocked_tasks[!(gpnum & 1)])],
 			   rnp->grplo, rnp->grphi, rnp->grpnum);
 	}
 	seq_puts(m, "\n");
-- 
cgit v1.2.3-71-gd317


From fce29d15b59245597f7f320db4a9f2be0f5fb512 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Oct 2009 11:20:34 +0800
Subject: tracing/filters: Refactor subsystem filter code

Change:
	for_each_pred
		for_each_subsystem
To:
	for_each_subsystem
		for_each_pred

This change also prepares for later patches.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4AD69502.8060903@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |   1 -
 kernel/trace/trace_events_filter.c | 124 ++++++++++++++-----------------------
 2 files changed, 45 insertions(+), 80 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index acef8b4636f0..628614532d16 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -683,7 +683,6 @@ struct event_filter {
 	int			n_preds;
 	struct filter_pred	**preds;
 	char			*filter_string;
-	bool			no_reset;
 };
 
 struct event_subsystem {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 92672016da28..9c4f9a0dae2b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -615,14 +615,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
 	return 0;
 }
 
-enum {
-	FILTER_DISABLE_ALL,
-	FILTER_INIT_NO_RESET,
-	FILTER_SKIP_NO_RESET,
-};
-
-static void filter_free_subsystem_preds(struct event_subsystem *system,
-					int flag)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct ftrace_event_call *call;
 
@@ -633,14 +626,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
 		if (strcmp(call->system, system->name) != 0)
 			continue;
 
-		if (flag == FILTER_INIT_NO_RESET) {
-			call->filter->no_reset = false;
-			continue;
-		}
-
-		if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
-			continue;
-
 		filter_disable_preds(call);
 		remove_filter_string(call->filter);
 	}
@@ -817,44 +802,6 @@ add_pred_fn:
 	return 0;
 }
 
-static int filter_add_subsystem_pred(struct filter_parse_state *ps,
-				     struct event_subsystem *system,
-				     struct filter_pred *pred,
-				     char *filter_string,
-				     bool dry_run)
-{
-	struct ftrace_event_call *call;
-	int err = 0;
-	bool fail = true;
-
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		if (!call->define_fields)
-			continue;
-
-		if (strcmp(call->system, system->name))
-			continue;
-
-		if (call->filter->no_reset)
-			continue;
-
-		err = filter_add_pred(ps, call, pred, dry_run);
-		if (err)
-			call->filter->no_reset = true;
-		else
-			fail = false;
-
-		if (!dry_run)
-			replace_filter_string(call->filter, filter_string);
-	}
-
-	if (fail) {
-		parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-		return err;
-	}
-	return 0;
-}
-
 static void parse_init(struct filter_parse_state *ps,
 		       struct filter_op *ops,
 		       char *infix_string)
@@ -1209,8 +1156,7 @@ static int check_preds(struct filter_parse_state *ps)
 	return 0;
 }
 
-static int replace_preds(struct event_subsystem *system,
-			 struct ftrace_event_call *call,
+static int replace_preds(struct ftrace_event_call *call,
 			 struct filter_parse_state *ps,
 			 char *filter_string,
 			 bool dry_run)
@@ -1257,11 +1203,7 @@ static int replace_preds(struct event_subsystem *system,
 add_pred:
 		if (!pred)
 			return -ENOMEM;
-		if (call)
-			err = filter_add_pred(ps, call, pred, false);
-		else
-			err = filter_add_subsystem_pred(ps, system, pred,
-						filter_string, dry_run);
+		err = filter_add_pred(ps, call, pred, dry_run);
 		filter_free_pred(pred);
 		if (err)
 			return err;
@@ -1272,6 +1214,44 @@ add_pred:
 	return 0;
 }
 
+static int replace_system_preds(struct event_subsystem *system,
+				struct filter_parse_state *ps,
+				char *filter_string)
+{
+	struct ftrace_event_call *call;
+	int err;
+	bool fail = true;
+
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		if (!call->define_fields)
+			continue;
+
+		if (strcmp(call->system, system->name) != 0)
+			continue;
+
+		/* try to see if the filter can be applied */
+		err = replace_preds(call, ps, filter_string, true);
+		if (err)
+			continue;
+
+		/* really apply the filter */
+		filter_disable_preds(call);
+		err = replace_preds(call, ps, filter_string, false);
+		if (err)
+			filter_disable_preds(call);
+		else
+			replace_filter_string(call->filter, filter_string);
+		fail = false;
+	}
+
+	if (fail) {
+		parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+		return err;
+	}
+	return 0;
+}
+
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
 	int err;
@@ -1306,7 +1286,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		goto out;
 	}
 
-	err = replace_preds(NULL, call, ps, filter_string, false);
+	err = replace_preds(call, ps, filter_string, false);
 	if (err)
 		append_filter_err(ps, call->filter);
 
@@ -1334,7 +1314,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 		goto out_unlock;
 
 	if (!strcmp(strstrip(filter_string), "0")) {
-		filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
+		filter_free_subsystem_preds(system);
 		remove_filter_string(system->filter);
 		mutex_unlock(&event_mutex);
 		return 0;
@@ -1354,23 +1334,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 		goto out;
 	}
 
-	filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
-
-	/* try to see the filter can be applied to which events */
-	err = replace_preds(system, NULL, ps, filter_string, true);
-	if (err) {
-		append_filter_err(ps, system->filter);
-		goto out;
-	}
-
-	filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
-
-	/* really apply the filter to the events */
-	err = replace_preds(system, NULL, ps, filter_string, false);
-	if (err) {
+	err = replace_system_preds(system, ps, filter_string);
+	if (err)
 		append_filter_err(ps, system->filter);
-		filter_free_subsystem_preds(system, 2);
-	}
 
 out:
 	filter_opstack_clear(ps);
-- 
cgit v1.2.3-71-gd317


From b0f1a59a98d7ac2102e7e4f22904c26d564a5628 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Oct 2009 11:21:12 +0800
Subject: tracing/filters: Use a different op for glob match

"==" will always do a full match, and "~" will do a glob match.

In the future, we may add "=~" for regex match.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4AD69528.3050309@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  2 +-
 kernel/trace/trace_events_filter.c | 59 ++++++++++++++++++--------------------
 2 files changed, 29 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 628614532d16..ffe53ddbe67a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -702,7 +702,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
 
 enum regex_type {
-	MATCH_FULL,
+	MATCH_FULL = 0,
 	MATCH_FRONT_ONLY,
 	MATCH_MIDDLE_ONLY,
 	MATCH_END_ONLY,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9c4f9a0dae2b..273845fce393 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -29,6 +29,7 @@ enum filter_op_ids
 {
 	OP_OR,
 	OP_AND,
+	OP_GLOB,
 	OP_NE,
 	OP_EQ,
 	OP_LT,
@@ -46,16 +47,17 @@ struct filter_op {
 };
 
 static struct filter_op filter_ops[] = {
-	{ OP_OR, "||", 1 },
-	{ OP_AND, "&&", 2 },
-	{ OP_NE, "!=", 4 },
-	{ OP_EQ, "==", 4 },
-	{ OP_LT, "<", 5 },
-	{ OP_LE, "<=", 5 },
-	{ OP_GT, ">", 5 },
-	{ OP_GE, ">=", 5 },
-	{ OP_NONE, "OP_NONE", 0 },
-	{ OP_OPEN_PAREN, "(", 0 },
+	{ OP_OR,	"||",		1 },
+	{ OP_AND,	"&&",		2 },
+	{ OP_GLOB,	"~",		4 },
+	{ OP_NE,	"!=",		4 },
+	{ OP_EQ,	"==",		4 },
+	{ OP_LT,	"<",		5 },
+	{ OP_LE,	"<=",		5 },
+	{ OP_GT,	">",		5 },
+	{ OP_GE,	">=",		5 },
+	{ OP_NONE,	"OP_NONE",	0 },
+	{ OP_OPEN_PAREN, "(",		0 },
 };
 
 enum {
@@ -329,22 +331,18 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
 	return type;
 }
 
-static int filter_build_regex(struct filter_pred *pred)
+static void filter_build_regex(struct filter_pred *pred)
 {
 	struct regex *r = &pred->regex;
-	char *search, *dup;
-	enum regex_type type;
-	int not;
-
-	type = filter_parse_regex(r->pattern, r->len, &search, &not);
-	dup = kstrdup(search, GFP_KERNEL);
-	if (!dup)
-		return -ENOMEM;
-
-	strcpy(r->pattern, dup);
-	kfree(dup);
-
-	r->len = strlen(r->pattern);
+	char *search;
+	enum regex_type type = MATCH_FULL;
+	int not = 0;
+
+	if (pred->op == OP_GLOB) {
+		type = filter_parse_regex(r->pattern, r->len, &search, &not);
+		r->len = strlen(search);
+		memmove(r->pattern, search, r->len+1);
+	}
 
 	switch (type) {
 	case MATCH_FULL:
@@ -362,8 +360,6 @@ static int filter_build_regex(struct filter_pred *pred)
 	}
 
 	pred->not ^= not;
-
-	return 0;
 }
 
 /* return 1 if event matches, 0 otherwise (discard) */
@@ -676,7 +672,10 @@ static bool is_string_field(struct ftrace_event_field *field)
 
 static int is_legal_op(struct ftrace_event_field *field, int op)
 {
-	if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
+	if (is_string_field(field) &&
+	    (op != OP_EQ && op != OP_NE && op != OP_GLOB))
+		return 0;
+	if (!is_string_field(field) && op == OP_GLOB)
 		return 0;
 
 	return 1;
@@ -761,15 +760,13 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	}
 
 	if (is_string_field(field)) {
-		ret = filter_build_regex(pred);
-		if (ret)
-			return ret;
+		filter_build_regex(pred);
 
 		if (field->filter_type == FILTER_STATIC_STRING) {
 			fn = filter_pred_string;
 			pred->regex.field_len = field->size;
 		} else if (field->filter_type == FILTER_DYN_STRING)
-				fn = filter_pred_strloc;
+			fn = filter_pred_strloc;
 		else {
 			fn = filter_pred_pchar;
 			pred->regex.field_len = strlen(pred->regex.pattern);
-- 
cgit v1.2.3-71-gd317


From 6fb2915df7f0747d9044da9dbff5b46dc2e20830 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Oct 2009 11:21:42 +0800
Subject: tracing/profile: Add filter support

- Add an ioctl to allocate a filter for a perf event.

- Free the filter when the associated perf event is to be freed.

- Do the filtering in perf_swevent_match().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4AD69546.8050401@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  11 ++-
 include/linux/perf_counter.h       |   1 +
 include/linux/perf_event.h         |   6 ++
 kernel/perf_event.c                |  80 ++++++++++++++++++++--
 kernel/trace/trace.h               |   3 +-
 kernel/trace/trace_events_filter.c | 133 +++++++++++++++++++++++++++++--------
 6 files changed, 199 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4ec5e67e18cf..d11770472bc8 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -144,7 +144,7 @@ extern char			*trace_profile_buf_nmi;
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
 
 extern void destroy_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_match_preds(struct event_filter *filter, void *rec);
 extern int filter_current_check_discard(struct ring_buffer *buffer,
 					struct ftrace_event_call *call,
 					void *rec,
@@ -186,4 +186,13 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
+#ifdef CONFIG_EVENT_PROFILE
+struct perf_event;
+extern int ftrace_profile_enable(int event_id);
+extern void ftrace_profile_disable(int event_id);
+extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+				     char *filter_str);
+extern void ftrace_profile_free_filter(struct perf_event *event);
+#endif
+
 #endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7b7fbf433cff..91a2b4309e7a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -225,6 +225,7 @@ struct perf_counter_attr {
 #define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
 #define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
 #define PERF_COUNTER_IOC_SET_OUTPUT	_IO ('$', 5)
+#define PERF_COUNTER_IOC_SET_FILTER	_IOW('$', 6, char *)
 
 enum perf_counter_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2e6d95f97419..df9d964c15fc 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -221,6 +221,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_RESET		_IO ('$', 3)
 #define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, u64)
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
+#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
@@ -633,7 +634,12 @@ struct perf_event {
 
 	struct pid_namespace		*ns;
 	u64				id;
+
+#ifdef CONFIG_EVENT_PROFILE
+	struct event_filter		*filter;
 #endif
+
+#endif /* CONFIG_PERF_EVENTS */
 };
 
 /**
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9d0b5c665883..12b5ec39bf97 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -28,6 +28,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
+#include <linux/ftrace_event.h>
 
 #include <asm/irq_regs.h>
 
@@ -1658,6 +1659,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
 	return ERR_PTR(err);
 }
 
+static void perf_event_free_filter(struct perf_event *event);
+
 static void free_event_rcu(struct rcu_head *head)
 {
 	struct perf_event *event;
@@ -1665,6 +1668,7 @@ static void free_event_rcu(struct rcu_head *head)
 	event = container_of(head, struct perf_event, rcu_head);
 	if (event->ns)
 		put_pid_ns(event->ns);
+	perf_event_free_filter(event);
 	kfree(event);
 }
 
@@ -1974,7 +1978,8 @@ unlock:
 	return ret;
 }
 
-int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -2002,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_EVENT_IOC_SET_OUTPUT:
 		return perf_event_set_output(event, arg);
 
+	case PERF_EVENT_IOC_SET_FILTER:
+		return perf_event_set_filter(event, (void __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -3806,9 +3814,14 @@ static int perf_swevent_is_counting(struct perf_event *event)
 	return 1;
 }
 
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data);
+
 static int perf_swevent_match(struct perf_event *event,
 				enum perf_type_id type,
-				u32 event_id, struct pt_regs *regs)
+				u32 event_id,
+				struct perf_sample_data *data,
+				struct pt_regs *regs)
 {
 	if (!perf_swevent_is_counting(event))
 		return 0;
@@ -3826,6 +3839,10 @@ static int perf_swevent_match(struct perf_event *event,
 			return 0;
 	}
 
+	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
+	    !perf_tp_event_match(event, data))
+		return 0;
+
 	return 1;
 }
 
@@ -3842,7 +3859,7 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_swevent_match(event, type, event_id, regs))
+		if (perf_swevent_match(event, type, event_id, data, regs))
 			perf_swevent_add(event, nr, nmi, data, regs);
 	}
 	rcu_read_unlock();
@@ -4086,6 +4103,7 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
+
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 			  int entry_size)
 {
@@ -4109,8 +4127,15 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
-extern int ftrace_profile_enable(int);
-extern void ftrace_profile_disable(int);
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data)
+{
+	void *record = data->raw->data;
+
+	if (likely(!event->filter) || filter_match_preds(event->filter, record))
+		return 1;
+	return 0;
+}
 
 static void tp_perf_event_destroy(struct perf_event *event)
 {
@@ -4135,12 +4160,53 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 
 	return &perf_ops_generic;
 }
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+	char *filter_str;
+	int ret;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	filter_str = strndup_user(arg, PAGE_SIZE);
+	if (IS_ERR(filter_str))
+		return PTR_ERR(filter_str);
+
+	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+	kfree(filter_str);
+	return ret;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+	ftrace_profile_free_filter(event);
+}
+
 #else
+
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data)
+{
+	return 1;
+}
+
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
 	return NULL;
 }
-#endif
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+#endif /* CONFIG_EVENT_PROFILE */
 
 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
@@ -4394,7 +4460,7 @@ err_size:
 	goto out;
 }
 
-int perf_event_set_output(struct perf_event *event, int output_fd)
+static int perf_event_set_output(struct perf_event *event, int output_fd)
 {
 	struct perf_event *output_event = NULL;
 	struct file *output_file = NULL;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ffe53ddbe67a..4959ada9e0bb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -743,7 +743,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->filter_active) &&
+	    !filter_match_preds(call->filter, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 273845fce393..e27bb6acc2dd 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/mutex.h>
+#include <linux/perf_event.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -363,9 +364,8 @@ static void filter_build_regex(struct filter_pred *pred)
 }
 
 /* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	struct event_filter *filter = call->filter;
 	int match, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
@@ -538,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
-void destroy_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
 {
-	struct event_filter *filter = call->filter;
 	int i;
 
 	if (!filter)
@@ -553,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
 	kfree(filter->preds);
 	kfree(filter->filter_string);
 	kfree(filter);
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+	__free_preds(call->filter);
 	call->filter = NULL;
+	call->filter_active = 0;
 }
 
-static int init_preds(struct ftrace_event_call *call)
+static struct event_filter *__alloc_preds(void)
 {
 	struct event_filter *filter;
 	struct filter_pred *pred;
 	int i;
 
-	if (call->filter)
-		return 0;
-
-	filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-	if (!call->filter)
-		return -ENOMEM;
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!filter)
+		return ERR_PTR(-ENOMEM);
 
 	filter->n_preds = 0;
 
@@ -583,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
 		filter->preds[i] = pred;
 	}
 
-	return 0;
+	return filter;
 
 oom:
-	destroy_preds(call);
+	__free_preds(filter);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int init_preds(struct ftrace_event_call *call)
+{
+	if (call->filter)
+		return 0;
+
+	call->filter_active = 0;
+	call->filter = __alloc_preds();
+	if (IS_ERR(call->filter))
+		return PTR_ERR(call->filter);
 
-	return -ENOMEM;
+	return 0;
 }
 
 static int init_subsystem_preds(struct event_subsystem *system)
@@ -629,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 
 static int filter_add_pred_fn(struct filter_parse_state *ps,
 			      struct ftrace_event_call *call,
+			      struct event_filter *filter,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
-	struct event_filter *filter = call->filter;
 	int idx, err;
 
 	if (filter->n_preds == MAX_FILTER_PRED) {
@@ -647,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 		return err;
 
 	filter->n_preds++;
-	call->filter_active = 1;
 
 	return 0;
 }
@@ -726,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 
 static int filter_add_pred(struct filter_parse_state *ps,
 			   struct ftrace_event_call *call,
+			   struct event_filter *filter,
 			   struct filter_pred *pred,
 			   bool dry_run)
 {
@@ -795,7 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 
 add_pred_fn:
 	if (!dry_run)
-		return filter_add_pred_fn(ps, call, pred, fn);
+		return filter_add_pred_fn(ps, call, filter, pred, fn);
 	return 0;
 }
 
@@ -1154,6 +1168,7 @@ static int check_preds(struct filter_parse_state *ps)
 }
 
 static int replace_preds(struct ftrace_event_call *call,
+			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
 			 char *filter_string,
 			 bool dry_run)
@@ -1200,7 +1215,7 @@ static int replace_preds(struct ftrace_event_call *call,
 add_pred:
 		if (!pred)
 			return -ENOMEM;
-		err = filter_add_pred(ps, call, pred, dry_run);
+		err = filter_add_pred(ps, call, filter, pred, dry_run);
 		filter_free_pred(pred);
 		if (err)
 			return err;
@@ -1216,6 +1231,7 @@ static int replace_system_preds(struct event_subsystem *system,
 				char *filter_string)
 {
 	struct ftrace_event_call *call;
+	struct event_filter *filter;
 	int err;
 	bool fail = true;
 
@@ -1228,17 +1244,19 @@ static int replace_system_preds(struct event_subsystem *system,
 			continue;
 
 		/* try to see if the filter can be applied */
-		err = replace_preds(call, ps, filter_string, true);
+		err = replace_preds(call, filter, ps, filter_string, true);
 		if (err)
 			continue;
 
 		/* really apply the filter */
 		filter_disable_preds(call);
-		err = replace_preds(call, ps, filter_string, false);
+		err = replace_preds(call, filter, ps, filter_string, false);
 		if (err)
 			filter_disable_preds(call);
-		else
-			replace_filter_string(call->filter, filter_string);
+		else {
+			call->filter_active = 1;
+			replace_filter_string(filter, filter_string);
+		}
 		fail = false;
 	}
 
@@ -1252,7 +1270,6 @@ static int replace_system_preds(struct event_subsystem *system,
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
 	int err;
-
 	struct filter_parse_state *ps;
 
 	mutex_lock(&event_mutex);
@@ -1283,10 +1300,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		goto out;
 	}
 
-	err = replace_preds(call, ps, filter_string, false);
+	err = replace_preds(call, call->filter, ps, filter_string, false);
 	if (err)
 		append_filter_err(ps, call->filter);
-
+	else
+		call->filter_active = 1;
 out:
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
@@ -1301,7 +1319,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 				 char *filter_string)
 {
 	int err;
-
 	struct filter_parse_state *ps;
 
 	mutex_lock(&event_mutex);
@@ -1345,3 +1362,67 @@ out_unlock:
 	return err;
 }
 
+#ifdef CONFIG_EVENT_PROFILE
+
+void ftrace_profile_free_filter(struct perf_event *event)
+{
+	struct event_filter *filter = event->filter;
+
+	event->filter = NULL;
+	__free_preds(filter);
+}
+
+int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+			      char *filter_str)
+{
+	int err;
+	struct event_filter *filter;
+	struct filter_parse_state *ps;
+	struct ftrace_event_call *call = NULL;
+
+	mutex_lock(&event_mutex);
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (call->id == event_id)
+			break;
+	}
+	if (!call)
+		return -EINVAL;
+
+	if (event->filter)
+		return -EEXIST;
+
+	filter = __alloc_preds();
+	if (IS_ERR(filter))
+		return PTR_ERR(filter);
+
+	err = -ENOMEM;
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		goto free_preds;
+
+	parse_init(ps, filter_ops, filter_str);
+	err = filter_parse(ps);
+	if (err)
+		goto free_ps;
+
+	err = replace_preds(call, filter, ps, filter_str, false);
+	if (!err)
+		event->filter = filter;
+
+free_ps:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+free_preds:
+	if (err)
+		__free_preds(filter);
+
+	mutex_unlock(&event_mutex);
+
+	return err;
+}
+
+#endif /* CONFIG_EVENT_PROFILE */
+
-- 
cgit v1.2.3-71-gd317


From a66abe7fbf7805a1a02f241bd5283265ff6706ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 15 Oct 2009 12:24:04 +0200
Subject: tracing/events: Fix locking imbalance in the filter code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Américo Wang noticed that we have a locking imbalance in the
error paths of ftrace_profile_set_filter(), causing potential
leakage of event_mutex.

Also clean up other error codepaths related to event_mutex
while at it.

Plus fix an initialized variable in the subsystem filter code.

Reported-by: Américo Wang <xiyou.wangcong@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <2375c9f90910150247u5ccb8e2at58c764e385ffa490@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e27bb6acc2dd..21d34757b955 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1230,10 +1230,10 @@ static int replace_system_preds(struct event_subsystem *system,
 				struct filter_parse_state *ps,
 				char *filter_string)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
-	struct event_filter *filter;
-	int err;
 	bool fail = true;
+	int err;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 
@@ -1262,7 +1262,7 @@ static int replace_system_preds(struct event_subsystem *system,
 
 	if (fail) {
 		parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-		return err;
+		return -EINVAL;
 	}
 	return 0;
 }
@@ -1281,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_disable_preds(call);
 		remove_filter_string(call->filter);
-		mutex_unlock(&event_mutex);
-		return 0;
+		goto out_unlock;
 	}
 
 	err = -ENOMEM;
@@ -1330,8 +1329,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_free_subsystem_preds(system);
 		remove_filter_string(system->filter);
-		mutex_unlock(&event_mutex);
-		return 0;
+		goto out_unlock;
 	}
 
 	err = -ENOMEM;
@@ -1386,15 +1384,20 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 		if (call->id == event_id)
 			break;
 	}
+
+	err = -EINVAL;
 	if (!call)
-		return -EINVAL;
+		goto out_unlock;
 
+	err = -EEXIST;
 	if (event->filter)
-		return -EEXIST;
+		goto out_unlock;
 
 	filter = __alloc_preds();
-	if (IS_ERR(filter))
-		return PTR_ERR(filter);
+	if (IS_ERR(filter)) {
+		err = PTR_ERR(filter);
+		goto out_unlock;
+	}
 
 	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
@@ -1419,6 +1422,7 @@ free_preds:
 	if (err)
 		__free_preds(filter);
 
+out_unlock:
 	mutex_unlock(&event_mutex);
 
 	return err;
-- 
cgit v1.2.3-71-gd317


From f397af06e4c9bf5a0bc92facb8cb29905e338ab0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 16 Oct 2009 20:07:20 -0400
Subject: tracing/kprobes: Update kprobe-tracer selftest against new syntax

Update kprobe-tracer selftest since command syntax has been
changed.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20091017000720.16556.26343.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 739f70e8e924..cdacdab10020 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1438,12 +1438,12 @@ static __init int kprobe_trace_self_tests_init(void)
 	pr_info("Testing kprobe tracing: ");
 
 	ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
-				  "a1 a2 a3 a4 a5 a6");
+				  "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
 	if (WARN_ON_ONCE(ret))
 		pr_warning("error enabling function entry\n");
 
 	ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
-				  "ra rv");
+				  "$retval");
 	if (WARN_ON_ONCE(ret))
 		pr_warning("error enabling function return\n");
 
-- 
cgit v1.2.3-71-gd317


From e63cc2397ecc0f2b604f22fb9cdbb05911c1e5d4 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 16 Oct 2009 20:07:28 -0400
Subject: tracing/kprobes: Add failure messages for debugging

Add verbose failure messages to kprobe-tracer for debugging.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20091017000728.16556.16713.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index cdacdab10020..b8ef707c84d7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -597,15 +597,19 @@ static int create_trace_probe(int argc, char **argv)
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
 
-	if (argc < 2)
+	if (argc < 2) {
+		pr_info("Probe point is not specified.\n");
 		return -EINVAL;
+	}
 
 	if (argv[0][0] == 'p')
 		is_return = 0;
 	else if (argv[0][0] == 'r')
 		is_return = 1;
-	else
+	else {
+		pr_info("Probe definition must be started with 'p' or 'r'.\n");
 		return -EINVAL;
+	}
 
 	if (argv[0][1] == ':') {
 		event = &argv[0][2];
@@ -625,21 +629,29 @@ static int create_trace_probe(int argc, char **argv)
 	}
 
 	if (isdigit(argv[1][0])) {
-		if (is_return)
+		if (is_return) {
+			pr_info("Return probe point must be a symbol.\n");
 			return -EINVAL;
+		}
 		/* an address specified */
 		ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
-		if (ret)
+		if (ret) {
+			pr_info("Failed to parse address.\n");
 			return ret;
+		}
 	} else {
 		/* a symbol specified */
 		symbol = argv[1];
 		/* TODO: support .init module functions */
 		ret = split_symbol_offset(symbol, &offset);
-		if (ret)
+		if (ret) {
+			pr_info("Failed to parse symbol.\n");
 			return ret;
-		if (offset && is_return)
+		}
+		if (offset && is_return) {
+			pr_info("Return probe must be used without offset.\n");
 			return -EINVAL;
+		}
 	}
 	argc -= 2; argv += 2;
 
@@ -658,8 +670,11 @@ static int create_trace_probe(int argc, char **argv)
 	}
 	tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
 			       is_return);
-	if (IS_ERR(tp))
+	if (IS_ERR(tp)) {
+		pr_info("Failed to allocate trace_probe.(%d)\n",
+			(int)PTR_ERR(tp));
 		return PTR_ERR(tp);
+	}
 
 	/* parse arguments */
 	ret = 0;
@@ -672,6 +687,8 @@ static int create_trace_probe(int argc, char **argv)
 			arg = argv[i];
 
 		if (conflict_field_name(argv[i], tp->args, i)) {
+			pr_info("Argument%d name '%s' conflicts with "
+				"another field.\n", i, argv[i]);
 			ret = -EINVAL;
 			goto error;
 		}
@@ -685,8 +702,10 @@ static int create_trace_probe(int argc, char **argv)
 			goto error;
 		}
 		ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
-		if (ret)
+		if (ret) {
+			pr_info("Parse error at argument%d. (%d)\n", i, ret);
 			goto error;
+		}
 	}
 	tp->nr_args = i;
 
-- 
cgit v1.2.3-71-gd317


From 72f279b256d520e321a850880d094bc0bcbf45d6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 22 Oct 2009 19:19:34 +0800
Subject: generic-ipi: Fix misleading smp_call_function*() description

After commit:8969a5ede0f9e17da4b943712429aef2c9bcd82b
"generic-ipi: remove kmalloc()", wait = 0 can be guaranteed.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Nick Piggin <npiggin@suse.de>
LKML-Reference: <1256210374-25354-1-git-send-email-sheng@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index c9d1c7835c2f..8bd618f0364d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -265,9 +265,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
  * @info: An arbitrary pointer to pass to the function.
  * @wait: If true, wait until function has completed on other CPUs.
  *
- * Returns 0 on success, else a negative status code. Note that @wait
- * will be implicitly turned on in case of allocation failures, since
- * we fall back to on-stack allocation.
+ * Returns 0 on success, else a negative status code.
  */
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			     int wait)
@@ -355,9 +353,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
  * @wait: If true, wait (atomically) until function has completed
  *        on other CPUs.
  *
- * If @wait is true, then returns once @func has returned. Note that @wait
- * will be implicitly turned on in case of allocation failures, since
- * we fall back to on-stack allocation.
+ * If @wait is true, then returns once @func has returned.
  *
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler. Preemption
@@ -443,8 +439,7 @@ EXPORT_SYMBOL(smp_call_function_many);
  * Returns 0.
  *
  * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func. In case of allocation
- * failure, @wait will be implicitly turned on.
+ * it returns just before the target cpu calls @func.
  *
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
-- 
cgit v1.2.3-71-gd317


From 5c828713358cb9df8aa174371edcbbb62203a490 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 23 Oct 2009 14:58:11 +0200
Subject: ratelimit: Make suppressed output messages more useful

Today I got:

  [39648.224782] Registered led device: iwl-phy0::TX
  [40676.545099] __ratelimit: 246 callbacks suppressed
  [40676.545103] abcdef[23675]: segfault at 0 ...

as you can see the ratelimit message contains a function prefix.
Since this is always __ratelimit, this wont help much.

This patch changes __ratelimit and printk_ratelimit to print the
function name that calls ratelimit.

This will pinpoint the responsible function, as long as not several
different places call ratelimit with the same ratelimit state at
the same time. In that case we catch only one random function that
calls ratelimit after the wait period.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Young <hidave.darkstar@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <200910231458.11832.borntraeger@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h    | 3 ++-
 include/linux/ratelimit.h | 3 ++-
 kernel/printk.c           | 6 +++---
 lib/ratelimit.c           | 6 +++---
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3305f33201be..21d0d822c716 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -240,7 +240,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2))) __cold;
 
-extern int printk_ratelimit(void);
+extern int __printk_ratelimit(const char *func);
+#define printk_ratelimit() __printk_ratelimit(__func__)
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
 
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 187bc16c1f15..668cf1bef030 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -25,6 +25,7 @@ struct ratelimit_state {
 		.burst		= burst_init,				\
 	}
 
-extern int __ratelimit(struct ratelimit_state *rs);
+extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
+#define __ratelimit(state) ___ratelimit(state, __func__)
 
 #endif /* _LINUX_RATELIMIT_H */
diff --git a/kernel/printk.c b/kernel/printk.c
index b997c893cdcf..8283dbe15af4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1364,11 +1364,11 @@ late_initcall(disable_boot_consoles);
  */
 DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 
-int printk_ratelimit(void)
+int __printk_ratelimit(const char *func)
 {
-	return __ratelimit(&printk_ratelimit_state);
+	return ___ratelimit(&printk_ratelimit_state, func);
 }
-EXPORT_SYMBOL(printk_ratelimit);
+EXPORT_SYMBOL(__printk_ratelimit);
 
 /**
  * printk_timed_ratelimit - caller-controlled printk ratelimiting
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 5551731ae1d4..09f5ce1810dc 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -20,7 +20,7 @@
  * This enforces a rate limit: not more than @rs->ratelimit_burst callbacks
  * in every @rs->ratelimit_jiffies
  */
-int __ratelimit(struct ratelimit_state *rs)
+int ___ratelimit(struct ratelimit_state *rs, const char *func)
 {
 	unsigned long flags;
 	int ret;
@@ -43,7 +43,7 @@ int __ratelimit(struct ratelimit_state *rs)
 	if (time_is_before_jiffies(rs->begin + rs->interval)) {
 		if (rs->missed)
 			printk(KERN_WARNING "%s: %d callbacks suppressed\n",
-				__func__, rs->missed);
+				func, rs->missed);
 		rs->begin   = 0;
 		rs->printed = 0;
 		rs->missed  = 0;
@@ -59,4 +59,4 @@ int __ratelimit(struct ratelimit_state *rs)
 
 	return ret;
 }
-EXPORT_SYMBOL(__ratelimit);
+EXPORT_SYMBOL(___ratelimit);
-- 
cgit v1.2.3-71-gd317


From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001
From: Ryota Ozaki <ozaki.ryota@gmail.com>
Date: Sat, 24 Oct 2009 01:20:10 +0900
Subject: sched, cpuacct: Fix niced guest time accounting

CPU time of a guest is always accounted in 'user' time
without concern for the nice value of its counterpart
process although the guest is scheduled under the nice
value.

This patch fixes the defect and accounts cpu time of
a niced guest in 'nice' time as same as a niced process.

And also the patch adds 'guest_nice' to cpuacct. The
value provides niced guest cpu time which is like 'nice'
to 'user'.

The original discussions can be found here:

  http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html
  http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html

Signed-off-by: Ryota Ozaki <ozaki.ryota@gmail.com>
Acked-by: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/filesystems/proc.txt |  3 ++-
 fs/proc/stat.c                     | 19 +++++++++++++------
 include/linux/kernel_stat.h        |  1 +
 kernel/sched.c                     |  9 +++++++--
 4 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2c48f945546b..4af0018533f2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1072,7 +1072,8 @@ second).  The meanings of the columns are as follows, from left to right:
 - irq: servicing interrupts
 - softirq: servicing softirqs
 - steal: involuntary wait
-- guest: running a guest
+- guest: running a normal guest
+- guest_nice: running a niced guest
 
 The "intr" line gives counts of interrupts  serviced since boot time, for each
 of the  possible system interrupts.   The first  column  is the  total of  all
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..b9b7aad2003d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
 	int i, j;
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-	cputime64_t guest;
+	cputime64_t guest, guest_nice;
 	u64 sum = 0;
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
-	guest = cputime64_zero;
+	guest = guest_nice = cputime64_zero;
 	getboottime(&boottime);
 	jif = boottime.tv_sec;
 
@@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+		guest_nice = cputime64_add(guest_nice,
+			kstat_cpu(i).cpustat.guest_nice);
 		for_each_irq_nr(j) {
 			sum += kstat_irqs_cpu(j, i);
 		}
@@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	sum += arch_irq_stat();
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+		"%llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
 		(unsigned long long)cputime64_to_clock_t(steal),
-		(unsigned long long)cputime64_to_clock_t(guest));
+		(unsigned long long)cputime64_to_clock_t(guest),
+		(unsigned long long)cputime64_to_clock_t(guest_nice));
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
 		guest = kstat_cpu(i).cpustat.guest;
+		guest_nice = kstat_cpu(i).cpustat.guest_nice;
 		seq_printf(p,
-			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+			"%llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v)
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
 			(unsigned long long)cputime64_to_clock_t(steal),
-			(unsigned long long)cputime64_to_clock_t(guest));
+			(unsigned long long)cputime64_to_clock_t(guest),
+			(unsigned long long)cputime64_to_clock_t(guest_nice));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 348fa8874b52..c059044bc6dc 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ struct cpu_usage_stat {
 	cputime64_t iowait;
 	cputime64_t steal;
 	cputime64_t guest;
+	cputime64_t guest_nice;
 };
 
 struct kernel_stat {
diff --git a/kernel/sched.c b/kernel/sched.c
index e5205811c19e..67be4d0dddaa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5017,8 +5017,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 	p->gtime = cputime_add(p->gtime, cputime);
 
 	/* Add guest time to cpustat. */
-	cpustat->user = cputime64_add(cpustat->user, tmp);
-	cpustat->guest = cputime64_add(cpustat->guest, tmp);
+	if (TASK_NICE(p) > 0) {
+		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+		cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+	} else {
+		cpustat->user = cputime64_add(cpustat->user, tmp);
+		cpustat->guest = cputime64_add(cpustat->guest, tmp);
+	}
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From 7a041097518a120f92af631a83db4d41e07ee51b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 25 Oct 2009 14:24:45 +0200
Subject: x86: Fix user return notifier build

When CONFIG_USER_RETURN_NOTIFIER is set, we need to link
kernel/user-return-notifier.o.

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1256473485-23109-1-git-send-email-avi@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..0ae57a83d481 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-- 
cgit v1.2.3-71-gd317


From 9b1d82fa1611706fa7ee1505f290160a18caf95d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Oct 2009 19:03:50 -0700
Subject: rcu: "Tiny RCU", The Bloatwatch Edition

This patch is a version of RCU designed for !SMP provided for a
small-footprint RCU implementation.  In particular, the
implementation of synchronize_rcu() is extremely lightweight and
high performance. It passes rcutorture testing in each of the
four relevant configurations (combinations of NO_HZ and PREEMPT)
on x86.  This saves about 1K bytes compared to old Classic RCU
(which is no longer in mainline), and more than three kilobytes
compared to Hierarchical RCU (updated to 2.6.30):

	CONFIG_TREE_RCU:

	   text	   data	    bss	    dec	    filename
	    183       4       0     187     kernel/rcupdate.o
	   2783     520      36    3339     kernel/rcutree.o
				   3526 Total (vs 4565 for v7)

	CONFIG_TREE_PREEMPT_RCU:

	   text	   data	    bss	    dec	    filename
	    263       4       0     267     kernel/rcupdate.o
	   4594     776      52    5422     kernel/rcutree.o
	   			   5689 Total (6155 for v7)

	CONFIG_TINY_RCU:

	   text	   data	    bss	    dec	    filename
	     96       4       0     100     kernel/rcupdate.o
	    734      24       0     758     kernel/rcutiny.o
	    			    858 Total (vs 848 for v7)

The above is for x86.  Your mileage may vary on other platforms.
Further compression is possible, but is being procrastinated.

Changes from v7 (http://lkml.org/lkml/2009/10/9/388)

o	Apply Lai Jiangshan's review comments (aside from
might_sleep() 	in synchronize_sched(), which is covered by SMP builds).

o	Fix up expedited primitives.

Changes from v6 (http://lkml.org/lkml/2009/9/23/293).

o	Forward ported to put it into the 2.6.33 stream.

o	Added lockdep support.

o	Make lightweight rcu_barrier.

Changes from v5 (http://lkml.org/lkml/2009/6/23/12).

o	Ported to latest pre-2.6.32 merge window kernel.

	- Renamed rcu_qsctr_inc() to rcu_sched_qs().
	- Renamed rcu_bh_qsctr_inc() to rcu_bh_qs().
	- Provided trivial rcu_cpu_notify().
	- Provided trivial exit_rcu().
	- Provided trivial rcu_needs_cpu().
	- Fixed up the rcu_*_enter/exit() functions in linux/hardirq.h.

o	Removed the dependence on EMBEDDED, with a view to making
	TINY_RCU default for !SMP at some time in the future.

o	Added (trivial) support for expedited grace periods.

Changes from v4 (http://lkml.org/lkml/2009/5/2/91) include:

o	Squeeze the size down a bit further by removing the
	->completed field from struct rcu_ctrlblk.

o	This permits synchronize_rcu() to become the empty function.
	Previous concerns about rcutorture were unfounded, as
	rcutorture correctly handles a constant value from
	rcu_batches_completed() and rcu_batches_completed_bh().

Changes from v3 (http://lkml.org/lkml/2009/3/29/221) include:

o	Changed rcu_batches_completed(), rcu_batches_completed_bh()
	rcu_enter_nohz(), rcu_exit_nohz(), rcu_nmi_enter(), and
	rcu_nmi_exit(), to be static inlines, as suggested by David
	Howells.  Doing this saves about 100 bytes from rcutiny.o.
	(The numbers between v3 and this v4 of the patch are not directly
	comparable, since they are against different versions of Linux.)

Changes from v2 (http://lkml.org/lkml/2009/2/3/333) include:

o	Fix whitespace issues.

o	Change short-circuit "||" operator to instead be "+" in order
to 	fix performance bug noted by "kraai" on LWN.

		(http://lwn.net/Articles/324348/)

Changes from v1 (http://lkml.org/lkml/2009/1/13/440) include:

o	This version depends on EMBEDDED as well as !SMP, as suggested
	by Ingo.

o	Updated rcu_needs_cpu() to unconditionally return zero,
	permitting the CPU to enter dynticks-idle mode at any time.
	This works because callbacks can be invoked upon entry to
	dynticks-idle mode.

o	Paul is now OK with this being included, based on a poll at
the 	Kernel Miniconf at linux.conf.au, where about ten people said
	that they cared about saving 900 bytes on single-CPU systems.

o	Applies to both mainline and tip/core/rcu.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: avi@redhat.com
Cc: mtosatti@redhat.com
LKML-Reference: <12565226351355-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hardirq.h  |  24 ++++
 include/linux/rcupdate.h |   6 +
 include/linux/rcutiny.h  |  97 ++++++++++++++++
 init/Kconfig             |   9 ++
 kernel/Makefile          |   1 +
 kernel/rcupdate.c        |   4 +
 kernel/rcutiny.c         | 294 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 435 insertions(+)
 create mode 100644 include/linux/rcutiny.h
 create mode 100644 kernel/rcutiny.c

(limited to 'kernel')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 6d527ee82b2b..d5b387669dab 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -139,10 +139,34 @@ static inline void account_system_vtime(struct task_struct *tsk)
 #endif
 
 #if defined(CONFIG_NO_HZ)
+#if defined(CONFIG_TINY_RCU)
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+
+static inline void rcu_irq_enter(void)
+{
+	rcu_exit_nohz();
+}
+
+static inline void rcu_irq_exit(void)
+{
+	rcu_enter_nohz();
+}
+
+static inline void rcu_nmi_enter(void)
+{
+}
+
+static inline void rcu_nmi_exit(void)
+{
+}
+
+#else
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
 extern void rcu_nmi_enter(void);
 extern void rcu_nmi_exit(void);
+#endif
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3ebd0b7bcb08..6dd71fa48429 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -68,11 +68,17 @@ extern int sched_expedited_torture_stats(char *page);
 /* Internal to kernel */
 extern void rcu_init(void);
 extern void rcu_scheduler_starting(void);
+#ifndef CONFIG_TINY_RCU
 extern int rcu_needs_cpu(int cpu);
+#else
+static inline int rcu_needs_cpu(int cpu) { return 0; }
+#endif
 extern int rcu_scheduler_active;
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
+#elif CONFIG_TINY_RCU
+#include <linux/rcutiny.h>
 #else
 #error "Unknown RCU implementation specified to kernel configuration"
 #endif
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
new file mode 100644
index 000000000000..891073c264dc
--- /dev/null
+++ b/include/linux/rcutiny.h
@@ -0,0 +1,97 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ */
+
+#ifndef __LINUX_TINY_H
+#define __LINUX_TINY_H
+
+#include <linux/cache.h>
+
+void rcu_sched_qs(int cpu);
+void rcu_bh_qs(int cpu);
+
+#define __rcu_read_lock()	preempt_disable()
+#define __rcu_read_unlock()	preempt_enable()
+#define __rcu_read_lock_bh()	local_bh_disable()
+#define __rcu_read_unlock_bh()	local_bh_enable()
+#define call_rcu_sched		call_rcu
+
+#define rcu_init_sched()	do { } while (0)
+extern void rcu_check_callbacks(int cpu, int user);
+extern void __rcu_init(void);
+
+/*
+ * Return the number of grace periods.
+ */
+static inline long rcu_batches_completed(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods.
+ */
+static inline long rcu_batches_completed_bh(void)
+{
+	return 0;
+}
+
+extern int rcu_expedited_torture_stats(char *page);
+
+static inline void synchronize_rcu_expedited(void)
+{
+	synchronize_sched();
+}
+
+static inline void synchronize_rcu_bh_expedited(void)
+{
+	synchronize_sched();
+}
+
+struct notifier_block;
+extern int rcu_cpu_notify(struct notifier_block *self,
+			  unsigned long action, void *hcpu);
+
+#ifdef CONFIG_NO_HZ
+
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static inline void rcu_enter_nohz(void)
+{
+}
+
+static inline void rcu_exit_nohz(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+static inline void exit_rcu(void)
+{
+}
+
+#endif /* __LINUX_RCUTINY_H */
diff --git a/init/Kconfig b/init/Kconfig
index 09c5c6431f42..d367bf6bcc34 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -334,6 +334,15 @@ config TREE_PREEMPT_RCU
 	  is also required.  It also scales down nicely to
 	  smaller systems.
 
+config TINY_RCU
+	bool "UP-only small-memory-footprint RCU"
+	depends on !SMP
+	help
+	  This option selects the RCU implementation that is
+	  designed for UP systems from which real-time response
+	  is not required.  This option greatly reduces the
+	  memory footprint of RCU.
+
 endchoice
 
 config RCU_TRACE
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..e9a0e6ed25cc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_TINY_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 400183346ad2..7625f207f65e 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -67,6 +67,8 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	complete(&rcu->completion);
 }
 
+#ifndef CONFIG_TINY_RCU
+
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 /**
@@ -157,6 +159,8 @@ void synchronize_rcu_bh(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
+#endif /* #ifndef CONFIG_TINY_RCU */
+
 static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 		unsigned long action, void *hcpu)
 {
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..0b54efde66ac
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,294 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
+	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
+	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.rcucblist = NULL,
+	.donetail = &rcu_ctrlblk.rcucblist,
+	.curtail = &rcu_ctrlblk.rcucblist,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.rcucblist = NULL,
+	.donetail = &rcu_bh_ctrlblk.rcucblist,
+	.curtail = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_NO_HZ
+
+static long rcu_dynticks_nesting = 1;
+
+/*
+ * Enter dynticks-idle mode, which is an extended quiescent state
+ * if we have fully entered that mode (i.e., if the new value of
+ * dynticks_nesting is zero).
+ */
+void rcu_enter_nohz(void)
+{
+	if (--rcu_dynticks_nesting == 0)
+		rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+}
+
+/*
+ * Exit dynticks-idle mode, so that we are no longer in an extended
+ * quiescent state.
+ */
+void rcu_exit_nohz(void)
+{
+	rcu_dynticks_nesting++;
+}
+
+#endif /* #ifdef CONFIG_NO_HZ */
+
+/*
+ * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
+ * Also disable irqs to avoid confusion due to interrupt handlers invoking
+ * call_rcu().
+ */
+static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (rcp->rcucblist != NULL &&
+	    rcp->donetail != rcp->curtail) {
+		rcp->donetail = rcp->curtail;
+		local_irq_restore(flags);
+		return 1;
+	}
+	local_irq_restore(flags);
+	return 0;
+}
+
+/*
+ * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
+ * are at it, given that any rcu quiescent state is also an rcu_bh
+ * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
+ */
+void rcu_sched_qs(int cpu)
+{
+	if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Record an rcu_bh quiescent state.
+ */
+void rcu_bh_qs(int cpu)
+{
+	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if the scheduling-clock interrupt came from an extended
+ * quiescent state, and, if so, tell RCU about it.
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) &&
+	     !in_softirq() &&
+	     hardirq_count() <= (1 << HARDIRQ_SHIFT)))
+		rcu_sched_qs(cpu);
+	else if (!in_softirq())
+		rcu_bh_qs(cpu);
+}
+
+/*
+ * Helper function for rcu_process_callbacks() that operates on the
+ * specified rcu_ctrlkblk structure.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+
+	/* If no RCU callbacks ready to invoke, just return. */
+	if (&rcp->rcucblist == rcp->donetail)
+		return;
+
+	/* Move the ready-to-invoke callbacks to a local list. */
+	local_irq_save(flags);
+	list = rcp->rcucblist;
+	rcp->rcucblist = *rcp->donetail;
+	*rcp->donetail = NULL;
+	if (rcp->curtail == rcp->donetail)
+		rcp->curtail = &rcp->rcucblist;
+	rcp->donetail = &rcp->rcucblist;
+	local_irq_restore(flags);
+
+	/* Invoke the callbacks on the local list. */
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+	}
+}
+
+/*
+ * Invoke any callbacks whose grace period has completed.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk);
+	__rcu_process_callbacks(&rcu_bh_ctrlblk);
+}
+
+/*
+ * Null function to handle CPU being onlined.  Longer term, we want to
+ * make TINY_RCU avoid using rcupdate.c, but later...
+ */
+int rcu_cpu_notify(struct notifier_block *self,
+		   unsigned long action, void *hcpu)
+{
+	return NOTIFY_OK;
+}
+
+/*
+ * Wait for a grace period to elapse.  But it is illegal to invoke
+ * synchronize_sched() from within an RCU read-side critical section.
+ * Therefore, any legal call to synchronize_sched() is a quiescent
+ * state, and so on a UP system, synchronize_sched() need do nothing.
+ * Ditto for synchronize_rcu_bh().  (But Lai Jiangshan points out the
+ * benefits of doing might_sleep() to reduce latency.)
+ *
+ * Cool, huh?  (Due to Josh Triplett.)
+ *
+ * But we want to make this a static inline later.
+ */
+void synchronize_sched(void)
+{
+	cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
+void synchronize_rcu_bh(void)
+{
+	synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
+/*
+ * Helper function for call_rcu() and call_rcu_bh().
+ */
+static void __call_rcu(struct rcu_head *head,
+		       void (*func)(struct rcu_head *rcu),
+		       struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	*rcp->curtail = head;
+	rcp->curtail = &head->next;
+	local_irq_restore(flags);
+}
+
+/*
+ * Post an RCU callback to be invoked after the end of an RCU grace
+ * period.  But since we have but one CPU, that would be after any
+ * quiescent state.
+ */
+void call_rcu(struct rcu_head *head,
+	      void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_ctrlblk);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Post an RCU bottom-half callback to be invoked after any subsequent
+ * quiescent state.
+ */
+void call_rcu_bh(struct rcu_head *head,
+		 void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_ctrlblk);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+void rcu_barrier(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+void rcu_barrier_bh(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_bh(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+void rcu_barrier_sched(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_sched(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
+void __rcu_init(void)
+{
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
-- 
cgit v1.2.3-71-gd317


From 0cd397d33608ae6c97d2ee6c8c43462b419b7e26 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Oct 2009 19:03:51 -0700
Subject: rcu: Add synchronize_srcu_expedited()

This patch creates a synchronize_srcu_expedited() that uses
synchronize_sched_expedited() where synchronize_srcu()
uses synchronize_sched().  The synchronize_srcu() and
synchronize_srcu_expedited() functions become one-liners that
pass synchronize_sched() or synchronize_sched_expedited(),
repectively, to a new __synchronize_srcu() function.

While in the file, move the EXPORT_SYMBOL_GPL()s to immediately
follow the corresponding functions.

Requested-by: Avi Kivity <avi@redhat.com>
Tested-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: avi@redhat.com
LKML-Reference: <12565226354038-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/srcu.h |  1 +
 kernel/srcu.c        | 74 ++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 52 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index aca0eee53930..4765d97dcafb 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -48,6 +48,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp);
 int srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
+void synchronize_srcu_expedited(struct srcu_struct *sp);
 long srcu_batches_completed(struct srcu_struct *sp);
 
 #endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..818d7d9aa03c 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
 	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
 	return (sp->per_cpu_ref ? 0 : -ENOMEM);
 }
+EXPORT_SYMBOL_GPL(init_srcu_struct);
 
 /*
  * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
 }
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
 /**
  * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
 	preempt_enable();
 	return idx;
 }
+EXPORT_SYMBOL_GPL(srcu_read_lock);
 
 /**
  * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
 	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(srcu_read_unlock);
 
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
- *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates.  Can block; must be called from
- * process context.
- *
- * Note that it is illegal to call synchornize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section.
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-void synchronize_srcu(struct srcu_struct *sp)
+void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 {
 	int idx;
 
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
 		return;
 	}
 
-	synchronize_sched();  /* Force memory barrier on all CPUs. */
+	sync_func();  /* Force memory barrier on all CPUs. */
 
 	/*
 	 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
 	idx = sp->completed & 0x1;
 	sp->completed++;
 
-	synchronize_sched();  /* Force memory barrier on all CPUs. */
+	sync_func();  /* Force memory barrier on all CPUs. */
 
 	/*
 	 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
 	while (srcu_readers_active_idx(sp, idx))
 		schedule_timeout_interruptible(1);
 
-	synchronize_sched();  /* Force memory barrier on all CPUs. */
+	sync_func();  /* Force memory barrier on all CPUs. */
 
 	/*
 	 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -236,6 +229,47 @@ void synchronize_srcu(struct srcu_struct *sp)
 	mutex_unlock(&sp->mutex);
 }
 
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	__synchronize_srcu(sp, synchronize_sched);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/**
+ * synchronize_srcu_expedited - like synchronize_srcu, but less patient
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchronize_srcu_expedited()
+ * from the corresponding SRCU read-side critical section; doing so
+ * will result in deadlock.  However, it is perfectly legal to call
+ * synchronize_srcu_expedited() on one srcu_struct from some other
+ * srcu_struct's read-side critical section.
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	__synchronize_srcu(sp, synchronize_sched_expedited);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
 /**
  * srcu_batches_completed - return batches completed.
  * @sp: srcu_struct on which to report batch completion.
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
 {
 	return sp->completed;
 }
-
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-EXPORT_SYMBOL_GPL(srcu_read_lock);
-EXPORT_SYMBOL_GPL(srcu_read_unlock);
-EXPORT_SYMBOL_GPL(synchronize_srcu);
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
-- 
cgit v1.2.3-71-gd317


From 804bb8370522a569bd3a732b9de5fbd55e26f155 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Oct 2009 19:03:52 -0700
Subject: rcu: Add synchronize_srcu_expedited() to the rcutorture test suite

Adds the "srcu_expedited" torture type, and also renames
sched_ops_sync to sched_sync_ops for consistency while we are in
this file.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: avi@redhat.com
Cc: mtosatti@redhat.com
LKML-Reference: <12565226353636-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 697c0a0229d4..14480e8b2a24 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -547,6 +547,25 @@ static struct rcu_torture_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+static void srcu_torture_synchronize_expedited(void)
+{
+	synchronize_srcu_expedited(&srcu_ctl);
+}
+
+static struct rcu_torture_ops srcu_expedited_ops = {
+	.init		= srcu_torture_init,
+	.cleanup	= srcu_torture_cleanup,
+	.readlock	= srcu_torture_read_lock,
+	.read_delay	= srcu_read_delay,
+	.readunlock	= srcu_torture_read_unlock,
+	.completed	= srcu_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= srcu_torture_synchronize_expedited,
+	.cb_barrier	= NULL,
+	.stats		= srcu_torture_stats,
+	.name		= "srcu_expedited"
+};
+
 /*
  * Definitions for sched torture testing.
  */
@@ -592,7 +611,7 @@ static struct rcu_torture_ops sched_ops = {
 	.name		= "sched"
 };
 
-static struct rcu_torture_ops sched_ops_sync = {
+static struct rcu_torture_ops sched_sync_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
 	.readlock	= sched_torture_read_lock,
@@ -1098,8 +1117,8 @@ rcu_torture_init(void)
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
-		  &sched_expedited_ops,
-		  &srcu_ops, &sched_ops, &sched_ops_sync, };
+		  &srcu_ops, &srcu_expedited_ops,
+		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 
 	mutex_lock(&fullstop_mutex);
 
-- 
cgit v1.2.3-71-gd317


From cf886c44ec418a01b2c52493465accb81acbf930 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Oct 2009 19:03:54 -0700
Subject: rcu: Improve rcutorture diagnostics when bad torture_type specified

Make rcutorture list the available torture_type values when it
doesn't like the one specified.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: avi@redhat.com
Cc: mtosatti@redhat.com
LKML-Reference: <12565226351868-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 14480e8b2a24..3dd0ca23e191 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1129,8 +1129,12 @@ rcu_torture_init(void)
 			break;
 	}
 	if (i == ARRAY_SIZE(torture_ops)) {
-		printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
+		printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
 		       torture_type);
+		printk(KERN_ALERT "rcu-torture types:");
+		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+			printk(KERN_ALERT " %s", torture_ops[i]->name);
+		printk(KERN_ALERT "\n");
 		mutex_unlock(&fullstop_mutex);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3-71-gd317


From 4ce5b90340879ce93d169b7b523c2cbbe7c45843 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 26 Oct 2009 07:55:55 +0100
Subject: rcu: Do tiny cleanups in rcutiny

No change in functionality - just straighten out a few small
stylistic details.

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: avi@redhat.com
Cc: mtosatti@redhat.com
LKML-Reference: <12565226351355-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutiny.h |  6 ++----
 kernel/rcutiny.c        | 49 +++++++++++++++++++++++--------------------------
 2 files changed, 25 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 891073c264dc..2c1fe8373e71 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -20,9 +20,8 @@
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
+ *		Documentation/RCU
  */
-
 #ifndef __LINUX_TINY_H
 #define __LINUX_TINY_H
 
@@ -70,8 +69,7 @@ static inline void synchronize_rcu_bh_expedited(void)
 }
 
 struct notifier_block;
-extern int rcu_cpu_notify(struct notifier_block *self,
-			  unsigned long action, void *hcpu);
+extern int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu);
 
 #ifdef CONFIG_NO_HZ
 
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0b54efde66ac..b33ec3aa377a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -20,22 +20,21 @@
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
+ *		Documentation/RCU
  */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/completion.h>
 #include <linux/moduleparam.h>
+#include <linux/completion.h>
+#include <linux/interrupt.h>
 #include <linux/notifier.h>
-#include <linux/cpu.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
 #include <linux/time.h>
+#include <linux/cpu.h>
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -46,14 +45,13 @@ struct rcu_ctrlblk {
 
 /* Definition for rcupdate control block. */
 static struct rcu_ctrlblk rcu_ctrlblk = {
-	.rcucblist = NULL,
-	.donetail = &rcu_ctrlblk.rcucblist,
-	.curtail = &rcu_ctrlblk.rcucblist,
+	.donetail	= &rcu_ctrlblk.rcucblist,
+	.curtail	= &rcu_ctrlblk.rcucblist,
 };
+
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.rcucblist = NULL,
-	.donetail = &rcu_bh_ctrlblk.rcucblist,
-	.curtail = &rcu_bh_ctrlblk.rcucblist,
+	.donetail	= &rcu_bh_ctrlblk.rcucblist,
+	.curtail	= &rcu_bh_ctrlblk.rcucblist,
 };
 
 #ifdef CONFIG_NO_HZ
@@ -84,8 +82,8 @@ void rcu_exit_nohz(void)
 
 /*
  * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
- * Also disable irqs to avoid confusion due to interrupt handlers invoking
- * call_rcu().
+ * Also disable irqs to avoid confusion due to interrupt handlers
+ * invoking call_rcu().
  */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
@@ -99,6 +97,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 		return 1;
 	}
 	local_irq_restore(flags);
+
 	return 0;
 }
 
@@ -143,8 +142,8 @@ void rcu_check_callbacks(int cpu, int user)
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
-	unsigned long flags;
 	struct rcu_head *next, *list;
+	unsigned long flags;
 
 	/* If no RCU callbacks ready to invoke, just return. */
 	if (&rcp->rcucblist == rcp->donetail)
@@ -182,8 +181,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  * Null function to handle CPU being onlined.  Longer term, we want to
  * make TINY_RCU avoid using rcupdate.c, but later...
  */
-int rcu_cpu_notify(struct notifier_block *self,
-		   unsigned long action, void *hcpu)
+int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	return NOTIFY_OK;
 }
@@ -223,6 +221,7 @@ static void __call_rcu(struct rcu_head *head,
 
 	head->func = func;
 	head->next = NULL;
+
 	local_irq_save(flags);
 	*rcp->curtail = head;
 	rcp->curtail = &head->next;
@@ -234,8 +233,7 @@ static void __call_rcu(struct rcu_head *head,
  * period.  But since we have but one CPU, that would be after any
  * quiescent state.
  */
-void call_rcu(struct rcu_head *head,
-	      void (*func)(struct rcu_head *rcu))
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
 	__call_rcu(head, func, &rcu_ctrlblk);
 }
@@ -245,8 +243,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * Post an RCU bottom-half callback to be invoked after any subsequent
  * quiescent state.
  */
-void call_rcu_bh(struct rcu_head *head,
-		 void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
-- 
cgit v1.2.3-71-gd317


From 88b91c7ca49bc8600cf1106eb891d08c1965b9ce Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 26 Oct 2009 10:24:31 -0700
Subject: rcu: Simplify creating of lockdep class for root rcu_node

Use lockdep_set_class() to simplify the code and to avoid any
additional overhead in the !LOCKDEP case.  Also move the
definition of rcu_root_class into kernel/rcutree.c, as suggested
by Lai Jiangshan.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1256577871443-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ddbf111e9e18..055f1a941a9e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -51,6 +51,8 @@
 
 /* Data structures. */
 
+static struct lock_class_key rcu_root_class;
+
 #define RCU_STATE_INITIALIZER(name) { \
 	.level = { &name.node[0] }, \
 	.levelcnt = { \
@@ -1666,8 +1668,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		cpustride *= rsp->levelspread[i];
 		rnp = rsp->level[i];
 		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
-			if (rnp != rcu_get_root(rsp))
-				spin_lock_init(&rnp->lock);
+			spin_lock_init(&rnp->lock);
 			rnp->gpnum = 0;
 			rnp->qsmask = 0;
 			rnp->qsmaskinit = 0;
@@ -1690,7 +1691,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
 		}
 	}
-	spin_lock_init(&rcu_get_root(rsp)->lock);
+	lockdep_set_class(&rcu_get_root(rsp)->lock, &rcu_root_class);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From f7d7986060b2890fc26db6ab5203efbd33aa2497 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sun, 18 Oct 2009 01:09:29 +0000
Subject: perf_event: Add alignment-faults and emulation-faults software events

Add two more software events that are common to many cpus.

Alignment faults: When a load or store is not aligned properly.

Emulation faults: When an instruction is emulated in software.

Both cause a very significant slowdown (100x or worse), so identifying and
fixing them is very important.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/perf_counter.h   | 2 ++
 include/linux/perf_event.h     | 2 ++
 kernel/perf_event.c            | 2 ++
 tools/perf/design.txt          | 2 ++
 tools/perf/util/parse-events.c | 4 ++++
 5 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7b7fbf433cff..d6b95d1e79f0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -106,6 +106,8 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
 	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
 	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
+	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2e6d95f97419..a33707a3a788 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -102,6 +102,8 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
 	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
 	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
+	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9d0b5c665883..0683b33cbb28 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4186,6 +4186,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_SW_CONTEXT_SWITCHES:
 	case PERF_COUNT_SW_CPU_MIGRATIONS:
+	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
+	case PERF_COUNT_SW_EMULATION_FAULTS:
 		if (!event->parent) {
 			atomic_inc(&perf_swevent_enabled[event_id]);
 			event->destroy = sw_perf_event_destroy;
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index fdd42a824c98..f000c30877ac 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -137,6 +137,8 @@ enum sw_event_ids {
 	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
 	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
 	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
+	PERF_COUNT_SW_ALIGNMENT_FAULTS	= 7,
+	PERF_COUNT_SW_EMULATION_FAULTS	= 8,
 };
 
 Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 8cfb48cbbea0..34bd84423933 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -46,6 +46,8 @@ static struct event_symbol event_symbols[] = {
   { CSW(PAGE_FAULTS_MAJ),	"major-faults",		""		},
   { CSW(CONTEXT_SWITCHES),	"context-switches",	"cs"		},
   { CSW(CPU_MIGRATIONS),	"cpu-migrations",	"migrations"	},
+  { CSW(ALIGNMENT_FAULTS),	"alignment-faults",	""		},
+  { CSW(EMULATION_FAULTS),	"emulation-faults",	""		},
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
@@ -74,6 +76,8 @@ static const char *sw_event_names[] = {
 	"CPU-migrations",
 	"minor-faults",
 	"major-faults",
+	"alignment-faults",
+	"emulation-faults",
 };
 
 #define MAX_ALIASES 8
-- 
cgit v1.2.3-71-gd317


From be404f0212ffa8f67361f8ee460a25d901d88991 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 8 Oct 2009 22:47:30 +0200
Subject: PM / freezer: Don't get over-anxious while waiting

Freezing isn't exactly the most latency sensitive operation and
there's no reason to burn cpu cycles and power waiting for it to
complete.  msleep(10) instead of yield().  This should improve
reliability of emergency hibernation.

[rjw: Modified the comment next to the msleep(10).]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/process.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index cc2e55373b68..5ade1bdcf366 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
+#include <linux/delay.h>
 
 /* 
  * Timeout for stopping processes
@@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only)
 	do_gettimeofday(&start);
 
 	end_time = jiffies + TIMEOUT;
-	do {
+	while (true) {
 		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
@@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only)
 				todo++;
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
-		yield();			/* Yield is okay here */
-		if (time_after(jiffies, end_time))
+		if (!todo || time_after(jiffies, end_time))
 			break;
-	} while (todo);
+
+		/*
+		 * We need to retry, but first give the freezing tasks some
+		 * time to enter the regrigerator.
+		 */
+		msleep(10);
+	}
 
 	do_gettimeofday(&end);
 	elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
-- 
cgit v1.2.3-71-gd317


From dd004c475cd15a5749b04b0283d41ffdfa57d658 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 27 Oct 2009 16:42:44 -0400
Subject: kprobe-tracer: Compare both of event-name and event-group to find
 probe

Fix find_probe_event() to compare both of event-name and
event-group. Without this fix, kprobe-tracer overwrites existing
same event-name probe even if its group-name is different.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
LKML-Reference: <20091027204244.30545.27516.stgit@harusame>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b8ef707c84d7..a86c3ac0df21 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,12 +353,14 @@ static void free_trace_probe(struct trace_probe *tp)
 	kfree(tp);
 }
 
-static struct trace_probe *find_probe_event(const char *event)
+static struct trace_probe *find_probe_event(const char *event,
+					    const char *group)
 {
 	struct trace_probe *tp;
 
 	list_for_each_entry(tp, &probe_list, list)
-		if (!strcmp(tp->call.name, event))
+		if (strcmp(tp->call.name, event) == 0 &&
+		    strcmp(tp->call.system, group) == 0)
 			return tp;
 	return NULL;
 }
@@ -383,7 +385,7 @@ static int register_trace_probe(struct trace_probe *tp)
 	mutex_lock(&probe_lock);
 
 	/* register as an event */
-	old_tp = find_probe_event(tp->call.name);
+	old_tp = find_probe_event(tp->call.name, tp->call.system);
 	if (old_tp) {
 		/* delete old event */
 		unregister_trace_probe(old_tp);
-- 
cgit v1.2.3-71-gd317


From 3ed67776fc23061180896086a206a02be649dd26 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 28 Oct 2009 17:37:01 +0800
Subject: tracing/filters: Fix to make system filter work

commit fce29d15b59245597f7f320db4a9f2be0f5fb512
("tracing/filters: Refactor subsystem filter code")
broke system filter accidentally.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4AE810BD.3070009@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 21d34757b955..50504cb228de 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1230,12 +1230,12 @@ static int replace_system_preds(struct event_subsystem *system,
 				struct filter_parse_state *ps,
 				char *filter_string)
 {
-	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	bool fail = true;
 	int err;
 
 	list_for_each_entry(call, &ftrace_events, list) {
+		struct event_filter *filter = call->filter;
 
 		if (!call->define_fields)
 			continue;
-- 
cgit v1.2.3-71-gd317


From 3c912b6edaac56cb451e7571c95c15cbb6bd0c81 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 2 Nov 2009 16:17:22 +1100
Subject: x86: Fix user return notifier put_cpu_var() invocation

Today's linux-next build (x86_64 allmodconfig) failed like this:

  kernel/user-return-notifier.c: In function
  'fire_user_return_notifiers': kernel/user-return-notifier.c:45:
  error: expected expression before ')' token

Introduced by commit 7c68af6e32c73992bad24107311f3433c89016e2
("core, x86: Add user return notifiers") from the tip and kvm trees
but revealed by commit e0fdb0e050eae331046385643618f12452aa7e73
("percpu: add __percpu for sparse") from the percpu tree.

Before that percpu tree commit, "put_cpu_var()" would compile
without error (even though it really needs a parameter).

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
LKML-Reference: <20091102161722.eea4358d.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user-return-notifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 530ccb816513..03e2d6fd9b18 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -42,5 +42,5 @@ void fire_user_return_notifiers(void)
 	head = &get_cpu_var(return_notifier_list);
 	hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
 		urn->on_user_return(urn);
-	put_cpu_var();
+	put_cpu_var(return_notifier_list);
 }
-- 
cgit v1.2.3-71-gd317


From 5e9b397292ca0b9409dced33e3a22ec993377064 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 2 Nov 2009 08:51:13 +0800
Subject: tracing: Fix to use __always_unused attribute

____ftrace_check_##name() is used for compile-time check on
F_printk() only, so it should be marked as __unused instead
of __used.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4AEE2D01.4010305@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_export.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9753fcc61bc5..c74848ddb85a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -48,11 +48,11 @@
 struct ____ftrace_##name {					\
 	tstruct							\
 };								\
-static void __used ____ftrace_check_##name(void)		\
+static void __always_unused ____ftrace_check_##name(void)	\
 {								\
 	struct ____ftrace_##name *__entry = NULL;		\
 								\
-	/* force cmpile-time check on F_printk() */		\
+	/* force compile-time check on F_printk() */		\
 	printk(print);						\
 }
 
-- 
cgit v1.2.3-71-gd317


From c5e0cb3ddc5f14cedcfc50c0fb3b5fc6b56576da Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 28 Oct 2009 08:14:48 -0700
Subject: rcu: Cleanup: balance rcu_irq_enter()/rcu_irq_exit() calls

Currently, rcu_irq_exit() is invoked only for CONFIG_NO_HZ,
while rcu_irq_enter() is invoked unconditionally.  This patch
moves rcu_irq_exit() out from under CONFIG_NO_HZ so that the
calls are balanced.

This patch has no effect on the behavior of the kernel because
both rcu_irq_enter() and rcu_irq_exit() are empty for
!CONFIG_NO_HZ, but the code is easier to understand if the calls
are obviously balanced in all cases.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12567428891605-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e0..21939d9e830e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
 
+	rcu_irq_exit();
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
-	rcu_irq_exit();
 	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
 		tick_nohz_stop_sched_tick(0);
 #endif
-- 
cgit v1.2.3-71-gd317


From 4dae560f97fa438f373b53e14b30149c9e44a600 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Fri, 30 Oct 2009 19:23:10 +0530
Subject: kprobes: Sanitize struct kretprobe_instance allocations

For as long as kretprobes have existed, we've allocated NR_CPUS
instances of kretprobe_instance structures. With the default
value of CONFIG_NR_CPUS increasing on certain architectures, we
are potentially wasting kernel memory.

See http://sourceware.org/bugzilla/show_bug.cgi?id=10839#c3 for
more details.

Use a saner num_possible_cpus() instead of NR_CPUS for
allocation.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: fweisbec@gmail.com
LKML-Reference: <20091030135310.GA22230@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5240d75f4c60..1494e85b35f2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1014,9 +1014,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 	/* Pre-allocate memory for max kretprobe instances */
 	if (rp->maxactive <= 0) {
 #ifdef CONFIG_PREEMPT
-		rp->maxactive = max(10, 2 * NR_CPUS);
+		rp->maxactive = max(10, 2 * num_possible_cpus());
 #else
-		rp->maxactive = NR_CPUS;
+		rp->maxactive = num_possible_cpus();
 #endif
 	}
 	spin_lock_init(&rp->lock);
-- 
cgit v1.2.3-71-gd317


From fb0459d75c1d0a4ba3cafdd2c754e7486968a676 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 25 Sep 2009 12:25:56 +0200
Subject: perf/core: Provide a kernel-internal interface to get to performance
 counters

There are reasons for kernel code to ask for, and use, performance
counters.
For example, in CPU freq governors this tends to be a good idea, but
there are other examples possible as well of course.

This patch adds the needed bits to do enable this functionality; they
have been tested in an experimental cpufreq driver that I'm working on,
and the changes are all that I needed to access counters properly.

[fweisbec@gmail.com: added pid to perf_event_create_kernel_counter so
that we can profile a particular task too

TODO: Have a better error reporting, don't just return NULL in fail
case.]

v2: Remove the wrong comment about the fact
    perf_event_create_kernel_counter must be called from a kernel
    thread.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Jan Kiszka <jan.kiszka@web.de>
Cc: Avi Kivity <avi@redhat.com>
LKML-Reference: <20090925122556.2f8bd939@infradead.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/perf_event.h |  6 ++++
 kernel/perf_event.c        | 75 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index df9d964c15fc..fa151d49a2ee 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -744,6 +744,12 @@ extern int hw_perf_group_sched_in(struct perf_event *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_event_context *ctx, int cpu);
 extern void perf_event_update_userpage(struct perf_event *event);
+extern int perf_event_release_kernel(struct perf_event *event);
+extern struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr,
+				int cpu,
+				pid_t pid);
+extern u64 perf_event_read_value(struct perf_event *event);
 
 struct perf_sample_data {
 	u64				type;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 12b5ec39bf97..02d4ff041b01 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1725,6 +1725,26 @@ static int perf_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+int perf_event_release_kernel(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_event_remove_from_context(event);
+	mutex_unlock(&ctx->mutex);
+
+	mutex_lock(&event->owner->perf_event_mutex);
+	list_del_init(&event->owner_entry);
+	mutex_unlock(&event->owner->perf_event_mutex);
+	put_task_struct(event->owner);
+
+	free_event(event);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
 static int perf_event_read_size(struct perf_event *event)
 {
 	int entry = sizeof(u64); /* value */
@@ -1750,7 +1770,7 @@ static int perf_event_read_size(struct perf_event *event)
 	return size;
 }
 
-static u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event)
 {
 	struct perf_event *child;
 	u64 total = 0;
@@ -1761,6 +1781,7 @@ static u64 perf_event_read_value(struct perf_event *event)
 
 	return total;
 }
+EXPORT_SYMBOL_GPL(perf_event_read_value);
 
 static int perf_event_read_entry(struct perf_event *event,
 				   u64 read_format, char __user *buf)
@@ -4638,6 +4659,58 @@ err_put_context:
 	return err;
 }
 
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @pid: task to profile
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+				 pid_t pid)
+{
+	struct perf_event *event;
+	struct perf_event_context *ctx;
+	int err;
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
+
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return NULL ;
+
+	event = perf_event_alloc(attr, cpu, ctx, NULL,
+				     NULL, GFP_KERNEL);
+	err = PTR_ERR(event);
+	if (IS_ERR(event))
+		goto err_put_context;
+
+	event->filp = NULL;
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_install_in_context(ctx, event, cpu);
+	++ctx->generation;
+	mutex_unlock(&ctx->mutex);
+
+	event->owner = current;
+	get_task_struct(current);
+	mutex_lock(&current->perf_event_mutex);
+	list_add_tail(&event->owner_entry, &current->perf_event_list);
+	mutex_unlock(&current->perf_event_mutex);
+
+	return event;
+
+err_put_context:
+	if (err < 0)
+		put_ctx(ctx);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+
 /*
  * inherit a event from parent task to child task:
  */
-- 
cgit v1.2.3-71-gd317


From 97eaf5300b9d0cd99c310bf8c4a0f2f3296d88a3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 18 Oct 2009 15:33:50 +0200
Subject: perf/core: Add a callback to perf events

A simple callback in a perf event can be used for multiple purposes.
For example it is useful for triggered based events like hardware
breakpoints that need a callback to dispatch a triggered breakpoint
event.

v2: Simplify a bit the callback attribution as suggested by Paul
    Mackerras

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mundt <lethal@linux-sh.org>
---
 include/linux/perf_event.h |  7 ++++++-
 kernel/perf_event.c        | 14 ++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fa151d49a2ee..8d54e6d25eeb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -544,6 +544,8 @@ struct perf_pending_entry {
 	void (*func)(struct perf_pending_entry *);
 };
 
+typedef void (*perf_callback_t)(struct perf_event *, void *);
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -639,6 +641,8 @@ struct perf_event {
 	struct event_filter		*filter;
 #endif
 
+	perf_callback_t			callback;
+
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -748,7 +752,8 @@ extern int perf_event_release_kernel(struct perf_event *event);
 extern struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				int cpu,
-				pid_t pid);
+				pid_t pid,
+				perf_callback_t callback);
 extern u64 perf_event_read_value(struct perf_event *event);
 
 struct perf_sample_data {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 02d4ff041b01..5087125e2a00 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4293,6 +4293,7 @@ perf_event_alloc(struct perf_event_attr *attr,
 		   struct perf_event_context *ctx,
 		   struct perf_event *group_leader,
 		   struct perf_event *parent_event,
+		   perf_callback_t callback,
 		   gfp_t gfpflags)
 {
 	const struct pmu *pmu;
@@ -4335,6 +4336,11 @@ perf_event_alloc(struct perf_event_attr *attr,
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (!callback && parent_event)
+		callback = parent_event->callback;
+	
+	event->callback	= callback;
+
 	if (attr->disabled)
 		event->state = PERF_EVENT_STATE_OFF;
 
@@ -4611,7 +4617,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	}
 
 	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-				     NULL, GFP_KERNEL);
+				     NULL, NULL, GFP_KERNEL);
 	err = PTR_ERR(event);
 	if (IS_ERR(event))
 		goto err_put_context;
@@ -4668,7 +4674,7 @@ err_put_context:
  */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-				 pid_t pid)
+				 pid_t pid, perf_callback_t callback)
 {
 	struct perf_event *event;
 	struct perf_event_context *ctx;
@@ -4683,7 +4689,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 		return NULL ;
 
 	event = perf_event_alloc(attr, cpu, ctx, NULL,
-				     NULL, GFP_KERNEL);
+				     NULL, callback, GFP_KERNEL);
 	err = PTR_ERR(event);
 	if (IS_ERR(event))
 		goto err_put_context;
@@ -4736,7 +4742,7 @@ inherit_event(struct perf_event *parent_event,
 	child_event = perf_event_alloc(&parent_event->attr,
 					   parent_event->cpu, child_ctx,
 					   group_leader, parent_event,
-					   GFP_KERNEL);
+					   NULL, GFP_KERNEL);
 	if (IS_ERR(child_event))
 		return child_event;
 	get_ctx(child_ctx);
-- 
cgit v1.2.3-71-gd317


From 1477b6a7edd9ffa7bba4f9779ce9a76ce92761ed Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 4 Nov 2009 16:14:16 +0900
Subject: sched: Remove unused __schedule() declaration

__schedule() had been removed.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AF129C8.3030008@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 -
 kernel/kgdb.c         | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60bf583..f18102c4d0b8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -349,7 +349,6 @@ extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
-asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
 extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..7d7014634022 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
 
 	/*
 	 * All threads that don't have debuggerinfo should be
-	 * in __schedule() sleeping, since all other CPUs
+	 * in schedule() sleeping, since all other CPUs
 	 * are in kgdb_wait, and thus have debuggerinfo.
 	 */
 	if (local_debuggerinfo) {
-- 
cgit v1.2.3-71-gd317


From 9824a2b728b63e7ff586b9fd9293c819be79f0f3 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 4 Nov 2009 16:16:54 +0900
Subject: sched: Remove unused cpu_nr_migrations()

cpu_nr_migrations() is not used, remove it.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AF12A66.6020609@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/sched.c        | 11 -----------
 2 files changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 754b3deed02b..dfc21fb76bf1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -145,7 +145,6 @@ extern unsigned long this_cpu_load(void);
 
 
 extern void calc_global_load(void);
-extern u64 cpu_nr_migrations(int cpu);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 67be4d0dddaa..30fd0ba5f603 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -541,7 +541,6 @@ struct rq {
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
-	u64 nr_migrations_in;
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
@@ -2049,7 +2048,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #endif
 	if (old_cpu != new_cpu) {
 		p->se.nr_migrations++;
-		new_rq->nr_migrations_in++;
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
@@ -2988,15 +2986,6 @@ static void calc_load_account_active(struct rq *this_rq)
 	}
 }
 
-/*
- * Externally visible per-cpu scheduler statistics:
- * cpu_nr_migrations(cpu) - number of migrations into that cpu
- */
-u64 cpu_nr_migrations(int cpu)
-{
-	return cpu_rq(cpu)->nr_migrations_in;
-}
-
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
-- 
cgit v1.2.3-71-gd317


From 77b44d1b7c28360910cdbd427fb62d485c08674c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 3 Nov 2009 19:12:47 -0500
Subject: tracing/kprobes: Rename Kprobe-tracer to kprobe-event

Rename Kprobes-based event tracer to kprobes-based tracing event
(kprobe-event), since it is not a tracer but an extensible
tracing event interface.

This also changes CONFIG_KPROBE_TRACER to CONFIG_KPROBE_EVENT
and sets it y by default.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
LKML-Reference: <20091104001247.3454.14131.stgit@harusame>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/trace/kprobetrace.txt | 34 ++++++++++++++++------------------
 kernel/trace/Kconfig                | 19 ++++++++++++-------
 kernel/trace/Makefile               |  2 +-
 kernel/trace/trace_kprobe.c         |  6 ++----
 4 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 15415243a9a3..47aabeebbdf6 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -1,26 +1,23 @@
-                        Kprobe-based Event Tracer
-                         =========================
+                        Kprobe-based Event Tracing
+                        ==========================
 
                  Documentation is written by Masami Hiramatsu
 
 
 Overview
 --------
-This tracer is similar to the events tracer which is based on Tracepoint
-infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe
-and kretprobe). It probes anywhere where kprobes can probe(this means, all
-functions body except for __kprobes functions).
+These events are similar to tracepoint based events. Instead of Tracepoint,
+this is based on kprobes (kprobe and kretprobe). So it can probe wherever
+kprobes can probe (this means, all functions body except for __kprobes
+functions). Unlike the Tracepoint based event, this can be added and removed
+dynamically, on the fly.
 
-Unlike the function tracer, this tracer can probe instructions inside of
-kernel functions. It allows you to check which instruction has been executed.
+To enable this feature, build your kernel with CONFIG_KPROBE_TRACING=y.
 
-Unlike the Tracepoint based events tracer, this tracer can add and remove
-probe points on the fly.
-
-Similar to the events tracer, this tracer doesn't need to be activated via
-current_tracer, instead of that, just set probe points via
-/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
-probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter.
+Similar to the events tracer, this doesn't need to be activated via
+current_tracer. Instead of that, add probe points via
+/sys/kernel/debug/tracing/kprobe_events, and enable it via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/enabled.
 
 
 Synopsis of kprobe_events
@@ -55,9 +52,9 @@ Per-Probe Event Filtering
 -------------------------
  Per-probe event filtering feature allows you to set different filter on each
 probe and gives you what arguments will be shown in trace buffer. If an event
-name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds
-an event under tracing/events/kprobes/<EVENT>, at the directory you can see
-'id', 'enabled', 'format' and 'filter'.
+name is specified right after 'p:' or 'r:' in kprobe_events, it adds an event
+under tracing/events/kprobes/<EVENT>, at the directory you can see 'id',
+'enabled', 'format' and 'filter'.
 
 enabled:
   You can enable/disable the probe by writing 1 or 0 on it.
@@ -71,6 +68,7 @@ filter:
 id:
   This shows the id of this probe event.
 
+
 Event Profiling
 ---------------
  You can check the total number of probe hits and probe miss-hits via
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 15372a9f2399..f05671609a89 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -428,17 +428,22 @@ config BLK_DEV_IO_TRACE
 
 	  If unsure, say N.
 
-config KPROBE_TRACER
+config KPROBE_EVENT
 	depends on KPROBES
 	depends on X86
-	bool "Trace kprobes"
+	bool "Enable kprobes-based dynamic events"
 	select TRACING
-	select GENERIC_TRACER
+	default y
 	help
-	  This tracer probes everywhere where kprobes can probe it, and
-	  records various registers and memories specified by user.
-	  This also allows you to trace kprobe probe points as a dynamic
-	  defined events. It provides per-probe event filtering interface.
+	  This allows the user to add tracing events (similar to tracepoints) on the fly
+	  via the ftrace interface. See Documentation/trace/kprobetrace.txt
+	  for more details.
+
+	  Those events can be inserted wherever kprobes can probe, and record
+	  various register and memory values.
+
+	  This option is also required by perf-probe subcommand of perf tools. If
+	  you want to use perf tools, this option is strongly recommended.
 
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8cb75d7f280..edc3a3cca1a1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,7 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
-obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o
+obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a86c3ac0df21..cf17a6694f32 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1,5 +1,5 @@
 /*
- * kprobe based kernel tracer
+ * Kprobes-based tracing events
  *
  * Created by Masami Hiramatsu <mhiramat@redhat.com>
  *
@@ -57,8 +57,6 @@ const char *reserved_field_names[] = {
 	FIELD_STRING_FUNC,
 };
 
-/* currently, trace_kprobe only supports X86. */
-
 struct fetch_func {
 	unsigned long (*func)(struct pt_regs *, void *);
 	void *data;
@@ -191,7 +189,7 @@ static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
 }
 
 /**
- * Kprobe tracer core functions
+ * Kprobe event core functions
  */
 
 struct probe_arg {
-- 
cgit v1.2.3-71-gd317


From e2c880630438f80b474378d5487b511b07665051 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 3 Nov 2009 14:53:15 +1030
Subject: cpumask: Simplify sched_rt.c

find_lowest_rq() wants to call pick_optimal_cpu() on the
intersection of sched_domain_span(sd) and lowest_mask.  Rather
than doing a cpus_and into a temporary, we can open-code it.

This actually makes the code slightly clearer, IMHO.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <200911031453.15350.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 61 ++++++++++++++++++++++---------------------------------
 1 file changed, 24 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb19..5c5fef378415 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
-static inline int pick_optimal_cpu(int this_cpu,
-				   const struct cpumask *mask)
-{
-	int first;
-
-	/* "this_cpu" is cheaper to preempt than a remote processor */
-	if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
-		return this_cpu;
-
-	first = cpumask_first(mask);
-	if (first < nr_cpu_ids)
-		return first;
-
-	return -1;
-}
-
 static int find_lowest_rq(struct task_struct *task)
 {
 	struct sched_domain *sd;
 	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
-	cpumask_var_t domain_mask;
 
 	if (task->rt.nr_cpus_allowed == 1)
 		return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
 	 * Otherwise, we consult the sched_domains span maps to figure
 	 * out which cpu is logically closest to our hot cache data.
 	 */
-	if (this_cpu == cpu)
-		this_cpu = -1; /* Skip this_cpu opt if the same */
-
-	if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
-		for_each_domain(cpu, sd) {
-			if (sd->flags & SD_WAKE_AFFINE) {
-				int best_cpu;
+	if (!cpumask_test_cpu(this_cpu, lowest_mask))
+		this_cpu = -1; /* Skip this_cpu opt if not among lowest */
 
-				cpumask_and(domain_mask,
-					    sched_domain_span(sd),
-					    lowest_mask);
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_AFFINE) {
+			int best_cpu;
 
-				best_cpu = pick_optimal_cpu(this_cpu,
-							    domain_mask);
-
-				if (best_cpu != -1) {
-					free_cpumask_var(domain_mask);
-					return best_cpu;
-				}
-			}
+			/*
+			 * "this_cpu" is cheaper to preempt than a
+			 * remote processor.
+			 */
+			if (this_cpu != -1 &&
+			    cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+				return this_cpu;
+
+			best_cpu = cpumask_first_and(lowest_mask,
+						     sched_domain_span(sd));
+			if (best_cpu < nr_cpu_ids)
+				return best_cpu;
 		}
-		free_cpumask_var(domain_mask);
 	}
 
 	/*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
 	 * just give the caller *something* to work with from the compatible
 	 * locations.
 	 */
-	return pick_optimal_cpu(this_cpu, lowest_mask);
+	if (this_cpu != -1)
+		return this_cpu;
+
+	cpu = cpumask_any(lowest_mask);
+	if (cpu < nr_cpu_ids)
+		return cpu;
+	return -1;
 }
 
 /* Will lock the rq it finds */
-- 
cgit v1.2.3-71-gd317


From acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 3 Nov 2009 14:53:40 +1030
Subject: cpumask: Partition_sched_domains takes array of cpumask_var_t

Currently partition_sched_domains() takes a 'struct cpumask
*doms_new' which is a kmalloc'ed array of cpumask_t.  You can't
have such an array if 'struct cpumask' is undefined, as we plan
for CONFIG_CPUMASK_OFFSTACK=y.

So, we make this an array of cpumask_var_t instead: this is the
same for the CONFIG_CPUMASK_OFFSTACK=n case, but requires
multiple allocations for the CONFIG_CPUMASK_OFFSTACK=y case.
Hence we add alloc_sched_domains() and free_sched_domains()
functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <200911031453.40668.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  8 ++++--
 kernel/cpuset.c       | 19 +++++++-------
 kernel/sched.c        | 68 ++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 61 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfc21fb76bf1..78ba664474f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1009,9 +1009,13 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 	return to_cpumask(sd->span);
 }
 
-extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 				    struct sched_domain_attr *dattr_new);
 
+/* Allocate an array of sched domains, for partition_sched_domains(). */
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
+
 /* Test a flag in parent sched domain */
 static inline int test_sd_parent(struct sched_domain *sd, int flag)
 {
@@ -1029,7 +1033,7 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
 struct sched_domain_attr;
 
 static inline void
-partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			struct sched_domain_attr *dattr_new)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d247381e7371..3cf2183b472d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  *	element of the partition (one sched domain) to be passed to
  *	partition_sched_domains().
  */
-/* FIXME: see the FIXME in partition_sched_domains() */
-static int generate_sched_domains(struct cpumask **domains,
+static int generate_sched_domains(cpumask_var_t **domains,
 			struct sched_domain_attr **attributes)
 {
 	LIST_HEAD(q);		/* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
-	struct cpumask *doms;	/* resulting partition; i.e. sched domains */
+	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
 
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
-		doms = kmalloc(cpumask_size(), GFP_KERNEL);
+		ndoms = 1;
+		doms = alloc_sched_domains(ndoms);
 		if (!doms)
 			goto done;
 
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
 			*dattr = SD_ATTR_INIT;
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
-		cpumask_copy(doms, top_cpuset.cpus_allowed);
+		cpumask_copy(doms[0], top_cpuset.cpus_allowed);
 
-		ndoms = 1;
 		goto done;
 	}
 
@@ -636,7 +635,7 @@ restart:
 	 * Now we know how many domains to create.
 	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
 	 */
-	doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
+	doms = alloc_sched_domains(ndoms);
 	if (!doms)
 		goto done;
 
@@ -656,7 +655,7 @@ restart:
 			continue;
 		}
 
-		dp = doms + nslot;
+		dp = doms[nslot];
 
 		if (nslot == ndoms) {
 			static int warnings = 10;
@@ -718,7 +717,7 @@ done:
 static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 	struct sched_domain_attr *attr;
-	struct cpumask *doms;
+	cpumask_var_t *doms;
 	int ndoms;
 
 	get_online_cpus();
@@ -2052,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 				unsigned long phase, void *unused_cpu)
 {
 	struct sched_domain_attr *attr;
-	struct cpumask *doms;
+	cpumask_var_t *doms;
 	int ndoms;
 
 	switch (phase) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 30fd0ba5f603..ae026aad145b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8846,7 +8846,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
 	return __build_sched_domains(cpu_map, NULL);
 }
 
-static struct cpumask *doms_cur;	/* current sched domains */
+static cpumask_var_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
@@ -8868,6 +8868,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
 	return 0;
 }
 
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -8879,12 +8904,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
 
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
-	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
+	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
-		doms_cur = fallback_doms;
-	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
 	dattr_cur = NULL;
-	err = build_sched_domains(doms_cur);
+	err = build_sched_domains(doms_cur[0]);
 	register_sched_domain_sysctl();
 
 	return err;
@@ -8934,19 +8959,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
- * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
- * The passed in 'doms_new' should be kmalloc'd. This routine takes
- * ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL &&
- * ndoms_new == 1, and partition_sched_domains() will fallback to
- * the single partition 'fallback_doms', it also forces the domains
- * to be rebuilt.
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8954,8 +8979,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  *
  * Call with hotplug lock held
  */
-/* FIXME: Change to struct cpumask *doms_new[] */
-void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
@@ -8974,40 +8998,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(&doms_cur[i], &doms_new[j])
+			if (cpumask_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
-		detach_destroy_domains(doms_cur + i);
+		detach_destroy_domains(doms_cur[i]);
 match1:
 		;
 	}
 
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
-		doms_new = fallback_doms;
-		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur && !new_topology; j++) {
-			if (cpumask_equal(&doms_new[i], &doms_cur[j])
+			if (cpumask_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
-		__build_sched_domains(doms_new + i,
+		__build_sched_domains(doms_new[i],
 					dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 
 	/* Remember the new sched domains */
-	if (doms_cur != fallback_doms)
-		kfree(doms_cur);
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
-- 
cgit v1.2.3-71-gd317


From 24b26d4211130b6455692804c14d537158855cd7 Mon Sep 17 00:00:00 2001
From: Liuweni <qingshenlwy@gmail.com>
Date: Wed, 4 Nov 2009 20:11:05 +0800
Subject: irq: Fix docbook comments

Fix docbook comments to match the actual function names
(set_irq_msi, handle_percpu_irq).

Signed-off-by: Liuwenyi <qingshenlwy@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..ba566c261adc 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -166,11 +166,11 @@ int set_irq_data(unsigned int irq, void *data)
 EXPORT_SYMBOL(set_irq_data);
 
 /**
- *	set_irq_data - set irq type data for an irq
+ *	set_irq_msi - set MSI descriptor data for an irq
  *	@irq:	Interrupt number
  *	@entry:	Pointer to MSI descriptor data
  *
- *	Set the hardware irq controller data for an irq
+ *	Set the MSI descriptor entry for an irq
  */
 int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 {
@@ -590,7 +590,7 @@ out_unlock:
 }
 
 /**
- *	handle_percpu_IRQ - Per CPU local irq handler
+ *	handle_percpu_irq - Per CPU local irq handler
  *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
-- 
cgit v1.2.3-71-gd317


From 663e69592856df53ef52969482ef413a96bc4e06 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2009 14:22:21 +0100
Subject: irq: Remove unused debug_poll_all_shared_irqs()

commit 74296a8ed added this function for debug purposes, but it was
never used for anything. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  6 ------
 kernel/irq/spurious.c     | 14 +-------------
 2 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7ca72b74eec7..75f3f00ac1e5 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -603,12 +603,6 @@ static inline void init_irq_proc(void)
 }
 #endif
 
-#if defined(CONFIG_GENERIC_HARDIRQS) && defined(CONFIG_DEBUG_SHIRQ)
-extern void debug_poll_all_shared_irqs(void);
-#else
-static inline void debug_poll_all_shared_irqs(void) { }
-#endif
-
 struct seq_file;
 int show_interrupts(struct seq_file *p, void *v);
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760fe..8996b98f9eb2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
 	return ok;
 }
 
-static void poll_all_shared_irqs(void)
+static void poll_spurious_irqs(unsigned long dummy)
 {
 	struct irq_desc *desc;
 	int i;
@@ -123,23 +123,11 @@ static void poll_all_shared_irqs(void)
 
 		try_one_irq(i, desc);
 	}
-}
-
-static void poll_spurious_irqs(unsigned long dummy)
-{
-	poll_all_shared_irqs();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
 
-#ifdef CONFIG_DEBUG_SHIRQ
-void debug_poll_all_shared_irqs(void)
-{
-	poll_all_shared_irqs();
-}
-#endif
-
 /*
  * If 99,900 of the previous 100,000 interrupts have not been handled
  * then assume that the IRQ is stuck in some manner. Drop a diagnostic
-- 
cgit v1.2.3-71-gd317


From a1f84a3ab8e002159498814eaa7e48c33752b04b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 27 Oct 2009 15:35:38 +0100
Subject: sched: Check for an idle shared cache in select_task_rq_fair()

When waking affine, check for an idle shared cache, and if
found, wake to that CPU/sibling instead of the waker's CPU.

This improves pgsql+oltp ramp up by roughly 8%. Possibly more
for other loads, depending on overlap. The trade-off is a
roughly 1% peak downturn if tasks are truly synchronous.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@kernel.org>
LKML-Reference: <1256654138.17752.7.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b47eeda..da87385683cc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1372,11 +1372,36 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 				want_sd = 0;
 		}
 
-		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+		if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
+			int candidate = -1, i;
 
-			affine_sd = tmp;
-			want_affine = 0;
+			if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+				candidate = cpu;
+
+			/*
+			 * Check for an idle shared cache.
+			 */
+			if (tmp->flags & SD_PREFER_SIBLING) {
+				if (candidate == cpu) {
+					if (!cpu_rq(prev_cpu)->cfs.nr_running)
+						candidate = prev_cpu;
+				}
+
+				if (candidate == -1 || candidate == cpu) {
+					for_each_cpu(i, sched_domain_span(tmp)) {
+						if (!cpu_rq(i)->cfs.nr_running) {
+							candidate = i;
+							break;
+						}
+					}
+				}
+			}
+
+			if (candidate >= 0) {
+				affine_sd = tmp;
+				want_affine = 0;
+				cpu = candidate;
+			}
 		}
 
 		if (!want_sd && !want_affine)
-- 
cgit v1.2.3-71-gd317


From 1b9508f6831e10d53256825de8904caa22d1ca2c Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 4 Nov 2009 17:53:50 +0100
Subject: sched: Rate-limit newidle

Rate limit newidle to migration_cost. It's a win for all
stages of sysbench oltp tests.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 22 +++++++++++++++++++++-
 kernel/sched_debug.c |  4 ++++
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ae026aad145b..f8492123b5d1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -589,6 +589,8 @@ struct rq {
 
 	u64 rt_avg;
 	u64 age_stamp;
+	u64 idle_stamp;
+	u64 avg_idle;
 #endif
 
 	/* calc_load related fields */
@@ -2353,6 +2355,17 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	if (rq != orig_rq)
 		update_rq_clock(rq);
 
+	if (rq->idle_stamp) {
+		u64 delta = rq->clock - rq->idle_stamp;
+		u64 max = 2*sysctl_sched_migration_cost;
+
+		if (delta > max)
+			rq->avg_idle = max;
+		else
+			update_avg(&rq->avg_idle, delta);
+		rq->idle_stamp = 0;
+	}
+
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 
@@ -4389,6 +4402,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 
+	this_rq->idle_stamp = this_rq->clock;
+
+	if (this_rq->avg_idle < sysctl_sched_migration_cost)
+		return;
+
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 
@@ -4403,8 +4421,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
-		if (pulled_task)
+		if (pulled_task) {
+			this_rq->idle_stamp = 0;
 			break;
+		}
 	}
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..6988cf08f705 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
 
 #ifdef CONFIG_SCHEDSTATS
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 
 	P(yld_count);
 
 	P(sched_switch);
 	P(sched_count);
 	P(sched_goidle);
+#ifdef CONFIG_SMP
+	P64(avg_idle);
+#endif
 
 	P(ttwu_count);
 	P(ttwu_local);
-- 
cgit v1.2.3-71-gd317


From eed3b9cf3fe3fcc7a50238dfcab63a63914e8f42 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 29 Sep 2009 14:25:15 +0200
Subject: nohz: Reuse ktime in sub-functions of tick_check_idle.

On a system with NOHZ=y tick_check_idle calls tick_nohz_stop_idle and
tick_nohz_update_jiffies. Given the right conditions (ts->idle_active
and/or ts->tick_stopped) both function get a time stamp with ktime_get.
The same time stamp can be reused if both function require one.

On s390 this change has the additional benefit that gcc inlines the
tick_nohz_stop_idle function into tick_check_idle. The number of
instructions to execute tick_check_idle drops from 225 to 144
(without the ktime_get optimization it is 367 vs 215 instructions).

before:

 0)               |  tick_check_idle() {
 0)               |    tick_nohz_stop_idle() {
 0)               |      ktime_get() {
 0)               |        read_tod_clock() {
 0)   0.601 us    |        }
 0)   1.765 us    |      }
 0)   3.047 us    |    }
 0)               |    ktime_get() {
 0)               |      read_tod_clock() {
 0)   0.570 us    |      }
 0)   1.727 us    |    }
 0)               |    tick_do_update_jiffies64() {
 0)   0.609 us    |    }
 0)   8.055 us    |  }

after:

 0)               |  tick_check_idle() {
 0)               |    ktime_get() {
 0)               |      read_tod_clock() {
 0)   0.617 us    |      }
 0)   1.773 us    |    }
 0)               |    tick_do_update_jiffies64() {
 0)   0.593 us    |    }
 0)   4.477 us    |  }

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: john stultz <johnstul@us.ibm.com>
LKML-Reference: <20090929122533.206589318@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 62 ++++++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e0f59a21c061..7378e2c71ca6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
  * value. We do this unconditionally on any cpu, as we don't know whether the
  * cpu, which has the update task assigned is in a long sleep.
  */
-static void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(ktime_t now)
 {
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long flags;
-	ktime_t now;
-
-	if (!ts->tick_stopped)
-		return;
 
 	cpumask_clear_cpu(cpu, nohz_cpu_mask);
-	now = ktime_get();
 	ts->idle_waketime = now;
 
 	local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
 	touch_softlockup_watchdog();
 }
 
-static void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t delta;
 
-	if (ts->idle_active) {
-		ktime_t now, delta;
-		now = ktime_get();
-		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_lastupdate = now;
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-		ts->idle_active = 0;
+	delta = ktime_sub(now, ts->idle_entrytime);
+	ts->idle_lastupdate = now;
+	ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	ts->idle_active = 0;
 
-		sched_clock_idle_wakeup_event(0);
-	}
+	sched_clock_idle_wakeup_event(0);
 }
 
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -431,7 +423,11 @@ void tick_nohz_restart_sched_tick(void)
 	ktime_t now;
 
 	local_irq_disable();
-	tick_nohz_stop_idle(cpu);
+	if (ts->idle_active || (ts->inidle && ts->tick_stopped))
+		now = ktime_get();
+
+	if (ts->idle_active)
+		tick_nohz_stop_idle(cpu, now);
 
 	if (!ts->inidle || !ts->tick_stopped) {
 		ts->inidle = 0;
@@ -445,7 +441,6 @@ void tick_nohz_restart_sched_tick(void)
 
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
-	now = ktime_get();
 	tick_do_update_jiffies64(now);
 	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
@@ -579,22 +574,18 @@ static void tick_nohz_switch_to_nohz(void)
  * timer and do not touch the other magic bits which need to be done
  * when idle is left.
  */
-static void tick_nohz_kick_tick(int cpu)
+static void tick_nohz_kick_tick(int cpu, ktime_t now)
 {
 #if 0
 	/* Switch back to 2.6.27 behaviour */
 
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-	ktime_t delta, now;
-
-	if (!ts->tick_stopped)
-		return;
+	ktime_t delta;
 
 	/*
 	 * Do not touch the tick device, when the next expiry is either
 	 * already reached or less/equal than the tick period.
 	 */
-	now = ktime_get();
 	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
 	if (delta.tv64 <= tick_period.tv64)
 		return;
@@ -603,9 +594,26 @@ static void tick_nohz_kick_tick(int cpu)
 #endif
 }
 
+static inline void tick_check_nohz(int cpu)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now;
+
+	if (!ts->idle_active && !ts->tick_stopped)
+		return;
+	now = ktime_get();
+	if (ts->idle_active)
+		tick_nohz_stop_idle(cpu, now);
+	if (ts->tick_stopped) {
+		tick_nohz_update_jiffies(now);
+		tick_nohz_kick_tick(cpu, now);
+	}
+}
+
 #else
 
 static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_check_nohz(int cpu) { }
 
 #endif /* NO_HZ */
 
@@ -615,11 +623,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 void tick_check_idle(int cpu)
 {
 	tick_check_oneshot_broadcast(cpu);
-#ifdef CONFIG_NO_HZ
-	tick_nohz_stop_idle(cpu);
-	tick_nohz_update_jiffies();
-	tick_nohz_kick_tick(cpu);
-#endif
+	tick_check_nohz(cpu);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From 3c5d92a0cfb5103c0d5ab74d4ae6373d3af38148 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 29 Sep 2009 14:25:16 +0200
Subject: nohz: Introduce arch_needs_cpu

Allow the architecture to request a normal jiffy tick when the system
goes idle and tick_nohz_stop_sched_tick is called . On s390 the hook is
used to prevent the system going fully idle if there has been an
interrupt other than a clock comparator interrupt since the last wakeup.

On s390 the HiperSockets response time for 1 connection ping-pong goes
down from 42 to 34 microseconds. The CPU cost decreases by 27%.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
LKML-Reference: <20090929122533.402715150@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/s390/include/asm/cputime.h |  8 ++++++++
 arch/s390/kernel/s390_ext.c     |  2 ++
 arch/s390/kernel/vtime.c        |  2 ++
 drivers/s390/cio/cio.c          |  1 +
 include/linux/tick.h            |  3 +++
 kernel/time/tick-sched.c        | 13 ++++++++-----
 6 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 24b1244aadb9..95f3561517c7 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -183,6 +183,7 @@ struct s390_idle_data {
 	unsigned long long idle_count;
 	unsigned long long idle_enter;
 	unsigned long long idle_time;
+	int nohz_delay;
 };
 
 DECLARE_PER_CPU(struct s390_idle_data, s390_idle);
@@ -198,4 +199,11 @@ static inline void s390_idle_check(void)
 		vtime_start_cpu();
 }
 
+static inline int s390_nohz_delay(int cpu)
+{
+	return per_cpu(s390_idle, cpu).nohz_delay != 0;
+}
+
+#define arch_needs_cpu(cpu) s390_nohz_delay(cpu)
+
 #endif /* _S390_CPUTIME_H */
diff --git a/arch/s390/kernel/s390_ext.c b/arch/s390/kernel/s390_ext.c
index 0de305b598ce..59618bcd99b7 100644
--- a/arch/s390/kernel/s390_ext.c
+++ b/arch/s390/kernel/s390_ext.c
@@ -126,6 +126,8 @@ void __irq_entry do_extint(struct pt_regs *regs, unsigned short code)
 		/* Serve timer interrupts first. */
 		clock_comparator_work();
 	kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++;
+	if (code != 0x1004)
+		__get_cpu_var(s390_idle).nohz_delay = 1;
         index = ext_hash(code);
 	for (p = ext_int_hash[index]; p; p = p->next) {
 		if (likely(p->code == code))
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index c41bb0d416e1..b59a812a010e 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -167,6 +167,8 @@ void vtime_stop_cpu(void)
 	/* Wait for external, I/O or machine check interrupt. */
 	psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT;
 
+	idle->nohz_delay = 0;
+
 	/* Check if the CPU timer needs to be reprogrammed. */
 	if (vq->do_spt) {
 		__u64 vmax = VTIMER_MAX_SLICE;
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 138124fcfcad..126f240715a4 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -618,6 +618,7 @@ void __irq_entry do_IRQ(struct pt_regs *regs)
 	old_regs = set_irq_regs(regs);
 	s390_idle_check();
 	irq_enter();
+	__get_cpu_var(s390_idle).nohz_delay = 1;
 	if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator)
 		/* Serve timer interrupts first. */
 		clock_comparator_work();
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0482229c07db..8dc082194b22 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -98,6 +98,9 @@ extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
 extern void tick_check_idle(int cpu);
 extern int tick_oneshot_mode_active(void);
+#  ifndef arch_needs_cpu
+#   define arch_needs_cpu(cpu) (0)
+#  endif
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7378e2c71ca6..3840f6dff7eb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -264,12 +264,15 @@ void tick_nohz_stop_sched_tick(int inidle)
 		last_jiffies = jiffies;
 	} while (read_seqretry(&xtime_lock, seq));
 
-	/* Get the next timer wheel timer */
-	next_jiffies = get_next_timer_interrupt(last_jiffies);
-	delta_jiffies = next_jiffies - last_jiffies;
-
-	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
+	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
+	    arch_needs_cpu(cpu)) {
+		next_jiffies = last_jiffies + 1;
 		delta_jiffies = 1;
+	} else {
+		/* Get the next timer wheel timer */
+		next_jiffies = get_next_timer_interrupt(last_jiffies);
+		delta_jiffies = next_jiffies - last_jiffies;
+	}
 	/*
 	 * Do not stop the tick, if we are only one off
 	 * or if the cpu is required for rcu
-- 
cgit v1.2.3-71-gd317


From fd210738f6601d0fb462df9a2fe5a41896ff6a8f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 5 Nov 2009 10:57:46 +0100
Subject: sched: Fix affinity logic in select_task_rq_fair()

Ingo Molnar reported:

[   26.804000] BUG: using smp_processor_id() in preemptible [00000000] code: events/1/10
[   26.808000] caller is vmstat_update+0x26/0x70
[   26.812000] Pid: 10, comm: events/1 Not tainted 2.6.32-rc5 #6887
[   26.816000] Call Trace:
[   26.820000]  [<c1924a24>] ? printk+0x28/0x3c
[   26.824000]  [<c13258a0>] debug_smp_processor_id+0xf0/0x110
[   26.824000] mount used greatest stack depth: 1464 bytes left
[   26.828000]  [<c111d086>] vmstat_update+0x26/0x70
[   26.832000]  [<c1086418>] worker_thread+0x188/0x310
[   26.836000]  [<c10863b7>] ? worker_thread+0x127/0x310
[   26.840000]  [<c108d310>] ? autoremove_wake_function+0x0/0x60
[   26.844000]  [<c1086290>] ? worker_thread+0x0/0x310
[   26.848000]  [<c108cf0c>] kthread+0x7c/0x90
[   26.852000]  [<c108ce90>] ? kthread+0x0/0x90
[   26.856000]  [<c100c0a7>] kernel_thread_helper+0x7/0x10
[   26.860000] BUG: using smp_processor_id() in preemptible [00000000] code: events/1/10
[   26.864000] caller is vmstat_update+0x3c/0x70

Because this commit:

  a1f84a3: sched: Check for an idle shared cache in select_task_rq_fair()

broke ->cpus_allowed.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: arjan@infradead.org
Cc: <stable@kernel.org>
LKML-Reference: <1257415066.12867.1.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da87385683cc..e4d4483fd617 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1389,6 +1389,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 
 				if (candidate == -1 || candidate == cpu) {
 					for_each_cpu(i, sched_domain_span(tmp)) {
+						if (!cpumask_test_cpu(i, &p->cpus_allowed))
+							continue;
 						if (!cpu_rq(i)->cfs.nr_running) {
 							candidate = i;
 							break;
-- 
cgit v1.2.3-71-gd317


From afa588b2651a03da4bc601a17a244b1cd97264f2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 2 Apr 2009 23:44:59 -0700
Subject: sysctl: Separate the binary sysctl logic into it's own file.

In preparation for more invasive cleanups separate the core
binary sysctl logic into it's own file.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/Makefile        |   2 +-
 kernel/sysctl.c        | 165 -------------------------------------------
 kernel/sysctl_binary.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 186 insertions(+), 166 deletions(-)
 create mode 100644 kernel/sysctl_binary.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..986a5c197346 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,7 +4,7 @@
 
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    cpu.o exit.o itimer.o time.o softirq.o resource.o \
-	    sysctl.o capability.o ptrace.o timer.o user.o \
+	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c517412..6a642d7ffa85 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,7 +27,6 @@
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/kmemcheck.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -60,7 +59,6 @@
 #include <asm/io.h>
 #endif
 
-static int deprecated_sysctl_warning(struct __sysctl_args *args);
 
 #if defined(CONFIG_SYSCTL)
 
@@ -1766,122 +1764,6 @@ void register_sysctl_root(struct ctl_table_root *root)
 	spin_unlock(&sysctl_lock);
 }
 
-#ifdef CONFIG_SYSCTL_SYSCALL
-/* Perform the actual read/write of a sysctl table entry. */
-static int do_sysctl_strategy(struct ctl_table_root *root,
-			struct ctl_table *table,
-			void __user *oldval, size_t __user *oldlenp,
-			void __user *newval, size_t newlen)
-{
-	int op = 0, rc;
-
-	if (oldval)
-		op |= MAY_READ;
-	if (newval)
-		op |= MAY_WRITE;
-	if (sysctl_perm(root, table, op))
-		return -EPERM;
-
-	if (table->strategy) {
-		rc = table->strategy(table, oldval, oldlenp, newval, newlen);
-		if (rc < 0)
-			return rc;
-		if (rc > 0)
-			return 0;
-	}
-
-	/* If there is no strategy routine, or if the strategy returns
-	 * zero, proceed with automatic r/w */
-	if (table->data && table->maxlen) {
-		rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
-		if (rc < 0)
-			return rc;
-	}
-	return 0;
-}
-
-static int parse_table(int __user *name, int nlen,
-		       void __user *oldval, size_t __user *oldlenp,
-		       void __user *newval, size_t newlen,
-		       struct ctl_table_root *root,
-		       struct ctl_table *table)
-{
-	int n;
-repeat:
-	if (!nlen)
-		return -ENOTDIR;
-	if (get_user(n, name))
-		return -EFAULT;
-	for ( ; table->ctl_name || table->procname; table++) {
-		if (!table->ctl_name)
-			continue;
-		if (n == table->ctl_name) {
-			int error;
-			if (table->child) {
-				if (sysctl_perm(root, table, MAY_EXEC))
-					return -EPERM;
-				name++;
-				nlen--;
-				table = table->child;
-				goto repeat;
-			}
-			error = do_sysctl_strategy(root, table,
-						   oldval, oldlenp,
-						   newval, newlen);
-			return error;
-		}
-	}
-	return -ENOTDIR;
-}
-
-int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
-	       void __user *newval, size_t newlen)
-{
-	struct ctl_table_header *head;
-	int error = -ENOTDIR;
-
-	if (nlen <= 0 || nlen >= CTL_MAXNAME)
-		return -ENOTDIR;
-	if (oldval) {
-		int old_len;
-		if (!oldlenp || get_user(old_len, oldlenp))
-			return -EFAULT;
-	}
-
-	for (head = sysctl_head_next(NULL); head;
-			head = sysctl_head_next(head)) {
-		error = parse_table(name, nlen, oldval, oldlenp, 
-					newval, newlen,
-					head->root, head->ctl_table);
-		if (error != -ENOTDIR) {
-			sysctl_head_finish(head);
-			break;
-		}
-	}
-	return error;
-}
-
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
-	struct __sysctl_args tmp;
-	int error;
-
-	if (copy_from_user(&tmp, args, sizeof(tmp)))
-		return -EFAULT;
-
-	error = deprecated_sysctl_warning(&tmp);
-	if (error)
-		goto out;
-
-	lock_kernel();
-	error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
-			  tmp.newval, tmp.newlen);
-	unlock_kernel();
-out:
-	return error;
-}
-#endif /* CONFIG_SYSCTL_SYSCALL */
-
 /*
  * sysctl_perm does NOT grant the superuser all rights automatically, because
  * some sysctl variables are readonly even to root.
@@ -3148,23 +3030,6 @@ int sysctl_ms_jiffies(struct ctl_table *table,
 #else /* CONFIG_SYSCTL_SYSCALL */
 
 
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
-	struct __sysctl_args tmp;
-	int error;
-
-	if (copy_from_user(&tmp, args, sizeof(tmp)))
-		return -EFAULT;
-
-	error = deprecated_sysctl_warning(&tmp);
-
-	/* If no error reading the parameters then just -ENOSYS ... */
-	if (!error)
-		error = -ENOSYS;
-
-	return error;
-}
-
 int sysctl_data(struct ctl_table *table,
 		  void __user *oldval, size_t __user *oldlenp,
 		  void __user *newval, size_t newlen)
@@ -3202,36 +3067,6 @@ int sysctl_ms_jiffies(struct ctl_table *table,
 
 #endif /* CONFIG_SYSCTL_SYSCALL */
 
-static int deprecated_sysctl_warning(struct __sysctl_args *args)
-{
-	static int msg_count;
-	int name[CTL_MAXNAME];
-	int i;
-
-	/* Check args->nlen. */
-	if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
-		return -ENOTDIR;
-
-	/* Read in the sysctl name for better debug message logging */
-	for (i = 0; i < args->nlen; i++)
-		if (get_user(name[i], args->name + i))
-			return -EFAULT;
-
-	/* Ignore accesses to kernel.version */
-	if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
-		return 0;
-
-	if (msg_count < 5) {
-		msg_count++;
-		printk(KERN_INFO
-			"warning: process `%s' used the deprecated sysctl "
-			"system call with ", current->comm);
-		for (i = 0; i < args->nlen; i++)
-			printk("%d.", name[i]);
-		printk("\n");
-	}
-	return 0;
-}
 
 /*
  * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
new file mode 100644
index 000000000000..eceeed20ca88
--- /dev/null
+++ b/kernel/sysctl_binary.c
@@ -0,0 +1,185 @@
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include <linux/sunrpc/debug.h>
+#include <linux/string.h>
+#include <net/ip_vs.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+#include <linux/file.h>
+#include <linux/ctype.h>
+#include <linux/smp_lock.h>
+
+static int deprecated_sysctl_warning(struct __sysctl_args *args);
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+
+/* Perform the actual read/write of a sysctl table entry. */
+static int do_sysctl_strategy(struct ctl_table_root *root,
+			struct ctl_table *table,
+			void __user *oldval, size_t __user *oldlenp,
+			void __user *newval, size_t newlen)
+{
+	int op = 0, rc;
+
+	if (oldval)
+		op |= MAY_READ;
+	if (newval)
+		op |= MAY_WRITE;
+	if (sysctl_perm(root, table, op))
+		return -EPERM;
+
+	if (table->strategy) {
+		rc = table->strategy(table, oldval, oldlenp, newval, newlen);
+		if (rc < 0)
+			return rc;
+		if (rc > 0)
+			return 0;
+	}
+
+	/* If there is no strategy routine, or if the strategy returns
+	 * zero, proceed with automatic r/w */
+	if (table->data && table->maxlen) {
+		rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
+		if (rc < 0)
+			return rc;
+	}
+	return 0;
+}
+
+static int parse_table(int __user *name, int nlen,
+		       void __user *oldval, size_t __user *oldlenp,
+		       void __user *newval, size_t newlen,
+		       struct ctl_table_root *root,
+		       struct ctl_table *table)
+{
+	int n;
+repeat:
+	if (!nlen)
+		return -ENOTDIR;
+	if (get_user(n, name))
+		return -EFAULT;
+	for ( ; table->ctl_name || table->procname; table++) {
+		if (!table->ctl_name)
+			continue;
+		if (n == table->ctl_name) {
+			int error;
+			if (table->child) {
+				if (sysctl_perm(root, table, MAY_EXEC))
+					return -EPERM;
+				name++;
+				nlen--;
+				table = table->child;
+				goto repeat;
+			}
+			error = do_sysctl_strategy(root, table,
+						   oldval, oldlenp,
+						   newval, newlen);
+			return error;
+		}
+	}
+	return -ENOTDIR;
+}
+
+int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
+	       void __user *newval, size_t newlen)
+{
+	struct ctl_table_header *head;
+	int error = -ENOTDIR;
+
+	if (nlen <= 0 || nlen >= CTL_MAXNAME)
+		return -ENOTDIR;
+	if (oldval) {
+		int old_len;
+		if (!oldlenp || get_user(old_len, oldlenp))
+			return -EFAULT;
+	}
+
+	for (head = sysctl_head_next(NULL); head;
+			head = sysctl_head_next(head)) {
+		error = parse_table(name, nlen, oldval, oldlenp, 
+					newval, newlen,
+					head->root, head->ctl_table);
+		if (error != -ENOTDIR) {
+			sysctl_head_finish(head);
+			break;
+		}
+	}
+	return error;
+}
+
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
+{
+	struct __sysctl_args tmp;
+	int error;
+
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+
+	error = deprecated_sysctl_warning(&tmp);
+	if (error)
+		goto out;
+
+	lock_kernel();
+	error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
+			  tmp.newval, tmp.newlen);
+	unlock_kernel();
+out:
+	return error;
+}
+
+#else /* CONFIG_SYSCTL_SYSCALL */
+
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
+{
+	struct __sysctl_args tmp;
+	int error;
+
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+
+	error = deprecated_sysctl_warning(&tmp);
+
+	/* If no error reading the parameters then just -ENOSYS ... */
+	if (!error)
+		error = -ENOSYS;
+
+	return error;
+}
+
+#endif /* CONFIG_SYSCTL_SYSCALL */
+
+static int deprecated_sysctl_warning(struct __sysctl_args *args)
+{
+	static int msg_count;
+	int name[CTL_MAXNAME];
+	int i;
+
+	/* Check args->nlen. */
+	if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
+		return -ENOTDIR;
+
+	/* Read in the sysctl name for better debug message logging */
+	for (i = 0; i < args->nlen; i++)
+		if (get_user(name[i], args->name + i))
+			return -EFAULT;
+
+	/* Ignore accesses to kernel.version */
+	if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
+		return 0;
+
+	if (msg_count < 5) {
+		msg_count++;
+		printk(KERN_INFO
+			"warning: process `%s' used the deprecated sysctl "
+			"system call with ", current->comm);
+		for (i = 0; i < args->nlen; i++)
+			printk("%d.", name[i]);
+		printk("\n");
+	}
+	return 0;
+}
-- 
cgit v1.2.3-71-gd317


From 2830b68361a9f58354ad043c6d85043ea917f907 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 00:09:33 -0700
Subject: sysctl: Refactor the binary sysctl handling to remove duplicate code

Read in the binary sysctl path once, instead of reread it
from user space each time the code needs to access a path
element.

The deprecated sysctl warning is moved to do_sysctl so
that the compat_sysctl entries syscalls will also warn.

The return of -ENOSYS when !CONFIG_SYSCTL_SYSCALL is moved
to binary_sysctl.  Always leaving a do_sysctl available
that handles !CONFIG_SYSCTL_SYSCALL and printing the
deprecated sysctl warning allows for a single defitition
of the sysctl syscall.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sysctl_binary.c | 123 +++++++++++++++++++++++--------------------------
 1 file changed, 58 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index eceeed20ca88..930a31cd708b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -14,8 +14,6 @@
 #include <linux/ctype.h>
 #include <linux/smp_lock.h>
 
-static int deprecated_sysctl_warning(struct __sysctl_args *args);
-
 #ifdef CONFIG_SYSCTL_SYSCALL
 
 /* Perform the actual read/write of a sysctl table entry. */
@@ -51,7 +49,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
 	return 0;
 }
 
-static int parse_table(int __user *name, int nlen,
+static int parse_table(const int *name, int nlen,
 		       void __user *oldval, size_t __user *oldlenp,
 		       void __user *newval, size_t newlen,
 		       struct ctl_table_root *root,
@@ -61,8 +59,7 @@ static int parse_table(int __user *name, int nlen,
 repeat:
 	if (!nlen)
 		return -ENOTDIR;
-	if (get_user(n, name))
-		return -EFAULT;
+	n = *name;
 	for ( ; table->ctl_name || table->procname; table++) {
 		if (!table->ctl_name)
 			continue;
@@ -85,19 +82,13 @@ repeat:
 	return -ENOTDIR;
 }
 
-int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
-	       void __user *newval, size_t newlen)
+static ssize_t binary_sysctl(const int *name, int nlen,
+	void __user *oldval, size_t __user *oldlenp,
+	void __user *newval, size_t newlen)
+
 {
 	struct ctl_table_header *head;
-	int error = -ENOTDIR;
-
-	if (nlen <= 0 || nlen >= CTL_MAXNAME)
-		return -ENOTDIR;
-	if (oldval) {
-		int old_len;
-		if (!oldlenp || get_user(old_len, oldlenp))
-			return -EFAULT;
-	}
+	ssize_t error = -ENOTDIR;
 
 	for (head = sysctl_head_next(NULL); head;
 			head = sysctl_head_next(head)) {
@@ -112,74 +103,76 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 	return error;
 }
 
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
-	struct __sysctl_args tmp;
-	int error;
-
-	if (copy_from_user(&tmp, args, sizeof(tmp)))
-		return -EFAULT;
-
-	error = deprecated_sysctl_warning(&tmp);
-	if (error)
-		goto out;
-
-	lock_kernel();
-	error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
-			  tmp.newval, tmp.newlen);
-	unlock_kernel();
-out:
-	return error;
-}
-
 #else /* CONFIG_SYSCTL_SYSCALL */
 
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
+static ssize_t binary_sysctl(const int *ctl_name, int nlen,
+	void __user *oldval, size_t __user *oldlenp,
+	void __user *newval, size_t newlen)
 {
-	struct __sysctl_args tmp;
-	int error;
-
-	if (copy_from_user(&tmp, args, sizeof(tmp)))
-		return -EFAULT;
-
-	error = deprecated_sysctl_warning(&tmp);
-
-	/* If no error reading the parameters then just -ENOSYS ... */
-	if (!error)
-		error = -ENOSYS;
-
-	return error;
+	return -ENOSYS;
 }
 
 #endif /* CONFIG_SYSCTL_SYSCALL */
 
-static int deprecated_sysctl_warning(struct __sysctl_args *args)
+static void deprecated_sysctl_warning(const int *name, int nlen)
 {
 	static int msg_count;
-	int name[CTL_MAXNAME];
 	int i;
 
-	/* Check args->nlen. */
-	if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
-		return -ENOTDIR;
-
-	/* Read in the sysctl name for better debug message logging */
-	for (i = 0; i < args->nlen; i++)
-		if (get_user(name[i], args->name + i))
-			return -EFAULT;
-
 	/* Ignore accesses to kernel.version */
-	if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
-		return 0;
+	if ((nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
+		return;
 
 	if (msg_count < 5) {
 		msg_count++;
 		printk(KERN_INFO
 			"warning: process `%s' used the deprecated sysctl "
 			"system call with ", current->comm);
-		for (i = 0; i < args->nlen; i++)
+		for (i = 0; i < nlen; i++)
 			printk("%d.", name[i]);
 		printk("\n");
 	}
-	return 0;
+	return;
+}
+
+int do_sysctl(int __user *args_name, int nlen,
+	void __user *oldval, size_t __user *oldlenp,
+	void __user *newval, size_t newlen)
+{
+	int name[CTL_MAXNAME];
+	size_t oldlen = 0;
+	int i;
+
+	if (nlen <= 0 || nlen >= CTL_MAXNAME)
+		return -ENOTDIR;
+	if (oldval && !oldlenp)
+		return -EFAULT;
+	if (oldlenp && get_user(oldlen, oldlenp))
+		return -EFAULT;
+
+	/* Read in the sysctl name for simplicity */
+	for (i = 0; i < nlen; i++)
+		if (get_user(name[i], args_name + i))
+			return -EFAULT;
+
+	deprecated_sysctl_warning(name, nlen);
+
+	return binary_sysctl(name, nlen, oldval, oldlenp, newval, newlen);
+}
+
+
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
+{
+	struct __sysctl_args tmp;
+	int error;
+
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+
+	lock_kernel();
+	error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
+			  tmp.newval, tmp.newlen);
+	unlock_kernel();
+
+	return error;
 }
-- 
cgit v1.2.3-71-gd317


From da3f6f9b3e0d1e73975ca81ae124406bf1587d40 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 00:36:27 -0700
Subject: sysctl: Introduce a generic compat sysctl sysctl

This uses compat_alloc_userspace to remove the various
hacks to allow do_sysctl to write to throuh oldlenp.

The rest of our mature compat syscall helper facitilies
are used as well to ensure we have a nice clean maintainable
compat syscall that can be used on all architectures.

The motiviation for a generic compat sysctl (besides the
obvious hack removal) is to reduce the number of compat
sysctl defintions out there so I can refactor the
binary sysctl implementation.

ppc already used the name compat_sys_sysctl so I remove the
ppcs version here.

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/powerpc/kernel/sys_ppc32.c | 52 -----------------------------------------
 kernel/sysctl_binary.c          | 50 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index b97c2d67f4ac..c5a4732bcc48 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -520,58 +520,6 @@ asmlinkage long compat_sys_umask(u32 mask)
 	return sys_umask((int)mask);
 }
 
-#ifdef CONFIG_SYSCTL_SYSCALL
-struct __sysctl_args32 {
-	u32 name;
-	int nlen;
-	u32 oldval;
-	u32 oldlenp;
-	u32 newval;
-	u32 newlen;
-	u32 __unused[4];
-};
-
-asmlinkage long compat_sys_sysctl(struct __sysctl_args32 __user *args)
-{
-	struct __sysctl_args32 tmp;
-	int error;
-	size_t oldlen;
-	size_t __user *oldlenp = NULL;
-	unsigned long addr = (((unsigned long)&args->__unused[0]) + 7) & ~7;
-
-	if (copy_from_user(&tmp, args, sizeof(tmp)))
-		return -EFAULT;
-
-	if (tmp.oldval && tmp.oldlenp) {
-		/* Duh, this is ugly and might not work if sysctl_args
-		   is in read-only memory, but do_sysctl does indirectly
-		   a lot of uaccess in both directions and we'd have to
-		   basically copy the whole sysctl.c here, and
-		   glibc's __sysctl uses rw memory for the structure
-		   anyway.  */
-		oldlenp = (size_t __user *)addr;
-		if (get_user(oldlen, (compat_size_t __user *)compat_ptr(tmp.oldlenp)) ||
-		    put_user(oldlen, oldlenp))
-			return -EFAULT;
-	}
-
-	lock_kernel();
-	error = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
-			  compat_ptr(tmp.oldval), oldlenp,
-			  compat_ptr(tmp.newval), tmp.newlen);
-	unlock_kernel();
-	if (oldlenp) {
-		if (!error) {
-			if (get_user(oldlen, oldlenp) ||
-			    put_user(oldlen, (compat_size_t __user *)compat_ptr(tmp.oldlenp)))
-				error = -EFAULT;
-		}
-		copy_to_user(args->__unused, tmp.__unused, sizeof(tmp.__unused));
-	}
-	return error;
-}
-#endif
-
 unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff)
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 930a31cd708b..775cc49da622 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -176,3 +176,53 @@ SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
 
 	return error;
 }
+
+#ifdef CONFIG_COMPAT
+#include <asm/compat.h>
+
+struct compat_sysctl_args {
+	compat_uptr_t	name;
+	int		nlen;
+	compat_uptr_t	oldval;
+	compat_uptr_t	oldlenp;
+	compat_uptr_t	newval;
+	compat_size_t	newlen;
+	compat_ulong_t	__unused[4];
+};
+
+asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
+{
+	struct compat_sysctl_args tmp;
+	compat_size_t __user *compat_oldlenp;
+	size_t __user *oldlenp = NULL;
+	size_t oldlen = 0;
+	ssize_t result;
+
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+
+	compat_oldlenp = compat_ptr(tmp.oldlenp);
+	if (compat_oldlenp) {
+		oldlenp = compat_alloc_user_space(sizeof(*compat_oldlenp));
+
+		if (get_user(oldlen, compat_oldlenp) ||
+		    put_user(oldlen, oldlenp))
+			return -EFAULT;
+	}
+
+	lock_kernel();
+	result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
+			   compat_ptr(tmp.oldval), oldlenp,
+			   compat_ptr(tmp.newval), tmp.newlen);
+	unlock_kernel();
+
+	if (oldlenp && !result) {
+		if (get_user(oldlen, oldlenp) ||
+		    put_user(oldlen, compat_oldlenp))
+			return -EFAULT;
+	}
+
+	return result;
+}
+
+#endif /* CONFIG_COMPAT */
-- 
cgit v1.2.3-71-gd317


From 942405f36038b8f930ab67e24aa1ad72bee96a25 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 01:01:43 -0700
Subject: sysctl: Remove the cond_syscall entry for sys32_sysctl

Now that all architechtures are use compat_sys_sysctl and sys32_sysctl
does not exist there is not point in retaining a cond_syscall
entry for it.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sys_ni.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8d1951..de5bf1448238 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -139,7 +139,6 @@ cond_syscall(sys_pciconfig_read);
 cond_syscall(sys_pciconfig_write);
 cond_syscall(sys_pciconfig_iobase);
 cond_syscall(sys32_ipc);
-cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
-- 
cgit v1.2.3-71-gd317


From 642c6d946b5cdc27d0146c41dc20b7c4d4c3ccd8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 01:08:48 -0700
Subject: sysctl: Make do_sysctl static

Now that all of the architectures use compat_sys_sysctl do_sysctl
can become static.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sysctl.h | 4 ----
 kernel/sysctl_binary.c | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1e4743ee6831..82c32b89932d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -996,10 +996,6 @@ extern int proc_doulongvec_minmax(struct ctl_table *, int,
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 				      void __user *, size_t *, loff_t *);
 
-extern int do_sysctl (int __user *name, int nlen,
-		      void __user *oldval, size_t __user *oldlenp,
-		      void __user *newval, size_t newlen);
-
 extern ctl_handler sysctl_data;
 extern ctl_handler sysctl_string;
 extern ctl_handler sysctl_intvec;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 775cc49da622..642019894299 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -135,7 +135,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
 	return;
 }
 
-int do_sysctl(int __user *args_name, int nlen,
+static int do_sysctl(int __user *args_name, int nlen,
 	void __user *oldval, size_t __user *oldlenp,
 	void __user *newval, size_t newlen)
 {
-- 
cgit v1.2.3-71-gd317


From 444a2a3bcd6d5bed5c823136f68fcc93c0fe283f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Nov 2009 04:13:05 +0100
Subject: tracing, perf_events: Protect the buffer from recursion in perf

While tracing using events with perf, if one enables the
lockdep:lock_acquire event, it will infect every other perf
trace events.

Basically, you can enable whatever set of trace events through
perf but if this event is part of the set, the only result we
can get is a long list of lock_acquire events of rcu read lock,
and only that.

This is because of a recursion inside perf.

1) When a trace event is triggered, it will fill a per cpu
   buffer and submit it to perf.

2) Perf will commit this event but will also protect some data
   using rcu_read_lock

3) A recursion appears: rcu_read_lock triggers a lock_acquire
   event that will fill the per cpu event and then submit the
   buffer to perf.

4) Perf detects a recursion and ignores it

5) Perf continues its work on the previous event, but its buffer
   has been overwritten by the lock_acquire event, it has then
   been turned into a lock_acquire event of rcu read lock

Such scenario also happens with lock_release with
rcu_read_unlock().

We could turn the rcu_read_lock() into __rcu_read_lock() to drop
the lock debugging from perf fast path, but that would make us
lose the rcu debugging and that doesn't prevent from other
possible kind of recursion from perf in the future.

This patch adds a recursion protection based on a counter on the
perf trace per cpu buffers to solve the problem.

-v2: Fixed lost whitespace, added reviewed-by tag

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jason Baron <jbaron@redhat.com>
LKML-Reference: <1257477185-7838-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  9 +++++--
 include/trace/ftrace.h             | 39 ++++++++++++++++++++++-------
 kernel/trace/trace_event_profile.c | 41 ++++++++++++++-----------------
 kernel/trace/trace_kprobe.c        | 50 ++++++++++++++++++++++++++++++++------
 kernel/trace/trace_syscalls.c      | 44 +++++++++++++++++++++++++++------
 5 files changed, 133 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index f7b47c336703..43360c1d8f70 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -137,8 +137,13 @@ struct ftrace_event_call {
 
 #define FTRACE_MAX_PROFILE_SIZE	2048
 
-extern char			*trace_profile_buf;
-extern char			*trace_profile_buf_nmi;
+struct perf_trace_buf {
+	char	buf[FTRACE_MAX_PROFILE_SIZE];
+	int	recursion;
+};
+
+extern struct perf_trace_buf	*perf_trace_buf;
+extern struct perf_trace_buf	*perf_trace_buf_nmi;
 
 #define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a7f946094128..4945d1c99864 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -649,6 +649,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *	struct ftrace_event_call *event_call = &event_<call>;
  *	extern void perf_tp_event(int, u64, u64, void *, int);
  *	struct ftrace_raw_##call *entry;
+ *	struct perf_trace_buf *trace_buf;
  *	u64 __addr = 0, __count = 1;
  *	unsigned long irq_flags;
  *	struct trace_entry *ent;
@@ -673,14 +674,25 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *	__cpu = smp_processor_id();
  *
  *	if (in_nmi())
- *		raw_data = rcu_dereference(trace_profile_buf_nmi);
+ *		trace_buf = rcu_dereference(perf_trace_buf_nmi);
  *	else
- *		raw_data = rcu_dereference(trace_profile_buf);
+ *		trace_buf = rcu_dereference(perf_trace_buf);
  *
- *	if (!raw_data)
+ *	if (!trace_buf)
  *		goto end;
  *
- *	raw_data = per_cpu_ptr(raw_data, __cpu);
+ *	trace_buf = per_cpu_ptr(trace_buf, __cpu);
+ *
+ * 	// Avoid recursion from perf that could mess up the buffer
+ * 	if (trace_buf->recursion++)
+ *		goto end_recursion;
+ *
+ * 	raw_data = trace_buf->buf;
+ *
+ *	// Make recursion update visible before entering perf_tp_event
+ *	// so that we protect from perf recursions.
+ *
+ *	barrier();
  *
  *	//zero dead bytes from alignment to avoid stack leak to userspace:
  *	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
@@ -713,8 +725,9 @@ static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_event_call *event_call = &event_##call;		\
-	extern void perf_tp_event(int, u64, u64, void *, int);	\
+	extern void perf_tp_event(int, u64, u64, void *, int);		\
 	struct ftrace_raw_##call *entry;				\
+	struct perf_trace_buf *trace_buf;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
 	struct trace_entry *ent;					\
@@ -739,14 +752,20 @@ static void ftrace_profile_##call(proto)				\
 	__cpu = smp_processor_id();					\
 									\
 	if (in_nmi())							\
-		raw_data = rcu_dereference(trace_profile_buf_nmi);		\
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);	\
 	else								\
-		raw_data = rcu_dereference(trace_profile_buf);		\
+		trace_buf = rcu_dereference(perf_trace_buf);		\
 									\
-	if (!raw_data)							\
+	if (!trace_buf)							\
 		goto end;						\
 									\
-	raw_data = per_cpu_ptr(raw_data, __cpu);			\
+	trace_buf = per_cpu_ptr(trace_buf, __cpu);			\
+	if (trace_buf->recursion++)					\
+		goto end_recursion;					\
+									\
+	barrier();							\
+									\
+	raw_data = trace_buf->buf;					\
 									\
 	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;		\
 	entry = (struct ftrace_raw_##call *)raw_data;			\
@@ -761,6 +780,8 @@ static void ftrace_profile_##call(proto)				\
 	perf_tp_event(event_call->id, __addr, __count, entry,		\
 			     __entry_size);				\
 									\
+end_recursion:								\
+	trace_buf->recursion--;						\
 end:									\
 	local_irq_restore(irq_flags);					\
 									\
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index c9f687ab0d4f..e0d351b01f5a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,41 +8,36 @@
 #include <linux/module.h>
 #include "trace.h"
 
-/*
- * We can't use a size but a type in alloc_percpu()
- * So let's create a dummy type that matches the desired size
- */
-typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
 
-char		*trace_profile_buf;
-EXPORT_SYMBOL_GPL(trace_profile_buf);
+struct perf_trace_buf *perf_trace_buf;
+EXPORT_SYMBOL_GPL(perf_trace_buf);
 
-char		*trace_profile_buf_nmi;
-EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
+struct perf_trace_buf *perf_trace_buf_nmi;
+EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
 
 /* Count the events in use (per event id, not per instance) */
 static int	total_profile_count;
 
 static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 {
-	char *buf;
+	struct perf_trace_buf *buf;
 	int ret = -ENOMEM;
 
 	if (atomic_inc_return(&event->profile_count))
 		return 0;
 
 	if (!total_profile_count) {
-		buf = (char *)alloc_percpu(profile_buf_t);
+		buf = alloc_percpu(struct perf_trace_buf);
 		if (!buf)
 			goto fail_buf;
 
-		rcu_assign_pointer(trace_profile_buf, buf);
+		rcu_assign_pointer(perf_trace_buf, buf);
 
-		buf = (char *)alloc_percpu(profile_buf_t);
+		buf = alloc_percpu(struct perf_trace_buf);
 		if (!buf)
 			goto fail_buf_nmi;
 
-		rcu_assign_pointer(trace_profile_buf_nmi, buf);
+		rcu_assign_pointer(perf_trace_buf_nmi, buf);
 	}
 
 	ret = event->profile_enable(event);
@@ -53,10 +48,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 
 fail_buf_nmi:
 	if (!total_profile_count) {
-		free_percpu(trace_profile_buf_nmi);
-		free_percpu(trace_profile_buf);
-		trace_profile_buf_nmi = NULL;
-		trace_profile_buf = NULL;
+		free_percpu(perf_trace_buf_nmi);
+		free_percpu(perf_trace_buf);
+		perf_trace_buf_nmi = NULL;
+		perf_trace_buf = NULL;
 	}
 fail_buf:
 	atomic_dec(&event->profile_count);
@@ -84,7 +79,7 @@ int ftrace_profile_enable(int event_id)
 
 static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
-	char *buf, *nmi_buf;
+	struct perf_trace_buf *buf, *nmi_buf;
 
 	if (!atomic_add_negative(-1, &event->profile_count))
 		return;
@@ -92,11 +87,11 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 	event->profile_disable(event);
 
 	if (!--total_profile_count) {
-		buf = trace_profile_buf;
-		rcu_assign_pointer(trace_profile_buf, NULL);
+		buf = perf_trace_buf;
+		rcu_assign_pointer(perf_trace_buf, NULL);
 
-		nmi_buf = trace_profile_buf_nmi;
-		rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+		nmi_buf = perf_trace_buf_nmi;
+		rcu_assign_pointer(perf_trace_buf_nmi, NULL);
 
 		/*
 		 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index cf17a6694f32..3696476f307d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1208,6 +1208,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry *entry;
+	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
@@ -1229,14 +1230,26 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
-		raw_data = rcu_dereference(trace_profile_buf_nmi);
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
 	else
-		raw_data = rcu_dereference(trace_profile_buf);
+		trace_buf = rcu_dereference(perf_trace_buf);
 
-	if (!raw_data)
+	if (!trace_buf)
 		goto end;
 
-	raw_data = per_cpu_ptr(raw_data, __cpu);
+	trace_buf = per_cpu_ptr(trace_buf, __cpu);
+
+	if (trace_buf->recursion++)
+		goto end_recursion;
+
+	/*
+	 * Make recursion update visible before entering perf_tp_event
+	 * so that we protect from perf recursions.
+	 */
+	barrier();
+
+	raw_data = trace_buf->buf;
+
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
 	entry = (struct kprobe_trace_entry *)raw_data;
@@ -1249,8 +1262,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ip, 1, entry, size);
+
+end_recursion:
+	trace_buf->recursion--;
 end:
 	local_irq_restore(irq_flags);
+
 	return 0;
 }
 
@@ -1261,6 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry *entry;
+	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
@@ -1282,14 +1300,26 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
-		raw_data = rcu_dereference(trace_profile_buf_nmi);
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
 	else
-		raw_data = rcu_dereference(trace_profile_buf);
+		trace_buf = rcu_dereference(perf_trace_buf);
 
-	if (!raw_data)
+	if (!trace_buf)
 		goto end;
 
-	raw_data = per_cpu_ptr(raw_data, __cpu);
+	trace_buf = per_cpu_ptr(trace_buf, __cpu);
+
+	if (trace_buf->recursion++)
+		goto end_recursion;
+
+	/*
+	 * Make recursion update visible before entering perf_tp_event
+	 * so that we protect from perf recursions.
+	 */
+	barrier();
+
+	raw_data = trace_buf->buf;
+
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
 	entry = (struct kretprobe_trace_entry *)raw_data;
@@ -1303,8 +1333,12 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
+
+end_recursion:
+	trace_buf->recursion--;
 end:
 	local_irq_restore(irq_flags);
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 58b8e5370767..51213b0aa81b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -477,6 +477,7 @@ static int sys_prof_refcount_exit;
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
+	struct perf_trace_buf *trace_buf;
 	struct syscall_trace_enter *rec;
 	unsigned long flags;
 	char *raw_data;
@@ -507,14 +508,25 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	cpu = smp_processor_id();
 
 	if (in_nmi())
-		raw_data = rcu_dereference(trace_profile_buf_nmi);
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
 	else
-		raw_data = rcu_dereference(trace_profile_buf);
+		trace_buf = rcu_dereference(perf_trace_buf);
 
-	if (!raw_data)
+	if (!trace_buf)
 		goto end;
 
-	raw_data = per_cpu_ptr(raw_data, cpu);
+	trace_buf = per_cpu_ptr(trace_buf, cpu);
+
+	if (trace_buf->recursion++)
+		goto end_recursion;
+
+	/*
+	 * Make recursion update visible before entering perf_tp_event
+	 * so that we protect from perf recursions.
+	 */
+	barrier();
+
+	raw_data = trace_buf->buf;
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -527,6 +539,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 			       (unsigned long *)&rec->args);
 	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 
+end_recursion:
+	trace_buf->recursion--;
 end:
 	local_irq_restore(flags);
 }
@@ -574,6 +588,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
+	struct perf_trace_buf *trace_buf;
 	unsigned long flags;
 	int syscall_nr;
 	char *raw_data;
@@ -605,14 +620,25 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	cpu = smp_processor_id();
 
 	if (in_nmi())
-		raw_data = rcu_dereference(trace_profile_buf_nmi);
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
 	else
-		raw_data = rcu_dereference(trace_profile_buf);
+		trace_buf = rcu_dereference(perf_trace_buf);
 
-	if (!raw_data)
+	if (!trace_buf)
 		goto end;
 
-	raw_data = per_cpu_ptr(raw_data, cpu);
+	trace_buf = per_cpu_ptr(trace_buf, cpu);
+
+	if (trace_buf->recursion++)
+		goto end_recursion;
+
+	/*
+	 * Make recursion update visible before entering perf_tp_event
+	 * so that we protect from perf recursions.
+	 */
+	barrier();
+
+	raw_data = trace_buf->buf;
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -626,6 +652,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
 	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
 
+end_recursion:
+	trace_buf->recursion--;
 end:
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3-71-gd317


From c82a43d40b93200a10a9fec0a489791e65e135ca Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 26 Oct 2009 23:28:11 +0300
Subject: irq: Do not attempt to create subdirectories if /proc/irq/<irq>
 failed

If a parent directory (ie /proc/irq/<irq>) could not be created
we should not attempt to create subdirectories. Otherwise it
would lead that "smp_affinity" and "spurious" entries are may be
registered under /proc root instead of a proper place.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20091026202811.GD5321@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/proc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591f..dfef5b9f3845 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,6 +214,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 
 	/* create /proc/irq/1234 */
 	desc->dir = proc_mkdir(name, root_irq_dir);
+	if (!desc->dir)
+		return;
 
 #ifdef CONFIG_SMP
 	/* create /proc/irq/<irq>/smp_affinity */
-- 
cgit v1.2.3-71-gd317


From d8c80ce091f6ead6710bc71b58f2c32e5bf855e4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 27 Oct 2009 15:45:23 +0800
Subject: sched, no_hz: Remove unused rq->last_tick_seen field

In 15934a37324f32e0fda633dc7984a671ea81cd75,
field last_tick_seen is added to struct rq.
But it is unused now.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Guillaume Chazarain <guichaz@yahoo.fr>
LKML-Reference: <4AE6A513.6010100@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f8492123b5d1..23e353568d8e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -534,7 +534,6 @@ struct rq {
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
-	unsigned long last_tick_seen;
 	unsigned char in_nohz_recently;
 #endif
 	/* capture load from *all* tasks on this cpu: */
-- 
cgit v1.2.3-71-gd317


From 24f1e32c60c45c89a997c73395b69c8af6f0a84e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 9 Sep 2009 19:22:48 +0200
Subject: hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf
 events

This patch rebase the implementation of the breakpoints API on top of
perf events instances.

Each breakpoints are now perf events that handle the
register scheduling, thread/cpu attachment, etc..

The new layering is now made as follows:

       ptrace       kgdb      ftrace   perf syscall
          \          |          /         /
           \         |         /         /
                                        /
            Core breakpoint API        /
                                      /
                     |               /
                     |              /

              Breakpoints perf events

                     |
                     |

               Breakpoints PMU ---- Debug Register constraints handling
                                    (Part of core breakpoint API)
                     |
                     |

             Hardware debug registers

Reasons of this rewrite:

- Use the centralized/optimized pmu registers scheduling,
  implying an easier arch integration
- More powerful register handling: perf attributes (pinned/flexible
  events, exclusive/non-exclusive, tunable period, etc...)

Impact:

- New perf ABI: the hardware breakpoints counters
- Ptrace breakpoints setting remains tricky and still needs some per
  thread breakpoints references.

Todo (in the order):

- Support breakpoints perf counter events for perf tools (ie: implement
  perf_bpcounter_event())
- Support from perf tools

Changes in v2:

- Follow the perf "event " rename
- The ptrace regression have been fixed (ptrace breakpoint perf events
  weren't released when a task ended)
- Drop the struct hw_breakpoint and store generic fields in
  perf_event_attr.
- Separate core and arch specific headers, drop
  asm-generic/hw_breakpoint.h and create linux/hw_breakpoint.h
- Use new generic len/type for breakpoint
- Handle off case: when breakpoints api is not supported by an arch

Changes in v3:

- Fix broken CONFIG_KVM, we need to propagate the breakpoint api
  changes to kvm when we exit the guest and restore the bp registers
  to the host.

Changes in v4:

- Drop the hw_breakpoint_restore() stub as it is only used by KVM
- EXPORT_SYMBOL_GPL hw_breakpoint_restore() as KVM can be built as a
  module
- Restore the breakpoints unconditionally on kvm guest exit:
  TIF_DEBUG_THREAD doesn't anymore cover every cases of running
  breakpoints and vcpu->arch.switch_db_regs might not always be
  set when the guest used debug registers.
  (Waiting for a reliable optimization)

Changes in v5:

- Split-up the asm-generic/hw-breakpoint.h moving to
  linux/hw_breakpoint.h into a separate patch
- Optimize the breakpoints restoring while switching from kvm guest
  to host. We only want to restore the state if we have active
  breakpoints to the host, otherwise we don't care about messed-up
  address registers.
- Add asm/hw_breakpoint.h to Kbuild
- Fix bad breakpoint type in trace_selftest.c

Changes in v6:

- Fix wrong header inclusion in trace.h (triggered a build
  error with CONFIG_FTRACE_SELFTEST

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jan Kiszka <jan.kiszka@web.de>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
---
 arch/Kconfig                         |   3 +
 arch/x86/include/asm/Kbuild          |   1 +
 arch/x86/include/asm/debugreg.h      |  11 +-
 arch/x86/include/asm/hw_breakpoint.h |  58 +++--
 arch/x86/include/asm/processor.h     |  12 +-
 arch/x86/kernel/hw_breakpoint.c      | 391 +++++++++++++++++++++-----------
 arch/x86/kernel/process.c            |   7 +-
 arch/x86/kernel/process_32.c         |  26 +--
 arch/x86/kernel/process_64.c         |  26 +--
 arch/x86/kernel/ptrace.c             | 182 ++++++++++-----
 arch/x86/kernel/smpboot.c            |   3 -
 arch/x86/kvm/x86.c                   |  18 +-
 arch/x86/power/cpu.c                 |   6 -
 include/linux/hw_breakpoint.h        | 243 ++++++++++----------
 include/linux/perf_event.h           |  26 ++-
 kernel/exit.c                        |   5 +
 kernel/hw_breakpoint.c               | 424 ++++++++++++++---------------------
 kernel/perf_event.c                  |  53 ++++-
 kernel/trace/trace.h                 |   5 +-
 kernel/trace/trace_entries.h         |   6 +-
 kernel/trace/trace_ksym.c            | 126 +++++------
 kernel/trace/trace_selftest.c        |   3 +-
 22 files changed, 885 insertions(+), 750 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index acb664397945..eef3bbb97075 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -128,6 +128,9 @@ config HAVE_DEFAULT_NO_SPIN_MUTEXES
 
 config HAVE_HW_BREAKPOINT
 	bool
+	depends on HAVE_PERF_EVENTS
+	select ANON_INODES
+	select PERF_EVENTS
 
 
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..9f828f87ca35 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
 header-y += processor-flags.h
+header-y += hw_breakpoint.h
 
 unifdef-y += e820.h
 unifdef-y += ist.h
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 23439fbb1d0e..9a3333c91f9a 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -75,13 +75,8 @@
  */
 #ifdef __KERNEL__
 
-/* For process management */
-extern void flush_thread_hw_breakpoint(struct task_struct *tsk);
-extern int copy_thread_hw_breakpoint(struct task_struct *tsk,
-		struct task_struct *child, unsigned long clone_flags);
+DECLARE_PER_CPU(unsigned long, dr7);
 
-/* For CPU management */
-extern void load_debug_registers(void);
 static inline void hw_breakpoint_disable(void)
 {
 	/* Zero the control register for HW Breakpoint */
@@ -94,6 +89,10 @@ static inline void hw_breakpoint_disable(void)
 	set_debugreg(0UL, 3);
 }
 
+#ifdef CONFIG_KVM
+extern void hw_breakpoint_restore(void);
+#endif
+
 #endif	/* __KERNEL__ */
 
 #endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 3cfca8e2b5f6..0675a7c4c20e 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -4,6 +4,11 @@
 #ifdef	__KERNEL__
 #define	__ARCH_HW_BREAKPOINT_H
 
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
 struct arch_hw_breakpoint {
 	char		*name; /* Contains name of the symbol to set bkpt */
 	unsigned long	address;
@@ -12,44 +17,57 @@ struct arch_hw_breakpoint {
 };
 
 #include <linux/kdebug.h>
-#include <linux/hw_breakpoint.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
 
 /* Available HW breakpoint length encodings */
-#define HW_BREAKPOINT_LEN_1		0x40
-#define HW_BREAKPOINT_LEN_2		0x44
-#define HW_BREAKPOINT_LEN_4		0x4c
-#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+#define X86_BREAKPOINT_LEN_1		0x40
+#define X86_BREAKPOINT_LEN_2		0x44
+#define X86_BREAKPOINT_LEN_4		0x4c
+#define X86_BREAKPOINT_LEN_EXECUTE	0x40
 
 #ifdef CONFIG_X86_64
-#define HW_BREAKPOINT_LEN_8		0x48
+#define X86_BREAKPOINT_LEN_8		0x48
 #endif
 
 /* Available HW breakpoint type encodings */
 
 /* trigger on instruction execute */
-#define HW_BREAKPOINT_EXECUTE	0x80
+#define X86_BREAKPOINT_EXECUTE	0x80
 /* trigger on memory write */
-#define HW_BREAKPOINT_WRITE	0x81
+#define X86_BREAKPOINT_WRITE	0x81
 /* trigger on memory read or write */
-#define HW_BREAKPOINT_RW	0x83
+#define X86_BREAKPOINT_RW	0x83
 
 /* Total number of available HW breakpoint registers */
 #define HBP_NUM 4
 
-extern struct hw_breakpoint *hbp_kernel[HBP_NUM];
-DECLARE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]);
-extern unsigned int hbp_user_refcount[HBP_NUM];
+struct perf_event;
+struct pmu;
 
-extern void arch_install_thread_hw_breakpoint(struct task_struct *tsk);
-extern void arch_uninstall_thread_hw_breakpoint(void);
 extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
-						struct task_struct *tsk);
-extern void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk);
-extern void arch_flush_thread_hw_breakpoint(struct task_struct *tsk);
-extern void arch_update_kernel_hw_breakpoint(void *);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
+					 struct task_struct *tsk);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
-				     unsigned long val, void *data);
+					   unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+				  int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
 #endif	/* __KERNEL__ */
 #endif	/* _I386_HW_BREAKPOINT_H */
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 61aafb71c7ef..820f3000f736 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -423,6 +423,8 @@ extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
 
+struct perf_event;
+
 struct thread_struct {
 	/* Cached TLS descriptors: */
 	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -444,12 +446,10 @@ struct thread_struct {
 	unsigned long		fs;
 #endif
 	unsigned long		gs;
-	/* Hardware debugging registers: */
-	unsigned long		debugreg[HBP_NUM];
-	unsigned long		debugreg6;
-	unsigned long		debugreg7;
-	/* Hardware breakpoint info */
-	struct hw_breakpoint	*hbp[HBP_NUM];
+	/* Save middle states of ptrace breakpoints */
+	struct perf_event	*ptrace_bps[HBP_NUM];
+	/* Debug status used for traps, single steps, etc... */
+	unsigned long           debugreg6;
 	/* Fault info: */
 	unsigned long		cr2;
 	unsigned long		trap_no;
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 9316a9de4de3..e622620790bd 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -15,6 +15,7 @@
  *
  * Copyright (C) 2007 Alan Stern
  * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
  */
 
 /*
@@ -22,6 +23,8 @@
  * using the CPU's debug registers.
  */
 
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 #include <linux/irqflags.h>
 #include <linux/notifier.h>
 #include <linux/kallsyms.h>
@@ -38,26 +41,24 @@
 #include <asm/processor.h>
 #include <asm/debugreg.h>
 
-/* Unmasked kernel DR7 value */
-static unsigned long kdr7;
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
 
 /*
- * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
- * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
  */
-static const unsigned long	dr7_masks[HBP_NUM] = {
-	0x000f0003,	/* LEN0, R/W0, G0, L0 */
-	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
-	0x0f000030,	/* LEN2, R/W2, G2, L2 */
-	0xf00000c0	/* LEN3, R/W3, G3, L3 */
-};
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
 
 
 /*
  * Encode the length, type, Exact, and Enable bits for a particular breakpoint
  * as stored in debug register 7.
  */
-static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
 {
 	unsigned long bp_info;
 
@@ -68,64 +69,89 @@ static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
 	return bp_info;
 }
 
-void arch_update_kernel_hw_breakpoint(void *unused)
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
 {
-	struct hw_breakpoint *bp;
-	int i, cpu = get_cpu();
-	unsigned long temp_kdr7 = 0;
-
-	/* Don't allow debug exceptions while we update the registers */
-	set_debugreg(0UL, 7);
+	int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
 
-	for (i = hbp_kernel_pos; i < HBP_NUM; i++) {
-		per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i];
-		if (bp) {
-			temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type);
-			set_debugreg(bp->info.address, i);
-		}
-	}
+	*len = (bp_info & 0xc) | 0x40;
+	*type = (bp_info & 0x3) | 0x80;
 
-	/* No need to set DR6. Update the debug registers with kernel-space
-	 * breakpoint values from kdr7 and user-space requests from the
-	 * current process
-	 */
-	kdr7 = temp_kdr7;
-	set_debugreg(kdr7 | current->thread.debugreg7, 7);
-	put_cpu();
+	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
 }
 
 /*
- * Install the thread breakpoints in their debug registers.
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
  */
-void arch_install_thread_hw_breakpoint(struct task_struct *tsk)
+int arch_install_hw_breakpoint(struct perf_event *bp)
 {
-	struct thread_struct *thread = &(tsk->thread);
-
-	switch (hbp_kernel_pos) {
-	case 4:
-		set_debugreg(thread->debugreg[3], 3);
-	case 3:
-		set_debugreg(thread->debugreg[2], 2);
-	case 2:
-		set_debugreg(thread->debugreg[1], 1);
-	case 1:
-		set_debugreg(thread->debugreg[0], 0);
-	default:
-		break;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned long *dr7;
+	int i;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+		if (!*slot) {
+			*slot = bp;
+			break;
+		}
 	}
 
-	/* No need to set DR6 */
-	set_debugreg((kdr7 | thread->debugreg7), 7);
+	if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+		return -EBUSY;
+
+	set_debugreg(info->address, i);
+	__get_cpu_var(cpu_debugreg[i]) = info->address;
+
+	dr7 = &__get_cpu_var(dr7);
+	*dr7 |= encode_dr7(i, info->len, info->type);
+
+	set_debugreg(*dr7, 7);
+
+	return 0;
 }
 
 /*
- * Install the debug register values for just the kernel, no thread.
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
  */
-void arch_uninstall_thread_hw_breakpoint(void)
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
 {
-	/* Clear the user-space portion of debugreg7 by setting only kdr7 */
-	set_debugreg(kdr7, 7);
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned long *dr7;
+	int i;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+		if (*slot == bp) {
+			*slot = NULL;
+			break;
+		}
+	}
+
+	if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+		return;
 
+	dr7 = &__get_cpu_var(dr7);
+	*dr7 &= ~encode_dr7(i, info->len, info->type);
+
+	set_debugreg(*dr7, 7);
 }
 
 static int get_hbp_len(u8 hbp_len)
@@ -133,17 +159,17 @@ static int get_hbp_len(u8 hbp_len)
 	unsigned int len_in_bytes = 0;
 
 	switch (hbp_len) {
-	case HW_BREAKPOINT_LEN_1:
+	case X86_BREAKPOINT_LEN_1:
 		len_in_bytes = 1;
 		break;
-	case HW_BREAKPOINT_LEN_2:
+	case X86_BREAKPOINT_LEN_2:
 		len_in_bytes = 2;
 		break;
-	case HW_BREAKPOINT_LEN_4:
+	case X86_BREAKPOINT_LEN_4:
 		len_in_bytes = 4;
 		break;
 #ifdef CONFIG_X86_64
-	case HW_BREAKPOINT_LEN_8:
+	case X86_BREAKPOINT_LEN_8:
 		len_in_bytes = 8;
 		break;
 #endif
@@ -178,67 +204,146 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
 /*
  * Store a breakpoint's encoded address, length, and type.
  */
-static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk)
+static int arch_store_info(struct perf_event *bp)
 {
-	/*
-	 * User-space requests will always have the address field populated
-	 * Symbol names from user-space are rejected
-	 */
-	if (tsk && bp->info.name)
-		return -EINVAL;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	/*
 	 * For kernel-addresses, either the address or symbol name can be
 	 * specified.
 	 */
-	if (bp->info.name)
-		bp->info.address = (unsigned long)
-					kallsyms_lookup_name(bp->info.name);
-	if (bp->info.address)
+	if (info->name)
+		info->address = (unsigned long)
+				kallsyms_lookup_name(info->name);
+	if (info->address)
 		return 0;
+
 	return -EINVAL;
 }
 
-/*
- * Validate the arch-specific HW Breakpoint register settings
- */
-int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
-						struct task_struct *tsk)
+int arch_bp_generic_fields(int x86_len, int x86_type,
+			   int *gen_len, int *gen_type)
 {
-	unsigned int align;
-	int ret = -EINVAL;
+	/* Len */
+	switch (x86_len) {
+	case X86_BREAKPOINT_LEN_1:
+		*gen_len = HW_BREAKPOINT_LEN_1;
+		break;
+	case X86_BREAKPOINT_LEN_2:
+		*gen_len = HW_BREAKPOINT_LEN_2;
+		break;
+	case X86_BREAKPOINT_LEN_4:
+		*gen_len = HW_BREAKPOINT_LEN_4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86_BREAKPOINT_LEN_8:
+		*gen_len = HW_BREAKPOINT_LEN_8;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
 
-	switch (bp->info.type) {
-	/*
-	 * Ptrace-refactoring code
-	 * For now, we'll allow instruction breakpoint only for user-space
-	 * addresses
-	 */
-	case HW_BREAKPOINT_EXECUTE:
-		if ((!arch_check_va_in_userspace(bp->info.address,
-							bp->info.len)) &&
-			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
-			return ret;
+	/* Type */
+	switch (x86_type) {
+	case X86_BREAKPOINT_EXECUTE:
+		*gen_type = HW_BREAKPOINT_X;
 		break;
-	case HW_BREAKPOINT_WRITE:
+	case X86_BREAKPOINT_WRITE:
+		*gen_type = HW_BREAKPOINT_W;
 		break;
-	case HW_BREAKPOINT_RW:
+	case X86_BREAKPOINT_RW:
+		*gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
 		break;
 	default:
-		return ret;
+		return -EINVAL;
 	}
 
-	switch (bp->info.len) {
+	return 0;
+}
+
+
+static int arch_build_bp_info(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+	info->address = bp->attr.bp_addr;
+
+	/* Len */
+	switch (bp->attr.bp_len) {
 	case HW_BREAKPOINT_LEN_1:
-		align = 0;
+		info->len = X86_BREAKPOINT_LEN_1;
 		break;
 	case HW_BREAKPOINT_LEN_2:
-		align = 1;
+		info->len = X86_BREAKPOINT_LEN_2;
 		break;
 	case HW_BREAKPOINT_LEN_4:
-		align = 3;
+		info->len = X86_BREAKPOINT_LEN_4;
 		break;
 #ifdef CONFIG_X86_64
 	case HW_BREAKPOINT_LEN_8:
+		info->len = X86_BREAKPOINT_LEN_8;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	/* Type */
+	switch (bp->attr.bp_type) {
+	case HW_BREAKPOINT_W:
+		info->type = X86_BREAKPOINT_WRITE;
+		break;
+	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+		info->type = X86_BREAKPOINT_RW;
+		break;
+	case HW_BREAKPOINT_X:
+		info->type = X86_BREAKPOINT_EXECUTE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct perf_event *bp,
+				  struct task_struct *tsk)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned int align;
+	int ret;
+
+
+	ret = arch_build_bp_info(bp);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+
+	if (info->type == X86_BREAKPOINT_EXECUTE)
+		/*
+		 * Ptrace-refactoring code
+		 * For now, we'll allow instruction breakpoint only for user-space
+		 * addresses
+		 */
+		if ((!arch_check_va_in_userspace(info->address, info->len)) &&
+			info->len != X86_BREAKPOINT_EXECUTE)
+			return ret;
+
+	switch (info->len) {
+	case X86_BREAKPOINT_LEN_1:
+		align = 0;
+		break;
+	case X86_BREAKPOINT_LEN_2:
+		align = 1;
+		break;
+	case X86_BREAKPOINT_LEN_4:
+		align = 3;
+		break;
+#ifdef CONFIG_X86_64
+	case X86_BREAKPOINT_LEN_8:
 		align = 7;
 		break;
 #endif
@@ -246,8 +351,8 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
 		return ret;
 	}
 
-	if (bp->triggered)
-		ret = arch_store_info(bp, tsk);
+	if (bp->callback)
+		ret = arch_store_info(bp);
 
 	if (ret < 0)
 		return ret;
@@ -255,44 +360,47 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
 	 * Check that the low-order bits of the address are appropriate
 	 * for the alignment implied by len.
 	 */
-	if (bp->info.address & align)
+	if (info->address & align)
 		return -EINVAL;
 
 	/* Check that the virtual address is in the proper range */
 	if (tsk) {
-		if (!arch_check_va_in_userspace(bp->info.address, bp->info.len))
+		if (!arch_check_va_in_userspace(info->address, info->len))
 			return -EFAULT;
 	} else {
-		if (!arch_check_va_in_kernelspace(bp->info.address,
-								bp->info.len))
+		if (!arch_check_va_in_kernelspace(info->address, info->len))
 			return -EFAULT;
 	}
+
 	return 0;
 }
 
-void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk)
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	struct hw_breakpoint *bp = thread->hbp[pos];
-
-	thread->debugreg7 &= ~dr7_masks[pos];
-	if (bp) {
-		thread->debugreg[pos] = bp->info.address;
-		thread->debugreg7 |= encode_dr7(pos, bp->info.len,
-							bp->info.type);
-	} else
-		thread->debugreg[pos] = 0;
+	int i;
+	struct thread_struct *t = &tsk->thread;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		unregister_hw_breakpoint(t->ptrace_bps[i]);
+		t->ptrace_bps[i] = NULL;
+	}
 }
 
-void arch_flush_thread_hw_breakpoint(struct task_struct *tsk)
+#ifdef CONFIG_KVM
+void hw_breakpoint_restore(void)
 {
-	int i;
-	struct thread_struct *thread = &(tsk->thread);
-
-	thread->debugreg7 = 0;
-	for (i = 0; i < HBP_NUM; i++)
-		thread->debugreg[i] = 0;
+	set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
+	set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
+	set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
+	set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+	set_debugreg(current->thread.debugreg6, 6);
+	set_debugreg(__get_cpu_var(dr7), 7);
 }
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+#endif
 
 /*
  * Handle debug exception notifications.
@@ -313,7 +421,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk)
 static int __kprobes hw_breakpoint_handler(struct die_args *args)
 {
 	int i, cpu, rc = NOTIFY_STOP;
-	struct hw_breakpoint *bp;
+	struct perf_event *bp;
 	unsigned long dr7, dr6;
 	unsigned long *dr6_p;
 
@@ -325,10 +433,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 	if ((dr6 & DR_TRAP_BITS) == 0)
 		return NOTIFY_DONE;
 
-	/* Lazy debug register switching */
-	if (!test_tsk_thread_flag(current, TIF_DEBUG))
-		arch_uninstall_thread_hw_breakpoint();
-
 	get_debugreg(dr7, 7);
 	/* Disable breakpoints during exception handling */
 	set_debugreg(0UL, 7);
@@ -344,17 +448,18 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 	for (i = 0; i < HBP_NUM; ++i) {
 		if (likely(!(dr6 & (DR_TRAP0 << i))))
 			continue;
+
 		/*
-		 * Find the corresponding hw_breakpoint structure and
-		 * invoke its triggered callback.
+		 * The counter may be concurrently released but that can only
+		 * occur from a call_rcu() path. We can then safely fetch
+		 * the breakpoint, use its callback, touch its counter
+		 * while we are in an rcu_read_lock() path.
 		 */
-		if (i >= hbp_kernel_pos)
-			bp = per_cpu(this_hbp_kernel[i], cpu);
-		else {
-			bp = current->thread.hbp[i];
-			if (bp)
-				rc = NOTIFY_DONE;
-		}
+		rcu_read_lock();
+
+		bp = per_cpu(bp_per_reg[i], cpu);
+		if (bp)
+			rc = NOTIFY_DONE;
 		/*
 		 * Reset the 'i'th TRAP bit in dr6 to denote completion of
 		 * exception handling
@@ -362,19 +467,23 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 		(*dr6_p) &= ~(DR_TRAP0 << i);
 		/*
 		 * bp can be NULL due to lazy debug register switching
-		 * or due to the delay between updates of hbp_kernel_pos
-		 * and this_hbp_kernel.
+		 * or due to concurrent perf counter removing.
 		 */
-		if (!bp)
-			continue;
+		if (!bp) {
+			rcu_read_unlock();
+			break;
+		}
+
+		(bp->callback)(bp, args->regs);
 
-		(bp->triggered)(bp, args->regs);
+		rcu_read_unlock();
 	}
 	if (dr6 & (~DR_TRAP_BITS))
 		rc = NOTIFY_DONE;
 
 	set_debugreg(dr7, 7);
 	put_cpu();
+
 	return rc;
 }
 
@@ -389,3 +498,13 @@ int __kprobes hw_breakpoint_exceptions_notify(
 
 	return hw_breakpoint_handler(data);
 }
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+	/* TODO */
+}
+
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
+{
+	/* TODO */
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cf8ee0016307..744508e7cfdd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -10,6 +10,7 @@
 #include <linux/clockchips.h>
 #include <linux/random.h>
 #include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -18,7 +19,6 @@
 #include <asm/i387.h>
 #include <asm/ds.h>
 #include <asm/debugreg.h>
-#include <asm/hw_breakpoint.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -47,8 +47,6 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 	}
-	if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG)))
-		flush_thread_hw_breakpoint(tsk);
 
 	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
@@ -107,8 +105,7 @@ void flush_thread(void)
 	}
 #endif
 
-	if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG)))
-		flush_thread_hw_breakpoint(tsk);
+	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 209e74801763..d5bd3132ee70 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -59,7 +59,6 @@
 #include <asm/syscalls.h>
 #include <asm/ds.h>
 #include <asm/debugreg.h>
-#include <asm/hw_breakpoint.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -264,9 +263,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	p->thread.io_bitmap_ptr = NULL;
 	tsk = current;
 	err = -ENOMEM;
-	if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG)))
-		if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
-			goto out;
+
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
@@ -287,13 +285,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		err = do_set_thread_area(p, -1,
 			(struct user_desc __user *)childregs->si, 0);
 
-out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
-	if (err)
-		flush_thread_hw_breakpoint(p);
 
 	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
 	p->thread.ds_ctx = NULL;
@@ -437,23 +432,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		lazy_load_gs(next->gs);
 
 	percpu_write(current_task, next_p);
-	/*
-	 * There's a problem with moving the arch_install_thread_hw_breakpoint()
-	 * call before current is updated.  Suppose a kernel breakpoint is
-	 * triggered in between the two, the hw-breakpoint handler will see that
-	 * the 'current' task does not have TIF_DEBUG flag set and will think it
-	 * is leftover from an old task (lazy switching) and will erase it. Then
-	 * until the next context switch, no user-breakpoints will be installed.
-	 *
-	 * The real problem is that it's impossible to update both current and
-	 * physical debug registers at the same instant, so there will always be
-	 * a window in which they disagree and a breakpoint might get triggered.
-	 * Since we use lazy switching, we are forced to assume that a
-	 * disagreement means that current is correct and the exception is due
-	 * to lazy debug register switching.
-	 */
-	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
-		arch_install_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 72edac026a78..5bafdec34441 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,7 +53,6 @@
 #include <asm/syscalls.h>
 #include <asm/ds.h>
 #include <asm/debugreg.h>
-#include <asm/hw_breakpoint.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -244,8 +243,6 @@ void release_thread(struct task_struct *dead_task)
 			BUG();
 		}
 	}
-	if (unlikely(dead_task->thread.debugreg7))
-		flush_thread_hw_breakpoint(dead_task);
 }
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -309,9 +306,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	savesegment(ds, p->thread.ds);
 
 	err = -ENOMEM;
-	if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG)))
-		if (copy_thread_hw_breakpoint(me, p, clone_flags))
-			goto out;
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@ -351,8 +346,6 @@ out:
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
-	if (err)
-		flush_thread_hw_breakpoint(p);
 
 	return err;
 }
@@ -508,23 +501,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	if (preload_fpu)
 		__math_state_restore();
-	/*
-	 * There's a problem with moving the arch_install_thread_hw_breakpoint()
-	 * call before current is updated.  Suppose a kernel breakpoint is
-	 * triggered in between the two, the hw-breakpoint handler will see that
-	 * the 'current' task does not have TIF_DEBUG flag set and will think it
-	 * is leftover from an old task (lazy switching) and will erase it. Then
-	 * until the next context switch, no user-breakpoints will be installed.
-	 *
-	 * The real problem is that it's impossible to update both current and
-	 * physical debug registers at the same instant, so there will always be
-	 * a window in which they disagree and a breakpoint might get triggered.
-	 * Since we use lazy switching, we are forced to assume that a
-	 * disagreement means that current is correct and the exception is due
-	 * to lazy debug register switching.
-	 */
-	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
-		arch_install_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 267cb85b479c..e79610d95971 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/workqueue.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -441,54 +443,59 @@ static int genregs_set(struct task_struct *target,
 	return ret;
 }
 
-/*
- * Decode the length and type bits for a particular breakpoint as
- * stored in debug register 7.  Return the "enabled" status.
- */
-static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len,
-		unsigned *type)
-{
-	int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
-
-	*len = (bp_info & 0xc) | 0x40;
-	*type = (bp_info & 0x3) | 0x80;
-	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
-}
-
-static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+static void ptrace_triggered(struct perf_event *bp, void *data)
 {
-	struct thread_struct *thread = &(current->thread);
 	int i;
+	struct thread_struct *thread = &(current->thread);
 
 	/*
 	 * Store in the virtual DR6 register the fact that the breakpoint
 	 * was hit so the thread's debugger will see it.
 	 */
-	for (i = 0; i < hbp_kernel_pos; i++)
-		/*
-		 * We will check bp->info.address against the address stored in
-		 * thread's hbp structure and not debugreg[i]. This is to ensure
-		 * that the corresponding bit for 'i' in DR7 register is enabled
-		 */
-		if (bp->info.address == thread->hbp[i]->info.address)
+	for (i = 0; i < HBP_NUM; i++) {
+		if (thread->ptrace_bps[i] == bp)
 			break;
+	}
 
 	thread->debugreg6 |= (DR_TRAP0 << i);
 }
 
+/*
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
+ */
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
+{
+	int i;
+	int dr7 = 0;
+	struct arch_hw_breakpoint *info;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		if (bp[i] && !bp[i]->attr.disabled) {
+			info = counter_arch_bp(bp[i]);
+			dr7 |= encode_dr7(i, info->len, info->type);
+		}
+	}
+
+	return dr7;
+}
+
 /*
  * Handle ptrace writes to debug register 7.
  */
 static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 {
 	struct thread_struct *thread = &(tsk->thread);
-	unsigned long old_dr7 = thread->debugreg7;
+	unsigned long old_dr7;
 	int i, orig_ret = 0, rc = 0;
 	int enabled, second_pass = 0;
 	unsigned len, type;
-	struct hw_breakpoint *bp;
+	int gen_len, gen_type;
+	struct perf_event *bp;
 
 	data &= ~DR_CONTROL_RESERVED;
+	old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
 restore:
 	/*
 	 * Loop through all the hardware breakpoints, making the
@@ -496,11 +503,12 @@ restore:
 	 */
 	for (i = 0; i < HBP_NUM; i++) {
 		enabled = decode_dr7(data, i, &len, &type);
-		bp = thread->hbp[i];
+		bp = thread->ptrace_bps[i];
 
 		if (!enabled) {
 			if (bp) {
-				/* Don't unregister the breakpoints right-away,
+				/*
+				 * Don't unregister the breakpoints right-away,
 				 * unless all register_user_hw_breakpoint()
 				 * requests have succeeded. This prevents
 				 * any window of opportunity for debug
@@ -508,27 +516,45 @@ restore:
 				 */
 				if (!second_pass)
 					continue;
-				unregister_user_hw_breakpoint(tsk, bp);
-				kfree(bp);
+				thread->ptrace_bps[i] = NULL;
+				unregister_hw_breakpoint(bp);
 			}
 			continue;
 		}
+
+		/*
+		 * We shoud have at least an inactive breakpoint at this
+		 * slot. It means the user is writing dr7 without having
+		 * written the address register first
+		 */
 		if (!bp) {
-			rc = -ENOMEM;
-			bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
-			if (bp) {
-				bp->info.address = thread->debugreg[i];
-				bp->triggered = ptrace_triggered;
-				bp->info.len = len;
-				bp->info.type = type;
-				rc = register_user_hw_breakpoint(tsk, bp);
-				if (rc)
-					kfree(bp);
-			}
-		} else
-			rc = modify_user_hw_breakpoint(tsk, bp);
+			rc = -EINVAL;
+			break;
+		}
+
+		rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
 		if (rc)
 			break;
+
+		/*
+		 * This is a temporary thing as bp is unregistered/registered
+		 * to simulate modification
+		 */
+		bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len,
+					       gen_type, bp->callback,
+					       tsk, true);
+		thread->ptrace_bps[i] = NULL;
+
+		if (!bp) { /* incorrect bp, or we have a bug in bp API */
+			rc = -EINVAL;
+			break;
+		}
+		if (IS_ERR(bp)) {
+			rc = PTR_ERR(bp);
+			bp = NULL;
+			break;
+		}
+		thread->ptrace_bps[i] = bp;
 	}
 	/*
 	 * Make a second pass to free the remaining unused breakpoints
@@ -553,15 +579,63 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 	struct thread_struct *thread = &(tsk->thread);
 	unsigned long val = 0;
 
-	if (n < HBP_NUM)
-		val = thread->debugreg[n];
-	else if (n == 6)
+	if (n < HBP_NUM) {
+		struct perf_event *bp;
+		bp = thread->ptrace_bps[n];
+		if (!bp)
+			return 0;
+		val = bp->hw.info.address;
+	} else if (n == 6) {
 		val = thread->debugreg6;
-	else if (n == 7)
-		val = thread->debugreg7;
+	 } else if (n == 7) {
+		val = ptrace_get_dr7(thread->ptrace_bps);
+	}
 	return val;
 }
 
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+				      unsigned long addr)
+{
+	struct perf_event *bp;
+	struct thread_struct *t = &tsk->thread;
+
+	if (!t->ptrace_bps[nr]) {
+		/*
+		 * Put stub len and type to register (reserve) an inactive but
+		 * correct bp
+		 */
+		bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1,
+						 HW_BREAKPOINT_W,
+						 ptrace_triggered, tsk,
+						 false);
+	} else {
+		bp = t->ptrace_bps[nr];
+		t->ptrace_bps[nr] = NULL;
+		bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len,
+					       bp->attr.bp_type,
+					       bp->callback,
+					       tsk,
+					       bp->attr.disabled);
+	}
+
+	if (!bp)
+		return -EIO;
+	/*
+	 * CHECKME: the previous code returned -EIO if the addr wasn't a
+	 * valid task virtual addr. The new one will return -EINVAL in this
+	 * case.
+	 * -EINVAL may be what we want for in-kernel breakpoints users, but
+	 * -EIO looks better for ptrace, since we refuse a register writing
+	 * for the user. And anyway this is the previous behaviour.
+	 */
+	if (IS_ERR(bp))
+		return PTR_ERR(bp);
+
+	t->ptrace_bps[nr] = bp;
+
+	return 0;
+}
+
 /*
  * Handle PTRACE_POKEUSR calls for the debug register area.
  */
@@ -575,19 +649,13 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
 		return -EIO;
 
 	if (n == 6) {
-		tsk->thread.debugreg6 = val;
+		thread->debugreg6 = val;
 		goto ret_path;
 	}
 	if (n < HBP_NUM) {
-		if (thread->hbp[n]) {
-			if (arch_check_va_in_userspace(val,
-					thread->hbp[n]->info.len) == 0) {
-				rc = -EIO;
-				goto ret_path;
-			}
-			thread->hbp[n]->info.address = val;
-		}
-		thread->debugreg[n] = val;
+		rc = ptrace_set_breakpoint_addr(tsk, n, val);
+		if (rc)
+			return rc;
 	}
 	/* All that's left is DR7 */
 	if (n == 7)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 213a7a3e4562..565ebc65920e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -64,7 +64,6 @@
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
-#include <asm/debugreg.h>
 #include <linux/mc146818rtc.h>
 
 #include <asm/smpboot_hooks.h>
@@ -328,7 +327,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	x86_cpuinit.setup_percpu_clockev();
 
 	wmb();
-	load_debug_registers();
 	cpu_idle();
 }
 
@@ -1269,7 +1267,6 @@ void cpu_disable_common(void)
 	remove_cpu_from_maps(cpu);
 	unlock_vector_lock();
 	fixup_irqs();
-	hw_breakpoint_disable();
 }
 
 int native_cpu_disable(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fc2974adf9b6..22dee7aa7813 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,6 +42,7 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
+#include <asm/debugreg.h>
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
@@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	trace_kvm_entry(vcpu->vcpu_id);
 	kvm_x86_ops->run(vcpu, kvm_run);
 
-	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
-		set_debugreg(current->thread.debugreg[0], 0);
-		set_debugreg(current->thread.debugreg[1], 1);
-		set_debugreg(current->thread.debugreg[2], 2);
-		set_debugreg(current->thread.debugreg[3], 3);
-		set_debugreg(current->thread.debugreg6, 6);
-		set_debugreg(current->thread.debugreg7, 7);
-	}
+	/*
+	 * If the guest has used debug registers, at least dr7
+	 * will be disabled while returning to the host.
+	 * If we don't have active breakpoints in the host, we don't
+	 * care about the messed up debug address registers. But if
+	 * we have some of them active, restore the old state.
+	 */
+	if (__get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK)
+		hw_breakpoint_restore();
 
 	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index e09a44fc4664..0a979f3e5b8a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,7 +105,6 @@ static void __save_processor_state(struct saved_context *ctxt)
 	ctxt->cr4 = read_cr4();
 	ctxt->cr8 = read_cr8();
 #endif
-	hw_breakpoint_disable();
 }
 
 /* Needed by apm.c */
@@ -144,11 +143,6 @@ static void fix_processor_context(void)
 #endif
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
-
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	load_debug_registers();
 }
 
 /**
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 61ccc8f17eac..7eba9b92e5f3 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -1,136 +1,131 @@
 #ifndef _LINUX_HW_BREAKPOINT_H
 #define _LINUX_HW_BREAKPOINT_H
 
+#include <linux/perf_event.h>
 
-#ifdef	__KERNEL__
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/kallsyms.h>
-
-/**
- * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
- * @triggered: callback invoked after target address access
- * @info: arch-specific breakpoint info (address, length, and type)
- *
- * %hw_breakpoint structures are the kernel's way of representing
- * hardware breakpoints.  These are data breakpoints
- * (also known as "watchpoints", triggered on data access), and the breakpoint's
- * target address can be located in either kernel space or user space.
- *
- * The breakpoint's address, length, and type are highly
- * architecture-specific.  The values are encoded in the @info field; you
- * specify them when registering the breakpoint.  To examine the encoded
- * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
- * below.
- *
- * The address is specified as a regular kernel pointer (for kernel-space
- * breakponts) or as an %__user pointer (for user-space breakpoints).
- * With register_user_hw_breakpoint(), the address must refer to a
- * location in user space.  The breakpoint will be active only while the
- * requested task is running.  Conversely with
- * register_kernel_hw_breakpoint(), the address must refer to a location
- * in kernel space, and the breakpoint will be active on all CPUs
- * regardless of the current task.
- *
- * The length is the breakpoint's extent in bytes, which is subject to
- * certain limitations.  include/asm/hw_breakpoint.h contains macros
- * defining the available lengths for a specific architecture.  Note that
- * the address's alignment must match the length.  The breakpoint will
- * catch accesses to any byte in the range from address to address +
- * (length - 1).
- *
- * The breakpoint's type indicates the sort of access that will cause it
- * to trigger.  Possible values may include:
- *
- * 	%HW_BREAKPOINT_RW (triggered on read or write access),
- * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
- * 	%HW_BREAKPOINT_READ (triggered on read access).
- *
- * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
- * possibilities are available on all architectures.  Execute breakpoints
- * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
- *
- * When a breakpoint gets hit, the @triggered callback is
- * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
- * processor registers.
- * Data breakpoints occur after the memory access has taken place.
- * Breakpoints are disabled during execution @triggered, to avoid
- * recursive traps and allow unhindered access to breakpointed memory.
- *
- * This sample code sets a breakpoint on pid_max and registers a callback
- * function for writes to that variable.  Note that it is not portable
- * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
- *
- * ----------------------------------------------------------------------
- *
- * #include <asm/hw_breakpoint.h>
- *
- * struct hw_breakpoint my_bp;
- *
- * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
- * {
- * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
- * 	dump_stack();
- *  	.......<more debugging output>........
- * }
- *
- * static struct hw_breakpoint my_bp;
- *
- * static int init_module(void)
- * {
- *	..........<do anything>............
- *	my_bp.info.type = HW_BREAKPOINT_WRITE;
- *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
- *
- *	my_bp.installed = (void *)my_bp_installed;
- *
- *	rc = register_kernel_hw_breakpoint(&my_bp);
- *	..........<do anything>............
- * }
- *
- * static void cleanup_module(void)
- * {
- *	..........<do anything>............
- *	unregister_kernel_hw_breakpoint(&my_bp);
- *	..........<do anything>............
- * }
- *
- * ----------------------------------------------------------------------
- */
-struct hw_breakpoint {
-	void (*triggered)(struct hw_breakpoint *, struct pt_regs *);
-	struct arch_hw_breakpoint info;
+enum {
+	HW_BREAKPOINT_LEN_1 = 1,
+	HW_BREAKPOINT_LEN_2 = 2,
+	HW_BREAKPOINT_LEN_4 = 4,
+	HW_BREAKPOINT_LEN_8 = 8,
 };
 
-/*
- * len and type values are defined in include/asm/hw_breakpoint.h.
- * Available values vary according to the architecture.  On i386 the
- * possibilities are:
- *
- *	HW_BREAKPOINT_LEN_1
- *	HW_BREAKPOINT_LEN_2
- *	HW_BREAKPOINT_LEN_4
- *	HW_BREAKPOINT_RW
- *	HW_BREAKPOINT_READ
- *
- * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
- * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
- * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
- */
+enum {
+	HW_BREAKPOINT_R = 1,
+	HW_BREAKPOINT_W = 2,
+	HW_BREAKPOINT_X = 4,
+};
+
+static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
+{
+	return &bp->hw.info;
+}
+
+static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
+{
+	return bp->attr.bp_addr;
+}
+
+static inline int hw_breakpoint_type(struct perf_event *bp)
+{
+	return bp->attr.bp_type;
+}
+
+static inline int hw_breakpoint_len(struct perf_event *bp)
+{
+	return bp->attr.bp_len;
+}
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+extern struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active);
+
+/* FIXME: only change from the attr, and don't unregister */
+extern struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active);
 
-extern int register_user_hw_breakpoint(struct task_struct *tsk,
-					struct hw_breakpoint *bp);
-extern int modify_user_hw_breakpoint(struct task_struct *tsk,
-					struct hw_breakpoint *bp);
-extern void unregister_user_hw_breakpoint(struct task_struct *tsk,
-						struct hw_breakpoint *bp);
 /*
  * Kernel breakpoints are not associated with any particular thread.
  */
-extern int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
-extern void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+extern struct perf_event *
+register_wide_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				int cpu,
+				bool active);
+
+extern struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active);
+
+extern int register_perf_hw_breakpoint(struct perf_event *bp);
+extern int __register_perf_hw_breakpoint(struct perf_event *bp);
+extern void unregister_hw_breakpoint(struct perf_event *bp);
+extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events);
+
+extern int reserve_bp_slot(struct perf_event *bp);
+extern void release_bp_slot(struct perf_event *bp);
+
+extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk);
+
+#else /* !CONFIG_HAVE_HW_BREAKPOINT */
+
+static inline struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active)		{ return NULL; }
+static inline struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active)			{ return NULL; }
+static inline struct perf_event *
+register_wide_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				int cpu,
+				bool active)		{ return NULL; }
+static inline struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active)		{ return NULL; }
+static inline int
+register_perf_hw_breakpoint(struct perf_event *bp)	{ return -ENOSYS; }
+static inline int
+__register_perf_hw_breakpoint(struct perf_event *bp) 	{ return -ENOSYS; }
+static inline void unregister_hw_breakpoint(struct perf_event *bp)	{ }
+static inline void
+unregister_wide_hw_breakpoint(struct perf_event **cpu_events)		{ }
+static inline int
+reserve_bp_slot(struct perf_event *bp)			{return -ENOSYS; }
+static inline void release_bp_slot(struct perf_event *bp) 		{ }
+
+static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk)	{ }
 
-extern unsigned int hbp_kernel_pos;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 
-#endif	/* __KERNEL__ */
-#endif	/* _LINUX_HW_BREAKPOINT_H */
+#endif /* _LINUX_HW_BREAKPOINT_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8d54e6d25eeb..cead64ea6c15 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -18,6 +18,10 @@
 #include <linux/ioctl.h>
 #include <asm/byteorder.h>
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+#include <asm/hw_breakpoint.h>
+#endif
+
 /*
  * User-space ABI bits:
  */
@@ -31,6 +35,7 @@ enum perf_type_id {
 	PERF_TYPE_TRACEPOINT			= 2,
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
+	PERF_TYPE_BREAKPOINT			= 5,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
@@ -207,6 +212,15 @@ struct perf_event_attr {
 		__u32		wakeup_events;	  /* wakeup every n events */
 		__u32		wakeup_watermark; /* bytes before wakeup   */
 	};
+
+	union {
+		struct { /* Hardware breakpoint info */
+			__u64		bp_addr;
+			__u32		bp_type;
+			__u32		bp_len;
+		};
+	};
+
 	__u32			__reserved_2;
 
 	__u64			__reserved_3;
@@ -476,6 +490,11 @@ struct hw_perf_event {
 			atomic64_t	count;
 			struct hrtimer	hrtimer;
 		};
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+		union { /* breakpoint */
+			struct arch_hw_breakpoint	info;
+		};
+#endif
 	};
 	atomic64_t			prev_count;
 	u64				sample_period;
@@ -588,7 +607,7 @@ struct perf_event {
 	u64				tstamp_running;
 	u64				tstamp_stopped;
 
-	struct perf_event_attr	attr;
+	struct perf_event_attr		attr;
 	struct hw_perf_event		hw;
 
 	struct perf_event_context	*ctx;
@@ -643,6 +662,8 @@ struct perf_event {
 
 	perf_callback_t			callback;
 
+	perf_callback_t			event_callback;
+
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -831,6 +852,7 @@ extern int sysctl_perf_event_sample_rate;
 extern void perf_event_init(void);
 extern void perf_tp_event(int event_id, u64 addr, u64 count,
 				 void *record, int entry_size);
+extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
 #define perf_misc_flags(regs)	(user_mode(regs) ? PERF_RECORD_MISC_USER : \
@@ -865,6 +887,8 @@ static inline int perf_event_task_enable(void)				{ return -EINVAL; }
 static inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi,
 		     struct pt_regs *regs, u64 addr)			{ }
+static inline void
+perf_bp_event(struct perf_event *event, void *data)		{ }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f80123..266f8920628a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -979,6 +980,10 @@ NORET_TYPE void do_exit(long code)
 
 	proc_exit_connector(tsk);
 
+	/*
+	 * FIXME: do that only when needed, using sched_exit tracepoint
+	 */
+	flush_ptrace_hw_breakpoint(tsk);
 	/*
 	 * Flush inherited counters to the parent - before the parent
 	 * gets woken up by child-exit notifications.
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c1f64e65a9f3..08f6d0163201 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -15,6 +15,7 @@
  *
  * Copyright (C) 2007 Alan Stern
  * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
  */
 
 /*
@@ -35,334 +36,242 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 
-#include <asm/hw_breakpoint.h>
+#include <linux/hw_breakpoint.h>
+
 #include <asm/processor.h>
 
 #ifdef CONFIG_X86
 #include <asm/debugreg.h>
 #endif
-/*
- * Spinlock that protects all (un)register operations over kernel/user-space
- * breakpoint requests
- */
-static DEFINE_SPINLOCK(hw_breakpoint_lock);
-
-/* Array of kernel-space breakpoint structures */
-struct hw_breakpoint *hbp_kernel[HBP_NUM];
-
-/*
- * Per-processor copy of hbp_kernel[]. Used only when hbp_kernel is being
- * modified but we need the older copy to handle any hbp exceptions. It will
- * sync with hbp_kernel[] value after updation is done through IPIs.
- */
-DEFINE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]);
-
-/*
- * Kernel breakpoints grow downwards, starting from HBP_NUM
- * 'hbp_kernel_pos' denotes lowest numbered breakpoint register occupied for
- * kernel-space request. We will initialise it here and not in an __init
- * routine because load_debug_registers(), which uses this variable can be
- * called very early during CPU initialisation.
- */
-unsigned int hbp_kernel_pos = HBP_NUM;
 
-/*
- * An array containing refcount of threads using a given bkpt register
- * Accesses are synchronised by acquiring hw_breakpoint_lock
- */
-unsigned int hbp_user_refcount[HBP_NUM];
+static atomic_t bp_slot;
 
-/*
- * Load the debug registers during startup of a CPU.
- */
-void load_debug_registers(void)
+int reserve_bp_slot(struct perf_event *bp)
 {
-	unsigned long flags;
-	struct task_struct *tsk = current;
-
-	spin_lock_bh(&hw_breakpoint_lock);
-
-	/* Prevent IPIs for new kernel breakpoint updates */
-	local_irq_save(flags);
-	arch_update_kernel_hw_breakpoint(NULL);
-	local_irq_restore(flags);
-
-	if (test_tsk_thread_flag(tsk, TIF_DEBUG))
-		arch_install_thread_hw_breakpoint(tsk);
-
-	spin_unlock_bh(&hw_breakpoint_lock);
-}
+	if (atomic_inc_return(&bp_slot) == HBP_NUM) {
+		atomic_dec(&bp_slot);
 
-/*
- * Erase all the hardware breakpoint info associated with a thread.
- *
- * If tsk != current then tsk must not be usable (for example, a
- * child being cleaned up from a failed fork).
- */
-void flush_thread_hw_breakpoint(struct task_struct *tsk)
-{
-	int i;
-	struct thread_struct *thread = &(tsk->thread);
-
-	spin_lock_bh(&hw_breakpoint_lock);
-
-	/* The thread no longer has any breakpoints associated with it */
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
-	for (i = 0; i < HBP_NUM; i++) {
-		if (thread->hbp[i]) {
-			hbp_user_refcount[i]--;
-			kfree(thread->hbp[i]);
-			thread->hbp[i] = NULL;
-		}
+		return -ENOSPC;
 	}
 
-	arch_flush_thread_hw_breakpoint(tsk);
-
-	/* Actually uninstall the breakpoints if necessary */
-	if (tsk == current)
-		arch_uninstall_thread_hw_breakpoint();
-	spin_unlock_bh(&hw_breakpoint_lock);
+	return 0;
 }
 
-/*
- * Copy the hardware breakpoint info from a thread to its cloned child.
- */
-int copy_thread_hw_breakpoint(struct task_struct *tsk,
-		struct task_struct *child, unsigned long clone_flags)
+void release_bp_slot(struct perf_event *bp)
 {
-	/*
-	 * We will assume that breakpoint settings are not inherited
-	 * and the child starts out with no debug registers set.
-	 * But what about CLONE_PTRACE?
-	 */
-	clear_tsk_thread_flag(child, TIF_DEBUG);
-
-	/* We will call flush routine since the debugregs are not inherited */
-	arch_flush_thread_hw_breakpoint(child);
-
-	return 0;
+	atomic_dec(&bp_slot);
 }
 
-static int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
-					struct hw_breakpoint *bp)
+int __register_perf_hw_breakpoint(struct perf_event *bp)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int rc;
+	int ret;
 
-	/* Do not overcommit. Fail if kernel has used the hbp registers */
-	if (pos >= hbp_kernel_pos)
-		return -ENOSPC;
+	ret = reserve_bp_slot(bp);
+	if (ret)
+		return ret;
 
-	rc = arch_validate_hwbkpt_settings(bp, tsk);
-	if (rc)
-		return rc;
+	if (!bp->attr.disabled)
+		ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
 
-	thread->hbp[pos] = bp;
-	hbp_user_refcount[pos]++;
+	return ret;
+}
 
-	arch_update_user_hw_breakpoint(pos, tsk);
-	/*
-	 * Does it need to be installed right now?
-	 * Otherwise it will get installed the next time tsk runs
-	 */
-	if (tsk == current)
-		arch_install_thread_hw_breakpoint(tsk);
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+	bp->callback = perf_bp_event;
 
-	return rc;
+	return __register_perf_hw_breakpoint(bp);
 }
 
 /*
- * Modify the address of a hbp register already in use by the task
- * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint()
+ * Register a breakpoint bound to a task and a given cpu.
+ * If cpu is -1, the breakpoint is active for the task in every cpu
+ * If the task is -1, the breakpoint is active for every tasks in the given
+ * cpu.
  */
-static int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
-					struct hw_breakpoint *bp)
+static struct perf_event *
+register_user_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				pid_t pid,
+				int cpu,
+				bool active)
 {
-	struct thread_struct *thread = &(tsk->thread);
-
-	if ((pos >= hbp_kernel_pos) || (arch_validate_hwbkpt_settings(bp, tsk)))
-		return -EINVAL;
-
-	if (thread->hbp[pos] == NULL)
-		return -EINVAL;
-
-	thread->hbp[pos] = bp;
+	struct perf_event_attr *attr;
+	struct perf_event *bp;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return ERR_PTR(-ENOMEM);
+
+	attr->type = PERF_TYPE_BREAKPOINT;
+	attr->size = sizeof(*attr);
+	attr->bp_addr = addr;
+	attr->bp_len = len;
+	attr->bp_type = type;
 	/*
-	 * 'pos' must be that of a hbp register already used by 'tsk'
-	 * Otherwise arch_modify_user_hw_breakpoint() will fail
+	 * Such breakpoints are used by debuggers to trigger signals when
+	 * we hit the excepted memory op. We can't miss such events, they
+	 * must be pinned.
 	 */
-	arch_update_user_hw_breakpoint(pos, tsk);
+	attr->pinned = 1;
 
-	if (tsk == current)
-		arch_install_thread_hw_breakpoint(tsk);
+	if (!active)
+		attr->disabled = 1;
 
-	return 0;
-}
-
-static void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk)
-{
-	hbp_user_refcount[pos]--;
-	tsk->thread.hbp[pos] = NULL;
+	bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered);
+	kfree(attr);
 
-	arch_update_user_hw_breakpoint(pos, tsk);
-
-	if (tsk == current)
-		arch_install_thread_hw_breakpoint(tsk);
+	return bp;
 }
 
 /**
  * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @bp: the breakpoint structure to register
- *
- * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
- * @bp->triggered must be set properly before invocation
+ * @active: should we activate it while registering it
  *
  */
-int register_user_hw_breakpoint(struct task_struct *tsk,
-					struct hw_breakpoint *bp)
+struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int i, rc = -ENOSPC;
-
-	spin_lock_bh(&hw_breakpoint_lock);
-
-	for (i = 0; i < hbp_kernel_pos; i++) {
-		if (!thread->hbp[i]) {
-			rc = __register_user_hw_breakpoint(i, tsk, bp);
-			break;
-		}
-	}
-	if (!rc)
-		set_tsk_thread_flag(tsk, TIF_DEBUG);
-
-	spin_unlock_bh(&hw_breakpoint_lock);
-	return rc;
+	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
+					       tsk->pid, -1, active);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
 /**
  * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @bp: the breakpoint structure to unregister
- *
+ * @active: should we activate it while registering it
  */
-int modify_user_hw_breakpoint(struct task_struct *tsk, struct hw_breakpoint *bp)
+struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int i, ret = -ENOENT;
+	/*
+	 * FIXME: do it without unregistering
+	 * - We don't want to lose our slot
+	 * - If the new bp is incorrect, don't lose the older one
+	 */
+	unregister_hw_breakpoint(bp);
 
-	spin_lock_bh(&hw_breakpoint_lock);
-	for (i = 0; i < hbp_kernel_pos; i++) {
-		if (bp == thread->hbp[i]) {
-			ret = __modify_user_hw_breakpoint(i, tsk, bp);
-			break;
-		}
-	}
-	spin_unlock_bh(&hw_breakpoint_lock);
-	return ret;
+	return register_user_hw_breakpoint(addr, len, type, triggered,
+					   tsk, active);
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
 
 /**
- * unregister_user_hw_breakpoint - unregister a user-space hardware breakpoint
- * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
  * @bp: the breakpoint structure to unregister
- *
  */
-void unregister_user_hw_breakpoint(struct task_struct *tsk,
-						struct hw_breakpoint *bp)
+void unregister_hw_breakpoint(struct perf_event *bp)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int i, pos = -1, hbp_counter = 0;
-
-	spin_lock_bh(&hw_breakpoint_lock);
-	for (i = 0; i < hbp_kernel_pos; i++) {
-		if (thread->hbp[i])
-			hbp_counter++;
-		if (bp == thread->hbp[i])
-			pos = i;
-	}
-	if (pos >= 0) {
-		__unregister_user_hw_breakpoint(pos, tsk);
-		hbp_counter--;
-	}
-	if (!hbp_counter)
-		clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
-	spin_unlock_bh(&hw_breakpoint_lock);
+	if (!bp)
+		return;
+	perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+
+static struct perf_event *
+register_kernel_hw_breakpoint_cpu(unsigned long addr,
+				  int len,
+				  int type,
+				  perf_callback_t triggered,
+				  int cpu,
+				  bool active)
+{
+	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
+					       -1, cpu, active);
 }
-EXPORT_SYMBOL_GPL(unregister_user_hw_breakpoint);
 
 /**
- * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
- * @bp: the breakpoint structure to register
- *
- * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
- * @bp->triggered must be set properly before invocation
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @active: should we activate it while registering it
  *
+ * @return a set of per_cpu pointers to perf events
  */
-int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active)
 {
-	int rc;
+	struct perf_event **cpu_events, **pevent, *bp;
+	long err;
+	int cpu;
+
+	cpu_events = alloc_percpu(typeof(*cpu_events));
+	if (!cpu_events)
+		return ERR_PTR(-ENOMEM);
 
-	rc = arch_validate_hwbkpt_settings(bp, NULL);
-	if (rc)
-		return rc;
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		bp = register_kernel_hw_breakpoint_cpu(addr, len, type,
+					triggered, cpu, active);
 
-	spin_lock_bh(&hw_breakpoint_lock);
+		*pevent = bp;
 
-	rc = -ENOSPC;
-	/* Check if we are over-committing */
-	if ((hbp_kernel_pos > 0) && (!hbp_user_refcount[hbp_kernel_pos-1])) {
-		hbp_kernel_pos--;
-		hbp_kernel[hbp_kernel_pos] = bp;
-		on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1);
-		rc = 0;
+		if (IS_ERR(bp) || !bp) {
+			err = PTR_ERR(bp);
+			goto fail;
+		}
 	}
 
-	spin_unlock_bh(&hw_breakpoint_lock);
-	return rc;
+	return cpu_events;
+
+fail:
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		if (IS_ERR(*pevent) || !*pevent)
+			break;
+		unregister_hw_breakpoint(*pevent);
+	}
+	free_percpu(cpu_events);
+	/* return the error if any */
+	return ERR_PTR(err);
 }
-EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
 
 /**
- * unregister_kernel_hw_breakpoint - unregister a HW breakpoint for kernel space
- * @bp: the breakpoint structure to unregister
- *
- * Uninstalls and unregisters @bp.
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
  */
-void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
 {
-	int i, j;
-
-	spin_lock_bh(&hw_breakpoint_lock);
-
-	/* Find the 'bp' in our list of breakpoints for kernel */
-	for (i = hbp_kernel_pos; i < HBP_NUM; i++)
-		if (bp == hbp_kernel[i])
-			break;
+	int cpu;
+	struct perf_event **pevent;
 
-	/* Check if we did not find a match for 'bp'. If so return early */
-	if (i == HBP_NUM) {
-		spin_unlock_bh(&hw_breakpoint_lock);
-		return;
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		unregister_hw_breakpoint(*pevent);
 	}
-
-	/*
-	 * We'll shift the breakpoints one-level above to compact if
-	 * unregistration creates a hole
-	 */
-	for (j = i; j > hbp_kernel_pos; j--)
-		hbp_kernel[j] = hbp_kernel[j-1];
-
-	hbp_kernel[hbp_kernel_pos] = NULL;
-	on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1);
-	hbp_kernel_pos++;
-
-	spin_unlock_bh(&hw_breakpoint_lock);
+	free_percpu(cpu_events);
 }
-EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
 
 static struct notifier_block hw_breakpoint_exceptions_nb = {
 	.notifier_call = hw_breakpoint_exceptions_notify,
@@ -374,5 +283,12 @@ static int __init init_hw_breakpoint(void)
 {
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
 }
-
 core_initcall(init_hw_breakpoint);
+
+
+struct pmu perf_ops_bp = {
+	.enable		= arch_install_hw_breakpoint,
+	.disable	= arch_uninstall_hw_breakpoint,
+	.read		= hw_breakpoint_pmu_read,
+	.unthrottle	= hw_breakpoint_pmu_unthrottle
+};
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 5087125e2a00..98dc56b2ebe4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -29,6 +29,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
@@ -4229,6 +4230,51 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #endif /* CONFIG_EVENT_PROFILE */
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+	release_bp_slot(event);
+}
+
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+	int err;
+	/*
+	 * The breakpoint is already filled if we haven't created the counter
+	 * through perf syscall
+	 * FIXME: manage to get trigerred to NULL if it comes from syscalls
+	 */
+	if (!bp->callback)
+		err = register_perf_hw_breakpoint(bp);
+	else
+		err = __register_perf_hw_breakpoint(bp);
+	if (err)
+		return ERR_PTR(err);
+
+	bp->destroy = bp_perf_event_destroy;
+
+	return &perf_ops_bp;
+}
+
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+	/* TODO */
+}
+#else
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+}
+
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+	return NULL;
+}
+
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+}
+#endif
+
 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 static void sw_perf_event_destroy(struct perf_event *event)
@@ -4375,6 +4421,11 @@ perf_event_alloc(struct perf_event_attr *attr,
 		pmu = tp_perf_event_init(event);
 		break;
 
+	case PERF_TYPE_BREAKPOINT:
+		pmu = bp_perf_event_init(event);
+		break;
+
+
 	default:
 		break;
 	}
@@ -4686,7 +4737,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
-		return NULL ;
+		return NULL;
 
 	event = perf_event_alloc(attr, cpu, ctx, NULL,
 				     NULL, callback, GFP_KERNEL);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 91c3d0e9a5a1..d72f06ff263f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,14 +11,11 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
+#include <linux/hw_breakpoint.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
 
-#ifdef CONFIG_KSYM_TRACER
-#include <asm/hw_breakpoint.h>
-#endif
-
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e19747d4f860..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -372,11 +372,11 @@ FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
 	F_STRUCT(
 		__field(	unsigned long,	ip			  )
 		__field(	unsigned char,	type			  )
-		__array(	char	     ,	ksym_name, KSYM_NAME_LEN  )
 		__array(	char	     ,	cmd,	   TASK_COMM_LEN  )
+		__field(	unsigned long,  addr			  )
 	),
 
-	F_printk("ip: %pF type: %d ksym_name: %s cmd: %s",
+	F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
 		(void *)__entry->ip, (unsigned int)__entry->type,
-		__entry->ksym_name, __entry->cmd)
+		(void *)__entry->addr,  __entry->cmd)
 );
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 6d5609c67378..fea83eeeef09 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -29,7 +29,11 @@
 #include "trace_stat.h"
 #include "trace.h"
 
-/* For now, let us restrict the no. of symbols traced simultaneously to number
+#include <linux/hw_breakpoint.h>
+#include <asm/hw_breakpoint.h>
+
+/*
+ * For now, let us restrict the no. of symbols traced simultaneously to number
  * of available hardware breakpoint registers.
  */
 #define KSYM_TRACER_MAX HBP_NUM
@@ -37,8 +41,10 @@
 #define KSYM_TRACER_OP_LEN 3 /* rw- */
 
 struct trace_ksym {
-	struct hw_breakpoint	*ksym_hbp;
+	struct perf_event	**ksym_hbp;
 	unsigned long		ksym_addr;
+	int			type;
+	int			len;
 #ifdef CONFIG_PROFILE_KSYM_TRACER
 	unsigned long		counter;
 #endif
@@ -75,10 +81,11 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
 }
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
 
-void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
+void ksym_hbp_handler(struct perf_event *hbp, void *data)
 {
 	struct ring_buffer_event *event;
 	struct ksym_trace_entry *entry;
+	struct pt_regs *regs = data;
 	struct ring_buffer *buffer;
 	int pc;
 
@@ -96,12 +103,12 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
 
 	entry		= ring_buffer_event_data(event);
 	entry->ip	= instruction_pointer(regs);
-	entry->type	= hbp->info.type;
-	strlcpy(entry->ksym_name, hbp->info.name, KSYM_SYMBOL_LEN);
+	entry->type	= hw_breakpoint_type(hbp);
+	entry->addr	= hw_breakpoint_addr(hbp);
 	strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
 
 #ifdef CONFIG_PROFILE_KSYM_TRACER
-	ksym_collect_stats(hbp->info.address);
+	ksym_collect_stats(hw_breakpoint_addr(hbp));
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
 
 	trace_buffer_unlock_commit(buffer, event, 0, pc);
@@ -120,31 +127,21 @@ static int ksym_trace_get_access_type(char *str)
 	int access = 0;
 
 	if (str[0] == 'r')
-		access += 4;
-	else if (str[0] != '-')
-		return -EINVAL;
+		access |= HW_BREAKPOINT_R;
 
 	if (str[1] == 'w')
-		access += 2;
-	else if (str[1] != '-')
-		return -EINVAL;
+		access |= HW_BREAKPOINT_W;
 
-	if (str[2] != '-')
-		return -EINVAL;
+	if (str[2] == 'x')
+		access |= HW_BREAKPOINT_X;
 
 	switch (access) {
-	case 6:
-		access = HW_BREAKPOINT_RW;
-		break;
-	case 4:
-		access = -EINVAL;
-		break;
-	case 2:
-		access = HW_BREAKPOINT_WRITE;
-		break;
+	case HW_BREAKPOINT_W:
+	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+		return access;
+	default:
+		return -EINVAL;
 	}
-
-	return access;
 }
 
 /*
@@ -194,36 +191,33 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->ksym_hbp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
-	if (!entry->ksym_hbp)
-		goto err;
-
-	entry->ksym_hbp->info.name = kstrdup(ksymname, GFP_KERNEL);
-	if (!entry->ksym_hbp->info.name)
-		goto err;
-
-	entry->ksym_hbp->info.type = op;
-	entry->ksym_addr = entry->ksym_hbp->info.address = addr;
-#ifdef CONFIG_X86
-	entry->ksym_hbp->info.len = HW_BREAKPOINT_LEN_4;
-#endif
-	entry->ksym_hbp->triggered = (void *)ksym_hbp_handler;
+	entry->type = op;
+	entry->ksym_addr = addr;
+	entry->len = HW_BREAKPOINT_LEN_4;
+
+	ret = -EAGAIN;
+	entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr,
+					entry->len, entry->type,
+					ksym_hbp_handler, true);
+	if (IS_ERR(entry->ksym_hbp)) {
+		entry->ksym_hbp = NULL;
+		ret = PTR_ERR(entry->ksym_hbp);
+	}
 
-	ret = register_kernel_hw_breakpoint(entry->ksym_hbp);
-	if (ret < 0) {
+	if (!entry->ksym_hbp) {
 		printk(KERN_INFO "ksym_tracer request failed. Try again"
 					" later!!\n");
-		ret = -EAGAIN;
 		goto err;
 	}
+
 	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
 	ksym_filter_entry_count++;
+
 	return 0;
+
 err:
-	if (entry->ksym_hbp)
-		kfree(entry->ksym_hbp->info.name);
-	kfree(entry->ksym_hbp);
 	kfree(entry);
+
 	return ret;
 }
 
@@ -244,10 +238,10 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
 	mutex_lock(&ksym_tracer_mutex);
 
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		ret = trace_seq_printf(s, "%s:", entry->ksym_hbp->info.name);
-		if (entry->ksym_hbp->info.type == HW_BREAKPOINT_WRITE)
+		ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr);
+		if (entry->type == HW_BREAKPOINT_W)
 			ret = trace_seq_puts(s, "-w-\n");
-		else if (entry->ksym_hbp->info.type == HW_BREAKPOINT_RW)
+		else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
 			ret = trace_seq_puts(s, "rw-\n");
 		WARN_ON_ONCE(!ret);
 	}
@@ -269,12 +263,10 @@ static void __ksym_trace_reset(void)
 	mutex_lock(&ksym_tracer_mutex);
 	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
 								ksym_hlist) {
-		unregister_kernel_hw_breakpoint(entry->ksym_hbp);
+		unregister_wide_hw_breakpoint(entry->ksym_hbp);
 		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
-		kfree(entry->ksym_hbp->info.name);
-		kfree(entry->ksym_hbp);
 		kfree(entry);
 	}
 	mutex_unlock(&ksym_tracer_mutex);
@@ -327,7 +319,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
 		if (entry->ksym_addr == ksym_addr) {
 			/* Check for malformed request: (6) */
-			if (entry->ksym_hbp->info.type != op)
+			if (entry->type != op)
 				changed = 1;
 			else
 				goto out;
@@ -335,18 +327,21 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 		}
 	}
 	if (changed) {
-		unregister_kernel_hw_breakpoint(entry->ksym_hbp);
-		entry->ksym_hbp->info.type = op;
+		unregister_wide_hw_breakpoint(entry->ksym_hbp);
+		entry->type = op;
 		if (op > 0) {
-			ret = register_kernel_hw_breakpoint(entry->ksym_hbp);
-			if (ret == 0)
+			entry->ksym_hbp =
+				register_wide_hw_breakpoint(entry->ksym_addr,
+					entry->len, entry->type,
+					ksym_hbp_handler, true);
+			if (IS_ERR(entry->ksym_hbp))
+				entry->ksym_hbp = NULL;
+			if (!entry->ksym_hbp)
 				goto out;
 		}
 		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
-		kfree(entry->ksym_hbp->info.name);
-		kfree(entry->ksym_hbp);
 		kfree(entry);
 		ret = 0;
 		goto out;
@@ -413,16 +408,16 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%11s-%-5d [%03d] %-30s ", field->cmd,
-				entry->pid, iter->cpu, field->ksym_name);
+	ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
+				entry->pid, iter->cpu, (char *)field->addr);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	switch (field->type) {
-	case HW_BREAKPOINT_WRITE:
+	case HW_BREAKPOINT_W:
 		ret = trace_seq_printf(s, " W  ");
 		break;
-	case HW_BREAKPOINT_RW:
+	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
 		ret = trace_seq_printf(s, " RW ");
 		break;
 	default:
@@ -490,14 +485,13 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v)
 
 	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
 
-	if (entry->ksym_hbp)
-		access_type = entry->ksym_hbp->info.type;
+	access_type = entry->type;
 
 	switch (access_type) {
-	case HW_BREAKPOINT_WRITE:
+	case HW_BREAKPOINT_W:
 		seq_puts(m, "  W           ");
 		break;
-	case HW_BREAKPOINT_RW:
+	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
 		seq_puts(m, "  RW          ");
 		break;
 	default:
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 7179c12e4f0f..27c5072c2e6b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -828,7 +828,8 @@ trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
 
 	ksym_selftest_dummy = 0;
 	/* Register the read-write tracing request */
-	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
+	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY,
+				     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
 					(unsigned long)(&ksym_selftest_dummy));
 
 	if (ret < 0) {
-- 
cgit v1.2.3-71-gd317


From ba1c813a6b9a0ef14d7112daf51270eff326f037 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 10 Sep 2009 09:26:21 +0200
Subject: hw-breakpoints: Arbitrate access to pmu following registers
 constraints

Allow or refuse to build a counter using the breakpoints pmu following
given constraints.

We keep track of the pmu users by using three per cpu variables:

- nr_cpu_bp_pinned stores the number of pinned cpu breakpoints counters
  in the given cpu

- nr_bp_flexible stores the number of non-pinned breakpoints counters
  in the given cpu.

- task_bp_pinned stores the number of pinned task breakpoints in a cpu

The latter is not a simple counter but gathers the number of tasks that
have n pinned breakpoints.
Considering HBP_NUM the number of available breakpoint address
registers:
   task_bp_pinned[0] is the number of tasks having 1 breakpoint
   task_bp_pinned[1] is the number of tasks having 2 breakpoints
   [...]
   task_bp_pinned[HBP_NUM - 1] is the number of tasks having the
   maximum number of registers (HBP_NUM).

When a breakpoint counter is created and wants an access to the pmu,
we evaluate the following constraints:

== Non-pinned counter ==

- If attached to a single cpu, check:

    (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
         + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM

       -> If there are already non-pinned counters in this cpu, it
          means there is already a free slot for them.
          Otherwise, we check that the maximum number of per task
          breakpoints (for this cpu) plus the number of per cpu
          breakpoint (for this cpu) doesn't cover every registers.

- If attached to every cpus, check:

    (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
           + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM

       -> This is roughly the same, except we check the number of per
          cpu bp for every cpu and we keep the max one. Same for the
          per tasks breakpoints.

== Pinned counter ==

- If attached to a single cpu, check:

       ((per_cpu(nr_bp_flexible, cpu) > 1)
            + per_cpu(nr_cpu_bp_pinned, cpu)
            + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM

       -> Same checks as before. But now the nr_bp_flexible, if any,
          must keep one register at least (or flexible breakpoints will
          never be be fed).

- If attached to every cpus, check:

      ((per_cpu(nr_bp_flexible, *) > 1)
           + max(per_cpu(nr_cpu_bp_pinned, *))
           + max(per_cpu(task_bp_pinned, *))) < HBP_NUM

Changes in v2:

- Counter -> event rename

Changes in v5:

- Fix unreleased non-pinned task-bound-only counters. We only released
  it in the first cpu. (Thanks to Paul Mackerras for reporting that)

Changes in v6:

- Currently, events scheduling are done in this order: cpu context
  pinned + cpu context non-pinned + task context pinned + task context
  non-pinned events. Then our current constraints are right theoretically
  but not in practice, because non-pinned counters may be scheduled
  before we can apply every possible pinned counters. So consider
  non-pinned counters as pinned for now.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jan Kiszka <jan.kiszka@web.de>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
---
 kernel/hw_breakpoint.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 205 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 08f6d0163201..e662dc991c96 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -16,6 +16,8 @@
  * Copyright (C) 2007 Alan Stern
  * Copyright (C) IBM Corporation, 2009
  * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
  */
 
 /*
@@ -44,24 +46,221 @@
 #include <asm/debugreg.h>
 #endif
 
-static atomic_t bp_slot;
+/*
+ * Constraints data
+ */
+
+/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
 
-int reserve_bp_slot(struct perf_event *bp)
+/* Number of pinned task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
+
+/* Number of non-pinned cpu/task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+	unsigned int pinned;
+	unsigned int flexible;
+};
+
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu)
 {
-	if (atomic_inc_return(&bp_slot) == HBP_NUM) {
-		atomic_dec(&bp_slot);
+	int i;
+	unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
 
-		return -ENOSPC;
+	for (i = HBP_NUM -1; i >= 0; i--) {
+		if (tsk_pinned[i] > 0)
+			return i + 1;
 	}
 
 	return 0;
 }
 
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
+{
+	if (cpu >= 0) {
+		slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+		slots->pinned += max_task_bp_pinned(cpu);
+		slots->flexible = per_cpu(nr_bp_flexible, cpu);
+
+		return;
+	}
+
+	for_each_online_cpu(cpu) {
+		unsigned int nr;
+
+		nr = per_cpu(nr_cpu_bp_pinned, cpu);
+		nr += max_task_bp_pinned(cpu);
+
+		if (nr > slots->pinned)
+			slots->pinned = nr;
+
+		nr = per_cpu(nr_bp_flexible, cpu);
+
+		if (nr > slots->flexible)
+			slots->flexible = nr;
+	}
+}
+
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+{
+	int count = 0;
+	struct perf_event *bp;
+	struct perf_event_context *ctx = tsk->perf_event_ctxp;
+	unsigned int *task_bp_pinned;
+	struct list_head *list;
+	unsigned long flags;
+
+	if (WARN_ONCE(!ctx, "No perf context for this task"))
+		return;
+
+	list = &ctx->event_list;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	/*
+	 * The current breakpoint counter is not included in the list
+	 * at the open() callback time
+	 */
+	list_for_each_entry(bp, list, event_entry) {
+		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+			count++;
+	}
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
+		return;
+
+	task_bp_pinned = per_cpu(task_bp_pinned, cpu);
+	if (enable) {
+		task_bp_pinned[count]++;
+		if (count > 0)
+			task_bp_pinned[count-1]--;
+	} else {
+		task_bp_pinned[count]--;
+		if (count > 0)
+			task_bp_pinned[count-1]++;
+	}
+}
+
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void toggle_bp_slot(struct perf_event *bp, bool enable)
+{
+	int cpu = bp->cpu;
+	struct task_struct *tsk = bp->ctx->task;
+
+	/* Pinned counter task profiling */
+	if (tsk) {
+		if (cpu >= 0) {
+			toggle_bp_task_slot(tsk, cpu, enable);
+			return;
+		}
+
+		for_each_online_cpu(cpu)
+			toggle_bp_task_slot(tsk, cpu, enable);
+		return;
+	}
+
+	/* Pinned counter cpu profiling */
+	if (enable)
+		per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+	else
+		per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+}
+
+/*
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ *  == Non-pinned counter == (Considered as pinned for now)
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ *           + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
+ *
+ *       -> If there are already non-pinned counters in this cpu, it means
+ *          there is already a free slot for them.
+ *          Otherwise, we check that the maximum number of per task
+ *          breakpoints (for this cpu) plus the number of per cpu breakpoint
+ *          (for this cpu) doesn't cover every registers.
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ *           + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
+ *
+ *       -> This is roughly the same, except we check the number of per cpu
+ *          bp for every cpu and we keep the max one. Same for the per tasks
+ *          breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ *            + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
+ *
+ *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ *          one register at least (or they will never be fed).
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ *            + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
+ */
+int reserve_bp_slot(struct perf_event *bp)
+{
+	struct bp_busy_slots slots = {0};
+	int ret = 0;
+
+	mutex_lock(&nr_bp_mutex);
+
+	fetch_bp_busy_slots(&slots, bp->cpu);
+
+	/* Flexible counters need to keep at least one slot */
+	if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+		ret = -ENOSPC;
+		goto end;
+	}
+
+	toggle_bp_slot(bp, true);
+
+end:
+	mutex_unlock(&nr_bp_mutex);
+
+	return ret;
+}
+
 void release_bp_slot(struct perf_event *bp)
 {
-	atomic_dec(&bp_slot);
+	mutex_lock(&nr_bp_mutex);
+
+	toggle_bp_slot(bp, false);
+
+	mutex_unlock(&nr_bp_mutex);
 }
 
+
 int __register_perf_hw_breakpoint(struct perf_event *bp)
 {
 	int ret;
-- 
cgit v1.2.3-71-gd317


From 30ff21e31fe5c8b7b1b7d30cc41e32bc4ee9f175 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 10 Sep 2009 09:35:20 +0800
Subject: ksym_tracer: Remove KSYM_SELFTEST_ENTRY

The macro used to be used in both trace_selftest.c and
trace_ksym.c, but no longer, so remove it from header file.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace.h          | 1 -
 kernel/trace/trace_selftest.c | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d72f06ff263f..ee00475742eb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -367,7 +367,6 @@ int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
 
-#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
 extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
 
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 27c5072c2e6b..dc98309e839a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -828,7 +828,8 @@ trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
 
 	ksym_selftest_dummy = 0;
 	/* Register the read-write tracing request */
-	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY,
+
+	ret = process_new_ksym_entry("ksym_selftest_dummy",
 				     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
 					(unsigned long)(&ksym_selftest_dummy));
 
-- 
cgit v1.2.3-71-gd317


From b71a8eb0fa64ec6d00175f479e3ef851703568af Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 6 Oct 2009 12:42:51 +0200
Subject: tree-wide: fix typos "selct" + "slect" -> "select"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch was generated by

	git grep -E -i -l 's(le|el)ct' | xargs -r perl -p -i -e 's/([Ss])(le|el)ct/$1elect/

with only skipping net/netfilter/xt_SECMARK.c and
include/linux/netfilter/xt_SECMARK.h which have a struct member called
selctx.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/media/video/ov772x.c       | 2 +-
 drivers/scsi/aic7xxx/aic79xx.seq   | 2 +-
 drivers/scsi/aic7xxx/aic79xx_osm.c | 2 +-
 drivers/scsi/aic7xxx/aic7xxx.seq   | 2 +-
 drivers/scsi/aic7xxx/aic7xxx_osm.c | 2 +-
 drivers/scsi/dc395x.c              | 2 +-
 include/linux/spi/spi_bitbang.h    | 2 +-
 kernel/time/clocksource.c          | 2 +-
 sound/pci/hda/patch_cirrus.c       | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/drivers/media/video/ov772x.c b/drivers/media/video/ov772x.c
index eccb40ab7fec..205229333466 100644
--- a/drivers/media/video/ov772x.c
+++ b/drivers/media/video/ov772x.c
@@ -247,7 +247,7 @@
 
 /* COM5 */
 #define AFR_ON_OFF      0x80	/* Auto frame rate control ON/OFF selection */
-#define AFR_SPPED       0x40	/* Auto frame rate control speed slection */
+#define AFR_SPPED       0x40	/* Auto frame rate control speed selection */
 				/* Auto frame rate max rate control */
 #define AFR_NO_RATE     0x00	/*     No  reduction of frame rate */
 #define AFR_1p2         0x10	/*     Max reduction to 1/2 frame rate */
diff --git a/drivers/scsi/aic7xxx/aic79xx.seq b/drivers/scsi/aic7xxx/aic79xx.seq
index 58bc17591b54..3b66b5ae3d9f 100644
--- a/drivers/scsi/aic7xxx/aic79xx.seq
+++ b/drivers/scsi/aic7xxx/aic79xx.seq
@@ -1281,7 +1281,7 @@ END_CRITICAL;
  * Is it a disconnect message?  Set a flag in the SCB to remind us
  * and await the bus going free.  If this is an untagged transaction
  * store the SCB id for it in our untagged target table for lookup on
- * a reselction.
+ * a reselection.
  */
 mesgin_disconnect:
 	/*
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 75b23317bd26..1222a7ac698a 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -2335,7 +2335,7 @@ ahd_linux_queue_abort_cmd(struct scsi_cmnd *cmd)
 			/*
 			 * The sequencer will never re-reference the
 			 * in-core SCB.  To make sure we are notified
-			 * during reslection, set the MK_MESSAGE flag in
+			 * during reselection, set the MK_MESSAGE flag in
 			 * the card's copy of the SCB.
 			 */
 			ahd_outb(ahd, SCB_CONTROL,
diff --git a/drivers/scsi/aic7xxx/aic7xxx.seq b/drivers/scsi/aic7xxx/aic7xxx.seq
index 15196390e28d..5a4cfc954a9f 100644
--- a/drivers/scsi/aic7xxx/aic7xxx.seq
+++ b/drivers/scsi/aic7xxx/aic7xxx.seq
@@ -1693,7 +1693,7 @@ if ((ahc->flags & AHC_INITIATORROLE) != 0) {
  * Is it a disconnect message?  Set a flag in the SCB to remind us
  * and await the bus going free.  If this is an untagged transaction
  * store the SCB id for it in our untagged target table for lookup on
- * a reselction.
+ * a reselection.
  */
 mesgin_disconnect:
 	/*
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index fd2b9785ff4f..8cb05dc8e6a1 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -2290,7 +2290,7 @@ ahc_linux_queue_recovery_cmd(struct scsi_cmnd *cmd, scb_flag flag)
 		 * In the non-paging case, the sequencer will
 		 * never re-reference the in-core SCB.
 		 * To make sure we are notified during
-		 * reslection, set the MK_MESSAGE flag in
+		 * reselection, set the MK_MESSAGE flag in
 		 * the card's copy of the SCB.
 		 */
 		if ((ahc->flags & AHC_PAGESCBS) == 0) {
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index 075e2397273c..6c59c02c1ed9 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -1509,7 +1509,7 @@ static u8 start_scsi(struct AdapterCtlBlk* acb, struct DeviceCtlBlk* dcb,
 		 * Try anyway?
 		 *
 		 * We could, BUT: Sometimes the TRM_S1040 misses to produce a Selection
-		 * Timeout, a Disconnect or a Reselction IRQ, so we would be screwed!
+		 * Timeout, a Disconnect or a Reselection IRQ, so we would be screwed!
 		 * (This is likely to be a bug in the hardware. Obviously, most people
 		 *  only have one initiator per SCSI bus.)
 		 * Instead let this fail and have the timer make sure the command is 
diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index eed4254bd503..3274c507b8a9 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -15,7 +15,7 @@
  * Some hardware works well with requests at spi_transfer scope:
  *
  *   -	Drivers leveraging smarter hardware, with fifos or DMA; or for half
- *	duplex (MicroWire) controllers.  Provide chipslect() and txrx_bufs(),
+ *	duplex (MicroWire) controllers.  Provide chipselect() and txrx_bufs(),
  *	and custom setup()/cleanup() methods.
  */
 
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5e18c6ab2c6a..c403567f78c0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -580,7 +580,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
  * @count:	length of buffer
  *
  * Takes input from sysfs interface for manually overriding the default
- * clocksource selction.
+ * clocksource selection.
  */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 					  struct sysdev_attribute *attr,
diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c
index 8ba306856d38..7b0446fa6009 100644
--- a/sound/pci/hda/patch_cirrus.c
+++ b/sound/pci/hda/patch_cirrus.c
@@ -947,7 +947,7 @@ static void init_input(struct hda_codec *codec)
 		coef |= 0x0500; /* DMIC2 enable 2 channels, disable GPIO1 */
 	if (is_active_pin(codec, CS_DMIC1_PIN_NID))
 		coef |= 0x1800; /* DMIC1 enable 2 channels, disable GPIO0 
-				 * No effect if SPDIF_OUT2 is slected in 
+				 * No effect if SPDIF_OUT2 is selected in 
 				 * IDX_SPDIF_CTL.
 				  */
 	cs_vendor_coef_set(codec, IDX_ADC_CFG, coef);
-- 
cgit v1.2.3-71-gd317


From f84d49b218b7d4c6cba2e0b41f24bd4045403962 Mon Sep 17 00:00:00 2001
From: Naohiro Ooiwa <nooiwa@miraclelinux.com>
Date: Mon, 9 Nov 2009 00:46:42 +0900
Subject: signal: Print warning message when dropping signals

When the system has too many timers or too many aggregate
queued signals, the EAGAIN error is returned to application
from kernel, including timer_create() [POSIX.1b].

It means that the app exceeded the limit of pending signals,
but in general application writers do not expect this
outcome and the current silent failure can cause rare app
failures under very high load.

This patch adds a new message when we reach the limit
and if print_fatal_signals is enabled:

    task/1234: reached RLIMIT_SIGPENDING, dropping signal

If you see this message and your system behaved unexpectedly,
you can run following command to lift the limit:

   # ulimit -i unlimited

With help from Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>.

Signed-off-by: Naohiro Ooiwa <nooiwa@miraclelinux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: oleg@redhat.com
LKML-Reference: <4AF6E7E2.9080406@miraclelinux.com>
[ Modified a few small details, gave surrounding code some love. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt | 11 +++++++--
 kernel/signal.c                     | 46 ++++++++++++++++++++++++++-----------
 2 files changed, 42 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9107b387e91f..3bbd92f805a6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2032,8 +2032,15 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	print-fatal-signals=
 			[KNL] debug: print fatal signals
-			print-fatal-signals=1: print segfault info to
-			the kernel console.
+
+			If enabled, warn about various signal handling
+			related application anomalies: too many signals,
+			too many POSIX.1 timers, fatal signals causing a
+			coredump - etc.
+
+			If you hit the warning due to signal overflow,
+			you might want to try "ulimit -i unlimited".
+
 			default: off.
 
 	printk.time=	Show timing data prefixed to each printk message line
diff --git a/kernel/signal.c b/kernel/signal.c
index 6705320784fd..fe08008133da 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/signalfd.h>
+#include <linux/ratelimit.h>
 #include <linux/tracehook.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
@@ -41,6 +42,8 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
+int print_fatal_signals __read_mostly;
+
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
@@ -159,7 +162,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 {
 	unsigned long i, *s, *m, x;
 	int sig = 0;
-	
+
 	s = pending->signal.sig;
 	m = mask->sig;
 	switch (_NSIG_WORDS) {
@@ -184,17 +187,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 			sig = ffz(~x) + 1;
 		break;
 	}
-	
+
 	return sig;
 }
 
+static inline void print_dropped_signal(int sig)
+{
+	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+	if (!print_fatal_signals)
+		return;
+
+	if (!__ratelimit(&ratelimit_state))
+		return;
+
+	printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
+				current->comm, current->pid, sig);
+}
+
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
  *   appopriate lock must be held to stop the target task from exiting
  */
-static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
-					 int override_rlimit)
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
 {
 	struct sigqueue *q = NULL;
 	struct user_struct *user;
@@ -207,10 +224,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	 */
 	user = get_uid(__task_cred(t)->user);
 	atomic_inc(&user->sigpending);
+
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
-			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
+			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
+	} else {
+		print_dropped_signal(sig);
+	}
+
 	if (unlikely(q == NULL)) {
 		atomic_dec(&user->sigpending);
 		free_uid(user);
@@ -869,7 +891,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	else
 		override_rlimit = 0;
 
-	q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+	q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
 		override_rlimit);
 	if (q) {
 		list_add_tail(&q->list, &pending->list);
@@ -925,8 +947,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	return __send_signal(sig, info, t, group, from_ancestor_ns);
 }
 
-int print_fatal_signals;
-
 static void print_fatal_signal(struct pt_regs *regs, int signr)
 {
 	printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -1293,19 +1313,19 @@ EXPORT_SYMBOL(kill_pid);
  * These functions support sending signals using preallocated sigqueue
  * structures.  This is needed "because realtime applications cannot
  * afford to lose notifications of asynchronous events, like timer
- * expirations or I/O completions".  In the case of Posix Timers 
+ * expirations or I/O completions".  In the case of Posix Timers
  * we allocate the sigqueue structure from the timer_create.  If this
  * allocation fails we are able to report the failure to the application
  * with an EAGAIN error.
  */
- 
 struct sigqueue *sigqueue_alloc(void)
 {
-	struct sigqueue *q;
+	struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
 
-	if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0)))
+	if (q)
 		q->flags |= SIGQUEUE_PREALLOC;
-	return(q);
+
+	return q;
 }
 
 void sigqueue_free(struct sigqueue *q)
-- 
cgit v1.2.3-71-gd317


From dd8dbf2e6880e30c00b18600c962d0cb5a03c555 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Nov 2009 16:35:32 +1100
Subject: security: report the module name to security_module_request

For SELinux to do better filtering in userspace we send the name of the
module along with the AVC denial when a program is denied module_request.

Example output:

type=SYSCALL msg=audit(11/03/2009 10:59:43.510:9) : arch=x86_64 syscall=write success=yes exit=2 a0=3 a1=7fc28c0d56c0 a2=2 a3=7fffca0d7440 items=0 ppid=1727 pid=1729 auid=unset uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=(none) ses=unset comm=rpc.nfsd exe=/usr/sbin/rpc.nfsd subj=system_u:system_r:nfsd_t:s0 key=(null)
type=AVC msg=audit(11/03/2009 10:59:43.510:9) : avc:  denied  { module_request } for  pid=1729 comm=rpc.nfsd kmod="net-pf-10" scontext=system_u:system_r:nfsd_t:s0 tcontext=system_u:system_r:kernel_t:s0 tclass=system

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/lsm_audit.h | 18 ++++++++++--------
 include/linux/security.h  |  7 ++++---
 kernel/kmod.c             |  8 ++++----
 security/capability.c     |  2 +-
 security/lsm_audit.c      |  4 ++++
 security/security.c       |  4 ++--
 security/selinux/hooks.c  | 13 +++++++++++--
 7 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 190c37854870..f78f83d7663f 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -26,14 +26,15 @@
 
 /* Auxiliary data to use in generating the audit record. */
 struct common_audit_data {
-	char    type;
-#define LSM_AUDIT_DATA_FS      1
-#define LSM_AUDIT_DATA_NET     2
-#define LSM_AUDIT_DATA_CAP     3
-#define LSM_AUDIT_DATA_IPC     4
-#define LSM_AUDIT_DATA_TASK    5
-#define LSM_AUDIT_DATA_KEY     6
-#define LSM_AUDIT_NO_AUDIT     7
+	char type;
+#define LSM_AUDIT_DATA_FS	1
+#define LSM_AUDIT_DATA_NET	2
+#define LSM_AUDIT_DATA_CAP	3
+#define LSM_AUDIT_DATA_IPC	4
+#define LSM_AUDIT_DATA_TASK	5
+#define LSM_AUDIT_DATA_KEY	6
+#define LSM_AUDIT_NO_AUDIT	7
+#define LSM_AUDIT_DATA_KMOD	8
 	struct task_struct *tsk;
 	union 	{
 		struct {
@@ -66,6 +67,7 @@ struct common_audit_data {
 			char *key_desc;
 		} key_struct;
 #endif
+		char *kmod_name;
 	} u;
 	/* this union contains LSM specific data */
 	union {
diff --git a/include/linux/security.h b/include/linux/security.h
index ed0faea60b82..466cbadbd1ef 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -706,6 +706,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  * @kernel_module_request:
  *	Ability to trigger the kernel to automatically upcall to userspace for
  *	userspace to load a kernel module with the given name.
+ *	@kmod_name name of the module requested by the kernel
  *	Return 0 if successful.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
@@ -1577,7 +1578,7 @@ struct security_operations {
 	void (*cred_transfer)(struct cred *new, const struct cred *old);
 	int (*kernel_act_as)(struct cred *new, u32 secid);
 	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
-	int (*kernel_module_request)(void);
+	int (*kernel_module_request)(char *kmod_name);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
 	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
 				int flags);
@@ -1842,7 +1843,7 @@ void security_commit_creds(struct cred *new, const struct cred *old);
 void security_transfer_creds(struct cred *new, const struct cred *old);
 int security_kernel_act_as(struct cred *new, u32 secid);
 int security_kernel_create_files_as(struct cred *new, struct inode *inode);
-int security_kernel_module_request(void);
+int security_kernel_module_request(char *kmod_name);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags);
@@ -2407,7 +2408,7 @@ static inline int security_kernel_create_files_as(struct cred *cred,
 	return 0;
 }
 
-static inline int security_kernel_module_request(void)
+static inline int security_kernel_module_request(char *kmod_name)
 {
 	return 0;
 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a11f87..25b103190364 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
 #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
 
-	ret = security_kernel_module_request();
-	if (ret)
-		return ret;
-
 	va_start(args, fmt);
 	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
 	va_end(args);
 	if (ret >= MODULE_NAME_LEN)
 		return -ENAMETOOLONG;
 
+	ret = security_kernel_module_request(module_name);
+	if (ret)
+		return ret;
+
 	/* If modprobe needs a service that is in a module, we get a recursive
 	 * loop.  Limit the number of running kmod threads to max_threads/2 or
 	 * MAX_KMOD_CONCURRENT, whichever is the smaller.  A cleaner method
diff --git a/security/capability.c b/security/capability.c
index 4f3ab476937f..5c700e1a4fd3 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -421,7 +421,7 @@ static int cap_kernel_create_files_as(struct cred *new, struct inode *inode)
 	return 0;
 }
 
-static int cap_kernel_module_request(void)
+static int cap_kernel_module_request(char *kmod_name)
 {
 	return 0;
 }
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 3bb90b6f1dd3..51bd0fd9c9f0 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -354,6 +354,10 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		}
 		break;
 #endif
+	case LSM_AUDIT_DATA_KMOD:
+		audit_log_format(ab, " kmod=");
+		audit_log_untrustedstring(ab, a->u.kmod_name);
+		break;
 	} /* switch (a->type) */
 }
 
diff --git a/security/security.c b/security/security.c
index aad71b2ca195..24e060be9fa5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -764,9 +764,9 @@ int security_kernel_create_files_as(struct cred *new, struct inode *inode)
 	return security_ops->kernel_create_files_as(new, inode);
 }
 
-int security_kernel_module_request(void)
+int security_kernel_module_request(char *kmod_name)
 {
-	return security_ops->kernel_module_request();
+	return security_ops->kernel_module_request(kmod_name);
 }
 
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a29d6612a328..c96d63ec4753 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3337,9 +3337,18 @@ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
 	return 0;
 }
 
-static int selinux_kernel_module_request(void)
+static int selinux_kernel_module_request(char *kmod_name)
 {
-	return task_has_system(current, SYSTEM__MODULE_REQUEST);
+	u32 sid;
+	struct common_audit_data ad;
+
+	sid = task_sid(current);
+
+	COMMON_AUDIT_DATA_INIT(&ad, KMOD);
+	ad.u.kmod_name = kmod_name;
+
+	return avc_has_perm(sid, SECINITSID_KERNEL, SECCLASS_SYSTEM,
+			    SYSTEM__MODULE_REQUEST, &ad);
 }
 
 static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
-- 
cgit v1.2.3-71-gd317


From 281d150c5f8892f158747594ab49ce2823fd8b8c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 Nov 2009 13:52:27 -0800
Subject: rcu: Prepare for synchronization fixes: clean up for non-NO_HZ
 handling of ->completed counter

Impose a clear locking design on non-NO_HZ handling of the
->completed counter.  This increases the distance between the
RCU and the CPU-hotplug mechanisms.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: <stable@kernel.org> # .32.x
LKML-Reference: <12571987491353-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 67 +++++++++++++++++++++++++-------------------------------
 kernel/rcutree.h |  8 +++----
 2 files changed, 34 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 69f4efec3449..26249abf24dc 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -178,8 +178,28 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 	return &rsp->node[0];
 }
 
+/*
+ * Record the specified "completed" value, which is later used to validate
+ * dynticks counter manipulations and CPU-offline checks.  Specify
+ * "rsp->completed - 1" to unconditionally invalidate any future dynticks
+ * manipulations and CPU-offline checks.  Such invalidation is useful at
+ * the beginning of a grace period.
+ */
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+	rsp->dynticks_completed = comp;
+}
+
 #ifdef CONFIG_SMP
 
+/*
+ * Recall the previously recorded value of the completion for dynticks.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
+}
+
 /*
  * If the specified CPU is offline, tell the caller that it is in
  * a quiescent state.  Otherwise, whack it with a reschedule IPI.
@@ -337,27 +357,8 @@ void rcu_irq_exit(void)
 		set_need_resched();
 }
 
-/*
- * Record the specified "completed" value, which is later used to validate
- * dynticks counter manipulations.  Specify "rsp->completed - 1" to
- * unconditionally invalidate any future dynticks manipulations (which is
- * useful at the beginning of a grace period).
- */
-static void dyntick_record_completed(struct rcu_state *rsp, long comp)
-{
-	rsp->dynticks_completed = comp;
-}
-
 #ifdef CONFIG_SMP
 
-/*
- * Recall the previously recorded value of the completion for dynticks.
- */
-static long dyntick_recall_completed(struct rcu_state *rsp)
-{
-	return rsp->dynticks_completed;
-}
-
 /*
  * Snapshot the specified CPU's dynticks counter so that we can later
  * credit them with an implicit quiescent state.  Return 1 if this CPU
@@ -421,24 +422,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 
 #else /* #ifdef CONFIG_NO_HZ */
 
-static void dyntick_record_completed(struct rcu_state *rsp, long comp)
-{
-}
-
 #ifdef CONFIG_SMP
 
-/*
- * If there are no dynticks, then the only way that a CPU can passively
- * be in a quiescent state is to be offline.  Unlike dynticks idle, which
- * is a point in time during the prior (already finished) grace period,
- * an offline CPU is always in a quiescent state, and thus can be
- * unconditionally applied.  So just return the current value of completed.
- */
-static long dyntick_recall_completed(struct rcu_state *rsp)
-{
-	return rsp->completed;
-}
-
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
 	return 0;
@@ -1146,6 +1131,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 	long lastcomp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	u8 signaled;
+	u8 forcenow;
 
 	if (!rcu_gp_in_progress(rsp))
 		return;  /* No grace period in progress, nothing to force. */
@@ -1182,16 +1168,23 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		if (rcu_process_dyntick(rsp, lastcomp,
 					dyntick_save_progress_counter))
 			goto unlock_ret;
+		/* fall into next case. */
+
+	case RCU_SAVE_COMPLETED:
 
 		/* Update state, record completion counter. */
+		forcenow = 0;
 		spin_lock(&rnp->lock);
 		if (lastcomp == rsp->completed &&
-		    rsp->signaled == RCU_SAVE_DYNTICK) {
+		    rsp->signaled == signaled) {
 			rsp->signaled = RCU_FORCE_QS;
 			dyntick_record_completed(rsp, lastcomp);
+			forcenow = signaled == RCU_SAVE_COMPLETED;
 		}
 		spin_unlock(&rnp->lock);
-		break;
+		if (!forcenow)
+			break;
+		/* fall into next case. */
 
 	case RCU_FORCE_QS:
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1899023b0962..8a4c1650ad8d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -204,11 +204,12 @@ struct rcu_data {
 #define RCU_GP_IDLE		0	/* No grace period in progress. */
 #define RCU_GP_INIT		1	/* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK	2	/* Need to scan dyntick state. */
-#define RCU_FORCE_QS		3	/* Need to force quiescent state. */
+#define RCU_SAVE_COMPLETED	3	/* Need to save rsp->completed. */
+#define RCU_FORCE_QS		4	/* Need to force quiescent state. */
 #ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
 #else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#define RCU_SIGNAL_INIT		RCU_SAVE_COMPLETED
 #endif /* #else #ifdef CONFIG_NO_HZ */
 
 #define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
@@ -274,9 +275,8 @@ struct rcu_state {
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#ifdef CONFIG_NO_HZ
 	long dynticks_completed;		/* Value of completed @ snap. */
-#endif /* #ifdef CONFIG_NO_HZ */
+						/*  Protected by fqslock. */
 };
 
 #ifdef RCU_TREE_NONCORE
-- 
cgit v1.2.3-71-gd317


From d09b62dfa336447c52a5ec9bb88adbc479b0f3b8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 Nov 2009 13:52:28 -0800
Subject: rcu: Fix synchronization for rcu_process_gp_end() uses of ->completed
 counter

Impose a clear locking design on the rcu_process_gp_end()
function's use of the ->completed counter.  This is done by
creating a ->completed field in the rcu_node structure, which
can safely be accessed under the protection of that structure's
lock.  Performance and scalability are maintained by using a
form of double-checked locking, so that rcu_process_gp_end()
only acquires the leaf rcu_node structure's ->lock if a grace
period has recently ended.

This fix reduces rcutorture failure rate by at least two orders
of magnitude under heavy stress with force_quiescent_state()
being invoked artificially often.  Without this fix,
unsynchronized access to the ->completed field can cause
rcu_process_gp_end() to advance callbacks whose grace period has
not yet expired.  (Bad idea!)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: <stable@kernel.org> # .32.x
LKML-Reference: <12571987494069-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 128 ++++++++++++++++++++++++++++++++++---------------------
 kernel/rcutree.h |   3 ++
 2 files changed, 83 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 26249abf24dc..9e068d112153 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -569,6 +569,76 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 	return ret;
 }
 
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.  In addition, the corresponding leaf rcu_node structure's
+ * ->lock must be held by the caller, with irqs disabled.
+ */
+static void
+__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+{
+	/* Did another grace period end? */
+	if (rdp->completed != rnp->completed) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = rnp->completed;
+	}
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp;
+
+	local_irq_save(flags);
+	rnp = rdp->mynode;
+	if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
+	    !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+		local_irq_restore(flags);
+		return;
+	}
+	__rcu_process_gp_end(rsp, rnp, rdp);
+	spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Do per-CPU grace-period initialization for running CPU.  The caller
+ * must hold the lock of the leaf rcu_node structure corresponding to
+ * this CPU.
+ */
+static void
+rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+{
+	/* Prior grace period ended, so advance callbacks for current CPU. */
+	__rcu_process_gp_end(rsp, rnp, rdp);
+
+	/*
+	 * Because this CPU just now started the new grace period, we know
+	 * that all of its callbacks will be covered by this upcoming grace
+	 * period, even the ones that were registered arbitrarily recently.
+	 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
+	 *
+	 * Other CPUs cannot be sure exactly when the grace period started.
+	 * Therefore, their recently registered callbacks must pass through
+	 * an additional RCU_NEXT_READY stage, so that they will be handled
+	 * by the next RCU grace period.
+	 */
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+}
+
 /*
  * Start a new RCU grace period if warranted, re-initializing the hierarchy
  * in preparation for detecting the next grace period.  The caller must hold
@@ -596,26 +666,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	dyntick_record_completed(rsp, rsp->completed - 1);
 	note_new_gpnum(rsp, rdp);
 
-	/*
-	 * Because this CPU just now started the new grace period, we know
-	 * that all of its callbacks will be covered by this upcoming grace
-	 * period, even the ones that were registered arbitrarily recently.
-	 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
-	 *
-	 * Other CPUs cannot be sure exactly when the grace period started.
-	 * Therefore, their recently registered callbacks must pass through
-	 * an additional RCU_NEXT_READY stage, so that they will be handled
-	 * by the next RCU grace period.
-	 */
-	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-
 	/* Special-case the common single-level case. */
 	if (NUM_RCU_NODES == 1) {
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
 		rnp->gpnum = rsp->gpnum;
+		rnp->completed = rsp->completed;
 		rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
+		rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -648,6 +706,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
 		rnp->gpnum = rsp->gpnum;
+		rnp->completed = rsp->completed;
+		if (rnp == rdp->mynode)
+			rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	}
 
@@ -658,34 +719,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
-/*
- * Advance this CPU's callbacks, but only if the current grace period
- * has ended.  This may be called only from the CPU to whom the rdp
- * belongs.
- */
-static void
-rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-	long completed_snap;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
-
-	/* Did another grace period end? */
-	if (rdp->completed != completed_snap) {
-
-		/* Advance callbacks.  No harm if list empty. */
-		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
-		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
-		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-
-		/* Remember that we saw this grace-period completion. */
-		rdp->completed = completed_snap;
-	}
-	local_irq_restore(flags);
-}
-
 /*
  * Clean up after the prior grace period and let rcu_start_gp() start up
  * the next grace period if one is needed.  Note that the caller must
@@ -697,7 +730,6 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	rsp->completed = rsp->gpnum;
 	rsp->signaled = RCU_GP_IDLE;
-	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
 	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
 
@@ -1539,21 +1571,16 @@ static void __cpuinit
 rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 {
 	unsigned long flags;
-	long lastcomp;
 	unsigned long mask;
 	struct rcu_data *rdp = rsp->rda[cpu];
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
 	spin_lock_irqsave(&rnp->lock, flags);
-	lastcomp = rsp->completed;
-	rdp->completed = lastcomp;
-	rdp->gpnum = lastcomp;
 	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
 	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
 	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->preemptable = preemptable;
-	rdp->passed_quiesc_completed = lastcomp - 1;
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
@@ -1575,6 +1602,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 		spin_lock(&rnp->lock);	/* irqs already disabled. */
 		rnp->qsmaskinit |= mask;
 		mask = rnp->grpmask;
+		if (rnp == rdp->mynode) {
+			rdp->gpnum = rnp->completed; /* if GP in progress... */
+			rdp->completed = rnp->completed;
+			rdp->passed_quiesc_completed = rnp->completed - 1;
+		}
 		spin_unlock(&rnp->lock); /* irqs already disabled. */
 		rnp = rnp->parent;
 	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 8a4c1650ad8d..c1891c3cae63 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,6 +84,9 @@ struct rcu_node {
 	long	gpnum;		/* Current grace period for this node. */
 				/*  This will either be equal to or one */
 				/*  behind the root rcu_node's gpnum. */
+	long	completed;	/* Last grace period completed for this node. */
+				/*  This will either be equal to or one */
+				/*  behind the root rcu_node's gpnum. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
 				/*  order for current grace period to proceed.*/
 				/*  In leaf rcu_node, each bit corresponds to */
-- 
cgit v1.2.3-71-gd317


From 9160306e6f5b68bb64630c9031c517ca1cf463db Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 Nov 2009 13:52:29 -0800
Subject: rcu: Fix note_new_gpnum() uses of ->gpnum

Impose a clear locking design on the note_new_gpnum()
function's use of the ->gpnum counter.  This is done by updating
rdp->gpnum only from the corresponding leaf rcu_node structure's
rnp->gpnum field, and even then only under the protection of
that same rcu_node structure's ->lock field.  Performance and
scalability are maintained using a form of double-checked
locking, and excessive spinning is avoided by use of the
spin_trylock() function.  The use of spin_trylock() is safe due
to the fact that CPUs who fail to acquire this lock will try
again later. The hierarchical nature of the rcu_node data
structure limits contention (which could be limited further if
need be using the RCU_FANOUT kernel parameter).

Without this patch, obscure but quite possible races could
result in a quiescent state that occurred during one grace
period to be accounted to the following grace period, causing
this following grace period to end prematurely.  Not good!

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: <stable@kernel.org> # .32.x
LKML-Reference: <12571987492350-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 9e068d112153..ec007dd22632 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -540,13 +540,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 /*
  * Update CPU-local rcu_data state to record the newly noticed grace period.
  * This is used both when we started the grace period and when we notice
- * that someone else started the grace period.
+ * that someone else started the grace period.  The caller must hold the
+ * ->lock of the leaf rcu_node structure corresponding to the current CPU,
+ *  and must have irqs disabled.
  */
+static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+{
+	if (rdp->gpnum != rnp->gpnum) {
+		rdp->qs_pending = 1;
+		rdp->passed_quiesc = 0;
+		rdp->gpnum = rnp->gpnum;
+	}
+}
+
 static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	rdp->qs_pending = 1;
-	rdp->passed_quiesc = 0;
-	rdp->gpnum = rsp->gpnum;
+	unsigned long flags;
+	struct rcu_node *rnp;
+
+	local_irq_save(flags);
+	rnp = rdp->mynode;
+	if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
+	    !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+		local_irq_restore(flags);
+		return;
+	}
+	__note_new_gpnum(rsp, rnp, rdp);
+	spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
 /*
@@ -637,6 +657,9 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 	 */
 	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Set state so that this CPU will detect the next quiescent state. */
+	__note_new_gpnum(rsp, rnp, rdp);
 }
 
 /*
@@ -664,7 +687,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
 	record_gp_stall_check_time(rsp);
 	dyntick_record_completed(rsp, rsp->completed - 1);
-	note_new_gpnum(rsp, rdp);
 
 	/* Special-case the common single-level case. */
 	if (NUM_RCU_NODES == 1) {
-- 
cgit v1.2.3-71-gd317


From eae0c9dfb534cb3449888b9601228efa6480fdb5 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 10 Nov 2009 03:50:02 +0100
Subject: sched: Fix and clean up rate-limit newidle code

Commit 1b9508f, "Rate-limit newidle" has been confirmed to fix
the netperf UDP loopback regression reported by Alex Shi.

This is a cleanup and a fix:

 - moved to a more out of the way spot

 - fix to ensure that balancing doesn't try to balance
   runqueues which haven't gone online yet, which can
   mess up CPU enumeration during boot.

Reported-by: Alex Shi <alex.shi@intel.com>
Reported-by: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org> # .32.x: a1f84a3: sched: Check for an idle shared cache
Cc: <stable@kernel.org> # .32.x: 1b9508f: sched: Rate-limit newidle
Cc: <stable@kernel.org> # .32.x: fd21073: sched: Fix affinity logic
Cc: <stable@kernel.org> # .32.x
LKML-Reference: <1257821402.5648.17.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 23e353568d8e..ad37776cc39b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2354,17 +2354,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	if (rq != orig_rq)
 		update_rq_clock(rq);
 
-	if (rq->idle_stamp) {
-		u64 delta = rq->clock - rq->idle_stamp;
-		u64 max = 2*sysctl_sched_migration_cost;
-
-		if (delta > max)
-			rq->avg_idle = max;
-		else
-			update_avg(&rq->avg_idle, delta);
-		rq->idle_stamp = 0;
-	}
-
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 
@@ -2421,6 +2410,17 @@ out_running:
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
+
+	if (unlikely(rq->idle_stamp)) {
+		u64 delta = rq->clock - rq->idle_stamp;
+		u64 max = 2*sysctl_sched_migration_cost;
+
+		if (delta > max)
+			rq->avg_idle = max;
+		else
+			update_avg(&rq->avg_idle, delta);
+		rq->idle_stamp = 0;
+	}
 #endif
 out:
 	task_rq_unlock(rq, &flags);
@@ -4098,7 +4098,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
-	cpumask_setall(cpus);
+	cpumask_copy(cpus, cpu_online_mask);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -4261,7 +4261,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	int all_pinned = 0;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
-	cpumask_setall(cpus);
+	cpumask_copy(cpus, cpu_online_mask);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -9522,6 +9522,8 @@ void __init sched_init(void)
 		rq->cpu = i;
 		rq->online = 0;
 		rq->migration_thread = NULL;
+		rq->idle_stamp = 0;
+		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
 #endif
-- 
cgit v1.2.3-71-gd317


From 676c0dbe6e514fdd8e434a9e623c781aa9b40b15 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 9 Nov 2009 17:37:34 +0900
Subject: ksym_tracer: Support read accesses independent of read/write.

All of the infrastructure already exists to support read accesses
for platforms that support a read access independently of read/write
(such as in the case of the SuperH UBC). This just trivially hooks
up the read case by itself.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kiszka <jan.kiszka@web.de>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <20091109083733.GA25848@linux-sh.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_ksym.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index fea83eeeef09..11935b53a6cb 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -136,6 +136,7 @@ static int ksym_trace_get_access_type(char *str)
 		access |= HW_BREAKPOINT_X;
 
 	switch (access) {
+	case HW_BREAKPOINT_R:
 	case HW_BREAKPOINT_W:
 	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
 		return access;
@@ -239,7 +240,9 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
 
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
 		ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr);
-		if (entry->type == HW_BREAKPOINT_W)
+		if (entry->type == HW_BREAKPOINT_R)
+			ret = trace_seq_puts(s, "r--\n");
+		else if (entry->type == HW_BREAKPOINT_W)
 			ret = trace_seq_puts(s, "-w-\n");
 		else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
 			ret = trace_seq_puts(s, "rw-\n");
@@ -414,6 +417,9 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	switch (field->type) {
+	case HW_BREAKPOINT_R:
+		ret = trace_seq_printf(s, " R  ");
+		break;
 	case HW_BREAKPOINT_W:
 		ret = trace_seq_printf(s, " W  ");
 		break;
@@ -488,6 +494,9 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v)
 	access_type = entry->type;
 
 	switch (access_type) {
+	case HW_BREAKPOINT_R:
+		seq_puts(m, "  R           ");
+		break;
 	case HW_BREAKPOINT_W:
 		seq_puts(m, "  W           ");
 		break;
-- 
cgit v1.2.3-71-gd317


From f60d24d2ad04977b0bd9e3eb35dba2d2fa569af9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Nov 2009 10:17:07 +0100
Subject: hw-breakpoints: Fix broken hw-breakpoint sample module

The hw-breakpoint sample module has been broken during the
hw-breakpoint internals refactoring. Propagate the changes
to it.

Reported-by: "K. Prasad" <prasad@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/hw_breakpoint.c                  |  3 ++-
 kernel/kallsyms.c                       |  1 +
 samples/hw_breakpoint/data_breakpoint.c | 43 ++++++++++++++++++---------------
 3 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index e662dc991c96..9ea9414e0e58 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -454,6 +454,7 @@ fail:
 	/* return the error if any */
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
 
 /**
  * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
@@ -470,7 +471,7 @@ void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
 	}
 	free_percpu(cpu_events);
 }
-
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
 
 static struct notifier_block hw_breakpoint_exceptions_nb = {
 	.notifier_call = hw_breakpoint_exceptions_notify,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
 	}
 	return module_kallsyms_lookup_name(name);
 }
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
 
 int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
 				      unsigned long),
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index 9cbdbb871b7a..5bc9819a819e 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -27,18 +27,19 @@
 #include <linux/module.h>	/* Needed by all modules */
 #include <linux/kernel.h>	/* Needed for KERN_INFO */
 #include <linux/init.h>		/* Needed for the macros */
+#include <linux/kallsyms.h>
 
-#include <asm/hw_breakpoint.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 
-struct hw_breakpoint sample_hbp;
+struct perf_event **sample_hbp;
 
 static char ksym_name[KSYM_NAME_LEN] = "pid_max";
 module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
 MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
 			" write operations on the kernel symbol");
 
-void sample_hbp_handler(struct hw_breakpoint *temp, struct pt_regs
-								*temp_regs)
+static void sample_hbp_handler(struct perf_event *temp, void *data)
 {
 	printk(KERN_INFO "%s value is changed\n", ksym_name);
 	dump_stack();
@@ -48,30 +49,34 @@ void sample_hbp_handler(struct hw_breakpoint *temp, struct pt_regs
 static int __init hw_break_module_init(void)
 {
 	int ret;
+	unsigned long addr;
 
-#ifdef CONFIG_X86
-	sample_hbp.info.name = ksym_name;
-	sample_hbp.info.type = HW_BREAKPOINT_WRITE;
-	sample_hbp.info.len = HW_BREAKPOINT_LEN_4;
-#endif /* CONFIG_X86 */
+	addr = kallsyms_lookup_name(ksym_name);
 
-	sample_hbp.triggered = (void *)sample_hbp_handler;
+	sample_hbp = register_wide_hw_breakpoint(addr, HW_BREAKPOINT_LEN_4,
+						 HW_BREAKPOINT_W | HW_BREAKPOINT_R,
+						 sample_hbp_handler, true);
+	if (IS_ERR(sample_hbp)) {
+		ret = PTR_ERR(sample_hbp);
+		goto fail;
+	} else if (!sample_hbp) {
+		ret = -EINVAL;
+		goto fail;
+	}
 
-	ret = register_kernel_hw_breakpoint(&sample_hbp);
-
-	if (ret < 0) {
-		printk(KERN_INFO "Breakpoint registration failed\n");
-		return ret;
-	} else
-		printk(KERN_INFO "HW Breakpoint for %s write installed\n",
-								ksym_name);
+	printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
 
 	return 0;
+
+fail:
+	printk(KERN_INFO "Breakpoint registration failed\n");
+
+	return ret;
 }
 
 static void __exit hw_break_module_exit(void)
 {
-	unregister_kernel_hw_breakpoint(&sample_hbp);
+	unregister_wide_hw_breakpoint(sample_hbp);
 	printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
 }
 
-- 
cgit v1.2.3-71-gd317


From ffd44db5f02af32bcc25a8eb5981bf02a141cdab Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Nov 2009 20:12:01 +0100
Subject: sched: Make sure task has correct sched_class after policy change

From the code in rt_mutex_setprio(), it is evident that the
intention is that task's with a RT 'prio' value as a consequence
of receiving a PI boost also have their 'sched_class' field set
to '&rt_sched_class'.

However, Peter noticed that the code in __setscheduler() could
result in this intention being frustrated. Fix it.

Reported-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1257880321.4108.457.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ad37776cc39b..43e61fa04dc7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6159,22 +6159,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	BUG_ON(p->se.on_rq);
 
 	p->policy = policy;
-	switch (p->policy) {
-	case SCHED_NORMAL:
-	case SCHED_BATCH:
-	case SCHED_IDLE:
-		p->sched_class = &fair_sched_class;
-		break;
-	case SCHED_FIFO:
-	case SCHED_RR:
-		p->sched_class = &rt_sched_class;
-		break;
-	}
-
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
+	if (rt_prio(p->prio))
+		p->sched_class = &rt_sched_class;
+	else
+		p->sched_class = &fair_sched_class;
 	set_load_weight(p);
 }
 
-- 
cgit v1.2.3-71-gd317


From dbe01350fa8ce0c11948ab7d6be71a4d901be151 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 10 Nov 2009 13:37:19 -0800
Subject: rcu: Remove inline from forward-referenced functions

Some variants of gcc are reputed to dislike forward references
to functions declared "inline".  Remove the "inline" keyword
from such functions.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12578890422402-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.h        | 2 +-
 kernel/rcutree_plugin.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index c1891c3cae63..ddb79ece05eb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -301,7 +301,7 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #else /* #ifdef RCU_TREE_NONCORE */
 
 /* Forward declarations for rcutree_plugin.h */
-static inline void rcu_bootup_announce(void);
+static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempted_readers(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ef2a58c2b9d5..c03edf766357 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 /*
  * Tell them what RCU they are running.
  */
-static inline void rcu_bootup_announce(void)
+static void rcu_bootup_announce(void)
 {
 	printk(KERN_INFO
 	       "Experimental preemptable hierarchical RCU implementation.\n");
@@ -481,7 +481,7 @@ void exit_rcu(void)
 /*
  * Tell them what RCU they are running.
  */
-static inline void rcu_bootup_announce(void)
+static void rcu_bootup_announce(void)
 {
 	printk(KERN_INFO "Hierarchical RCU implementation.\n");
 }
-- 
cgit v1.2.3-71-gd317


From 956539b75921f561c0956c22d37320780e8b4ba1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 10 Nov 2009 13:37:20 -0800
Subject: rcu: Enable synchronize_sched_expedited() fastpath

This patch adds a counter increment to enable tasks to actually
take the synchronize_sched_expedited() function's fastpath.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1257889042435-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 76c0e9691fc0..e69fee4544bd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10865,6 +10865,7 @@ void synchronize_sched_expedited(void)
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 	rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+	synchronize_sched_expedited_count++;
 	mutex_unlock(&rcu_sched_expedited_mutex);
 	put_online_cpus();
 	if (need_full_sync)
-- 
cgit v1.2.3-71-gd317


From 4bcfe055030d9e953945def3864f7e6997b27782 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 10 Nov 2009 13:37:21 -0800
Subject: rcu: Rename dynticks_completed to completed_fqs

This field is used whether or not CONFIG_NO_HZ is set, so the
old name of ->dynticks_completed is quite misleading.

Change to ->completed_fqs, given that it the value that
force_quiescent_state() is trying to drive the ->completed field
away from.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12578890423298-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 4 ++--
 kernel/rcutree.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ec007dd22632..26fc7807761d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -187,7 +187,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
  */
 static void dyntick_record_completed(struct rcu_state *rsp, long comp)
 {
-	rsp->dynticks_completed = comp;
+	rsp->completed_fqs = comp;
 }
 
 #ifdef CONFIG_SMP
@@ -197,7 +197,7 @@ static void dyntick_record_completed(struct rcu_state *rsp, long comp)
  */
 static long dyntick_recall_completed(struct rcu_state *rsp)
 {
-	return rsp->dynticks_completed;
+	return rsp->completed_fqs;
 }
 
 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index ddb79ece05eb..17a28a08b559 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -264,6 +264,8 @@ struct rcu_state {
 	long orphan_qlen;			/* Number of orphaned cbs. */
 	spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
+	long	completed_fqs;			/* Value of completed @ snap. */
+						/*  Protected by fqslock. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
 	unsigned long n_force_qs;		/* Number of calls to */
@@ -278,8 +280,6 @@ struct rcu_state {
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-	long dynticks_completed;		/* Value of completed @ snap. */
-						/*  Protected by fqslock. */
 };
 
 #ifdef RCU_TREE_NONCORE
-- 
cgit v1.2.3-71-gd317


From c64ac3ce06558e534aec62b1fadeb0a3f111dac1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 10 Nov 2009 13:37:22 -0800
Subject: rcu: Simplify association of quiescent states with grace periods

The rdp->passed_quiesc_completed fields are used to properly
associate the recorded quiescent state with a grace period.  It
is OK to wrongly associate a given quiescent state with a
preceding grace period, but it is fatal to associate a given
quiescent state with a grace period that begins after the
quiescent state occurred.  Grace periods are numbered, and the
following fields track them:

o	->gpnum is the number of the grace period currently in
	progress, or the number of the last grace period to
	complete if no grace period is currently in progress.

o	->completed is the number of the last grace period to
	have completed.

These two fields are equal if there is no grace period in
progress, otherwise ->gpnum is one greater than ->completed.
But the rdp->passed_quiesc_completed field compared against
->completed, and if equal, the quiescent state is presumed to
count against the current grace period.

The earlier code copied rdp->completed to
rdp->passed_quiesc_completed, which has been made to work, but
is error-prone.  In contrast, copying one less than rdp->gpnum
is guaranteed safe, because rdp->gpnum is not incremented until
after the start of the corresponding grace period. At the end of
the grace period, when ->completed has incremented, then any
quiescent periods recorded previously will be discarded.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12578890421011-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 4 ++--
 kernel/rcutree_plugin.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 26fc7807761d..d8024192c73b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -100,7 +100,7 @@ void rcu_sched_qs(int cpu)
 	struct rcu_data *rdp;
 
 	rdp = &per_cpu(rcu_sched_data, cpu);
-	rdp->passed_quiesc_completed = rdp->completed;
+	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
 	rcu_preempt_note_context_switch(cpu);
@@ -111,7 +111,7 @@ void rcu_bh_qs(int cpu)
 	struct rcu_data *rdp;
 
 	rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc_completed = rdp->completed;
+	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
 }
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c03edf766357..52075da70549 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
 static void rcu_preempt_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-	rdp->passed_quiesc_completed = rdp->completed;
+	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
 }
-- 
cgit v1.2.3-71-gd317


From 26a7034b40ba80f82f64fa251a2cbf49f9971c6a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 5 Nov 2009 05:26:41 -0800
Subject: sysctl: Reduce sys_sysctl to a compatibility wrapper around /proc/sys

To simply maintenance and to be able to remove all of the binary
sysctl support from various subsystems I have rewritten the binary
sysctl code as a compatibility wrapper around proc/sys.

The code is built around a hard coded table based on the table
in sysctl_check.c that lists all of our current binary sysctls
and provides enough information to convert from the sysctl
binary input into into ascii and back again.  New in this
patch is the realization that the only dynamic entries
that need to be handled have ifname as the asscii string
and ifindex as their ctl_name.

When a sys_sysctl is called the code now looks in the
translation table converting the binary name to the
path under /proc where the value is to be found.  Opens
that file, and calls into a format conversion wrapper
that calls fop->read and then fop->write as appropriate.

Since in practice the practically no one uses or tests
sys_sysctl rewritting the code to be beautiful is a little
silly.  The redeeming merit of this work is it allows us to
rip out all of the binary sysctl syscall support from
everywhere else in the tree.  Allowing us to remove
a lot of dead (after this patch) and barely maintained code.

In addition it becomes much easier to optimize the sysctl
implementation for being the backing store of /proc/sys,
without having to worry about sys_sysctl.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 init/Kconfig           |    1 +
 kernel/sysctl_binary.c | 1485 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 1384 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index f51586406d62..feb3b8f7e7ee 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -754,6 +754,7 @@ config UID16
 
 config SYSCTL_SYSCALL
 	bool "Sysctl syscall support" if EMBEDDED
+	depends on PROC_SYSCTL
 	default y
 	select SYSCTL
 	---help---
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 642019894299..471438bbece6 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -12,108 +12,1385 @@
 #include <linux/pid_namespace.h>
 #include <linux/file.h>
 #include <linux/ctype.h>
-#include <linux/smp_lock.h>
 
 #ifdef CONFIG_SYSCTL_SYSCALL
 
-/* Perform the actual read/write of a sysctl table entry. */
-static int do_sysctl_strategy(struct ctl_table_root *root,
-			struct ctl_table *table,
-			void __user *oldval, size_t __user *oldlenp,
-			void __user *newval, size_t newlen)
+struct bin_table;
+typedef ssize_t bin_convert_t(struct file *file,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
+
+static bin_convert_t bin_dir;
+static bin_convert_t bin_string;
+static bin_convert_t bin_intvec;
+static bin_convert_t bin_ulongvec;
+static bin_convert_t bin_uuid;
+static bin_convert_t bin_dn_node_address;
+
+#define CTL_DIR   bin_dir
+#define CTL_STR   bin_string
+#define CTL_INT   bin_intvec
+#define CTL_ULONG bin_ulongvec
+#define CTL_UUID  bin_uuid
+#define CTL_DNADR bin_dn_node_address
+
+#define BUFSZ 256
+
+struct bin_table {
+	bin_convert_t		*convert;
+	int			ctl_name;
+	const char		*procname;
+	const struct bin_table	*child;
+};
+
+static const struct bin_table bin_random_table[] = {
+	{ CTL_INT,	RANDOM_POOLSIZE,	"poolsize" },
+	{ CTL_INT,	RANDOM_ENTROPY_COUNT,	"entropy_avail" },
+	{ CTL_INT,	RANDOM_READ_THRESH,	"read_wakeup_threshold" },
+	{ CTL_INT,	RANDOM_WRITE_THRESH,	"write_wakeup_threshold" },
+	{ CTL_UUID,	RANDOM_BOOT_ID,		"boot_id" },
+	{ CTL_UUID,	RANDOM_UUID,		"uuid" },
+	{}
+};
+
+static const struct bin_table bin_pty_table[] = {
+	{ CTL_INT,	PTY_MAX,	"max" },
+	{ CTL_INT,	PTY_NR,		"nr" },
+	{}
+};
+
+static const struct bin_table bin_kern_table[] = {
+	{ CTL_STR,	KERN_OSTYPE,			"ostype" },
+	{ CTL_STR,	KERN_OSRELEASE,			"osrelease" },
+	/* KERN_OSREV not used */
+	{ CTL_STR,	KERN_VERSION,			"version" },
+	/* KERN_SECUREMASK not used */
+	/* KERN_PROF not used */
+	{ CTL_STR,	KERN_NODENAME,			"hostname" },
+	{ CTL_STR,	KERN_DOMAINNAME,		"domainname" },
+
+	{ CTL_INT,	KERN_PANIC,			"panic" },
+	{ CTL_INT,	KERN_REALROOTDEV,		"real-root-dev" },
+
+	{ CTL_STR,	KERN_SPARC_REBOOT,		"reboot-cmd" },
+	{ CTL_INT,	KERN_CTLALTDEL,			"ctrl-alt-del" },
+	{ CTL_INT,	KERN_PRINTK,			"printk" },
+
+	/* KERN_NAMETRANS not used */
+	/* KERN_PPC_HTABRECLAIM not used */
+	/* KERN_PPC_ZEROPAGED not used */
+	{ CTL_INT,	KERN_PPC_POWERSAVE_NAP,		"powersave-nap" },
+
+	{ CTL_STR,	KERN_MODPROBE,			"modprobe" },
+	{ CTL_INT,	KERN_SG_BIG_BUFF,		"sg-big-buff" },
+	{ CTL_INT,	KERN_ACCT,			"acct" },
+	/* KERN_PPC_L2CR "l2cr" no longer used */
+
+	/* KERN_RTSIGNR not used */
+	/* KERN_RTSIGMAX not used */
+
+	{ CTL_ULONG,	KERN_SHMMAX,			"shmmax" },
+	{ CTL_INT,	KERN_MSGMAX,			"msgmax" },
+	{ CTL_INT,	KERN_MSGMNB,			"msgmnb" },
+	/* KERN_MSGPOOL not used*/
+	{ CTL_INT,	KERN_SYSRQ,			"sysrq" },
+	{ CTL_INT,	KERN_MAX_THREADS,		"threads-max" },
+	{ CTL_DIR,	KERN_RANDOM,			"random",	bin_random_table },
+	{ CTL_ULONG,	KERN_SHMALL,			"shmall" },
+	{ CTL_INT,	KERN_MSGMNI,			"msgmni" },
+	{ CTL_INT,	KERN_SEM,			"sem" },
+	{ CTL_INT,	KERN_SPARC_STOP_A,		"stop-a" },
+	{ CTL_INT,	KERN_SHMMNI,			"shmmni" },
+
+	{ CTL_INT,	KERN_OVERFLOWUID,		"overflowuid" },
+	{ CTL_INT,	KERN_OVERFLOWGID,		"overflowgid" },
+
+	{ CTL_STR,	KERN_HOTPLUG,			"hotplug", },
+	{ CTL_INT,	KERN_IEEE_EMULATION_WARNINGS,	"ieee_emulation_warnings" },
+
+	{ CTL_INT,	KERN_S390_USER_DEBUG_LOGGING,	"userprocess_debug" },
+	{ CTL_INT,	KERN_CORE_USES_PID,		"core_uses_pid" },
+	/* KERN_TAINTED "tainted" no longer used */
+	{ CTL_INT,	KERN_CADPID,			"cad_pid" },
+	{ CTL_INT,	KERN_PIDMAX,			"pid_max" },
+	{ CTL_STR,	KERN_CORE_PATTERN,		"core_pattern" },
+	{ CTL_INT,	KERN_PANIC_ON_OOPS,		"panic_on_oops" },
+	{ CTL_INT,	KERN_HPPA_PWRSW,		"soft-power" },
+	{ CTL_INT,	KERN_HPPA_UNALIGNED,		"unaligned-trap" },
+
+	{ CTL_INT,	KERN_PRINTK_RATELIMIT,		"printk_ratelimit" },
+	{ CTL_INT,	KERN_PRINTK_RATELIMIT_BURST,	"printk_ratelimit_burst" },
+
+	{ CTL_DIR,	KERN_PTY,			"pty",		bin_pty_table },
+	{ CTL_INT,	KERN_NGROUPS_MAX,		"ngroups_max" },
+	{ CTL_INT,	KERN_SPARC_SCONS_PWROFF,	"scons-poweroff" },
+	/* KERN_HZ_TIMER "hz_timer" no longer used */
+	{ CTL_INT,	KERN_UNKNOWN_NMI_PANIC,		"unknown_nmi_panic" },
+	{ CTL_INT,	KERN_BOOTLOADER_TYPE,		"bootloader_type" },
+	{ CTL_INT,	KERN_RANDOMIZE,			"randomize_va_space" },
+
+	{ CTL_INT,	KERN_SPIN_RETRY,		"spin_retry" },
+	/* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
+	{ CTL_INT,	KERN_IA64_UNALIGNED,		"ignore-unaligned-usertrap" },
+	{ CTL_INT,	KERN_COMPAT_LOG,		"compat-log" },
+	{ CTL_INT,	KERN_MAX_LOCK_DEPTH,		"max_lock_depth" },
+	{ CTL_INT,	KERN_NMI_WATCHDOG,		"nmi_watchdog" },
+	{ CTL_INT,	KERN_PANIC_ON_NMI,		"panic_on_unrecovered_nmi" },
+	{}
+};
+
+static const struct bin_table bin_vm_table[] = {
+	{ CTL_INT,	VM_OVERCOMMIT_MEMORY,		"overcommit_memory" },
+	{ CTL_INT,	VM_PAGE_CLUSTER,		"page-cluster" },
+	{ CTL_INT,	VM_DIRTY_BACKGROUND,		"dirty_background_ratio" },
+	{ CTL_INT,	VM_DIRTY_RATIO,			"dirty_ratio" },
+	/* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
+	/* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
+	{ CTL_INT,	VM_NR_PDFLUSH_THREADS,		"nr_pdflush_threads" },
+	{ CTL_INT,	VM_OVERCOMMIT_RATIO,		"overcommit_ratio" },
+	/* VM_PAGEBUF unused */
+	/* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
+	{ CTL_INT,	VM_SWAPPINESS,			"swappiness" },
+	{ CTL_INT,	VM_LOWMEM_RESERVE_RATIO,	"lowmem_reserve_ratio" },
+	{ CTL_INT,	VM_MIN_FREE_KBYTES,		"min_free_kbytes" },
+	{ CTL_INT,	VM_MAX_MAP_COUNT,		"max_map_count" },
+	{ CTL_INT,	VM_LAPTOP_MODE,			"laptop_mode" },
+	{ CTL_INT,	VM_BLOCK_DUMP,			"block_dump" },
+	{ CTL_INT,	VM_HUGETLB_GROUP,		"hugetlb_shm_group" },
+	{ CTL_INT,	VM_VFS_CACHE_PRESSURE,	"vfs_cache_pressure" },
+	{ CTL_INT,	VM_LEGACY_VA_LAYOUT,		"legacy_va_layout" },
+	/* VM_SWAP_TOKEN_TIMEOUT unused */
+	{ CTL_INT,	VM_DROP_PAGECACHE,		"drop_caches" },
+	{ CTL_INT,	VM_PERCPU_PAGELIST_FRACTION,	"percpu_pagelist_fraction" },
+	{ CTL_INT,	VM_ZONE_RECLAIM_MODE,		"zone_reclaim_mode" },
+	{ CTL_INT,	VM_MIN_UNMAPPED,		"min_unmapped_ratio" },
+	{ CTL_INT,	VM_PANIC_ON_OOM,		"panic_on_oom" },
+	{ CTL_INT,	VM_VDSO_ENABLED,		"vdso_enabled" },
+	{ CTL_INT,	VM_MIN_SLAB,			"min_slab_ratio" },
+
+	{}
+};
+
+static const struct bin_table bin_net_core_table[] = {
+	{ CTL_INT,	NET_CORE_WMEM_MAX,	"wmem_max" },
+	{ CTL_INT,	NET_CORE_RMEM_MAX,	"rmem_max" },
+	{ CTL_INT,	NET_CORE_WMEM_DEFAULT,	"wmem_default" },
+	{ CTL_INT,	NET_CORE_RMEM_DEFAULT,	"rmem_default" },
+	/* NET_CORE_DESTROY_DELAY unused */
+	{ CTL_INT,	NET_CORE_MAX_BACKLOG,	"netdev_max_backlog" },
+	/* NET_CORE_FASTROUTE unused */
+	{ CTL_INT,	NET_CORE_MSG_COST,	"message_cost" },
+	{ CTL_INT,	NET_CORE_MSG_BURST,	"message_burst" },
+	{ CTL_INT,	NET_CORE_OPTMEM_MAX,	"optmem_max" },
+	/* NET_CORE_HOT_LIST_LENGTH unused */
+	/* NET_CORE_DIVERT_VERSION unused */
+	/* NET_CORE_NO_CONG_THRESH unused */
+	/* NET_CORE_NO_CONG unused */
+	/* NET_CORE_LO_CONG unused */
+	/* NET_CORE_MOD_CONG unused */
+	{ CTL_INT,	NET_CORE_DEV_WEIGHT,	"dev_weight" },
+	{ CTL_INT,	NET_CORE_SOMAXCONN,	"somaxconn" },
+	{ CTL_INT,	NET_CORE_BUDGET,	"netdev_budget" },
+	{ CTL_INT,	NET_CORE_AEVENT_ETIME,	"xfrm_aevent_etime" },
+	{ CTL_INT,	NET_CORE_AEVENT_RSEQTH,	"xfrm_aevent_rseqth" },
+	{ CTL_INT,	NET_CORE_WARNINGS,	"warnings" },
+	{},
+};
+
+static const struct bin_table bin_net_unix_table[] = {
+	/* NET_UNIX_DESTROY_DELAY unused */
+	/* NET_UNIX_DELETE_DELAY unused */
+	{ CTL_INT,	NET_UNIX_MAX_DGRAM_QLEN,	"max_dgram_qlen" },
+	{}
+};
+
+static const struct bin_table bin_net_ipv4_route_table[] = {
+	{ CTL_INT,	NET_IPV4_ROUTE_FLUSH,			"flush" },
+	/* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
+	/* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
+	{ CTL_INT,	NET_IPV4_ROUTE_GC_THRESH,		"gc_thresh" },
+	{ CTL_INT,	NET_IPV4_ROUTE_MAX_SIZE,		"max_size" },
+	{ CTL_INT,	NET_IPV4_ROUTE_GC_MIN_INTERVAL,		"gc_min_interval" },
+	{ CTL_INT,	NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,	"gc_min_interval_ms" },
+	{ CTL_INT,	NET_IPV4_ROUTE_GC_TIMEOUT,		"gc_timeout" },
+	{ CTL_INT,	NET_IPV4_ROUTE_GC_INTERVAL,		"gc_interval" },
+	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_LOAD,		"redirect_load" },
+	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_NUMBER,		"redirect_number" },
+	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_SILENCE,	"redirect_silence" },
+	{ CTL_INT,	NET_IPV4_ROUTE_ERROR_COST,		"error_cost" },
+	{ CTL_INT,	NET_IPV4_ROUTE_ERROR_BURST,		"error_burst" },
+	{ CTL_INT,	NET_IPV4_ROUTE_GC_ELASTICITY,		"gc_elasticity" },
+	{ CTL_INT,	NET_IPV4_ROUTE_MTU_EXPIRES,		"mtu_expires" },
+	{ CTL_INT,	NET_IPV4_ROUTE_MIN_PMTU,		"min_pmtu" },
+	{ CTL_INT,	NET_IPV4_ROUTE_MIN_ADVMSS,		"min_adv_mss" },
+	{ CTL_INT,	NET_IPV4_ROUTE_SECRET_INTERVAL,		"secret_interval" },
+	{}
+};
+
+static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
+	{ CTL_INT,	NET_IPV4_CONF_FORWARDING,		"forwarding" },
+	{ CTL_INT,	NET_IPV4_CONF_MC_FORWARDING,		"mc_forwarding" },
+
+	{ CTL_INT,	NET_IPV4_CONF_ACCEPT_REDIRECTS,		"accept_redirects" },
+	{ CTL_INT,	NET_IPV4_CONF_SECURE_REDIRECTS,		"secure_redirects" },
+	{ CTL_INT,	NET_IPV4_CONF_SEND_REDIRECTS,		"send_redirects" },
+	{ CTL_INT,	NET_IPV4_CONF_SHARED_MEDIA,		"shared_media" },
+	{ CTL_INT,	NET_IPV4_CONF_RP_FILTER,		"rp_filter" },
+	{ CTL_INT,	NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,	"accept_source_route" },
+	{ CTL_INT,	NET_IPV4_CONF_PROXY_ARP,		"proxy_arp" },
+	{ CTL_INT,	NET_IPV4_CONF_MEDIUM_ID,		"medium_id" },
+	{ CTL_INT,	NET_IPV4_CONF_BOOTP_RELAY,		"bootp_relay" },
+	{ CTL_INT,	NET_IPV4_CONF_LOG_MARTIANS,		"log_martians" },
+	{ CTL_INT,	NET_IPV4_CONF_TAG,			"tag" },
+	{ CTL_INT,	NET_IPV4_CONF_ARPFILTER,		"arp_filter" },
+	{ CTL_INT,	NET_IPV4_CONF_ARP_ANNOUNCE,		"arp_announce" },
+	{ CTL_INT,	NET_IPV4_CONF_ARP_IGNORE,		"arp_ignore" },
+	{ CTL_INT,	NET_IPV4_CONF_ARP_ACCEPT,		"arp_accept" },
+	{ CTL_INT,	NET_IPV4_CONF_ARP_NOTIFY,		"arp_notify" },
+
+	{ CTL_INT,	NET_IPV4_CONF_NOXFRM,			"disable_xfrm" },
+	{ CTL_INT,	NET_IPV4_CONF_NOPOLICY,			"disable_policy" },
+	{ CTL_INT,	NET_IPV4_CONF_FORCE_IGMP_VERSION,	"force_igmp_version" },
+	{ CTL_INT,	NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },
+	{}
+};
+
+static const struct bin_table bin_net_ipv4_conf_table[] = {
+	{ CTL_DIR,	NET_PROTO_CONF_ALL,	"all",		bin_net_ipv4_conf_vars_table },
+	{ CTL_DIR,	NET_PROTO_CONF_DEFAULT,	"default",	bin_net_ipv4_conf_vars_table },
+	{ CTL_DIR,	0, NULL, bin_net_ipv4_conf_vars_table },
+	{}
+};
+
+static const struct bin_table bin_net_neigh_vars_table[] = {
+	{ CTL_INT,	NET_NEIGH_MCAST_SOLICIT,	"mcast_solicit" },
+	{ CTL_INT,	NET_NEIGH_UCAST_SOLICIT,	"ucast_solicit" },
+	{ CTL_INT,	NET_NEIGH_APP_SOLICIT,		"app_solicit" },
+	/* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
+	{ CTL_INT,	NET_NEIGH_REACHABLE_TIME,	"base_reachable_time" },
+	{ CTL_INT,	NET_NEIGH_DELAY_PROBE_TIME,	"delay_first_probe_time" },
+	{ CTL_INT,	NET_NEIGH_GC_STALE_TIME,	"gc_stale_time" },
+	{ CTL_INT,	NET_NEIGH_UNRES_QLEN,		"unres_qlen" },
+	{ CTL_INT,	NET_NEIGH_PROXY_QLEN,		"proxy_qlen" },
+	/* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
+	/* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
+	/* NET_NEIGH_LOCKTIME "locktime" no longer used */
+	{ CTL_INT,	NET_NEIGH_GC_INTERVAL,		"gc_interval" },
+	{ CTL_INT,	NET_NEIGH_GC_THRESH1,		"gc_thresh1" },
+	{ CTL_INT,	NET_NEIGH_GC_THRESH2,		"gc_thresh2" },
+	{ CTL_INT,	NET_NEIGH_GC_THRESH3,		"gc_thresh3" },
+	{ CTL_INT,	NET_NEIGH_RETRANS_TIME_MS,	"retrans_time_ms" },
+	{ CTL_INT,	NET_NEIGH_REACHABLE_TIME_MS,	"base_reachable_time_ms" },
+	{}
+};
+
+static const struct bin_table bin_net_neigh_table[] = {
+	{ CTL_DIR,	NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
+	{ CTL_DIR,	0, NULL, bin_net_neigh_vars_table },
+	{}
+};
+
+static const struct bin_table bin_net_ipv4_netfilter_table[] = {
+	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_MAX,		"ip_conntrack_max" },
+
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT	"ip_conntrack_tcp_timeout_close_wait" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
+
+	/* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
+
+	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_BUCKETS,		"ip_conntrack_buckets" },
+	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_LOG_INVALID,	"ip_conntrack_log_invalid" },
+	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
+	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_TCP_LOOSE,	"ip_conntrack_tcp_loose" },
+	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,	"ip_conntrack_tcp_be_liberal" },
+	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,	"ip_conntrack_tcp_max_retrans" },
+
+	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
+	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
+
+	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_COUNT,		"ip_conntrack_count" },
+	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_CHECKSUM,		"ip_conntrack_checksum" },
+	{}
+};
+
+static const struct bin_table bin_net_ipv4_table[] = {
+	{CTL_INT,	NET_IPV4_FORWARD,			"ip_forward" },
+
+	{ CTL_DIR,	NET_IPV4_CONF,		"conf",		bin_net_ipv4_conf_table },
+	{ CTL_DIR,	NET_IPV4_NEIGH,		"neigh",	bin_net_neigh_table },
+	{ CTL_DIR,	NET_IPV4_ROUTE,		"route",	bin_net_ipv4_route_table },
+	/* NET_IPV4_FIB_HASH unused */
+	{ CTL_DIR,	NET_IPV4_NETFILTER,	"netfilter",	bin_net_ipv4_netfilter_table },
+
+	{ CTL_INT,	NET_IPV4_TCP_TIMESTAMPS,		"tcp_timestamps" },
+	{ CTL_INT,	NET_IPV4_TCP_WINDOW_SCALING,		"tcp_window_scaling" },
+	{ CTL_INT,	NET_IPV4_TCP_SACK,			"tcp_sack" },
+	{ CTL_INT,	NET_IPV4_TCP_RETRANS_COLLAPSE,		"tcp_retrans_collapse" },
+	{ CTL_INT,	NET_IPV4_DEFAULT_TTL,			"ip_default_ttl" },
+	/* NET_IPV4_AUTOCONFIG unused */
+	{ CTL_INT,	NET_IPV4_NO_PMTU_DISC,			"ip_no_pmtu_disc" },
+	{ CTL_INT,	NET_IPV4_NONLOCAL_BIND,			"ip_nonlocal_bind" },
+	{ CTL_INT,	NET_IPV4_TCP_SYN_RETRIES,		"tcp_syn_retries" },
+	{ CTL_INT,	NET_TCP_SYNACK_RETRIES,			"tcp_synack_retries" },
+	{ CTL_INT,	NET_TCP_MAX_ORPHANS,			"tcp_max_orphans" },
+	{ CTL_INT,	NET_TCP_MAX_TW_BUCKETS,			"tcp_max_tw_buckets" },
+	{ CTL_INT,	NET_IPV4_DYNADDR,			"ip_dynaddr" },
+	{ CTL_INT,	NET_IPV4_TCP_KEEPALIVE_TIME,		"tcp_keepalive_time" },
+	{ CTL_INT,	NET_IPV4_TCP_KEEPALIVE_PROBES,		"tcp_keepalive_probes" },
+	{ CTL_INT,	NET_IPV4_TCP_KEEPALIVE_INTVL,		"tcp_keepalive_intvl" },
+	{ CTL_INT,	NET_IPV4_TCP_RETRIES1,			"tcp_retries1" },
+	{ CTL_INT,	NET_IPV4_TCP_RETRIES2,			"tcp_retries2" },
+	{ CTL_INT,	NET_IPV4_TCP_FIN_TIMEOUT,		"tcp_fin_timeout" },
+	{ CTL_INT,	NET_TCP_SYNCOOKIES,			"tcp_syncookies" },
+	{ CTL_INT,	NET_TCP_TW_RECYCLE,			"tcp_tw_recycle" },
+	{ CTL_INT,	NET_TCP_ABORT_ON_OVERFLOW,		"tcp_abort_on_overflow" },
+	{ CTL_INT,	NET_TCP_STDURG,				"tcp_stdurg" },
+	{ CTL_INT,	NET_TCP_RFC1337,			"tcp_rfc1337" },
+	{ CTL_INT,	NET_TCP_MAX_SYN_BACKLOG,		"tcp_max_syn_backlog" },
+	{ CTL_INT,	NET_IPV4_LOCAL_PORT_RANGE,		"ip_local_port_range" },
+	{ CTL_INT,	NET_IPV4_IGMP_MAX_MEMBERSHIPS,		"igmp_max_memberships" },
+	{ CTL_INT,	NET_IPV4_IGMP_MAX_MSF,			"igmp_max_msf" },
+	{ CTL_INT,	NET_IPV4_INET_PEER_THRESHOLD,		"inet_peer_threshold" },
+	{ CTL_INT,	NET_IPV4_INET_PEER_MINTTL,		"inet_peer_minttl" },
+	{ CTL_INT,	NET_IPV4_INET_PEER_MAXTTL,		"inet_peer_maxttl" },
+	{ CTL_INT,	NET_IPV4_INET_PEER_GC_MINTIME,		"inet_peer_gc_mintime" },
+	{ CTL_INT,	NET_IPV4_INET_PEER_GC_MAXTIME,		"inet_peer_gc_maxtime" },
+	{ CTL_INT,	NET_TCP_ORPHAN_RETRIES,			"tcp_orphan_retries" },
+	{ CTL_INT,	NET_TCP_FACK,				"tcp_fack" },
+	{ CTL_INT,	NET_TCP_REORDERING,			"tcp_reordering" },
+	{ CTL_INT,	NET_TCP_ECN,				"tcp_ecn" },
+	{ CTL_INT,	NET_TCP_DSACK,				"tcp_dsack" },
+	{ CTL_INT,	NET_TCP_MEM,				"tcp_mem" },
+	{ CTL_INT,	NET_TCP_WMEM,				"tcp_wmem" },
+	{ CTL_INT,	NET_TCP_RMEM,				"tcp_rmem" },
+	{ CTL_INT,	NET_TCP_APP_WIN,			"tcp_app_win" },
+	{ CTL_INT,	NET_TCP_ADV_WIN_SCALE,			"tcp_adv_win_scale" },
+	{ CTL_INT,	NET_TCP_TW_REUSE,			"tcp_tw_reuse" },
+	{ CTL_INT,	NET_TCP_FRTO,				"tcp_frto" },
+	{ CTL_INT,	NET_TCP_FRTO_RESPONSE,			"tcp_frto_response" },
+	{ CTL_INT,	NET_TCP_LOW_LATENCY,			"tcp_low_latency" },
+	{ CTL_INT,	NET_TCP_NO_METRICS_SAVE,		"tcp_no_metrics_save" },
+	{ CTL_INT,	NET_TCP_MODERATE_RCVBUF,		"tcp_moderate_rcvbuf" },
+	{ CTL_INT,	NET_TCP_TSO_WIN_DIVISOR,		"tcp_tso_win_divisor" },
+	{ CTL_STR,	NET_TCP_CONG_CONTROL,			"tcp_congestion_control" },
+	{ CTL_INT,	NET_TCP_ABC,				"tcp_abc" },
+	{ CTL_INT,	NET_TCP_MTU_PROBING,			"tcp_mtu_probing" },
+	{ CTL_INT,	NET_TCP_BASE_MSS,			"tcp_base_mss" },
+	{ CTL_INT,	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,	"tcp_workaround_signed_windows" },
+	{ CTL_INT,	NET_TCP_DMA_COPYBREAK,			"tcp_dma_copybreak" },
+	{ CTL_INT,	NET_TCP_SLOW_START_AFTER_IDLE,		"tcp_slow_start_after_idle" },
+	{ CTL_INT,	NET_CIPSOV4_CACHE_ENABLE,		"cipso_cache_enable" },
+	{ CTL_INT,	NET_CIPSOV4_CACHE_BUCKET_SIZE,		"cipso_cache_bucket_size" },
+	{ CTL_INT,	NET_CIPSOV4_RBM_OPTFMT,			"cipso_rbm_optfmt" },
+	{ CTL_INT,	NET_CIPSOV4_RBM_STRICTVALID,		"cipso_rbm_strictvalid" },
+	/* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
+	{ CTL_STR,	NET_TCP_ALLOWED_CONG_CONTROL,		"tcp_allowed_congestion_control" },
+	{ CTL_INT,	NET_TCP_MAX_SSTHRESH,			"tcp_max_ssthresh" },
+
+	{ CTL_INT,	NET_IPV4_ICMP_ECHO_IGNORE_ALL,		"icmp_echo_ignore_all" },
+	{ CTL_INT,	NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,	"icmp_echo_ignore_broadcasts" },
+	{ CTL_INT,	NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,	"icmp_ignore_bogus_error_responses" },
+	{ CTL_INT,	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,	"icmp_errors_use_inbound_ifaddr" },
+	{ CTL_INT,	NET_IPV4_ICMP_RATELIMIT,		"icmp_ratelimit" },
+	{ CTL_INT,	NET_IPV4_ICMP_RATEMASK,			"icmp_ratemask" },
+
+	{ CTL_INT,	NET_IPV4_IPFRAG_HIGH_THRESH,		"ipfrag_high_thresh" },
+	{ CTL_INT,	NET_IPV4_IPFRAG_LOW_THRESH,		"ipfrag_low_thresh" },
+	{ CTL_INT,	NET_IPV4_IPFRAG_TIME,			"ipfrag_time" },
+
+	{ CTL_INT,	NET_IPV4_IPFRAG_SECRET_INTERVAL,	"ipfrag_secret_interval" },
+	/* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
+
+	{ CTL_INT,	2088 /* NET_IPQ_QMAX */,		"ip_queue_maxlen" },
+
+	/* NET_TCP_DEFAULT_WIN_SCALE unused */
+	/* NET_TCP_BIC_BETA unused */
+	/* NET_IPV4_TCP_MAX_KA_PROBES unused */
+	/* NET_IPV4_IP_MASQ_DEBUG unused */
+	/* NET_TCP_SYN_TAILDROP unused */
+	/* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
+	/* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
+	/* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
+	/* NET_IPV4_ICMP_PARAMPROB_RATE unused */
+	/* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
+	/* NET_IPV4_ALWAYS_DEFRAG unused */
+	{}
+};
+
+static const struct bin_table bin_net_ipx_table[] = {
+	{ CTL_INT,	NET_IPX_PPROP_BROADCASTING,	"ipx_pprop_broadcasting" },
+	/* NET_IPX_FORWARDING unused */
+	{}
+};
+
+static const struct bin_table bin_net_atalk_table[] = {
+	{ CTL_INT,	NET_ATALK_AARP_EXPIRY_TIME,		"aarp-expiry-time" },
+	{ CTL_INT,	NET_ATALK_AARP_TICK_TIME,		"aarp-tick-time" },
+	{ CTL_INT,	NET_ATALK_AARP_RETRANSMIT_LIMIT,	"aarp-retransmit-limit" },
+	{ CTL_INT,	NET_ATALK_AARP_RESOLVE_TIME,		"aarp-resolve-time" },
+	{},
+};
+
+static const struct bin_table bin_net_netrom_table[] = {
+	{ CTL_INT,	NET_NETROM_DEFAULT_PATH_QUALITY,		"default_path_quality" },
+	{ CTL_INT,	NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER,	"obsolescence_count_initialiser" },
+	{ CTL_INT,	NET_NETROM_NETWORK_TTL_INITIALISER,		"network_ttl_initialiser" },
+	{ CTL_INT,	NET_NETROM_TRANSPORT_TIMEOUT,			"transport_timeout" },
+	{ CTL_INT,	NET_NETROM_TRANSPORT_MAXIMUM_TRIES,		"transport_maximum_tries" },
+	{ CTL_INT,	NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY,		"transport_acknowledge_delay" },
+	{ CTL_INT,	NET_NETROM_TRANSPORT_BUSY_DELAY,		"transport_busy_delay" },
+	{ CTL_INT,	NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE,	"transport_requested_window_size" },
+	{ CTL_INT,	NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT,	"transport_no_activity_timeout" },
+	{ CTL_INT,	NET_NETROM_ROUTING_CONTROL,			"routing_control" },
+	{ CTL_INT,	NET_NETROM_LINK_FAILS_COUNT,			"link_fails_count" },
+	{ CTL_INT,	NET_NETROM_RESET,				"reset" },
+	{}
+};
+
+static const struct bin_table bin_net_ax25_param_table[] = {
+	{ CTL_INT,	NET_AX25_IP_DEFAULT_MODE,	"ip_default_mode" },
+	{ CTL_INT,	NET_AX25_DEFAULT_MODE,		"ax25_default_mode" },
+	{ CTL_INT,	NET_AX25_BACKOFF_TYPE,		"backoff_type" },
+	{ CTL_INT,	NET_AX25_CONNECT_MODE,		"connect_mode" },
+	{ CTL_INT,	NET_AX25_STANDARD_WINDOW,	"standard_window_size" },
+	{ CTL_INT,	NET_AX25_EXTENDED_WINDOW,	"extended_window_size" },
+	{ CTL_INT,	NET_AX25_T1_TIMEOUT,		"t1_timeout" },
+	{ CTL_INT,	NET_AX25_T2_TIMEOUT,		"t2_timeout" },
+	{ CTL_INT,	NET_AX25_T3_TIMEOUT,		"t3_timeout" },
+	{ CTL_INT,	NET_AX25_IDLE_TIMEOUT,		"idle_timeout" },
+	{ CTL_INT,	NET_AX25_N2,			"maximum_retry_count" },
+	{ CTL_INT,	NET_AX25_PACLEN,		"maximum_packet_length" },
+	{ CTL_INT,	NET_AX25_PROTOCOL,		"protocol" },
+	{ CTL_INT,	NET_AX25_DAMA_SLAVE_TIMEOUT,	"dama_slave_timeout" },
+	{}
+};
+
+static const struct bin_table bin_net_ax25_table[] = {
+	{ CTL_DIR,	0, NULL, bin_net_ax25_param_table },
+	{}
+};
+
+static const struct bin_table bin_net_rose_table[] = {
+	{ CTL_INT,	NET_ROSE_RESTART_REQUEST_TIMEOUT,	"restart_request_timeout" },
+	{ CTL_INT,	NET_ROSE_CALL_REQUEST_TIMEOUT,		"call_request_timeout" },
+	{ CTL_INT,	NET_ROSE_RESET_REQUEST_TIMEOUT,		"reset_request_timeout" },
+	{ CTL_INT,	NET_ROSE_CLEAR_REQUEST_TIMEOUT,		"clear_request_timeout" },
+	{ CTL_INT,	NET_ROSE_ACK_HOLD_BACK_TIMEOUT,		"acknowledge_hold_back_timeout" },
+	{ CTL_INT,	NET_ROSE_ROUTING_CONTROL,		"routing_control" },
+	{ CTL_INT,	NET_ROSE_LINK_FAIL_TIMEOUT,		"link_fail_timeout" },
+	{ CTL_INT,	NET_ROSE_MAX_VCS,			"maximum_virtual_circuits" },
+	{ CTL_INT,	NET_ROSE_WINDOW_SIZE,			"window_size" },
+	{ CTL_INT,	NET_ROSE_NO_ACTIVITY_TIMEOUT,		"no_activity_timeout" },
+	{}
+};
+
+static const struct bin_table bin_net_ipv6_conf_var_table[] = {
+	{ CTL_INT,	NET_IPV6_FORWARDING,			"forwarding" },
+	{ CTL_INT,	NET_IPV6_HOP_LIMIT,			"hop_limit" },
+	{ CTL_INT,	NET_IPV6_MTU,				"mtu" },
+	{ CTL_INT,	NET_IPV6_ACCEPT_RA,			"accept_ra" },
+	{ CTL_INT,	NET_IPV6_ACCEPT_REDIRECTS,		"accept_redirects" },
+	{ CTL_INT,	NET_IPV6_AUTOCONF,			"autoconf" },
+	{ CTL_INT,	NET_IPV6_DAD_TRANSMITS,			"dad_transmits" },
+	{ CTL_INT,	NET_IPV6_RTR_SOLICITS,			"router_solicitations" },
+	{ CTL_INT,	NET_IPV6_RTR_SOLICIT_INTERVAL,		"router_solicitation_interval" },
+	{ CTL_INT,	NET_IPV6_RTR_SOLICIT_DELAY,		"router_solicitation_delay" },
+	{ CTL_INT,	NET_IPV6_USE_TEMPADDR,			"use_tempaddr" },
+	{ CTL_INT,	NET_IPV6_TEMP_VALID_LFT,		"temp_valid_lft" },
+	{ CTL_INT,	NET_IPV6_TEMP_PREFERED_LFT,		"temp_prefered_lft" },
+	{ CTL_INT,	NET_IPV6_REGEN_MAX_RETRY,		"regen_max_retry" },
+	{ CTL_INT,	NET_IPV6_MAX_DESYNC_FACTOR,		"max_desync_factor" },
+	{ CTL_INT,	NET_IPV6_MAX_ADDRESSES,			"max_addresses" },
+	{ CTL_INT,	NET_IPV6_FORCE_MLD_VERSION,		"force_mld_version" },
+	{ CTL_INT,	NET_IPV6_ACCEPT_RA_DEFRTR,		"accept_ra_defrtr" },
+	{ CTL_INT,	NET_IPV6_ACCEPT_RA_PINFO,		"accept_ra_pinfo" },
+	{ CTL_INT,	NET_IPV6_ACCEPT_RA_RTR_PREF,		"accept_ra_rtr_pref" },
+	{ CTL_INT,	NET_IPV6_RTR_PROBE_INTERVAL,		"router_probe_interval" },
+	{ CTL_INT,	NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,	"accept_ra_rt_info_max_plen" },
+	{ CTL_INT,	NET_IPV6_PROXY_NDP,			"proxy_ndp" },
+	{ CTL_INT,	NET_IPV6_ACCEPT_SOURCE_ROUTE,		"accept_source_route" },
+	{}
+};
+
+static const struct bin_table bin_net_ipv6_conf_table[] = {
+	{ CTL_DIR,	NET_PROTO_CONF_ALL,		"all",	bin_net_ipv6_conf_var_table },
+	{ CTL_DIR,	NET_PROTO_CONF_DEFAULT, 	"default", bin_net_ipv6_conf_var_table },
+	{ CTL_DIR,	0, NULL, bin_net_ipv6_conf_var_table },
+	{}
+};
+
+static const struct bin_table bin_net_ipv6_route_table[] = {
+	/* NET_IPV6_ROUTE_FLUSH	"flush"  no longer used */
+	{ CTL_INT,	NET_IPV6_ROUTE_GC_THRESH,		"gc_thresh" },
+	{ CTL_INT,	NET_IPV6_ROUTE_MAX_SIZE,		"max_size" },
+	{ CTL_INT,	NET_IPV6_ROUTE_GC_MIN_INTERVAL,		"gc_min_interval" },
+	{ CTL_INT,	NET_IPV6_ROUTE_GC_TIMEOUT,		"gc_timeout" },
+	{ CTL_INT,	NET_IPV6_ROUTE_GC_INTERVAL,		"gc_interval" },
+	{ CTL_INT,	NET_IPV6_ROUTE_GC_ELASTICITY,		"gc_elasticity" },
+	{ CTL_INT,	NET_IPV6_ROUTE_MTU_EXPIRES,		"mtu_expires" },
+	{ CTL_INT,	NET_IPV6_ROUTE_MIN_ADVMSS,		"min_adv_mss" },
+	{ CTL_INT,	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,	"gc_min_interval_ms" },
+	{}
+};
+
+static const struct bin_table bin_net_ipv6_icmp_table[] = {
+	{ CTL_INT,	NET_IPV6_ICMP_RATELIMIT,	"ratelimit" },
+	{}
+};
+
+static const struct bin_table bin_net_ipv6_table[] = {
+	{ CTL_DIR,	NET_IPV6_CONF,		"conf",		bin_net_ipv6_conf_table },
+	{ CTL_DIR,	NET_IPV6_NEIGH,		"neigh",	bin_net_neigh_table },
+	{ CTL_DIR,	NET_IPV6_ROUTE,		"route",	bin_net_ipv6_route_table },
+	{ CTL_DIR,	NET_IPV6_ICMP,		"icmp",		bin_net_ipv6_icmp_table },
+	{ CTL_INT,	NET_IPV6_BINDV6ONLY,		"bindv6only" },
+	{ CTL_INT,	NET_IPV6_IP6FRAG_HIGH_THRESH,	"ip6frag_high_thresh" },
+	{ CTL_INT,	NET_IPV6_IP6FRAG_LOW_THRESH,	"ip6frag_low_thresh" },
+	{ CTL_INT,	NET_IPV6_IP6FRAG_TIME,		"ip6frag_time" },
+	{ CTL_INT,	NET_IPV6_IP6FRAG_SECRET_INTERVAL,	"ip6frag_secret_interval" },
+	{ CTL_INT,	NET_IPV6_MLD_MAX_MSF,		"mld_max_msf" },
+	{ CTL_INT,	2088 /* IPQ_QMAX */,		"ip6_queue_maxlen" },
+	{}
+};
+
+static const struct bin_table bin_net_x25_table[] = {
+	{ CTL_INT,	NET_X25_RESTART_REQUEST_TIMEOUT,	"restart_request_timeout" },
+	{ CTL_INT,	NET_X25_CALL_REQUEST_TIMEOUT,		"call_request_timeout" },
+	{ CTL_INT,	NET_X25_RESET_REQUEST_TIMEOUT,	"reset_request_timeout" },
+	{ CTL_INT,	NET_X25_CLEAR_REQUEST_TIMEOUT,	"clear_request_timeout" },
+	{ CTL_INT,	NET_X25_ACK_HOLD_BACK_TIMEOUT,	"acknowledgement_hold_back_timeout" },
+	{ CTL_INT,	NET_X25_FORWARD,			"x25_forward" },
+	{}
+};
+
+static const struct bin_table bin_net_tr_table[] = {
+	{ CTL_INT,	NET_TR_RIF_TIMEOUT,	"rif_timeout" },
+	{}
+};
+
+
+static const struct bin_table bin_net_decnet_conf_vars[] = {
+	{ CTL_INT,	NET_DECNET_CONF_DEV_FORWARDING,	"forwarding" },
+	{ CTL_INT,	NET_DECNET_CONF_DEV_PRIORITY,	"priority" },
+	{ CTL_INT,	NET_DECNET_CONF_DEV_T2,		"t2" },
+	{ CTL_INT,	NET_DECNET_CONF_DEV_T3,		"t3" },
+	{}
+};
+
+static const struct bin_table bin_net_decnet_conf[] = {
+	{ CTL_DIR, NET_DECNET_CONF_ETHER,    "ethernet", bin_net_decnet_conf_vars },
+	{ CTL_DIR, NET_DECNET_CONF_GRE,	     "ipgre",    bin_net_decnet_conf_vars },
+	{ CTL_DIR, NET_DECNET_CONF_X25,	     "x25",      bin_net_decnet_conf_vars },
+	{ CTL_DIR, NET_DECNET_CONF_PPP,	     "ppp",      bin_net_decnet_conf_vars },
+	{ CTL_DIR, NET_DECNET_CONF_DDCMP,    "ddcmp",    bin_net_decnet_conf_vars },
+	{ CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
+	{ CTL_DIR, 0,			     NULL,	 bin_net_decnet_conf_vars },
+	{}
+};
+
+static const struct bin_table bin_net_decnet_table[] = {
+	{ CTL_DIR,	NET_DECNET_CONF,		"conf",	bin_net_decnet_conf },
+	{ CTL_DNADR,	NET_DECNET_NODE_ADDRESS,	"node_address" },
+	{ CTL_STR,	NET_DECNET_NODE_NAME,		"node_name" },
+	{ CTL_STR,	NET_DECNET_DEFAULT_DEVICE,	"default_device" },
+	{ CTL_INT,	NET_DECNET_TIME_WAIT,		"time_wait" },
+	{ CTL_INT,	NET_DECNET_DN_COUNT,		"dn_count" },
+	{ CTL_INT,	NET_DECNET_DI_COUNT,		"di_count" },
+	{ CTL_INT,	NET_DECNET_DR_COUNT,		"dr_count" },
+	{ CTL_INT,	NET_DECNET_DST_GC_INTERVAL,	"dst_gc_interval" },
+	{ CTL_INT,	NET_DECNET_NO_FC_MAX_CWND,	"no_fc_max_cwnd" },
+	{ CTL_INT,	NET_DECNET_MEM,		"decnet_mem" },
+	{ CTL_INT,	NET_DECNET_RMEM,		"decnet_rmem" },
+	{ CTL_INT,	NET_DECNET_WMEM,		"decnet_wmem" },
+	{ CTL_INT,	NET_DECNET_DEBUG_LEVEL,	"debug" },
+	{}
+};
+
+static const struct bin_table bin_net_sctp_table[] = {
+	{ CTL_INT,	NET_SCTP_RTO_INITIAL,		"rto_initial" },
+	{ CTL_INT,	NET_SCTP_RTO_MIN,		"rto_min" },
+	{ CTL_INT,	NET_SCTP_RTO_MAX,		"rto_max" },
+	{ CTL_INT,	NET_SCTP_RTO_ALPHA,		"rto_alpha_exp_divisor" },
+	{ CTL_INT,	NET_SCTP_RTO_BETA,		"rto_beta_exp_divisor" },
+	{ CTL_INT,	NET_SCTP_VALID_COOKIE_LIFE,	"valid_cookie_life" },
+	{ CTL_INT,	NET_SCTP_ASSOCIATION_MAX_RETRANS,	"association_max_retrans" },
+	{ CTL_INT,	NET_SCTP_PATH_MAX_RETRANS,	"path_max_retrans" },
+	{ CTL_INT,	NET_SCTP_MAX_INIT_RETRANSMITS,	"max_init_retransmits" },
+	{ CTL_INT,	NET_SCTP_HB_INTERVAL,		"hb_interval" },
+	{ CTL_INT,	NET_SCTP_PRESERVE_ENABLE,	"cookie_preserve_enable" },
+	{ CTL_INT,	NET_SCTP_MAX_BURST,		"max_burst" },
+	{ CTL_INT,	NET_SCTP_ADDIP_ENABLE,		"addip_enable" },
+	{ CTL_INT,	NET_SCTP_PRSCTP_ENABLE,		"prsctp_enable" },
+	{ CTL_INT,	NET_SCTP_SNDBUF_POLICY,		"sndbuf_policy" },
+	{ CTL_INT,	NET_SCTP_SACK_TIMEOUT,		"sack_timeout" },
+	{ CTL_INT,	NET_SCTP_RCVBUF_POLICY,		"rcvbuf_policy" },
+	{}
+};
+
+static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
+	{ CTL_INT,	NET_LLC2_ACK_TIMEOUT,	"ack" },
+	{ CTL_INT,	NET_LLC2_P_TIMEOUT,	"p" },
+	{ CTL_INT,	NET_LLC2_REJ_TIMEOUT,	"rej" },
+	{ CTL_INT,	NET_LLC2_BUSY_TIMEOUT,	"busy" },
+	{}
+};
+
+static const struct bin_table bin_net_llc_station_table[] = {
+	{ CTL_INT,	NET_LLC_STATION_ACK_TIMEOUT,	"ack_timeout" },
+	{}
+};
+
+static const struct bin_table bin_net_llc_llc2_table[] = {
+	{ CTL_DIR,	NET_LLC2,		"timeout",	bin_net_llc_llc2_timeout_table },
+	{}
+};
+
+static const struct bin_table bin_net_llc_table[] = {
+	{ CTL_DIR,	NET_LLC2,		"llc2",		bin_net_llc_llc2_table },
+	{ CTL_DIR,	NET_LLC_STATION,	"station",	bin_net_llc_station_table },
+	{}
+};
+
+static const struct bin_table bin_net_netfilter_table[] = {
+	{ CTL_INT,	NET_NF_CONNTRACK_MAX,			"nf_conntrack_max" },
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
+	/* NET_NF_CONNTRACK_UDP_TIMEOUT	"nf_conntrack_udp_timeout" no longer used */
+	/* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
+	/* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
+	/* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
+	{ CTL_INT,	NET_NF_CONNTRACK_BUCKETS,		"nf_conntrack_buckets" },
+	{ CTL_INT,	NET_NF_CONNTRACK_LOG_INVALID,		"nf_conntrack_log_invalid" },
+	/* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
+	{ CTL_INT,	NET_NF_CONNTRACK_TCP_LOOSE,		"nf_conntrack_tcp_loose" },
+	{ CTL_INT,	NET_NF_CONNTRACK_TCP_BE_LIBERAL,	"nf_conntrack_tcp_be_liberal" },
+	{ CTL_INT,	NET_NF_CONNTRACK_TCP_MAX_RETRANS,	"nf_conntrack_tcp_max_retrans" },
+	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
+	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
+	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
+	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
+	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
+	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
+	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
+	{ CTL_INT,	NET_NF_CONNTRACK_COUNT,			"nf_conntrack_count" },
+	/* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
+	/* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
+	{ CTL_INT,	NET_NF_CONNTRACK_FRAG6_LOW_THRESH,	"nf_conntrack_frag6_low_thresh" },
+	{ CTL_INT,	NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,	"nf_conntrack_frag6_high_thresh" },
+	{ CTL_INT,	NET_NF_CONNTRACK_CHECKSUM,		"nf_conntrack_checksum" },
+
+	{}
+};
+
+static const struct bin_table bin_net_irda_table[] = {
+	{ CTL_INT,	NET_IRDA_DISCOVERY,		"discovery" },
+	{ CTL_STR,	NET_IRDA_DEVNAME,		"devname" },
+	{ CTL_INT,	NET_IRDA_DEBUG,			"debug" },
+	{ CTL_INT,	NET_IRDA_FAST_POLL,		"fast_poll_increase" },
+	{ CTL_INT,	NET_IRDA_DISCOVERY_SLOTS,	"discovery_slots" },
+	{ CTL_INT,	NET_IRDA_DISCOVERY_TIMEOUT,	"discovery_timeout" },
+	{ CTL_INT,	NET_IRDA_SLOT_TIMEOUT,		"slot_timeout" },
+	{ CTL_INT,	NET_IRDA_MAX_BAUD_RATE,		"max_baud_rate" },
+	{ CTL_INT,	NET_IRDA_MIN_TX_TURN_TIME,	"min_tx_turn_time" },
+	{ CTL_INT,	NET_IRDA_MAX_TX_DATA_SIZE,	"max_tx_data_size" },
+	{ CTL_INT,	NET_IRDA_MAX_TX_WINDOW,		"max_tx_window" },
+	{ CTL_INT,	NET_IRDA_MAX_NOREPLY_TIME,	"max_noreply_time" },
+	{ CTL_INT,	NET_IRDA_WARN_NOREPLY_TIME,	"warn_noreply_time" },
+	{ CTL_INT,	NET_IRDA_LAP_KEEPALIVE_TIME,	"lap_keepalive_time" },
+	{}
+};
+
+static const struct bin_table bin_net_table[] = {
+	{ CTL_DIR,	NET_CORE,		"core",		bin_net_core_table },
+	/* NET_ETHER not used */
+	/* NET_802 not used */
+	{ CTL_DIR,	NET_UNIX,		"unix",		bin_net_unix_table },
+	{ CTL_DIR,	NET_IPV4,		"ipv4",		bin_net_ipv4_table },
+	{ CTL_DIR,	NET_IPX,		"ipx",		bin_net_ipx_table },
+	{ CTL_DIR,	NET_ATALK,		"appletalk",	bin_net_atalk_table },
+	{ CTL_DIR,	NET_NETROM,		"netrom",	bin_net_netrom_table },
+	{ CTL_DIR,	NET_AX25,		"ax25",		bin_net_ax25_table },
+	/*  NET_BRIDGE "bridge" no longer used */
+	{ CTL_DIR,	NET_ROSE,		"rose",		bin_net_rose_table },
+	{ CTL_DIR,	NET_IPV6,		"ipv6",		bin_net_ipv6_table },
+	{ CTL_DIR,	NET_X25,		"x25",		bin_net_x25_table },
+	{ CTL_DIR,	NET_TR,			"token-ring",	bin_net_tr_table },
+	{ CTL_DIR,	NET_DECNET,		"decnet",	bin_net_decnet_table },
+	/*  NET_ECONET not used */
+	{ CTL_DIR,	NET_SCTP,		"sctp",		bin_net_sctp_table },
+	{ CTL_DIR,	NET_LLC,		"llc",		bin_net_llc_table },
+	{ CTL_DIR,	NET_NETFILTER,		"netfilter",	bin_net_netfilter_table },
+	/* NET_DCCP "dccp" no longer used */
+	{ CTL_DIR,	NET_IRDA,		"irda",		bin_net_irda_table },
+	{ CTL_INT,	2089,			"nf_conntrack_max" },
+	{}
+};
+
+static const struct bin_table bin_fs_quota_table[] = {
+	{ CTL_INT,	FS_DQ_LOOKUPS,		"lookups" },
+	{ CTL_INT,	FS_DQ_DROPS,		"drops" },
+	{ CTL_INT,	FS_DQ_READS,		"reads" },
+	{ CTL_INT,	FS_DQ_WRITES,		"writes" },
+	{ CTL_INT,	FS_DQ_CACHE_HITS,	"cache_hits" },
+	{ CTL_INT,	FS_DQ_ALLOCATED,	"allocated_dquots" },
+	{ CTL_INT,	FS_DQ_FREE,		"free_dquots" },
+	{ CTL_INT,	FS_DQ_SYNCS,		"syncs" },
+	{ CTL_INT,	FS_DQ_WARNINGS,		"warnings" },
+	{}
+};
+
+static const struct bin_table bin_fs_xfs_table[] = {
+	{ CTL_INT,	XFS_SGID_INHERIT,	"irix_sgid_inherit" },
+	{ CTL_INT,	XFS_SYMLINK_MODE,	"irix_symlink_mode" },
+	{ CTL_INT,	XFS_PANIC_MASK,		"panic_mask" },
+
+	{ CTL_INT,	XFS_ERRLEVEL,		"error_level" },
+	{ CTL_INT,	XFS_SYNCD_TIMER,	"xfssyncd_centisecs" },
+	{ CTL_INT,	XFS_INHERIT_SYNC,	"inherit_sync" },
+	{ CTL_INT,	XFS_INHERIT_NODUMP,	"inherit_nodump" },
+	{ CTL_INT,	XFS_INHERIT_NOATIME,	"inherit_noatime" },
+	{ CTL_INT,	XFS_BUF_TIMER,		"xfsbufd_centisecs" },
+	{ CTL_INT,	XFS_BUF_AGE,		"age_buffer_centisecs" },
+	{ CTL_INT,	XFS_INHERIT_NOSYM,	"inherit_nosymlinks" },
+	{ CTL_INT,	XFS_ROTORSTEP,	"rotorstep" },
+	{ CTL_INT,	XFS_INHERIT_NODFRG,	"inherit_nodefrag" },
+	{ CTL_INT,	XFS_FILESTREAM_TIMER,	"filestream_centisecs" },
+	{ CTL_INT,	XFS_STATS_CLEAR,	"stats_clear" },
+	{}
+};
+
+static const struct bin_table bin_fs_ocfs2_nm_table[] = {
+	{ CTL_STR,	1, "hb_ctl_path" },
+	{}
+};
+
+static const struct bin_table bin_fs_ocfs2_table[] = {
+	{ CTL_DIR,	1,	"nm",	bin_fs_ocfs2_nm_table },
+	{}
+};
+
+static const struct bin_table bin_inotify_table[] = {
+	{ CTL_INT,	INOTIFY_MAX_USER_INSTANCES,	"max_user_instances" },
+	{ CTL_INT,	INOTIFY_MAX_USER_WATCHES,	"max_user_watches" },
+	{ CTL_INT,	INOTIFY_MAX_QUEUED_EVENTS,	"max_queued_events" },
+	{}
+};
+
+static const struct bin_table bin_fs_table[] = {
+	{ CTL_INT,	FS_NRINODE,		"inode-nr" },
+	{ CTL_INT,	FS_STATINODE,		"inode-state" },
+	/* FS_MAXINODE unused */
+	/* FS_NRDQUOT unused */
+	/* FS_MAXDQUOT unused */
+	/* FS_NRFILE "file-nr" no longer used */
+	{ CTL_INT,	FS_MAXFILE,		"file-max" },
+	{ CTL_INT,	FS_DENTRY,		"dentry-state" },
+	/* FS_NRSUPER unused */
+	/* FS_MAXUPSER unused */
+	{ CTL_INT,	FS_OVERFLOWUID,		"overflowuid" },
+	{ CTL_INT,	FS_OVERFLOWGID,		"overflowgid" },
+	{ CTL_INT,	FS_LEASES,		"leases-enable" },
+	{ CTL_INT,	FS_DIR_NOTIFY,		"dir-notify-enable" },
+	{ CTL_INT,	FS_LEASE_TIME,		"lease-break-time" },
+	{ CTL_DIR,	FS_DQSTATS,		"quota",	bin_fs_quota_table },
+	{ CTL_DIR,	FS_XFS,			"xfs",		bin_fs_xfs_table },
+	{ CTL_ULONG,	FS_AIO_NR,		"aio-nr" },
+	{ CTL_ULONG,	FS_AIO_MAX_NR,		"aio-max-nr" },
+	{ CTL_DIR,	FS_INOTIFY,		"inotify",	bin_inotify_table },
+	{ CTL_DIR,	FS_OCFS2,		"ocfs2",	bin_fs_ocfs2_table },
+	{ CTL_INT,	KERN_SETUID_DUMPABLE,	"suid_dumpable" },
+	{}
+};
+
+static const struct bin_table bin_ipmi_table[] = {
+	{ CTL_INT,	DEV_IPMI_POWEROFF_POWERCYCLE,	"poweroff_powercycle" },
+	{}
+};
+
+static const struct bin_table bin_mac_hid_files[] = {
+	/* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
+	/* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
+	{ CTL_INT,	DEV_MAC_HID_MOUSE_BUTTON_EMULATION,	"mouse_button_emulation" },
+	{ CTL_INT,	DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE,	"mouse_button2_keycode" },
+	{ CTL_INT,	DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE,	"mouse_button3_keycode" },
+	/* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
+	{}
+};
+
+static const struct bin_table bin_raid_table[] = {
+	{ CTL_INT,	DEV_RAID_SPEED_LIMIT_MIN,	"speed_limit_min" },
+	{ CTL_INT,	DEV_RAID_SPEED_LIMIT_MAX,	"speed_limit_max" },
+	{}
+};
+
+static const struct bin_table bin_scsi_table[] = {
+	{ CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
+	{}
+};
+
+static const struct bin_table bin_dev_table[] = {
+	/* DEV_CDROM	"cdrom" no longer used */
+	/* DEV_HWMON unused */
+	/* DEV_PARPORT	"parport" no longer used */
+	{ CTL_DIR,	DEV_RAID,	"raid",		bin_raid_table },
+	{ CTL_DIR,	DEV_MAC_HID,	"mac_hid",	bin_mac_hid_files },
+	{ CTL_DIR,	DEV_SCSI,	"scsi",		bin_scsi_table },
+	{ CTL_DIR,	DEV_IPMI,	"ipmi",		bin_ipmi_table },
+	{}
+};
+
+static const struct bin_table bin_bus_isa_table[] = {
+	{ CTL_INT,	BUS_ISA_MEM_BASE,	"membase" },
+	{ CTL_INT,	BUS_ISA_PORT_BASE,	"portbase" },
+	{ CTL_INT,	BUS_ISA_PORT_SHIFT,	"portshift" },
+	{}
+};
+
+static const struct bin_table bin_bus_table[] = {
+	{ CTL_DIR,	CTL_BUS_ISA,	"isa",	bin_bus_isa_table },
+	{}
+};
+
+
+static const struct bin_table bin_s390dbf_table[] = {
+	{ CTL_INT,	5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
+	{ CTL_INT,	5679 /* CTL_S390DBF_ACTIVE */,	  "debug_active" },
+	{}
+};
+
+static const struct bin_table bin_sunrpc_table[] = {
+	/* CTL_RPCDEBUG	"rpc_debug"  no longer used */
+	/* CTL_NFSDEBUG "nfs_debug"  no longer used */
+	/* CTL_NFSDDEBUG "nfsd_debug" no longer used  */
+	/* CTL_NLMDEBUG "nlm_debug" no longer used */
+
+	{ CTL_INT,	CTL_SLOTTABLE_UDP,	"udp_slot_table_entries" },
+	{ CTL_INT,	CTL_SLOTTABLE_TCP,	"tcp_slot_table_entries" },
+	{ CTL_INT,	CTL_MIN_RESVPORT,	"min_resvport" },
+	{ CTL_INT,	CTL_MAX_RESVPORT,	"max_resvport" },
+	{}
+};
+
+static const struct bin_table bin_pm_table[] = {
+	/* frv specific */
+	/* 1 == CTL_PM_SUSPEND	"suspend"  no longer used" */
+	{ CTL_INT,	2 /* CTL_PM_CMODE */,		"cmode" },
+	{ CTL_INT,	3 /* CTL_PM_P0 */,		"p0" },
+	{ CTL_INT,	4 /* CTL_PM_CM */,		"cm" },
+	{}
+};
+
+static const struct bin_table bin_root_table[] = {
+	{ CTL_DIR,	CTL_KERN,	"kernel",	bin_kern_table },
+	{ CTL_DIR,	CTL_VM,		"vm",		bin_vm_table },
+	{ CTL_DIR,	CTL_NET,	"net",		bin_net_table },
+	/* CTL_PROC not used */
+	{ CTL_DIR,	CTL_FS,		"fs",		bin_fs_table },
+	/* CTL_DEBUG "debug" no longer used */
+	{ CTL_DIR,	CTL_DEV,	"dev",		bin_dev_table },
+	{ CTL_DIR,	CTL_BUS,	"bus",		bin_bus_table },
+	{ CTL_DIR,	CTL_ABI,	"abi" },
+	/* CTL_CPU not used */
+	/* CTL_ARLAN "arlan" no longer used */
+	{ CTL_DIR,	CTL_S390DBF,	"s390dbf",	bin_s390dbf_table },
+	{ CTL_DIR,	CTL_SUNRPC,	"sunrpc",	bin_sunrpc_table },
+	{ CTL_DIR,	CTL_PM,		"pm",		bin_pm_table },
+	{}
+};
+
+static ssize_t bin_dir(struct file *file,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+	return -ENOTDIR;
+}
+
+
+static ssize_t bin_string(struct file *file,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+	ssize_t result, copied = 0;
+
+	if (oldval && oldlen) {
+		char __user *lastp;
+		loff_t pos = 0;
+		int ch;
+
+		result = vfs_read(file, oldval, oldlen, &pos);
+		if (result < 0)
+			goto out;
+
+		copied = result;
+		lastp = oldval + copied - 1;
+
+		result = -EFAULT;
+		if (get_user(ch, lastp))
+			goto out;
+
+		/* Trim off the trailing newline */
+		if (ch == '\n') {
+			result = -EFAULT;
+			if (put_user('\0', lastp))
+				goto out;
+			copied -= 1;
+		}
+	}
+
+	if (newval && newlen) {
+		loff_t pos = 0;
+
+		result = vfs_write(file, newval, newlen, &pos);
+		if (result < 0)
+			goto out;
+	}
+
+	result = copied;
+out:
+	return result;
+}
+
+static ssize_t bin_intvec(struct file *file,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+	mm_segment_t old_fs = get_fs();
+	ssize_t copied = 0;
+	char *buffer;
+	ssize_t result;
+
+	result = -ENOMEM;
+	buffer = kmalloc(BUFSZ, GFP_KERNEL);
+	if (!buffer)
+		goto out;
+
+	if (oldval && oldlen) {
+		unsigned __user *vec = oldval;
+		size_t length = oldlen / sizeof(*vec);
+		loff_t pos = 0;
+		char *str, *end;
+		int i;
+
+		set_fs(KERNEL_DS);
+		result = vfs_read(file, buffer, BUFSZ - 1, &pos);
+		set_fs(old_fs);
+		if (result < 0)
+			goto out_kfree;
+
+		str = buffer;
+		end = str + result;
+		*end++ = '\0';
+		for (i = 0; i < length; i++) {
+			unsigned long value;
+
+			value = simple_strtoul(str, &str, 10);
+			while (isspace(*str))
+				str++;
+			
+			result = -EFAULT;
+			if (put_user(value, vec + i))
+				goto out_kfree;
+
+			copied += sizeof(*vec);
+			if (!isdigit(*str))
+				break;
+		}
+	}
+
+	if (newval && newlen) {
+		unsigned __user *vec = newval;
+		size_t length = newlen / sizeof(*vec);
+		loff_t pos = 0;
+		char *str, *end;
+		int i;
+
+		str = buffer;
+		end = str + BUFSZ;
+		for (i = 0; i < length; i++) {
+			unsigned long value;
+
+			result = -EFAULT;
+			if (get_user(value, vec + i))
+				goto out_kfree;
+
+			str += snprintf(str, end - str, "%lu\t", value);
+		}
+
+		set_fs(KERNEL_DS);
+		result = vfs_write(file, buffer, str - buffer, &pos);
+		set_fs(old_fs);
+		if (result < 0)
+			goto out_kfree;
+	}
+	result = copied;
+out_kfree:
+	kfree(buffer);
+out:
+	return result;
+}
+
+static ssize_t bin_ulongvec(struct file *file,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-	int op = 0, rc;
-
-	if (oldval)
-		op |= MAY_READ;
-	if (newval)
-		op |= MAY_WRITE;
-	if (sysctl_perm(root, table, op))
-		return -EPERM;
-
-	if (table->strategy) {
-		rc = table->strategy(table, oldval, oldlenp, newval, newlen);
-		if (rc < 0)
-			return rc;
-		if (rc > 0)
-			return 0;
+	mm_segment_t old_fs = get_fs();
+	ssize_t copied = 0;
+	char *buffer;
+	ssize_t result;
+
+	result = -ENOMEM;
+	buffer = kmalloc(BUFSZ, GFP_KERNEL);
+	if (!buffer)
+		goto out;
+
+	if (oldval && oldlen) {
+		unsigned long __user *vec = oldval;
+		size_t length = oldlen / sizeof(*vec);
+		loff_t pos = 0;
+		char *str, *end;
+		int i;
+
+		set_fs(KERNEL_DS);
+		result = vfs_read(file, buffer, BUFSZ - 1, &pos);
+		set_fs(old_fs);
+		if (result < 0)
+			goto out_kfree;
+
+		str = buffer;
+		end = str + result;
+		*end++ = '\0';
+		for (i = 0; i < length; i++) {
+			unsigned long value;
+
+			value = simple_strtoul(str, &str, 10);
+			while (isspace(*str))
+				str++;
+			
+			result = -EFAULT;
+			if (put_user(value, vec + i))
+				goto out_kfree;
+
+			copied += sizeof(*vec);
+			if (!isdigit(*str))
+				break;
+		}
 	}
 
-	/* If there is no strategy routine, or if the strategy returns
-	 * zero, proceed with automatic r/w */
-	if (table->data && table->maxlen) {
-		rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
-		if (rc < 0)
-			return rc;
+	if (newval && newlen) {
+		unsigned long __user *vec = newval;
+		size_t length = newlen / sizeof(*vec);
+		loff_t pos = 0;
+		char *str, *end;
+		int i;
+
+		str = buffer;
+		end = str + BUFSZ;
+		for (i = 0; i < length; i++) {
+			unsigned long value;
+
+			result = -EFAULT;
+			if (get_user(value, vec + i))
+				goto out_kfree;
+
+			str += snprintf(str, end - str, "%lu\t", value);
+		}
+
+		set_fs(KERNEL_DS);
+		result = vfs_write(file, buffer, str - buffer, &pos);
+		set_fs(old_fs);
+		if (result < 0)
+			goto out_kfree;
 	}
-	return 0;
+	result = copied;
+out_kfree:
+	kfree(buffer);
+out:
+	return result;
+}
+
+static unsigned hex_value(int ch)
+{
+	return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
 }
 
-static int parse_table(const int *name, int nlen,
-		       void __user *oldval, size_t __user *oldlenp,
-		       void __user *newval, size_t newlen,
-		       struct ctl_table_root *root,
-		       struct ctl_table *table)
+static ssize_t bin_uuid(struct file *file,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-	int n;
+	mm_segment_t old_fs = get_fs();
+	ssize_t result, copied = 0;
+
+	/* Only supports reads */
+	if (oldval && oldlen) {
+		loff_t pos = 0;
+		char buf[40], *str = buf;
+		unsigned char uuid[16];
+		int i;
+
+		set_fs(KERNEL_DS);
+		result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
+		set_fs(old_fs);
+		if (result < 0)
+			goto out;
+
+		buf[result] = '\0';
+
+		/* Convert the uuid to from a string to binary */
+		for (i = 0; i < 16; i++) {
+			result = -EIO;
+			if (!isxdigit(str[0]) || !isxdigit(str[1]))
+				goto out;
+
+			uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
+			str += 2;
+			if (*str == '-')
+				str++;
+		}
+
+		if (oldlen > 16)
+			oldlen = 16;
+
+		result = -EFAULT;
+		if (copy_to_user(oldval, uuid, oldlen))
+			goto out;
+
+		copied = oldlen;
+	}
+	result = copied;
+out:
+	return result;
+}
+
+static ssize_t bin_dn_node_address(struct file *file,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+	mm_segment_t old_fs = get_fs();
+	ssize_t result, copied = 0;
+
+	if (oldval && oldlen) {
+		loff_t pos = 0;
+		char buf[15], *nodep;
+		unsigned long area, node;
+		__le16 dnaddr;
+
+		set_fs(KERNEL_DS);
+		result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
+		set_fs(old_fs);
+		if (result < 0)
+			goto out;
+
+		buf[result] = '\0';
+
+		/* Convert the decnet addresss to binary */
+		result = -EIO;
+		nodep = strchr(buf, '.') + 1;
+		if (!nodep)
+			goto out;
+
+		area = simple_strtoul(buf, NULL, 10);
+		node = simple_strtoul(nodep, NULL, 10);
+
+		result = -EIO;
+		if ((area > 63)||(node > 1023))
+			goto out;
+
+		dnaddr = cpu_to_le16((area << 10) | node);
+
+		result = -EFAULT;
+		if (put_user(dnaddr, (__le16 __user *)oldval))
+			goto out;
+
+		copied = sizeof(dnaddr);
+	}
+
+	if (newval && newlen) {
+		loff_t pos = 0;
+		__le16 dnaddr;
+		char buf[15];
+		int len;
+
+		result = -EINVAL;
+		if (newlen != sizeof(dnaddr))
+			goto out;
+
+		result = -EFAULT;
+		if (get_user(dnaddr, (__le16 __user *)newval))
+			goto out;
+
+		len = snprintf(buf, sizeof(buf), "%hu.%hu",
+				le16_to_cpu(dnaddr) >> 10,
+				le16_to_cpu(dnaddr) & 0x3ff);
+
+		set_fs(KERNEL_DS);
+		result = vfs_write(file, buf, len, &pos);
+		set_fs(old_fs);
+		if (result < 0)
+			goto out;
+	}
+
+	result = copied;
+out:
+	return result;
+}
+
+static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
+{
+	const struct bin_table *table = &bin_root_table[0];
+	struct net *net = current->nsproxy->net_ns;
+	int ctl_name;
+
+	memcpy(path, "sys/", 4);
+	path += 4;
+
 repeat:
 	if (!nlen)
-		return -ENOTDIR;
-	n = *name;
-	for ( ; table->ctl_name || table->procname; table++) {
-		if (!table->ctl_name)
-			continue;
-		if (n == table->ctl_name) {
-			int error;
+		return ERR_PTR(-ENOTDIR);
+	ctl_name = *name;
+	name++;
+	nlen--;
+	for ( ; table->convert; table++) {
+		struct net_device *dev = NULL;
+		const char *procname = NULL;
+
+		/* Use the well known sysctl number to proc name mapping */
+		if (ctl_name == table->ctl_name)
+			procname = table->procname;
+
+		/*
+		 * For a wild card entry map from ifindex to network
+		 * device name.
+		 */
+		else if (!table->ctl_name) {
+			dev = dev_get_by_index(net, ctl_name);
+			if (dev)
+				procname = dev->name;
+		}
+		if (procname) {
+			int len;
+
+			len = strlen(procname);
+			memcpy(path, procname, len);
+			path += len;
+			if (dev)
+				dev_put(dev);
 			if (table->child) {
-				if (sysctl_perm(root, table, MAY_EXEC))
-					return -EPERM;
-				name++;
-				nlen--;
+				*path++ = '/';
 				table = table->child;
 				goto repeat;
 			}
-			error = do_sysctl_strategy(root, table,
-						   oldval, oldlenp,
-						   newval, newlen);
-			return error;
+			*path = '\0';
+			return table;
 		}
 	}
-	return -ENOTDIR;
+	return ERR_PTR(-ENOTDIR);
 }
 
-static ssize_t binary_sysctl(const int *name, int nlen,
-	void __user *oldval, size_t __user *oldlenp,
-	void __user *newval, size_t newlen)
-
+static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
 {
-	struct ctl_table_header *head;
-	ssize_t error = -ENOTDIR;
-
-	for (head = sysctl_head_next(NULL); head;
-			head = sysctl_head_next(head)) {
-		error = parse_table(name, nlen, oldval, oldlenp, 
-					newval, newlen,
-					head->root, head->ctl_table);
-		if (error != -ENOTDIR) {
-			sysctl_head_finish(head);
-			break;
+	char *tmp, *result;
+
+	result = ERR_PTR(-ENOMEM);
+	tmp = __getname();
+	if (tmp) {
+		const struct bin_table *table = get_sysctl(name, nlen, tmp);
+		result = tmp;
+		*tablep = table;
+		if (IS_ERR(table)) {
+			__putname(tmp);
+			result = ERR_CAST(table);
 		}
 	}
-	return error;
+	return result;
 }
 
+static ssize_t binary_sysctl(const int *name, int nlen,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+	const struct bin_table *table = NULL;
+	struct nameidata nd;
+	struct vfsmount *mnt;
+	struct file *file;
+	ssize_t result;
+	char *pathname;
+	int flags;
+	int acc_mode, fmode;
+
+	pathname = sysctl_getname(name, nlen, &table);
+	result = PTR_ERR(pathname);
+	if (IS_ERR(pathname))
+		goto out;
+
+	/* How should the sysctl be accessed? */
+	if (oldval && oldlen && newval && newlen) {
+		flags = O_RDWR;
+		acc_mode = MAY_READ | MAY_WRITE;
+		fmode = FMODE_READ | FMODE_WRITE;
+	} else if (newval && newlen) {
+		flags = O_WRONLY;
+		acc_mode = MAY_WRITE;
+		fmode = FMODE_WRITE;
+	} else if (oldval && oldlen) {
+		flags = O_RDONLY;
+		acc_mode = MAY_READ;
+		fmode = FMODE_READ;
+	} else {
+		result = 0;
+		goto out_putname;
+	}
+
+	mnt = current->nsproxy->pid_ns->proc_mnt;
+	result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
+	if (result)
+		goto out_putname;
+
+	result = may_open(&nd.path, acc_mode, fmode);
+	if (result)
+		goto out_putpath;
+
+	file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
+	result = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out_putname;
+
+	result = table->convert(file, oldval, oldlen, newval, newlen);
+
+	fput(file);
+out_putname:
+	putname(pathname);
+out:
+	return result;
+
+out_putpath:
+	path_put(&nd.path);
+	goto out_putname;
+}
+
+
 #else /* CONFIG_SYSCTL_SYSCALL */
 
-static ssize_t binary_sysctl(const int *ctl_name, int nlen,
-	void __user *oldval, size_t __user *oldlenp,
-	void __user *newval, size_t newlen)
+static ssize_t binary_sysctl(const int *name, int nlen,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 #endif /* CONFIG_SYSCTL_SYSCALL */
 
+
 static void deprecated_sysctl_warning(const int *name, int nlen)
 {
 	static int msg_count;
@@ -135,21 +1412,15 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
 	return;
 }
 
-static int do_sysctl(int __user *args_name, int nlen,
-	void __user *oldval, size_t __user *oldlenp,
-	void __user *newval, size_t newlen)
+static ssize_t do_sysctl(int __user *args_name, int nlen,
+	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
 	int name[CTL_MAXNAME];
-	size_t oldlen = 0;
 	int i;
 
-	if (nlen <= 0 || nlen >= CTL_MAXNAME)
+	/* Check args->nlen. */
+	if (nlen < 0 || nlen > CTL_MAXNAME)
 		return -ENOTDIR;
-	if (oldval && !oldlenp)
-		return -EFAULT;
-	if (oldlenp && get_user(oldlen, oldlenp))
-		return -EFAULT;
-
 	/* Read in the sysctl name for simplicity */
 	for (i = 0; i < nlen; i++)
 		if (get_user(name[i], args_name + i))
@@ -157,26 +1428,39 @@ static int do_sysctl(int __user *args_name, int nlen,
 
 	deprecated_sysctl_warning(name, nlen);
 
-	return binary_sysctl(name, nlen, oldval, oldlenp, newval, newlen);
+	return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
 }
 
-
 SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
 {
 	struct __sysctl_args tmp;
-	int error;
+	size_t oldlen = 0;
+	ssize_t result;
 
 	if (copy_from_user(&tmp, args, sizeof(tmp)))
 		return -EFAULT;
 
-	lock_kernel();
-	error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
-			  tmp.newval, tmp.newlen);
-	unlock_kernel();
+	if (tmp.oldval && !tmp.oldlenp)
+		return -EFAULT;
+
+	if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
+		return -EFAULT;
+
+	result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
+			   tmp.newval, tmp.newlen);
+
+	if (result >= 0) {
+		oldlen = result;
+		result = 0;
+	}
 
-	return error;
+	if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
+		return -EFAULT;
+
+	return result;
 }
 
+
 #ifdef CONFIG_COMPAT
 #include <asm/compat.h>
 
@@ -194,34 +1478,31 @@ asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
 {
 	struct compat_sysctl_args tmp;
 	compat_size_t __user *compat_oldlenp;
-	size_t __user *oldlenp = NULL;
 	size_t oldlen = 0;
 	ssize_t result;
 
 	if (copy_from_user(&tmp, args, sizeof(tmp)))
 		return -EFAULT;
 
-	compat_oldlenp = compat_ptr(tmp.oldlenp);
-	if (compat_oldlenp) {
-		oldlenp = compat_alloc_user_space(sizeof(*compat_oldlenp));
+	if (tmp.oldval && !tmp.oldlenp)
+		return -EFAULT;
 
-		if (get_user(oldlen, compat_oldlenp) ||
-		    put_user(oldlen, oldlenp))
-			return -EFAULT;
-	}
+	compat_oldlenp = compat_ptr(tmp.oldlenp);
+	if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
+		return -EFAULT;
 
-	lock_kernel();
 	result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
-			   compat_ptr(tmp.oldval), oldlenp,
+			   compat_ptr(tmp.oldval), oldlen,
 			   compat_ptr(tmp.newval), tmp.newlen);
-	unlock_kernel();
 
-	if (oldlenp && !result) {
-		if (get_user(oldlen, oldlenp) ||
-		    put_user(oldlen, compat_oldlenp))
-			return -EFAULT;
+	if (result >= 0) {
+		oldlen = result;
+		result = 0;
 	}
 
+	if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
+		return -EFAULT;
+
 	return result;
 }
 
-- 
cgit v1.2.3-71-gd317


From a965cf946d38b0ff164a054477a91df70b0dd997 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 02:02:58 -0700
Subject: sysctl: Neuter the generic sysctl strategy routines.

Now that sys_sysctl is a compatibility layer on top of /proc/sys
these routines are never called but are still put in sysctl
tables so I have reduced them to stubs until they can be
removed entirely.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sysctl.c | 198 --------------------------------------------------------
 1 file changed, 198 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a642d7ffa85..f82e955875c9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2835,201 +2835,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_FS */
 
-
-#ifdef CONFIG_SYSCTL_SYSCALL
-/*
- * General sysctl support routines 
- */
-
-/* The generic sysctl data routine (used if no strategy routine supplied) */
-int sysctl_data(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	size_t len;
-
-	/* Get out of I don't have a variable */
-	if (!table->data || !table->maxlen)
-		return -ENOTDIR;
-
-	if (oldval && oldlenp) {
-		if (get_user(len, oldlenp))
-			return -EFAULT;
-		if (len) {
-			if (len > table->maxlen)
-				len = table->maxlen;
-			if (copy_to_user(oldval, table->data, len))
-				return -EFAULT;
-			if (put_user(len, oldlenp))
-				return -EFAULT;
-		}
-	}
-
-	if (newval && newlen) {
-		if (newlen > table->maxlen)
-			newlen = table->maxlen;
-
-		if (copy_from_user(table->data, newval, newlen))
-			return -EFAULT;
-	}
-	return 1;
-}
-
-/* The generic string strategy routine: */
-int sysctl_string(struct ctl_table *table,
-		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen)
-{
-	if (!table->data || !table->maxlen) 
-		return -ENOTDIR;
-	
-	if (oldval && oldlenp) {
-		size_t bufsize;
-		if (get_user(bufsize, oldlenp))
-			return -EFAULT;
-		if (bufsize) {
-			size_t len = strlen(table->data), copied;
-
-			/* This shouldn't trigger for a well-formed sysctl */
-			if (len > table->maxlen)
-				len = table->maxlen;
-
-			/* Copy up to a max of bufsize-1 bytes of the string */
-			copied = (len >= bufsize) ? bufsize - 1 : len;
-
-			if (copy_to_user(oldval, table->data, copied) ||
-			    put_user(0, (char __user *)(oldval + copied)))
-				return -EFAULT;
-			if (put_user(len, oldlenp))
-				return -EFAULT;
-		}
-	}
-	if (newval && newlen) {
-		size_t len = newlen;
-		if (len > table->maxlen)
-			len = table->maxlen;
-		if(copy_from_user(table->data, newval, len))
-			return -EFAULT;
-		if (len == table->maxlen)
-			len--;
-		((char *) table->data)[len] = 0;
-	}
-	return 1;
-}
-
-/*
- * This function makes sure that all of the integers in the vector
- * are between the minimum and maximum values given in the arrays
- * table->extra1 and table->extra2, respectively.
- */
-int sysctl_intvec(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-
-	if (newval && newlen) {
-		int __user *vec = (int __user *) newval;
-		int *min = (int *) table->extra1;
-		int *max = (int *) table->extra2;
-		size_t length;
-		int i;
-
-		if (newlen % sizeof(int) != 0)
-			return -EINVAL;
-
-		if (!table->extra1 && !table->extra2)
-			return 0;
-
-		if (newlen > table->maxlen)
-			newlen = table->maxlen;
-		length = newlen / sizeof(int);
-
-		for (i = 0; i < length; i++) {
-			int value;
-			if (get_user(value, vec + i))
-				return -EFAULT;
-			if (min && value < min[i])
-				return -EINVAL;
-			if (max && value > max[i])
-				return -EINVAL;
-		}
-	}
-	return 0;
-}
-
-/* Strategy function to convert jiffies to seconds */ 
-int sysctl_jiffies(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	if (oldval && oldlenp) {
-		size_t olen;
-
-		if (get_user(olen, oldlenp))
-			return -EFAULT;
-		if (olen) {
-			int val;
-
-			if (olen < sizeof(int))
-				return -EINVAL;
-
-			val = *(int *)(table->data) / HZ;
-			if (put_user(val, (int __user *)oldval))
-				return -EFAULT;
-			if (put_user(sizeof(int), oldlenp))
-				return -EFAULT;
-		}
-	}
-	if (newval && newlen) { 
-		int new;
-		if (newlen != sizeof(int))
-			return -EINVAL; 
-		if (get_user(new, (int __user *)newval))
-			return -EFAULT;
-		*(int *)(table->data) = new*HZ; 
-	}
-	return 1;
-}
-
-/* Strategy function to convert jiffies to seconds */ 
-int sysctl_ms_jiffies(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	if (oldval && oldlenp) {
-		size_t olen;
-
-		if (get_user(olen, oldlenp))
-			return -EFAULT;
-		if (olen) {
-			int val;
-
-			if (olen < sizeof(int))
-				return -EINVAL;
-
-			val = jiffies_to_msecs(*(int *)(table->data));
-			if (put_user(val, (int __user *)oldval))
-				return -EFAULT;
-			if (put_user(sizeof(int), oldlenp))
-				return -EFAULT;
-		}
-	}
-	if (newval && newlen) { 
-		int new;
-		if (newlen != sizeof(int))
-			return -EINVAL; 
-		if (get_user(new, (int __user *)newval))
-			return -EFAULT;
-		*(int *)(table->data) = msecs_to_jiffies(new);
-	}
-	return 1;
-}
-
-
-
-#else /* CONFIG_SYSCTL_SYSCALL */
-
-
 int sysctl_data(struct ctl_table *table,
 		  void __user *oldval, size_t __user *oldlenp,
 		  void __user *newval, size_t newlen)
@@ -3065,9 +2870,6 @@ int sysctl_ms_jiffies(struct ctl_table *table,
 	return -ENOSYS;
 }
 
-#endif /* CONFIG_SYSCTL_SYSCALL */
-
-
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
-- 
cgit v1.2.3-71-gd317


From 83ac201b4f06eb8aeb7ac93cf162651ba30e0b28 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 02:22:26 -0700
Subject: sysctl: Remove dead code from sysctl_check

Now that the sys_sysctl is now a compatibility wrapper around
/proc/sys we can remove much of sysctl_check and reduce it
to a few remaining sanity checks.  This completely decouples
it from the binary sysctl system call.

Little things like ensuring that the sysctl has not already
been registered are all that remain.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sysctl_check.c | 1376 +------------------------------------------------
 lib/Kconfig.debug     |    2 +-
 2 files changed, 4 insertions(+), 1374 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b6e7aaea4604..04cdcf72c827 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -5,1239 +5,6 @@
 #include <linux/string.h>
 #include <net/ip_vs.h>
 
-struct trans_ctl_table {
-	int			ctl_name;
-	const char		*procname;
-	const struct trans_ctl_table *child;
-};
-
-static const struct trans_ctl_table trans_random_table[] = {
-	{ RANDOM_POOLSIZE,	"poolsize" },
-	{ RANDOM_ENTROPY_COUNT,	"entropy_avail" },
-	{ RANDOM_READ_THRESH,	"read_wakeup_threshold" },
-	{ RANDOM_WRITE_THRESH,	"write_wakeup_threshold" },
-	{ RANDOM_BOOT_ID,	"boot_id" },
-	{ RANDOM_UUID,		"uuid" },
-	{}
-};
-
-static const struct trans_ctl_table trans_pty_table[] = {
-	{ PTY_MAX,		"max" },
-	{ PTY_NR,		"nr" },
-	{}
-};
-
-static const struct trans_ctl_table trans_kern_table[] = {
-	{ KERN_OSTYPE,			"ostype" },
-	{ KERN_OSRELEASE,		"osrelease" },
-	/* KERN_OSREV not used */
-	{ KERN_VERSION,			"version" },
-	/* KERN_SECUREMASK not used */
-	/* KERN_PROF not used */
-	{ KERN_NODENAME,		"hostname" },
-	{ KERN_DOMAINNAME,		"domainname" },
-
-	{ KERN_PANIC,			"panic" },
-	{ KERN_REALROOTDEV,		"real-root-dev" },
-
-	{ KERN_SPARC_REBOOT,		"reboot-cmd" },
-	{ KERN_CTLALTDEL,		"ctrl-alt-del" },
-	{ KERN_PRINTK,			"printk" },
-
-	/* KERN_NAMETRANS not used */
-	/* KERN_PPC_HTABRECLAIM not used */
-	/* KERN_PPC_ZEROPAGED not used */
-	{ KERN_PPC_POWERSAVE_NAP,	"powersave-nap" },
-
-	{ KERN_MODPROBE,		"modprobe" },
-	{ KERN_SG_BIG_BUFF,		"sg-big-buff" },
-	{ KERN_ACCT,			"acct" },
-	{ KERN_PPC_L2CR,		"l2cr" },
-
-	/* KERN_RTSIGNR not used */
-	/* KERN_RTSIGMAX not used */
-
-	{ KERN_SHMMAX,			"shmmax" },
-	{ KERN_MSGMAX,			"msgmax" },
-	{ KERN_MSGMNB,			"msgmnb" },
-	/* KERN_MSGPOOL not used*/
-	{ KERN_SYSRQ,			"sysrq" },
-	{ KERN_MAX_THREADS,		"threads-max" },
-	{ KERN_RANDOM,			"random",	trans_random_table },
-	{ KERN_SHMALL,			"shmall" },
-	{ KERN_MSGMNI,			"msgmni" },
-	{ KERN_SEM,			"sem" },
-	{ KERN_SPARC_STOP_A,		"stop-a" },
-	{ KERN_SHMMNI,			"shmmni" },
-
-	{ KERN_OVERFLOWUID,		"overflowuid" },
-	{ KERN_OVERFLOWGID,		"overflowgid" },
-
-	{ KERN_HOTPLUG,			"hotplug", },
-	{ KERN_IEEE_EMULATION_WARNINGS,	"ieee_emulation_warnings" },
-
-	{ KERN_S390_USER_DEBUG_LOGGING,	"userprocess_debug" },
-	{ KERN_CORE_USES_PID,		"core_uses_pid" },
-	{ KERN_TAINTED,			"tainted" },
-	{ KERN_CADPID,			"cad_pid" },
-	{ KERN_PIDMAX,			"pid_max" },
-	{ KERN_CORE_PATTERN,		"core_pattern" },
-	{ KERN_PANIC_ON_OOPS,		"panic_on_oops" },
-	{ KERN_HPPA_PWRSW,		"soft-power" },
-	{ KERN_HPPA_UNALIGNED,		"unaligned-trap" },
-
-	{ KERN_PRINTK_RATELIMIT,	"printk_ratelimit" },
-	{ KERN_PRINTK_RATELIMIT_BURST,	"printk_ratelimit_burst" },
-
-	{ KERN_PTY,			"pty",		trans_pty_table },
-	{ KERN_NGROUPS_MAX,		"ngroups_max" },
-	{ KERN_SPARC_SCONS_PWROFF,	"scons-poweroff" },
-	{ KERN_HZ_TIMER,		"hz_timer" },
-	{ KERN_UNKNOWN_NMI_PANIC,	"unknown_nmi_panic" },
-	{ KERN_BOOTLOADER_TYPE,		"bootloader_type" },
-	{ KERN_RANDOMIZE,		"randomize_va_space" },
-
-	{ KERN_SPIN_RETRY,		"spin_retry" },
-	{ KERN_ACPI_VIDEO_FLAGS,	"acpi_video_flags" },
-	{ KERN_IA64_UNALIGNED,		"ignore-unaligned-usertrap" },
-	{ KERN_COMPAT_LOG,		"compat-log" },
-	{ KERN_MAX_LOCK_DEPTH,		"max_lock_depth" },
-	{ KERN_NMI_WATCHDOG,		"nmi_watchdog" },
-	{ KERN_PANIC_ON_NMI,		"panic_on_unrecovered_nmi" },
-	{}
-};
-
-static const struct trans_ctl_table trans_vm_table[] = {
-	{ VM_OVERCOMMIT_MEMORY,		"overcommit_memory" },
-	{ VM_PAGE_CLUSTER,		"page-cluster" },
-	{ VM_DIRTY_BACKGROUND,		"dirty_background_ratio" },
-	{ VM_DIRTY_RATIO,		"dirty_ratio" },
-	{ VM_DIRTY_WB_CS,		"dirty_writeback_centisecs" },
-	{ VM_DIRTY_EXPIRE_CS,		"dirty_expire_centisecs" },
-	{ VM_NR_PDFLUSH_THREADS,	"nr_pdflush_threads" },
-	{ VM_OVERCOMMIT_RATIO,		"overcommit_ratio" },
-	/* VM_PAGEBUF unused */
-	{ VM_HUGETLB_PAGES,		"nr_hugepages" },
-	{ VM_SWAPPINESS,		"swappiness" },
-	{ VM_LOWMEM_RESERVE_RATIO,	"lowmem_reserve_ratio" },
-	{ VM_MIN_FREE_KBYTES,		"min_free_kbytes" },
-	{ VM_MAX_MAP_COUNT,		"max_map_count" },
-	{ VM_LAPTOP_MODE,		"laptop_mode" },
-	{ VM_BLOCK_DUMP,		"block_dump" },
-	{ VM_HUGETLB_GROUP,		"hugetlb_shm_group" },
-	{ VM_VFS_CACHE_PRESSURE,	"vfs_cache_pressure" },
-	{ VM_LEGACY_VA_LAYOUT,		"legacy_va_layout" },
-	/* VM_SWAP_TOKEN_TIMEOUT unused */
-	{ VM_DROP_PAGECACHE,		"drop_caches" },
-	{ VM_PERCPU_PAGELIST_FRACTION,	"percpu_pagelist_fraction" },
-	{ VM_ZONE_RECLAIM_MODE,		"zone_reclaim_mode" },
-	{ VM_MIN_UNMAPPED,		"min_unmapped_ratio" },
-	{ VM_PANIC_ON_OOM,		"panic_on_oom" },
-	{ VM_VDSO_ENABLED,		"vdso_enabled" },
-	{ VM_MIN_SLAB,			"min_slab_ratio" },
-
-	{}
-};
-
-static const struct trans_ctl_table trans_net_core_table[] = {
-	{ NET_CORE_WMEM_MAX,		"wmem_max" },
-	{ NET_CORE_RMEM_MAX,		"rmem_max" },
-	{ NET_CORE_WMEM_DEFAULT,	"wmem_default" },
-	{ NET_CORE_RMEM_DEFAULT,	"rmem_default" },
-	/* NET_CORE_DESTROY_DELAY unused */
-	{ NET_CORE_MAX_BACKLOG,		"netdev_max_backlog" },
-	/* NET_CORE_FASTROUTE unused */
-	{ NET_CORE_MSG_COST,		"message_cost" },
-	{ NET_CORE_MSG_BURST,		"message_burst" },
-	{ NET_CORE_OPTMEM_MAX,		"optmem_max" },
-	/* NET_CORE_HOT_LIST_LENGTH unused */
-	/* NET_CORE_DIVERT_VERSION unused */
-	/* NET_CORE_NO_CONG_THRESH unused */
-	/* NET_CORE_NO_CONG unused */
-	/* NET_CORE_LO_CONG unused */
-	/* NET_CORE_MOD_CONG unused */
-	{ NET_CORE_DEV_WEIGHT,		"dev_weight" },
-	{ NET_CORE_SOMAXCONN,		"somaxconn" },
-	{ NET_CORE_BUDGET,		"netdev_budget" },
-	{ NET_CORE_AEVENT_ETIME,	"xfrm_aevent_etime" },
-	{ NET_CORE_AEVENT_RSEQTH,	"xfrm_aevent_rseqth" },
-	{ NET_CORE_WARNINGS,		"warnings" },
-	{},
-};
-
-static const struct trans_ctl_table trans_net_unix_table[] = {
-	/* NET_UNIX_DESTROY_DELAY unused */
-	/* NET_UNIX_DELETE_DELAY unused */
-	{ NET_UNIX_MAX_DGRAM_QLEN,	"max_dgram_qlen" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
-	{ NET_IPV4_ROUTE_FLUSH,			"flush" },
-	{ NET_IPV4_ROUTE_MIN_DELAY,		"min_delay" },
-	{ NET_IPV4_ROUTE_MAX_DELAY,		"max_delay" },
-	{ NET_IPV4_ROUTE_GC_THRESH,		"gc_thresh" },
-	{ NET_IPV4_ROUTE_MAX_SIZE,		"max_size" },
-	{ NET_IPV4_ROUTE_GC_MIN_INTERVAL,	"gc_min_interval" },
-	{ NET_IPV4_ROUTE_GC_TIMEOUT,		"gc_timeout" },
-	{ NET_IPV4_ROUTE_GC_INTERVAL,		"gc_interval" },
-	{ NET_IPV4_ROUTE_REDIRECT_LOAD,		"redirect_load" },
-	{ NET_IPV4_ROUTE_REDIRECT_NUMBER,	"redirect_number" },
-	{ NET_IPV4_ROUTE_REDIRECT_SILENCE,	"redirect_silence" },
-	{ NET_IPV4_ROUTE_ERROR_COST,		"error_cost" },
-	{ NET_IPV4_ROUTE_ERROR_BURST,		"error_burst" },
-	{ NET_IPV4_ROUTE_GC_ELASTICITY,		"gc_elasticity" },
-	{ NET_IPV4_ROUTE_MTU_EXPIRES,		"mtu_expires" },
-	{ NET_IPV4_ROUTE_MIN_PMTU,		"min_pmtu" },
-	{ NET_IPV4_ROUTE_MIN_ADVMSS,		"min_adv_mss" },
-	{ NET_IPV4_ROUTE_SECRET_INTERVAL,	"secret_interval" },
-	{ NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,	"gc_min_interval_ms" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
-	{ NET_IPV4_CONF_FORWARDING,		"forwarding" },
-	{ NET_IPV4_CONF_MC_FORWARDING,		"mc_forwarding" },
-
-	{ NET_IPV4_CONF_PROXY_ARP,		"proxy_arp" },
-	{ NET_IPV4_CONF_ACCEPT_REDIRECTS,	"accept_redirects" },
-	{ NET_IPV4_CONF_SECURE_REDIRECTS,	"secure_redirects" },
-	{ NET_IPV4_CONF_SEND_REDIRECTS,		"send_redirects" },
-	{ NET_IPV4_CONF_SHARED_MEDIA,		"shared_media" },
-	{ NET_IPV4_CONF_RP_FILTER,		"rp_filter" },
-	{ NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,	"accept_source_route" },
-	{ NET_IPV4_CONF_BOOTP_RELAY,		"bootp_relay" },
-	{ NET_IPV4_CONF_LOG_MARTIANS,		"log_martians" },
-	{ NET_IPV4_CONF_TAG,			"tag" },
-	{ NET_IPV4_CONF_ARPFILTER,		"arp_filter" },
-	{ NET_IPV4_CONF_MEDIUM_ID,		"medium_id" },
-	{ NET_IPV4_CONF_NOXFRM,			"disable_xfrm" },
-	{ NET_IPV4_CONF_NOPOLICY,		"disable_policy" },
-	{ NET_IPV4_CONF_FORCE_IGMP_VERSION,	"force_igmp_version" },
-
-	{ NET_IPV4_CONF_ARP_ANNOUNCE,		"arp_announce" },
-	{ NET_IPV4_CONF_ARP_IGNORE,		"arp_ignore" },
-	{ NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },
-	{ NET_IPV4_CONF_ARP_ACCEPT,		"arp_accept" },
-	{ NET_IPV4_CONF_ARP_NOTIFY,		"arp_notify" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
-	{ NET_PROTO_CONF_ALL,		"all",		trans_net_ipv4_conf_vars_table },
-	{ NET_PROTO_CONF_DEFAULT,	"default",	trans_net_ipv4_conf_vars_table },
-	{ 0, NULL, trans_net_ipv4_conf_vars_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
-	{ NET_NEIGH_MCAST_SOLICIT,	"mcast_solicit" },
-	{ NET_NEIGH_UCAST_SOLICIT,	"ucast_solicit" },
-	{ NET_NEIGH_APP_SOLICIT,	"app_solicit" },
-	{ NET_NEIGH_RETRANS_TIME,	"retrans_time" },
-	{ NET_NEIGH_REACHABLE_TIME,	"base_reachable_time" },
-	{ NET_NEIGH_DELAY_PROBE_TIME,	"delay_first_probe_time" },
-	{ NET_NEIGH_GC_STALE_TIME,	"gc_stale_time" },
-	{ NET_NEIGH_UNRES_QLEN,		"unres_qlen" },
-	{ NET_NEIGH_PROXY_QLEN,		"proxy_qlen" },
-	{ NET_NEIGH_ANYCAST_DELAY,	"anycast_delay" },
-	{ NET_NEIGH_PROXY_DELAY,	"proxy_delay" },
-	{ NET_NEIGH_LOCKTIME,		"locktime" },
-	{ NET_NEIGH_GC_INTERVAL,	"gc_interval" },
-	{ NET_NEIGH_GC_THRESH1,		"gc_thresh1" },
-	{ NET_NEIGH_GC_THRESH2,		"gc_thresh2" },
-	{ NET_NEIGH_GC_THRESH3,		"gc_thresh3" },
-	{ NET_NEIGH_RETRANS_TIME_MS,	"retrans_time_ms" },
-	{ NET_NEIGH_REACHABLE_TIME_MS,	"base_reachable_time_ms" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_neigh_table[] = {
-	{ NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
-	{ 0, NULL, trans_net_neigh_vars_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
-	{ NET_IPV4_NF_CONNTRACK_MAX,				"ip_conntrack_max" },
-
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,		"ip_conntrack_tcp_timeout_syn_sent" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,		"ip_conntrack_tcp_timeout_syn_recv" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,	"ip_conntrack_tcp_timeout_established" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,		"ip_conntrack_tcp_timeout_fin_wait" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,		"ip_conntrack_tcp_timeout_close_wait" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,		"ip_conntrack_tcp_timeout_last_ack" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,		"ip_conntrack_tcp_timeout_time_wait" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,		"ip_conntrack_tcp_timeout_close" },
-
-	{ NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT,			"ip_conntrack_udp_timeout" },
-	{ NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM,		"ip_conntrack_udp_timeout_stream" },
-	{ NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,			"ip_conntrack_icmp_timeout" },
-	{ NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT,		"ip_conntrack_generic_timeout" },
-
-	{ NET_IPV4_NF_CONNTRACK_BUCKETS,			"ip_conntrack_buckets" },
-	{ NET_IPV4_NF_CONNTRACK_LOG_INVALID,			"ip_conntrack_log_invalid" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,	"ip_conntrack_tcp_timeout_max_retrans" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_LOOSE,			"ip_conntrack_tcp_loose" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,			"ip_conntrack_tcp_be_liberal" },
-	{ NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,		"ip_conntrack_tcp_max_retrans" },
-
-	{ NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,		"ip_conntrack_sctp_timeout_closed" },
-	{ NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,	"ip_conntrack_sctp_timeout_cookie_wait" },
-	{ NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,	"ip_conntrack_sctp_timeout_cookie_echoed" },
-	{ NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,	"ip_conntrack_sctp_timeout_established" },
-	{ NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,	"ip_conntrack_sctp_timeout_shutdown_sent" },
-	{ NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,	"ip_conntrack_sctp_timeout_shutdown_recd" },
-	{ NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,	"ip_conntrack_sctp_timeout_shutdown_ack_sent" },
-
-	{ NET_IPV4_NF_CONNTRACK_COUNT,		"ip_conntrack_count" },
-	{ NET_IPV4_NF_CONNTRACK_CHECKSUM,	"ip_conntrack_checksum" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv4_table[] = {
-	{ NET_IPV4_FORWARD,			"ip_forward" },
-	{ NET_IPV4_DYNADDR,			"ip_dynaddr" },
-
-	{ NET_IPV4_CONF,		"conf",		trans_net_ipv4_conf_table },
-	{ NET_IPV4_NEIGH,		"neigh",	trans_net_neigh_table },
-	{ NET_IPV4_ROUTE,		"route",	trans_net_ipv4_route_table },
-	/* NET_IPV4_FIB_HASH unused */
-	{ NET_IPV4_NETFILTER,		"netfilter",	trans_net_ipv4_netfilter_table },
-
-	{ NET_IPV4_TCP_TIMESTAMPS,		"tcp_timestamps" },
-	{ NET_IPV4_TCP_WINDOW_SCALING,		"tcp_window_scaling" },
-	{ NET_IPV4_TCP_SACK,			"tcp_sack" },
-	{ NET_IPV4_TCP_RETRANS_COLLAPSE,	"tcp_retrans_collapse" },
-	{ NET_IPV4_DEFAULT_TTL,			"ip_default_ttl" },
-	/* NET_IPV4_AUTOCONFIG unused */
-	{ NET_IPV4_NO_PMTU_DISC,		"ip_no_pmtu_disc" },
-	{ NET_IPV4_TCP_SYN_RETRIES,		"tcp_syn_retries" },
-	{ NET_IPV4_IPFRAG_HIGH_THRESH,		"ipfrag_high_thresh" },
-	{ NET_IPV4_IPFRAG_LOW_THRESH,		"ipfrag_low_thresh" },
-	{ NET_IPV4_IPFRAG_TIME,			"ipfrag_time" },
-	/* NET_IPV4_TCP_MAX_KA_PROBES unused */
-	{ NET_IPV4_TCP_KEEPALIVE_TIME,		"tcp_keepalive_time" },
-	{ NET_IPV4_TCP_KEEPALIVE_PROBES,	"tcp_keepalive_probes" },
-	{ NET_IPV4_TCP_RETRIES1,		"tcp_retries1" },
-	{ NET_IPV4_TCP_RETRIES2,		"tcp_retries2" },
-	{ NET_IPV4_TCP_FIN_TIMEOUT,		"tcp_fin_timeout" },
-	/* NET_IPV4_IP_MASQ_DEBUG unused */
-	{ NET_TCP_SYNCOOKIES,			"tcp_syncookies" },
-	{ NET_TCP_STDURG,			"tcp_stdurg" },
-	{ NET_TCP_RFC1337,			"tcp_rfc1337" },
-	/* NET_TCP_SYN_TAILDROP unused */
-	{ NET_TCP_MAX_SYN_BACKLOG,		"tcp_max_syn_backlog" },
-	{ NET_IPV4_LOCAL_PORT_RANGE,		"ip_local_port_range" },
-	{ NET_IPV4_ICMP_ECHO_IGNORE_ALL,	"icmp_echo_ignore_all" },
-	{ NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,	"icmp_echo_ignore_broadcasts" },
-	/* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
-	/* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
-	/* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
-	/* NET_IPV4_ICMP_PARAMPROB_RATE unused */
-	/* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
-	{ NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,	"icmp_ignore_bogus_error_responses" },
-	{ NET_IPV4_IGMP_MAX_MEMBERSHIPS,	"igmp_max_memberships" },
-	{ NET_TCP_TW_RECYCLE,			"tcp_tw_recycle" },
-	/* NET_IPV4_ALWAYS_DEFRAG unused */
-	{ NET_IPV4_TCP_KEEPALIVE_INTVL,		"tcp_keepalive_intvl" },
-	{ NET_IPV4_INET_PEER_THRESHOLD,		"inet_peer_threshold" },
-	{ NET_IPV4_INET_PEER_MINTTL,		"inet_peer_minttl" },
-	{ NET_IPV4_INET_PEER_MAXTTL,		"inet_peer_maxttl" },
-	{ NET_IPV4_INET_PEER_GC_MINTIME,	"inet_peer_gc_mintime" },
-	{ NET_IPV4_INET_PEER_GC_MAXTIME,	"inet_peer_gc_maxtime" },
-	{ NET_TCP_ORPHAN_RETRIES,		"tcp_orphan_retries" },
-	{ NET_TCP_ABORT_ON_OVERFLOW,		"tcp_abort_on_overflow" },
-	{ NET_TCP_SYNACK_RETRIES,		"tcp_synack_retries" },
-	{ NET_TCP_MAX_ORPHANS,			"tcp_max_orphans" },
-	{ NET_TCP_MAX_TW_BUCKETS,		"tcp_max_tw_buckets" },
-	{ NET_TCP_FACK,				"tcp_fack" },
-	{ NET_TCP_REORDERING,			"tcp_reordering" },
-	{ NET_TCP_ECN,				"tcp_ecn" },
-	{ NET_TCP_DSACK,			"tcp_dsack" },
-	{ NET_TCP_MEM,				"tcp_mem" },
-	{ NET_TCP_WMEM,				"tcp_wmem" },
-	{ NET_TCP_RMEM,				"tcp_rmem" },
-	{ NET_TCP_APP_WIN,			"tcp_app_win" },
-	{ NET_TCP_ADV_WIN_SCALE,		"tcp_adv_win_scale" },
-	{ NET_IPV4_NONLOCAL_BIND,		"ip_nonlocal_bind" },
-	{ NET_IPV4_ICMP_RATELIMIT,		"icmp_ratelimit" },
-	{ NET_IPV4_ICMP_RATEMASK,		"icmp_ratemask" },
-	{ NET_TCP_TW_REUSE,			"tcp_tw_reuse" },
-	{ NET_TCP_FRTO,				"tcp_frto" },
-	{ NET_TCP_LOW_LATENCY,			"tcp_low_latency" },
-	{ NET_IPV4_IPFRAG_SECRET_INTERVAL,	"ipfrag_secret_interval" },
-	{ NET_IPV4_IGMP_MAX_MSF,		"igmp_max_msf" },
-	{ NET_TCP_NO_METRICS_SAVE,		"tcp_no_metrics_save" },
-	/* NET_TCP_DEFAULT_WIN_SCALE unused */
-	{ NET_TCP_MODERATE_RCVBUF,		"tcp_moderate_rcvbuf" },
-	{ NET_TCP_TSO_WIN_DIVISOR,		"tcp_tso_win_divisor" },
-	/* NET_TCP_BIC_BETA unused */
-	{ NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,	"icmp_errors_use_inbound_ifaddr" },
-	{ NET_TCP_CONG_CONTROL,			"tcp_congestion_control" },
-	{ NET_TCP_ABC,				"tcp_abc" },
-	{ NET_IPV4_IPFRAG_MAX_DIST,		"ipfrag_max_dist" },
-	{ NET_TCP_MTU_PROBING,			"tcp_mtu_probing" },
-	{ NET_TCP_BASE_MSS,			"tcp_base_mss" },
-	{ NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,	"tcp_workaround_signed_windows" },
-	{ NET_TCP_DMA_COPYBREAK,		"tcp_dma_copybreak" },
-	{ NET_TCP_SLOW_START_AFTER_IDLE,	"tcp_slow_start_after_idle" },
-	{ NET_CIPSOV4_CACHE_ENABLE,		"cipso_cache_enable" },
-	{ NET_CIPSOV4_CACHE_BUCKET_SIZE,	"cipso_cache_bucket_size" },
-	{ NET_CIPSOV4_RBM_OPTFMT,		"cipso_rbm_optfmt" },
-	{ NET_CIPSOV4_RBM_STRICTVALID,		"cipso_rbm_strictvalid" },
-	{ NET_TCP_AVAIL_CONG_CONTROL,		"tcp_available_congestion_control" },
-	{ NET_TCP_ALLOWED_CONG_CONTROL,		"tcp_allowed_congestion_control" },
-	{ NET_TCP_MAX_SSTHRESH,			"tcp_max_ssthresh" },
-	{ NET_TCP_FRTO_RESPONSE,		"tcp_frto_response" },
-	{ 2088 /* NET_IPQ_QMAX */,		"ip_queue_maxlen" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipx_table[] = {
-	{ NET_IPX_PPROP_BROADCASTING,	"ipx_pprop_broadcasting" },
-	/* NET_IPX_FORWARDING unused */
-	{}
-};
-
-static const struct trans_ctl_table trans_net_atalk_table[] = {
-	{ NET_ATALK_AARP_EXPIRY_TIME,		"aarp-expiry-time" },
-	{ NET_ATALK_AARP_TICK_TIME,		"aarp-tick-time" },
-	{ NET_ATALK_AARP_RETRANSMIT_LIMIT,	"aarp-retransmit-limit" },
-	{ NET_ATALK_AARP_RESOLVE_TIME,		"aarp-resolve-time" },
-	{},
-};
-
-static const struct trans_ctl_table trans_net_netrom_table[] = {
-	{ NET_NETROM_DEFAULT_PATH_QUALITY,		"default_path_quality" },
-	{ NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER,	"obsolescence_count_initialiser" },
-	{ NET_NETROM_NETWORK_TTL_INITIALISER,		"network_ttl_initialiser" },
-	{ NET_NETROM_TRANSPORT_TIMEOUT,			"transport_timeout" },
-	{ NET_NETROM_TRANSPORT_MAXIMUM_TRIES,		"transport_maximum_tries" },
-	{ NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY,	"transport_acknowledge_delay" },
-	{ NET_NETROM_TRANSPORT_BUSY_DELAY,		"transport_busy_delay" },
-	{ NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE,	"transport_requested_window_size" },
-	{ NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT,	"transport_no_activity_timeout" },
-	{ NET_NETROM_ROUTING_CONTROL,			"routing_control" },
-	{ NET_NETROM_LINK_FAILS_COUNT,			"link_fails_count" },
-	{ NET_NETROM_RESET,				"reset" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ax25_param_table[] = {
-	{ NET_AX25_IP_DEFAULT_MODE,	"ip_default_mode" },
-	{ NET_AX25_DEFAULT_MODE,	"ax25_default_mode" },
-	{ NET_AX25_BACKOFF_TYPE,	"backoff_type" },
-	{ NET_AX25_CONNECT_MODE,	"connect_mode" },
-	{ NET_AX25_STANDARD_WINDOW,	"standard_window_size" },
-	{ NET_AX25_EXTENDED_WINDOW,	"extended_window_size" },
-	{ NET_AX25_T1_TIMEOUT,		"t1_timeout" },
-	{ NET_AX25_T2_TIMEOUT,		"t2_timeout" },
-	{ NET_AX25_T3_TIMEOUT,		"t3_timeout" },
-	{ NET_AX25_IDLE_TIMEOUT,	"idle_timeout" },
-	{ NET_AX25_N2,			"maximum_retry_count" },
-	{ NET_AX25_PACLEN,		"maximum_packet_length" },
-	{ NET_AX25_PROTOCOL,		"protocol" },
-	{ NET_AX25_DAMA_SLAVE_TIMEOUT,	"dama_slave_timeout" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ax25_table[] = {
-	{ 0, NULL, trans_net_ax25_param_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_bridge_table[] = {
-	{ NET_BRIDGE_NF_CALL_ARPTABLES,		"bridge-nf-call-arptables" },
-	{ NET_BRIDGE_NF_CALL_IPTABLES,		"bridge-nf-call-iptables" },
-	{ NET_BRIDGE_NF_CALL_IP6TABLES,		"bridge-nf-call-ip6tables" },
-	{ NET_BRIDGE_NF_FILTER_VLAN_TAGGED,	"bridge-nf-filter-vlan-tagged" },
-	{ NET_BRIDGE_NF_FILTER_PPPOE_TAGGED,	"bridge-nf-filter-pppoe-tagged" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_rose_table[] = {
-	{ NET_ROSE_RESTART_REQUEST_TIMEOUT,	"restart_request_timeout" },
-	{ NET_ROSE_CALL_REQUEST_TIMEOUT,	"call_request_timeout" },
-	{ NET_ROSE_RESET_REQUEST_TIMEOUT,	"reset_request_timeout" },
-	{ NET_ROSE_CLEAR_REQUEST_TIMEOUT,	"clear_request_timeout" },
-	{ NET_ROSE_ACK_HOLD_BACK_TIMEOUT,	"acknowledge_hold_back_timeout" },
-	{ NET_ROSE_ROUTING_CONTROL,		"routing_control" },
-	{ NET_ROSE_LINK_FAIL_TIMEOUT,		"link_fail_timeout" },
-	{ NET_ROSE_MAX_VCS,			"maximum_virtual_circuits" },
-	{ NET_ROSE_WINDOW_SIZE,			"window_size" },
-	{ NET_ROSE_NO_ACTIVITY_TIMEOUT,		"no_activity_timeout" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
-	{ NET_IPV6_FORWARDING,			"forwarding" },
-	{ NET_IPV6_HOP_LIMIT,			"hop_limit" },
-	{ NET_IPV6_MTU,				"mtu" },
-	{ NET_IPV6_ACCEPT_RA,			"accept_ra" },
-	{ NET_IPV6_ACCEPT_REDIRECTS,		"accept_redirects" },
-	{ NET_IPV6_AUTOCONF,			"autoconf" },
-	{ NET_IPV6_DAD_TRANSMITS,		"dad_transmits" },
-	{ NET_IPV6_RTR_SOLICITS,		"router_solicitations" },
-	{ NET_IPV6_RTR_SOLICIT_INTERVAL,	"router_solicitation_interval" },
-	{ NET_IPV6_RTR_SOLICIT_DELAY,		"router_solicitation_delay" },
-	{ NET_IPV6_USE_TEMPADDR,		"use_tempaddr" },
-	{ NET_IPV6_TEMP_VALID_LFT,		"temp_valid_lft" },
-	{ NET_IPV6_TEMP_PREFERED_LFT,		"temp_prefered_lft" },
-	{ NET_IPV6_REGEN_MAX_RETRY,		"regen_max_retry" },
-	{ NET_IPV6_MAX_DESYNC_FACTOR,		"max_desync_factor" },
-	{ NET_IPV6_MAX_ADDRESSES,		"max_addresses" },
-	{ NET_IPV6_FORCE_MLD_VERSION,		"force_mld_version" },
-	{ NET_IPV6_ACCEPT_RA_DEFRTR,		"accept_ra_defrtr" },
-	{ NET_IPV6_ACCEPT_RA_PINFO,		"accept_ra_pinfo" },
-	{ NET_IPV6_ACCEPT_RA_RTR_PREF,		"accept_ra_rtr_pref" },
-	{ NET_IPV6_RTR_PROBE_INTERVAL,		"router_probe_interval" },
-	{ NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,	"accept_ra_rt_info_max_plen" },
-	{ NET_IPV6_PROXY_NDP,			"proxy_ndp" },
-	{ NET_IPV6_ACCEPT_SOURCE_ROUTE,		"accept_source_route" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
-	{ NET_PROTO_CONF_ALL,		"all",	trans_net_ipv6_conf_var_table },
-	{ NET_PROTO_CONF_DEFAULT, 	"default", trans_net_ipv6_conf_var_table },
-	{ 0, NULL, trans_net_ipv6_conf_var_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
-	{ NET_IPV6_ROUTE_FLUSH,			"flush" },
-	{ NET_IPV6_ROUTE_GC_THRESH,		"gc_thresh" },
-	{ NET_IPV6_ROUTE_MAX_SIZE,		"max_size" },
-	{ NET_IPV6_ROUTE_GC_MIN_INTERVAL,	"gc_min_interval" },
-	{ NET_IPV6_ROUTE_GC_TIMEOUT,		"gc_timeout" },
-	{ NET_IPV6_ROUTE_GC_INTERVAL,		"gc_interval" },
-	{ NET_IPV6_ROUTE_GC_ELASTICITY,		"gc_elasticity" },
-	{ NET_IPV6_ROUTE_MTU_EXPIRES,		"mtu_expires" },
-	{ NET_IPV6_ROUTE_MIN_ADVMSS,		"min_adv_mss" },
-	{ NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,	"gc_min_interval_ms" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
-	{ NET_IPV6_ICMP_RATELIMIT,	"ratelimit" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_ipv6_table[] = {
-	{ NET_IPV6_CONF,		"conf",		trans_net_ipv6_conf_table },
-	{ NET_IPV6_NEIGH,		"neigh",	trans_net_neigh_table },
-	{ NET_IPV6_ROUTE,		"route",	trans_net_ipv6_route_table },
-	{ NET_IPV6_ICMP,		"icmp",		trans_net_ipv6_icmp_table },
-	{ NET_IPV6_BINDV6ONLY,		"bindv6only" },
-	{ NET_IPV6_IP6FRAG_HIGH_THRESH,	"ip6frag_high_thresh" },
-	{ NET_IPV6_IP6FRAG_LOW_THRESH,	"ip6frag_low_thresh" },
-	{ NET_IPV6_IP6FRAG_TIME,	"ip6frag_time" },
-	{ NET_IPV6_IP6FRAG_SECRET_INTERVAL,	"ip6frag_secret_interval" },
-	{ NET_IPV6_MLD_MAX_MSF,		"mld_max_msf" },
-	{ 2088 /* IPQ_QMAX */,		"ip6_queue_maxlen" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_x25_table[] = {
-	{ NET_X25_RESTART_REQUEST_TIMEOUT,	"restart_request_timeout" },
-	{ NET_X25_CALL_REQUEST_TIMEOUT,		"call_request_timeout" },
-	{ NET_X25_RESET_REQUEST_TIMEOUT,	"reset_request_timeout" },
-	{ NET_X25_CLEAR_REQUEST_TIMEOUT,	"clear_request_timeout" },
-	{ NET_X25_ACK_HOLD_BACK_TIMEOUT,	"acknowledgement_hold_back_timeout" },
-	{ NET_X25_FORWARD,			"x25_forward" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_tr_table[] = {
-	{ NET_TR_RIF_TIMEOUT,	"rif_timeout" },
-	{}
-};
-
-
-static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
-	{ NET_DECNET_CONF_DEV_FORWARDING,	"forwarding" },
-	{ NET_DECNET_CONF_DEV_PRIORITY,		"priority" },
-	{ NET_DECNET_CONF_DEV_T2,		"t2" },
-	{ NET_DECNET_CONF_DEV_T3,		"t3" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_decnet_conf[] = {
-	{ 0, NULL, trans_net_decnet_conf_vars },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_decnet_table[] = {
-	{ NET_DECNET_CONF,		"conf",	trans_net_decnet_conf },
-	{ NET_DECNET_NODE_ADDRESS,	"node_address" },
-	{ NET_DECNET_NODE_NAME,		"node_name" },
-	{ NET_DECNET_DEFAULT_DEVICE,	"default_device" },
-	{ NET_DECNET_TIME_WAIT,		"time_wait" },
-	{ NET_DECNET_DN_COUNT,		"dn_count" },
-	{ NET_DECNET_DI_COUNT,		"di_count" },
-	{ NET_DECNET_DR_COUNT,		"dr_count" },
-	{ NET_DECNET_DST_GC_INTERVAL,	"dst_gc_interval" },
-	{ NET_DECNET_NO_FC_MAX_CWND,	"no_fc_max_cwnd" },
-	{ NET_DECNET_MEM,		"decnet_mem" },
-	{ NET_DECNET_RMEM,		"decnet_rmem" },
-	{ NET_DECNET_WMEM,		"decnet_wmem" },
-	{ NET_DECNET_DEBUG_LEVEL,	"debug" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_sctp_table[] = {
-	{ NET_SCTP_RTO_INITIAL,		"rto_initial" },
-	{ NET_SCTP_RTO_MIN,		"rto_min" },
-	{ NET_SCTP_RTO_MAX,		"rto_max" },
-	{ NET_SCTP_RTO_ALPHA,		"rto_alpha_exp_divisor" },
-	{ NET_SCTP_RTO_BETA,		"rto_beta_exp_divisor" },
-	{ NET_SCTP_VALID_COOKIE_LIFE,	"valid_cookie_life" },
-	{ NET_SCTP_ASSOCIATION_MAX_RETRANS,	"association_max_retrans" },
-	{ NET_SCTP_PATH_MAX_RETRANS,	"path_max_retrans" },
-	{ NET_SCTP_MAX_INIT_RETRANSMITS,	"max_init_retransmits" },
-	{ NET_SCTP_HB_INTERVAL,		"hb_interval" },
-	{ NET_SCTP_PRESERVE_ENABLE,	"cookie_preserve_enable" },
-	{ NET_SCTP_MAX_BURST,		"max_burst" },
-	{ NET_SCTP_ADDIP_ENABLE,	"addip_enable" },
-	{ NET_SCTP_PRSCTP_ENABLE,	"prsctp_enable" },
-	{ NET_SCTP_SNDBUF_POLICY,	"sndbuf_policy" },
-	{ NET_SCTP_SACK_TIMEOUT,	"sack_timeout" },
-	{ NET_SCTP_RCVBUF_POLICY,	"rcvbuf_policy" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
-	{ NET_LLC2_ACK_TIMEOUT,		"ack" },
-	{ NET_LLC2_P_TIMEOUT,		"p" },
-	{ NET_LLC2_REJ_TIMEOUT,		"rej" },
-	{ NET_LLC2_BUSY_TIMEOUT,	"busy" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_llc_station_table[] = {
-	{ NET_LLC_STATION_ACK_TIMEOUT,	"ack_timeout" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
-	{ NET_LLC2,		"timeout",	trans_net_llc_llc2_timeout_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_llc_table[] = {
-	{ NET_LLC2,		"llc2",		trans_net_llc_llc2_table },
-	{ NET_LLC_STATION,	"station",	trans_net_llc_station_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_netfilter_table[] = {
-	{ NET_NF_CONNTRACK_MAX,				"nf_conntrack_max" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,	"nf_conntrack_tcp_timeout_syn_sent" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,	"nf_conntrack_tcp_timeout_syn_recv" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,	"nf_conntrack_tcp_timeout_established" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,	"nf_conntrack_tcp_timeout_fin_wait" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,	"nf_conntrack_tcp_timeout_close_wait" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,	"nf_conntrack_tcp_timeout_last_ack" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,	"nf_conntrack_tcp_timeout_time_wait" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,		"nf_conntrack_tcp_timeout_close" },
-	{ NET_NF_CONNTRACK_UDP_TIMEOUT,			"nf_conntrack_udp_timeout" },
-	{ NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM,		"nf_conntrack_udp_timeout_stream" },
-	{ NET_NF_CONNTRACK_ICMP_TIMEOUT,	"nf_conntrack_icmp_timeout" },
-	{ NET_NF_CONNTRACK_GENERIC_TIMEOUT,		"nf_conntrack_generic_timeout" },
-	{ NET_NF_CONNTRACK_BUCKETS,			"nf_conntrack_buckets" },
-	{ NET_NF_CONNTRACK_LOG_INVALID,			"nf_conntrack_log_invalid" },
-	{ NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,	"nf_conntrack_tcp_timeout_max_retrans" },
-	{ NET_NF_CONNTRACK_TCP_LOOSE,			"nf_conntrack_tcp_loose" },
-	{ NET_NF_CONNTRACK_TCP_BE_LIBERAL,		"nf_conntrack_tcp_be_liberal" },
-	{ NET_NF_CONNTRACK_TCP_MAX_RETRANS,		"nf_conntrack_tcp_max_retrans" },
-	{ NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,		"nf_conntrack_sctp_timeout_closed" },
-	{ NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,	"nf_conntrack_sctp_timeout_cookie_wait" },
-	{ NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,	"nf_conntrack_sctp_timeout_cookie_echoed" },
-	{ NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,	"nf_conntrack_sctp_timeout_established" },
-	{ NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,	"nf_conntrack_sctp_timeout_shutdown_sent" },
-	{ NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,	"nf_conntrack_sctp_timeout_shutdown_recd" },
-	{ NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,	"nf_conntrack_sctp_timeout_shutdown_ack_sent" },
-	{ NET_NF_CONNTRACK_COUNT,			"nf_conntrack_count" },
-	{ NET_NF_CONNTRACK_ICMPV6_TIMEOUT,	"nf_conntrack_icmpv6_timeout" },
-	{ NET_NF_CONNTRACK_FRAG6_TIMEOUT,		"nf_conntrack_frag6_timeout" },
-	{ NET_NF_CONNTRACK_FRAG6_LOW_THRESH,		"nf_conntrack_frag6_low_thresh" },
-	{ NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,		"nf_conntrack_frag6_high_thresh" },
-	{ NET_NF_CONNTRACK_CHECKSUM,			"nf_conntrack_checksum" },
-
-	{}
-};
-
-static const struct trans_ctl_table trans_net_dccp_table[] = {
-	{ NET_DCCP_DEFAULT,	"default" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_irda_table[] = {
-	{ NET_IRDA_DISCOVERY,		"discovery" },
-	{ NET_IRDA_DEVNAME,		"devname" },
-	{ NET_IRDA_DEBUG,		"debug" },
-	{ NET_IRDA_FAST_POLL,		"fast_poll_increase" },
-	{ NET_IRDA_DISCOVERY_SLOTS,	"discovery_slots" },
-	{ NET_IRDA_DISCOVERY_TIMEOUT,	"discovery_timeout" },
-	{ NET_IRDA_SLOT_TIMEOUT,	"slot_timeout" },
-	{ NET_IRDA_MAX_BAUD_RATE,	"max_baud_rate" },
-	{ NET_IRDA_MIN_TX_TURN_TIME,	"min_tx_turn_time" },
-	{ NET_IRDA_MAX_TX_DATA_SIZE,	"max_tx_data_size" },
-	{ NET_IRDA_MAX_TX_WINDOW,	"max_tx_window" },
-	{ NET_IRDA_MAX_NOREPLY_TIME,	"max_noreply_time" },
-	{ NET_IRDA_WARN_NOREPLY_TIME,	"warn_noreply_time" },
-	{ NET_IRDA_LAP_KEEPALIVE_TIME,	"lap_keepalive_time" },
-	{}
-};
-
-static const struct trans_ctl_table trans_net_table[] = {
-	{ NET_CORE,		"core",		trans_net_core_table },
-	/* NET_ETHER not used */
-	/* NET_802 not used */
-	{ NET_UNIX,		"unix",		trans_net_unix_table },
-	{ NET_IPV4,		"ipv4",		trans_net_ipv4_table },
-	{ NET_IPX,		"ipx",		trans_net_ipx_table },
-	{ NET_ATALK,		"appletalk",	trans_net_atalk_table },
-	{ NET_NETROM,		"netrom",	trans_net_netrom_table },
-	{ NET_AX25,		"ax25",		trans_net_ax25_table },
-	{ NET_BRIDGE,		"bridge",	trans_net_bridge_table },
-	{ NET_ROSE,		"rose",		trans_net_rose_table },
-	{ NET_IPV6,		"ipv6",		trans_net_ipv6_table },
-	{ NET_X25,		"x25",		trans_net_x25_table },
-	{ NET_TR,		"token-ring",	trans_net_tr_table },
-	{ NET_DECNET,		"decnet",	trans_net_decnet_table },
-	/*  NET_ECONET not used */
-	{ NET_SCTP,		"sctp",		trans_net_sctp_table },
-	{ NET_LLC,		"llc",		trans_net_llc_table },
-	{ NET_NETFILTER,	"netfilter",	trans_net_netfilter_table },
-	{ NET_DCCP,		"dccp",		trans_net_dccp_table },
-	{ NET_IRDA,		"irda",		trans_net_irda_table },
-	{ 2089,			"nf_conntrack_max" },
-	{}
-};
-
-static const struct trans_ctl_table trans_fs_quota_table[] = {
-	{ FS_DQ_LOOKUPS,	"lookups" },
-	{ FS_DQ_DROPS,		"drops" },
-	{ FS_DQ_READS,		"reads" },
-	{ FS_DQ_WRITES,		"writes" },
-	{ FS_DQ_CACHE_HITS,	"cache_hits" },
-	{ FS_DQ_ALLOCATED,	"allocated_dquots" },
-	{ FS_DQ_FREE,		"free_dquots" },
-	{ FS_DQ_SYNCS,		"syncs" },
-	{ FS_DQ_WARNINGS,	"warnings" },
-	{}
-};
-
-static const struct trans_ctl_table trans_fs_xfs_table[] = {
-	{ XFS_SGID_INHERIT,	"irix_sgid_inherit" },
-	{ XFS_SYMLINK_MODE,	"irix_symlink_mode" },
-	{ XFS_PANIC_MASK,	"panic_mask" },
-
-	{ XFS_ERRLEVEL,		"error_level" },
-	{ XFS_SYNCD_TIMER,	"xfssyncd_centisecs" },
-	{ XFS_INHERIT_SYNC,	"inherit_sync" },
-	{ XFS_INHERIT_NODUMP,	"inherit_nodump" },
-	{ XFS_INHERIT_NOATIME,	"inherit_noatime" },
-	{ XFS_BUF_TIMER,	"xfsbufd_centisecs" },
-	{ XFS_BUF_AGE,		"age_buffer_centisecs" },
-	{ XFS_INHERIT_NOSYM,	"inherit_nosymlinks" },
-	{ XFS_ROTORSTEP,	"rotorstep" },
-	{ XFS_INHERIT_NODFRG,	"inherit_nodefrag" },
-	{ XFS_FILESTREAM_TIMER,	"filestream_centisecs" },
-	{ XFS_STATS_CLEAR,	"stats_clear" },
-	{}
-};
-
-static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
-	{ 1, "hb_ctl_path" },
-	{}
-};
-
-static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
-	{ 1,	"nm",	trans_fs_ocfs2_nm_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_inotify_table[] = {
-	{ INOTIFY_MAX_USER_INSTANCES,	"max_user_instances" },
-	{ INOTIFY_MAX_USER_WATCHES,	"max_user_watches" },
-	{ INOTIFY_MAX_QUEUED_EVENTS,	"max_queued_events" },
-	{}
-};
-
-static const struct trans_ctl_table trans_fs_table[] = {
-	{ FS_NRINODE,		"inode-nr" },
-	{ FS_STATINODE,		"inode-state" },
-	/* FS_MAXINODE unused */
-	/* FS_NRDQUOT unused */
-	/* FS_MAXDQUOT unused */
-	{ FS_NRFILE,		"file-nr" },
-	{ FS_MAXFILE,		"file-max" },
-	{ FS_DENTRY,		"dentry-state" },
-	/* FS_NRSUPER unused */
-	/* FS_MAXUPSER unused */
-	{ FS_OVERFLOWUID,	"overflowuid" },
-	{ FS_OVERFLOWGID,	"overflowgid" },
-	{ FS_LEASES,		"leases-enable" },
-	{ FS_DIR_NOTIFY,	"dir-notify-enable" },
-	{ FS_LEASE_TIME,	"lease-break-time" },
-	{ FS_DQSTATS,		"quota",		trans_fs_quota_table },
-	{ FS_XFS,		"xfs",			trans_fs_xfs_table },
-	{ FS_AIO_NR,		"aio-nr" },
-	{ FS_AIO_MAX_NR,	"aio-max-nr" },
-	{ FS_INOTIFY,		"inotify",		trans_inotify_table },
-	{ FS_OCFS2,		"ocfs2",		trans_fs_ocfs2_table },
-	{ KERN_SETUID_DUMPABLE,	"suid_dumpable" },
-	{}
-};
-
-static const struct trans_ctl_table trans_debug_table[] = {
-	{}
-};
-
-static const struct trans_ctl_table trans_cdrom_table[] = {
-	{ DEV_CDROM_INFO,		"info" },
-	{ DEV_CDROM_AUTOCLOSE,		"autoclose" },
-	{ DEV_CDROM_AUTOEJECT,		"autoeject" },
-	{ DEV_CDROM_DEBUG,		"debug" },
-	{ DEV_CDROM_LOCK,		"lock" },
-	{ DEV_CDROM_CHECK_MEDIA,	"check_media" },
-	{}
-};
-
-static const struct trans_ctl_table trans_ipmi_table[] = {
-	{ DEV_IPMI_POWEROFF_POWERCYCLE,	"poweroff_powercycle" },
-	{}
-};
-
-static const struct trans_ctl_table trans_mac_hid_files[] = {
-	/* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
-	/* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
-	{ DEV_MAC_HID_MOUSE_BUTTON_EMULATION,	"mouse_button_emulation" },
-	{ DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE,	"mouse_button2_keycode" },
-	{ DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE,	"mouse_button3_keycode" },
-	/* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
-	{}
-};
-
-static const struct trans_ctl_table trans_raid_table[] = {
-	{ DEV_RAID_SPEED_LIMIT_MIN,	"speed_limit_min" },
-	{ DEV_RAID_SPEED_LIMIT_MAX,	"speed_limit_max" },
-	{}
-};
-
-static const struct trans_ctl_table trans_scsi_table[] = {
-	{ DEV_SCSI_LOGGING_LEVEL, "logging_level" },
-	{}
-};
-
-static const struct trans_ctl_table trans_parport_default_table[] = {
-	{ DEV_PARPORT_DEFAULT_TIMESLICE,	"timeslice" },
-	{ DEV_PARPORT_DEFAULT_SPINTIME,		"spintime" },
-	{}
-};
-
-static const struct trans_ctl_table trans_parport_device_table[] = {
-	{ DEV_PARPORT_DEVICE_TIMESLICE,		"timeslice" },
-	{}
-};
-
-static const struct trans_ctl_table trans_parport_devices_table[] = {
-	{ DEV_PARPORT_DEVICES_ACTIVE,		"active" },
-	{ 0, NULL, trans_parport_device_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_parport_parport_table[] = {
-	{ DEV_PARPORT_SPINTIME,		"spintime" },
-	{ DEV_PARPORT_BASE_ADDR,	"base-addr" },
-	{ DEV_PARPORT_IRQ,		"irq" },
-	{ DEV_PARPORT_DMA,		"dma" },
-	{ DEV_PARPORT_MODES,		"modes" },
-	{ DEV_PARPORT_DEVICES,		"devices",	trans_parport_devices_table },
-	{ DEV_PARPORT_AUTOPROBE,	"autoprobe" },
-	{ DEV_PARPORT_AUTOPROBE + 1,	"autoprobe0" },
-	{ DEV_PARPORT_AUTOPROBE + 2,	"autoprobe1" },
-	{ DEV_PARPORT_AUTOPROBE + 3,	"autoprobe2" },
-	{ DEV_PARPORT_AUTOPROBE + 4,	"autoprobe3" },
-	{}
-};
-static const struct trans_ctl_table trans_parport_table[] = {
-	{ DEV_PARPORT_DEFAULT,	"default",	trans_parport_default_table },
-	{ 0, NULL, trans_parport_parport_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_dev_table[] = {
-	{ DEV_CDROM,	"cdrom",	trans_cdrom_table },
-	/* DEV_HWMON unused */
-	{ DEV_PARPORT,	"parport",	trans_parport_table },
-	{ DEV_RAID,	"raid",		trans_raid_table },
-	{ DEV_MAC_HID,	"mac_hid",	trans_mac_hid_files },
-	{ DEV_SCSI,	"scsi",		trans_scsi_table },
-	{ DEV_IPMI,	"ipmi",		trans_ipmi_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_bus_isa_table[] = {
-	{ BUS_ISA_MEM_BASE,	"membase" },
-	{ BUS_ISA_PORT_BASE,	"portbase" },
-	{ BUS_ISA_PORT_SHIFT,	"portshift" },
-	{}
-};
-
-static const struct trans_ctl_table trans_bus_table[] = {
-	{ CTL_BUS_ISA,	"isa",	trans_bus_isa_table },
-	{}
-};
-
-static const struct trans_ctl_table trans_arlan_conf_table0[] = {
-	{ 1,	"spreadingCode" },
-	{ 2,	"channelNumber" },
-	{ 3,	"scramblingDisable" },
-	{ 4,	"txAttenuation" },
-	{ 5,	"systemId" },
-	{ 6,	"maxDatagramSize" },
-	{ 7,	"maxFrameSize" },
-	{ 8,	"maxRetries" },
-	{ 9,	"receiveMode" },
-	{ 10,	"priority" },
-	{ 11,	"rootOrRepeater" },
-	{ 12,	"SID" },
-	{ 13,	"registrationMode" },
-	{ 14,	"registrationFill" },
-	{ 15,	"localTalkAddress" },
-	{ 16,	"codeFormat" },
-	{ 17,	"numChannels" },
-	{ 18,	"channel1" },
-	{ 19,	"channel2" },
-	{ 20,	"channel3" },
-	{ 21,	"channel4" },
-	{ 22,	"txClear" },
-	{ 23,	"txRetries" },
-	{ 24,	"txRouting" },
-	{ 25,	"txScrambled" },
-	{ 26,	"rxParameter" },
-	{ 27,	"txTimeoutMs" },
-	{ 28,	"waitCardTimeout" },
-	{ 29,	"channelSet" },
-	{ 30,	"name" },
-	{ 31,	"waitTime" },
-	{ 32,	"lParameter" },
-	{ 33,	"_15" },
-	{ 34,	"headerSize" },
-	{ 36,	"tx_delay_ms" },
-	{ 37,	"retries" },
-	{ 38,	"ReTransmitPacketMaxSize" },
-	{ 39,	"waitReTransmitPacketMaxSize" },
-	{ 40,	"fastReTransCount" },
-	{ 41,	"driverRetransmissions" },
-	{ 42,	"txAckTimeoutMs" },
-	{ 43,	"registrationInterrupts" },
-	{ 44,	"hardwareType" },
-	{ 45,	"radioType" },
-	{ 46,	"writeEEPROM" },
-	{ 47,	"writeRadioType" },
-	{ 48,	"entry_exit_debug" },
-	{ 49,	"debug" },
-	{ 50,	"in_speed" },
-	{ 51,	"out_speed" },
-	{ 52,	"in_speed10" },
-	{ 53,	"out_speed10" },
-	{ 54,	"in_speed_max" },
-	{ 55,	"out_speed_max" },
-	{ 56,	"measure_rate" },
-	{ 57,	"pre_Command_Wait" },
-	{ 58,	"rx_tweak1" },
-	{ 59,	"rx_tweak2" },
-	{ 60,	"tx_queue_len" },
-
-	{ 150,	"arlan0-txRing" },
-	{ 151,	"arlan0-rxRing" },
-	{ 152,	"arlan0-18" },
-	{ 153,	"arlan0-ring" },
-	{ 154,	"arlan0-shm-cpy" },
-	{ 155,	"config0" },
-	{ 156,	"reset0" },
-	{}
-};
-
-static const struct trans_ctl_table trans_arlan_conf_table1[] = {
-	{ 1,	"spreadingCode" },
-	{ 2,	"channelNumber" },
-	{ 3,	"scramblingDisable" },
-	{ 4,	"txAttenuation" },
-	{ 5,	"systemId" },
-	{ 6,	"maxDatagramSize" },
-	{ 7,	"maxFrameSize" },
-	{ 8,	"maxRetries" },
-	{ 9,	"receiveMode" },
-	{ 10,	"priority" },
-	{ 11,	"rootOrRepeater" },
-	{ 12,	"SID" },
-	{ 13,	"registrationMode" },
-	{ 14,	"registrationFill" },
-	{ 15,	"localTalkAddress" },
-	{ 16,	"codeFormat" },
-	{ 17,	"numChannels" },
-	{ 18,	"channel1" },
-	{ 19,	"channel2" },
-	{ 20,	"channel3" },
-	{ 21,	"channel4" },
-	{ 22,	"txClear" },
-	{ 23,	"txRetries" },
-	{ 24,	"txRouting" },
-	{ 25,	"txScrambled" },
-	{ 26,	"rxParameter" },
-	{ 27,	"txTimeoutMs" },
-	{ 28,	"waitCardTimeout" },
-	{ 29,	"channelSet" },
-	{ 30,	"name" },
-	{ 31,	"waitTime" },
-	{ 32,	"lParameter" },
-	{ 33,	"_15" },
-	{ 34,	"headerSize" },
-	{ 36,	"tx_delay_ms" },
-	{ 37,	"retries" },
-	{ 38,	"ReTransmitPacketMaxSize" },
-	{ 39,	"waitReTransmitPacketMaxSize" },
-	{ 40,	"fastReTransCount" },
-	{ 41,	"driverRetransmissions" },
-	{ 42,	"txAckTimeoutMs" },
-	{ 43,	"registrationInterrupts" },
-	{ 44,	"hardwareType" },
-	{ 45,	"radioType" },
-	{ 46,	"writeEEPROM" },
-	{ 47,	"writeRadioType" },
-	{ 48,	"entry_exit_debug" },
-	{ 49,	"debug" },
-	{ 50,	"in_speed" },
-	{ 51,	"out_speed" },
-	{ 52,	"in_speed10" },
-	{ 53,	"out_speed10" },
-	{ 54,	"in_speed_max" },
-	{ 55,	"out_speed_max" },
-	{ 56,	"measure_rate" },
-	{ 57,	"pre_Command_Wait" },
-	{ 58,	"rx_tweak1" },
-	{ 59,	"rx_tweak2" },
-	{ 60,	"tx_queue_len" },
-
-	{ 150,	"arlan1-txRing" },
-	{ 151,	"arlan1-rxRing" },
-	{ 152,	"arlan1-18" },
-	{ 153,	"arlan1-ring" },
-	{ 154,	"arlan1-shm-cpy" },
-	{ 155,	"config1" },
-	{ 156,	"reset1" },
-	{}
-};
-
-static const struct trans_ctl_table trans_arlan_conf_table2[] = {
-	{ 1,	"spreadingCode" },
-	{ 2,	"channelNumber" },
-	{ 3,	"scramblingDisable" },
-	{ 4,	"txAttenuation" },
-	{ 5,	"systemId" },
-	{ 6,	"maxDatagramSize" },
-	{ 7,	"maxFrameSize" },
-	{ 8,	"maxRetries" },
-	{ 9,	"receiveMode" },
-	{ 10,	"priority" },
-	{ 11,	"rootOrRepeater" },
-	{ 12,	"SID" },
-	{ 13,	"registrationMode" },
-	{ 14,	"registrationFill" },
-	{ 15,	"localTalkAddress" },
-	{ 16,	"codeFormat" },
-	{ 17,	"numChannels" },
-	{ 18,	"channel1" },
-	{ 19,	"channel2" },
-	{ 20,	"channel3" },
-	{ 21,	"channel4" },
-	{ 22,	"txClear" },
-	{ 23,	"txRetries" },
-	{ 24,	"txRouting" },
-	{ 25,	"txScrambled" },
-	{ 26,	"rxParameter" },
-	{ 27,	"txTimeoutMs" },
-	{ 28,	"waitCardTimeout" },
-	{ 29,	"channelSet" },
-	{ 30,	"name" },
-	{ 31,	"waitTime" },
-	{ 32,	"lParameter" },
-	{ 33,	"_15" },
-	{ 34,	"headerSize" },
-	{ 36,	"tx_delay_ms" },
-	{ 37,	"retries" },
-	{ 38,	"ReTransmitPacketMaxSize" },
-	{ 39,	"waitReTransmitPacketMaxSize" },
-	{ 40,	"fastReTransCount" },
-	{ 41,	"driverRetransmissions" },
-	{ 42,	"txAckTimeoutMs" },
-	{ 43,	"registrationInterrupts" },
-	{ 44,	"hardwareType" },
-	{ 45,	"radioType" },
-	{ 46,	"writeEEPROM" },
-	{ 47,	"writeRadioType" },
-	{ 48,	"entry_exit_debug" },
-	{ 49,	"debug" },
-	{ 50,	"in_speed" },
-	{ 51,	"out_speed" },
-	{ 52,	"in_speed10" },
-	{ 53,	"out_speed10" },
-	{ 54,	"in_speed_max" },
-	{ 55,	"out_speed_max" },
-	{ 56,	"measure_rate" },
-	{ 57,	"pre_Command_Wait" },
-	{ 58,	"rx_tweak1" },
-	{ 59,	"rx_tweak2" },
-	{ 60,	"tx_queue_len" },
-
-	{ 150,	"arlan2-txRing" },
-	{ 151,	"arlan2-rxRing" },
-	{ 152,	"arlan2-18" },
-	{ 153,	"arlan2-ring" },
-	{ 154,	"arlan2-shm-cpy" },
-	{ 155,	"config2" },
-	{ 156,	"reset2" },
-	{}
-};
-
-static const struct trans_ctl_table trans_arlan_conf_table3[] = {
-	{ 1,	"spreadingCode" },
-	{ 2,	"channelNumber" },
-	{ 3,	"scramblingDisable" },
-	{ 4,	"txAttenuation" },
-	{ 5,	"systemId" },
-	{ 6,	"maxDatagramSize" },
-	{ 7,	"maxFrameSize" },
-	{ 8,	"maxRetries" },
-	{ 9,	"receiveMode" },
-	{ 10,	"priority" },
-	{ 11,	"rootOrRepeater" },
-	{ 12,	"SID" },
-	{ 13,	"registrationMode" },
-	{ 14,	"registrationFill" },
-	{ 15,	"localTalkAddress" },
-	{ 16,	"codeFormat" },
-	{ 17,	"numChannels" },
-	{ 18,	"channel1" },
-	{ 19,	"channel2" },
-	{ 20,	"channel3" },
-	{ 21,	"channel4" },
-	{ 22,	"txClear" },
-	{ 23,	"txRetries" },
-	{ 24,	"txRouting" },
-	{ 25,	"txScrambled" },
-	{ 26,	"rxParameter" },
-	{ 27,	"txTimeoutMs" },
-	{ 28,	"waitCardTimeout" },
-	{ 29,	"channelSet" },
-	{ 30,	"name" },
-	{ 31,	"waitTime" },
-	{ 32,	"lParameter" },
-	{ 33,	"_15" },
-	{ 34,	"headerSize" },
-	{ 36,	"tx_delay_ms" },
-	{ 37,	"retries" },
-	{ 38,	"ReTransmitPacketMaxSize" },
-	{ 39,	"waitReTransmitPacketMaxSize" },
-	{ 40,	"fastReTransCount" },
-	{ 41,	"driverRetransmissions" },
-	{ 42,	"txAckTimeoutMs" },
-	{ 43,	"registrationInterrupts" },
-	{ 44,	"hardwareType" },
-	{ 45,	"radioType" },
-	{ 46,	"writeEEPROM" },
-	{ 47,	"writeRadioType" },
-	{ 48,	"entry_exit_debug" },
-	{ 49,	"debug" },
-	{ 50,	"in_speed" },
-	{ 51,	"out_speed" },
-	{ 52,	"in_speed10" },
-	{ 53,	"out_speed10" },
-	{ 54,	"in_speed_max" },
-	{ 55,	"out_speed_max" },
-	{ 56,	"measure_rate" },
-	{ 57,	"pre_Command_Wait" },
-	{ 58,	"rx_tweak1" },
-	{ 59,	"rx_tweak2" },
-	{ 60,	"tx_queue_len" },
-
-	{ 150,	"arlan3-txRing" },
-	{ 151,	"arlan3-rxRing" },
-	{ 152,	"arlan3-18" },
-	{ 153,	"arlan3-ring" },
-	{ 154,	"arlan3-shm-cpy" },
-	{ 155,	"config3" },
-	{ 156,	"reset3" },
-	{}
-};
-
-static const struct trans_ctl_table trans_arlan_table[] = {
-	{ 1,		"arlan0",	trans_arlan_conf_table0 },
-	{ 2,		"arlan1",	trans_arlan_conf_table1 },
-	{ 3,		"arlan2",	trans_arlan_conf_table2 },
-	{ 4,		"arlan3",	trans_arlan_conf_table3 },
-	{}
-};
-
-static const struct trans_ctl_table trans_s390dbf_table[] = {
-	{ 5678 /* CTL_S390DBF_STOPPABLE */,	"debug_stoppable" },
-	{ 5679 /* CTL_S390DBF_ACTIVE */,	"debug_active" },
-	{}
-};
-
-static const struct trans_ctl_table trans_sunrpc_table[] = {
-	{ CTL_RPCDEBUG,		"rpc_debug" },
-	{ CTL_NFSDEBUG,		"nfs_debug" },
-	{ CTL_NFSDDEBUG,	"nfsd_debug" },
-	{ CTL_NLMDEBUG,		"nlm_debug" },
-	{ CTL_SLOTTABLE_UDP,	"udp_slot_table_entries" },
-	{ CTL_SLOTTABLE_TCP,	"tcp_slot_table_entries" },
-	{ CTL_MIN_RESVPORT,	"min_resvport" },
-	{ CTL_MAX_RESVPORT,	"max_resvport" },
-	{}
-};
-
-static const struct trans_ctl_table trans_pm_table[] = {
-	{ 1 /* CTL_PM_SUSPEND */,	"suspend" },
-	{ 2 /* CTL_PM_CMODE */,		"cmode" },
-	{ 3 /* CTL_PM_P0 */,		"p0" },
-	{ 4 /* CTL_PM_CM */,		"cm" },
-	{}
-};
-
-static const struct trans_ctl_table trans_frv_table[] = {
-	{ 1,	"cache-mode" },
-	{ 2,	"pin-cxnr" },
-	{}
-};
-
-static const struct trans_ctl_table trans_root_table[] = {
-	{ CTL_KERN,	"kernel",	trans_kern_table },
-	{ CTL_VM,	"vm",		trans_vm_table },
-	{ CTL_NET,	"net",		trans_net_table },
-	/* CTL_PROC not used */
-	{ CTL_FS,	"fs",		trans_fs_table },
-	{ CTL_DEBUG,	"debug",	trans_debug_table },
-	{ CTL_DEV,	"dev",		trans_dev_table },
-	{ CTL_BUS,	"bus",		trans_bus_table },
-	{ CTL_ABI,	"abi" },
-	/* CTL_CPU not used */
-	{ CTL_ARLAN,	"arlan",	trans_arlan_table },
-	{ CTL_S390DBF,	"s390dbf",	trans_s390dbf_table },
-	{ CTL_SUNRPC,	"sunrpc",	trans_sunrpc_table },
-	{ CTL_PM,	"pm",		trans_pm_table },
-	{ CTL_FRV,	"frv",		trans_frv_table },
-	{}
-};
-
-
-
 
 static int sysctl_depth(struct ctl_table *table)
 {
@@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
 	return table;
 }
 
-static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
-{
-	struct ctl_table *test;
-	const struct trans_ctl_table *ref;
-	int cur_depth;
-
-	cur_depth = sysctl_depth(table);
-
-	ref = trans_root_table;
-repeat:
-	test = sysctl_parent(table, cur_depth);
-	for (; ref->ctl_name || ref->procname || ref->child; ref++) {
-		int match = 0;
-
-		if (cur_depth && !ref->child)
-			continue;
-
-		if (test->procname && ref->procname &&
-			(strcmp(test->procname, ref->procname) == 0))
-			match++;
-
-		if (test->ctl_name && ref->ctl_name &&
-			(test->ctl_name == ref->ctl_name))
-			match++;
-
-		if (!ref->ctl_name && !ref->procname)
-			match++;
-
-		if (match) {
-			if (cur_depth != 0) {
-				cur_depth--;
-				ref = ref->child;
-				goto repeat;
-			}
-			goto out;
-		}
-	}
-	ref = NULL;
-out:
-	return ref;
-}
 
 static void sysctl_print_path(struct ctl_table *table)
 {
@@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table)
 		}
 	}
 	printk(" ");
-	if (table->ctl_name) {
-		for (i = depth; i >= 0; i--) {
-			tmp = sysctl_parent(table, i);
-			printk(".%d", tmp->ctl_name);
-		}
-	}
-}
-
-static void sysctl_repair_table(struct ctl_table *table)
-{
-	/* Don't complain about the classic default
-	 * sysctl strategy routine.  Maybe later we
-	 * can get the tables fixed and complain about
-	 * this.
-	 */
-	if (table->ctl_name && table->procname &&
-		(table->proc_handler == proc_dointvec) &&
-		(!table->strategy)) {
-		table->strategy = sysctl_data;
-	}
 }
 
 static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
@@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
 		ref = head->ctl_table;
 repeat:
 		test = sysctl_parent(table, cur_depth);
-		for (; ref->ctl_name || ref->procname; ref++) {
+		for (; ref->procname; ref++) {
 			int match = 0;
 			if (cur_depth && !ref->child)
 				continue;
@@ -1361,10 +67,6 @@ repeat:
 			    (strcmp(test->procname, ref->procname) == 0))
 					match++;
 
-			if (test->ctl_name && ref->ctl_name &&
-			    (test->ctl_name == ref->ctl_name))
-				match++;
-
 			if (match) {
 				if (cur_depth != 0) {
 					cur_depth--;
@@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
 	*fail = str;
 }
 
-static int sysctl_check_dir(struct nsproxy *namespaces,
-				struct ctl_table *table)
-{
-	struct ctl_table *ref;
-	int error;
-
-	error = 0;
-	ref = sysctl_check_lookup(namespaces, table);
-	if (ref) {
-		int match = 0;
-		if ((!table->procname && !ref->procname) ||
-		    (table->procname && ref->procname &&
-		     (strcmp(table->procname, ref->procname) == 0)))
-			match++;
-
-		if ((!table->ctl_name && !ref->ctl_name) ||
-		    (table->ctl_name && ref->ctl_name &&
-		     (table->ctl_name == ref->ctl_name)))
-			match++;
-
-		if (match != 2) {
-			printk(KERN_ERR "%s: failed: ", __func__);
-			sysctl_print_path(table);
-			printk(" ref: ");
-			sysctl_print_path(ref);
-			printk("\n");
-			error = -EINVAL;
-		}
-	}
-	return error;
-}
-
 static void sysctl_check_leaf(struct nsproxy *namespaces,
 				struct ctl_table *table, const char **fail)
 {
@@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
 		set_fail(fail, table, "Sysctl already exists");
 }
 
-static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
-{
-	const struct trans_ctl_table *ref;
-
-	ref = sysctl_binary_lookup(table);
-	if (table->ctl_name && !ref)
-		set_fail(fail, table, "Unknown sysctl binary path");
-	if (ref) {
-		if (ref->procname &&
-		    (!table->procname ||
-		     (strcmp(table->procname, ref->procname) != 0)))
-			set_fail(fail, table, "procname does not match binary path procname");
-
-		if (ref->ctl_name && table->ctl_name &&
-		    (table->ctl_name != ref->ctl_name))
-			set_fail(fail, table, "ctl_name does not match binary path ctl_name");
-	}
-}
-
 int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 {
 	int error = 0;
-	for (; table->ctl_name || table->procname; table++) {
+	for (; table->procname; table++) {
 		const char *fail = NULL;
 
-		sysctl_repair_table(table);
 		if (table->parent) {
 			if (table->procname && !table->parent->procname)
 				set_fail(&fail, table, "Parent without procname");
-			if (table->ctl_name && !table->parent->ctl_name)
-				set_fail(&fail, table, "Parent without ctl_name");
 		}
 		if (!table->procname)
 			set_fail(&fail, table, "No procname");
@@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 				set_fail(&fail, table, "Writable sysctl directory");
 			if (table->proc_handler)
 				set_fail(&fail, table, "Directory with proc_handler");
-			if (table->strategy)
-				set_fail(&fail, table, "Directory with strategy");
 			if (table->extra1)
 				set_fail(&fail, table, "Directory with extra1");
 			if (table->extra2)
 				set_fail(&fail, table, "Directory with extra2");
-			if (sysctl_check_dir(namespaces, table))
-				set_fail(&fail, table, "Inconsistent directory names");
 		} else {
-			if ((table->strategy == sysctl_data) ||
-			    (table->strategy == sysctl_string) ||
-			    (table->strategy == sysctl_intvec) ||
-			    (table->strategy == sysctl_jiffies) ||
-			    (table->strategy == sysctl_ms_jiffies) ||
-			    (table->proc_handler == proc_dostring) ||
+			if ((table->proc_handler == proc_dostring) ||
 			    (table->proc_handler == proc_dointvec) ||
 			    (table->proc_handler == proc_dointvec_minmax) ||
 			    (table->proc_handler == proc_dointvec_jiffies) ||
@@ -1513,14 +152,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 						set_fail(&fail, table, "No max");
 				}
 			}
-#ifdef CONFIG_SYSCTL_SYSCALL
-			if (table->ctl_name && !table->strategy)
-				set_fail(&fail, table, "Missing strategy");
-#endif
-#if 0
-			if (!table->ctl_name && table->strategy)
-				set_fail(&fail, table, "Strategy without ctl_name");
-#endif
 #ifdef CONFIG_PROC_SYSCTL
 			if (table->procname && !table->proc_handler)
 				set_fail(&fail, table, "No proc_handler");
@@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 #endif
 			sysctl_check_leaf(namespaces, table, &fail);
 		}
-		sysctl_check_bin_path(table, &fail);
 		if (table->mode > 0777)
 			set_fail(&fail, table, "bogus .mode");
 		if (fail) {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 234ceb10861f..01f2d1139e9a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -912,7 +912,7 @@ config LATENCYTOP
 
 config SYSCTL_SYSCALL_CHECK
 	bool "Sysctl checks"
-	depends on SYSCTL_SYSCALL
+	depends on SYSCTL
 	---help---
 	  sys_sysctl uses binary paths that have been found challenging
 	  to properly maintain and use. This enables checks that help
-- 
cgit v1.2.3-71-gd317


From 6fce56ec91b502ba6fcbbc2a6d25a8c2c7f77934 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 02:30:53 -0700
Subject: sysctl: Remove references to ctl_name and strategy from the generic
 sysctl table

Now that sys_sysctl is a generic wrapper around /proc/sys  .ctl_name
and .strategy members of sysctl tables are dead code.  Remove them.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sysctl.c | 195 ++------------------------------------------------------
 1 file changed, 7 insertions(+), 188 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f82e955875c9..f6dacc383c16 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -205,31 +205,26 @@ extern int lock_stat;
 
 static struct ctl_table root_table[] = {
 	{
-		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= kern_table,
 	},
 	{
-		.ctl_name	= CTL_VM,
 		.procname	= "vm",
 		.mode		= 0555,
 		.child		= vm_table,
 	},
 	{
-		.ctl_name	= CTL_FS,
 		.procname	= "fs",
 		.mode		= 0555,
 		.child		= fs_table,
 	},
 	{
-		.ctl_name	= CTL_DEBUG,
 		.procname	= "debug",
 		.mode		= 0555,
 		.child		= debug_table,
 	},
 	{
-		.ctl_name	= CTL_DEV,
 		.procname	= "dev",
 		.mode		= 0555,
 		.child		= dev_table,
@@ -238,7 +233,7 @@ static struct ctl_table root_table[] = {
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
  */
-	{ .ctl_name = 0 }
+	{ }
 };
 
 #ifdef CONFIG_SCHED_DEBUG
@@ -250,7 +245,6 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 
 static struct ctl_table kern_table[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
 		.maxlen		= sizeof(unsigned int),
@@ -259,40 +253,33 @@ static struct ctl_table kern_table[] = {
 	},
 #ifdef CONFIG_SCHED_DEBUG
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_min_granularity_ns",
 		.data		= &sysctl_sched_min_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &sched_nr_latency_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_latency_ns",
 		.data		= &sysctl_sched_latency,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &sched_nr_latency_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_wakeup_granularity_ns",
 		.data		= &sysctl_sched_wakeup_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_shares_ratelimit",
 		.data		= &sysctl_sched_shares_ratelimit,
 		.maxlen		= sizeof(unsigned int),
@@ -300,17 +287,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_shares_thresh",
 		.data		= &sysctl_sched_shares_thresh,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_features",
 		.data		= &sysctl_sched_features,
 		.maxlen		= sizeof(unsigned int),
@@ -318,7 +302,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_migration_cost",
 		.data		= &sysctl_sched_migration_cost,
 		.maxlen		= sizeof(unsigned int),
@@ -326,7 +309,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_nr_migrate",
 		.data		= &sysctl_sched_nr_migrate,
 		.maxlen		= sizeof(unsigned int),
@@ -334,7 +316,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_time_avg",
 		.data		= &sysctl_sched_time_avg,
 		.maxlen		= sizeof(unsigned int),
@@ -342,19 +323,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "timer_migration",
 		.data		= &sysctl_timer_migration,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
 #endif
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,
 		.maxlen		= sizeof(unsigned int),
@@ -362,7 +340,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &sched_rt_handler,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_rt_runtime_us",
 		.data		= &sysctl_sched_rt_runtime,
 		.maxlen		= sizeof(int),
@@ -370,7 +347,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &sched_rt_handler,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_compat_yield",
 		.data		= &sysctl_sched_compat_yield,
 		.maxlen		= sizeof(unsigned int),
@@ -379,7 +355,6 @@ static struct ctl_table kern_table[] = {
 	},
 #ifdef CONFIG_PROVE_LOCKING
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "prove_locking",
 		.data		= &prove_locking,
 		.maxlen		= sizeof(int),
@@ -389,7 +364,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_LOCK_STAT
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "lock_stat",
 		.data		= &lock_stat,
 		.maxlen		= sizeof(int),
@@ -398,7 +372,6 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= KERN_PANIC,
 		.procname	= "panic",
 		.data		= &panic_timeout,
 		.maxlen		= sizeof(int),
@@ -406,7 +379,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= KERN_CORE_USES_PID,
 		.procname	= "core_uses_pid",
 		.data		= &core_uses_pid,
 		.maxlen		= sizeof(int),
@@ -414,16 +386,13 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= KERN_CORE_PATTERN,
 		.procname	= "core_pattern",
 		.data		= core_pattern,
 		.maxlen		= CORENAME_MAX_SIZE,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
-		.strategy	= &sysctl_string,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "core_pipe_limit",
 		.data		= &core_pipe_limit,
 		.maxlen		= sizeof(unsigned int),
@@ -449,7 +418,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
 	{
-		.ctl_name	= KERN_REALROOTDEV,
 		.procname	= "real-root-dev",
 		.data		= &real_root_dev,
 		.maxlen		= sizeof(int),
@@ -458,7 +426,6 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "print-fatal-signals",
 		.data		= &print_fatal_signals,
 		.maxlen		= sizeof(int),
@@ -467,16 +434,13 @@ static struct ctl_table kern_table[] = {
 	},
 #ifdef CONFIG_SPARC
 	{
-		.ctl_name	= KERN_SPARC_REBOOT,
 		.procname	= "reboot-cmd",
 		.data		= reboot_command,
 		.maxlen		= 256,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
-		.strategy	= &sysctl_string,
 	},
 	{
-		.ctl_name	= KERN_SPARC_STOP_A,
 		.procname	= "stop-a",
 		.data		= &stop_a_enabled,
 		.maxlen		= sizeof (int),
@@ -484,7 +448,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= KERN_SPARC_SCONS_PWROFF,
 		.procname	= "scons-poweroff",
 		.data		= &scons_pwroff,
 		.maxlen		= sizeof (int),
@@ -494,7 +457,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_SPARC64
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "tsb-ratio",
 		.data		= &sysctl_tsb_ratio,
 		.maxlen		= sizeof (int),
@@ -504,7 +466,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef __hppa__
 	{
-		.ctl_name	= KERN_HPPA_PWRSW,
 		.procname	= "soft-power",
 		.data		= &pwrsw_enabled,
 		.maxlen		= sizeof (int),
@@ -512,7 +473,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= KERN_HPPA_UNALIGNED,
 		.procname	= "unaligned-trap",
 		.data		= &unaligned_enabled,
 		.maxlen		= sizeof (int),
@@ -521,7 +481,6 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= KERN_CTLALTDEL,
 		.procname	= "ctrl-alt-del",
 		.data		= &C_A_D,
 		.maxlen		= sizeof(int),
@@ -530,7 +489,6 @@ static struct ctl_table kern_table[] = {
 	},
 #ifdef CONFIG_FUNCTION_TRACER
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "ftrace_enabled",
 		.data		= &ftrace_enabled,
 		.maxlen		= sizeof(int),
@@ -540,7 +498,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_STACK_TRACER
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "stack_tracer_enabled",
 		.data		= &stack_tracer_enabled,
 		.maxlen		= sizeof(int),
@@ -550,7 +507,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_TRACING
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "ftrace_dump_on_oops",
 		.data		= &ftrace_dump_on_oops,
 		.maxlen		= sizeof(int),
@@ -560,16 +516,13 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_MODULES
 	{
-		.ctl_name	= KERN_MODPROBE,
 		.procname	= "modprobe",
 		.data		= &modprobe_path,
 		.maxlen		= KMOD_PATH_LEN,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
-		.strategy	= &sysctl_string,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "modules_disabled",
 		.data		= &modules_disabled,
 		.maxlen		= sizeof(int),
@@ -582,18 +535,15 @@ static struct ctl_table kern_table[] = {
 #endif
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 	{
-		.ctl_name	= KERN_HOTPLUG,
 		.procname	= "hotplug",
 		.data		= &uevent_helper,
 		.maxlen		= UEVENT_HELPER_PATH_LEN,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
-		.strategy	= &sysctl_string,
 	},
 #endif
 #ifdef CONFIG_CHR_DEV_SG
 	{
-		.ctl_name	= KERN_SG_BIG_BUFF,
 		.procname	= "sg-big-buff",
 		.data		= &sg_big_buff,
 		.maxlen		= sizeof (int),
@@ -603,7 +553,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	{
-		.ctl_name	= KERN_ACCT,
 		.procname	= "acct",
 		.data		= &acct_parm,
 		.maxlen		= 3*sizeof(int),
@@ -613,7 +562,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 	{
-		.ctl_name	= KERN_SYSRQ,
 		.procname	= "sysrq",
 		.data		= &__sysrq_enabled,
 		.maxlen		= sizeof (int),
@@ -631,7 +579,6 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= KERN_MAX_THREADS,
 		.procname	= "threads-max",
 		.data		= &max_threads,
 		.maxlen		= sizeof(int),
@@ -639,37 +586,31 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= KERN_RANDOM,
 		.procname	= "random",
 		.mode		= 0555,
 		.child		= random_table,
 	},
 	{
-		.ctl_name	= KERN_OVERFLOWUID,
 		.procname	= "overflowuid",
 		.data		= &overflowuid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
 	{
-		.ctl_name	= KERN_OVERFLOWGID,
 		.procname	= "overflowgid",
 		.data		= &overflowgid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
 #ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 	{
-		.ctl_name	= KERN_IEEE_EMULATION_WARNINGS,
 		.procname	= "ieee_emulation_warnings",
 		.data		= &sysctl_ieee_emulation_warnings,
 		.maxlen		= sizeof(int),
@@ -678,7 +619,6 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
 		.procname	= "userprocess_debug",
 		.data		= &sysctl_userprocess_debug,
 		.maxlen		= sizeof(int),
@@ -687,18 +627,15 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= KERN_PIDMAX,
 		.procname	= "pid_max",
 		.data		= &pid_max,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= sysctl_intvec,
 		.extra1		= &pid_max_min,
 		.extra2		= &pid_max_max,
 	},
 	{
-		.ctl_name	= KERN_PANIC_ON_OOPS,
 		.procname	= "panic_on_oops",
 		.data		= &panic_on_oops,
 		.maxlen		= sizeof(int),
@@ -707,7 +644,6 @@ static struct ctl_table kern_table[] = {
 	},
 #if defined CONFIG_PRINTK
 	{
-		.ctl_name	= KERN_PRINTK,
 		.procname	= "printk",
 		.data		= &console_loglevel,
 		.maxlen		= 4*sizeof(int),
@@ -715,16 +651,13 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= KERN_PRINTK_RATELIMIT,
 		.procname	= "printk_ratelimit",
 		.data		= &printk_ratelimit_state.interval,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= &sysctl_jiffies,
 	},
 	{
-		.ctl_name	= KERN_PRINTK_RATELIMIT_BURST,
 		.procname	= "printk_ratelimit_burst",
 		.data		= &printk_ratelimit_state.burst,
 		.maxlen		= sizeof(int),
@@ -732,19 +665,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "printk_delay",
 		.data		= &printk_delay_msec,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &ten_thousand,
 	},
 #endif
 	{
-		.ctl_name	= KERN_NGROUPS_MAX,
 		.procname	= "ngroups_max",
 		.data		= &ngroups_max,
 		.maxlen		= sizeof (int),
@@ -753,7 +683,6 @@ static struct ctl_table kern_table[] = {
 	},
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
-		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
 		.procname       = "unknown_nmi_panic",
 		.data           = &unknown_nmi_panic,
 		.maxlen         = sizeof (int),
@@ -770,7 +699,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #if defined(CONFIG_X86)
 	{
-		.ctl_name	= KERN_PANIC_ON_NMI,
 		.procname	= "panic_on_unrecovered_nmi",
 		.data		= &panic_on_unrecovered_nmi,
 		.maxlen		= sizeof(int),
@@ -778,7 +706,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "panic_on_io_nmi",
 		.data		= &panic_on_io_nmi,
 		.maxlen		= sizeof(int),
@@ -786,7 +713,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= KERN_BOOTLOADER_TYPE,
 		.procname	= "bootloader_type",
 		.data		= &bootloader_type,
 		.maxlen		= sizeof (int),
@@ -794,7 +720,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "bootloader_version",
 		.data		= &bootloader_version,
 		.maxlen		= sizeof (int),
@@ -802,7 +727,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "kstack_depth_to_print",
 		.data		= &kstack_depth_to_print,
 		.maxlen		= sizeof(int),
@@ -810,7 +734,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "io_delay_type",
 		.data		= &io_delay_type,
 		.maxlen		= sizeof(int),
@@ -820,7 +743,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #if defined(CONFIG_MMU)
 	{
-		.ctl_name	= KERN_RANDOMIZE,
 		.procname	= "randomize_va_space",
 		.data		= &randomize_va_space,
 		.maxlen		= sizeof(int),
@@ -830,7 +752,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 	{
-		.ctl_name	= KERN_SPIN_RETRY,
 		.procname	= "spin_retry",
 		.data		= &spin_retry,
 		.maxlen		= sizeof (int),
@@ -849,7 +770,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_IA64
 	{
-		.ctl_name	= KERN_IA64_UNALIGNED,
 		.procname	= "ignore-unaligned-usertrap",
 		.data		= &no_unaligned_warning,
 		.maxlen		= sizeof (int),
@@ -857,7 +777,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "unaligned-dump-stack",
 		.data		= &unaligned_dump_stack,
 		.maxlen		= sizeof (int),
@@ -867,71 +786,58 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_panic",
 		.data		= &softlockup_panic,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
 		.data		= &softlockup_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dosoftlockup_thresh,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_panic",
 		.data		= &sysctl_hung_task_panic,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_check_count",
 		.data		= &sysctl_hung_task_check_count,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
-		.strategy	= &sysctl_intvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_timeout_secs",
 		.data		= &sysctl_hung_task_timeout_secs,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &proc_dohung_task_timeout_secs,
-		.strategy	= &sysctl_intvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_warnings",
 		.data		= &sysctl_hung_task_warnings,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
-		.strategy	= &sysctl_intvec,
 	},
 #endif
 #ifdef CONFIG_COMPAT
 	{
-		.ctl_name	= KERN_COMPAT_LOG,
 		.procname	= "compat-log",
 		.data		= &compat_log,
 		.maxlen		= sizeof (int),
@@ -941,7 +847,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	{
-		.ctl_name	= KERN_MAX_LOCK_DEPTH,
 		.procname	= "max_lock_depth",
 		.data		= &max_lock_depth,
 		.maxlen		= sizeof(int),
@@ -950,17 +855,14 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "poweroff_cmd",
 		.data		= &poweroff_cmd,
 		.maxlen		= POWEROFF_CMD_PATH_LEN,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
-		.strategy	= &sysctl_string,
 	},
 #ifdef CONFIG_KEYS
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "keys",
 		.mode		= 0555,
 		.child		= key_sysctls,
@@ -968,7 +870,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_RCU_TORTURE_TEST
 	{
-		.ctl_name       = CTL_UNNUMBERED,
 		.procname       = "rcutorture_runnable",
 		.data           = &rcutorture_runnable,
 		.maxlen         = sizeof(int),
@@ -978,7 +879,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_SLOW_WORK
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "slow-work",
 		.mode		= 0555,
 		.child		= slow_work_sysctls,
@@ -986,7 +886,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "perf_event_paranoid",
 		.data		= &sysctl_perf_event_paranoid,
 		.maxlen		= sizeof(sysctl_perf_event_paranoid),
@@ -994,7 +893,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "perf_event_mlock_kb",
 		.data		= &sysctl_perf_event_mlock,
 		.maxlen		= sizeof(sysctl_perf_event_mlock),
@@ -1002,7 +900,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "perf_event_max_sample_rate",
 		.data		= &sysctl_perf_event_sample_rate,
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
@@ -1012,7 +909,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "kmemcheck",
 		.data		= &kmemcheck_enabled,
 		.maxlen		= sizeof(int),
@@ -1022,7 +918,6 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_BLOCK
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "blk_iopoll",
 		.data		= &blk_iopoll_enabled,
 		.maxlen		= sizeof(int),
@@ -1034,12 +929,11 @@ static struct ctl_table kern_table[] = {
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
  */
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static struct ctl_table vm_table[] = {
 	{
-		.ctl_name	= VM_OVERCOMMIT_MEMORY,
 		.procname	= "overcommit_memory",
 		.data		= &sysctl_overcommit_memory,
 		.maxlen		= sizeof(sysctl_overcommit_memory),
@@ -1047,7 +941,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= VM_PANIC_ON_OOM,
 		.procname	= "panic_on_oom",
 		.data		= &sysctl_panic_on_oom,
 		.maxlen		= sizeof(sysctl_panic_on_oom),
@@ -1055,7 +948,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "oom_kill_allocating_task",
 		.data		= &sysctl_oom_kill_allocating_task,
 		.maxlen		= sizeof(sysctl_oom_kill_allocating_task),
@@ -1063,7 +955,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "oom_dump_tasks",
 		.data		= &sysctl_oom_dump_tasks,
 		.maxlen		= sizeof(sysctl_oom_dump_tasks),
@@ -1071,7 +962,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= VM_OVERCOMMIT_RATIO,
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
 		.maxlen		= sizeof(sysctl_overcommit_ratio),
@@ -1079,7 +969,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= VM_PAGE_CLUSTER,
 		.procname	= "page-cluster", 
 		.data		= &page_cluster,
 		.maxlen		= sizeof(int),
@@ -1087,45 +976,37 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= VM_DIRTY_BACKGROUND,
 		.procname	= "dirty_background_ratio",
 		.data		= &dirty_background_ratio,
 		.maxlen		= sizeof(dirty_background_ratio),
 		.mode		= 0644,
 		.proc_handler	= &dirty_background_ratio_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "dirty_background_bytes",
 		.data		= &dirty_background_bytes,
 		.maxlen		= sizeof(dirty_background_bytes),
 		.mode		= 0644,
 		.proc_handler	= &dirty_background_bytes_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &one_ul,
 	},
 	{
-		.ctl_name	= VM_DIRTY_RATIO,
 		.procname	= "dirty_ratio",
 		.data		= &vm_dirty_ratio,
 		.maxlen		= sizeof(vm_dirty_ratio),
 		.mode		= 0644,
 		.proc_handler	= &dirty_ratio_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "dirty_bytes",
 		.data		= &vm_dirty_bytes,
 		.maxlen		= sizeof(vm_dirty_bytes),
 		.mode		= 0644,
 		.proc_handler	= &dirty_bytes_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &dirty_bytes_min,
 	},
 	{
@@ -1143,7 +1024,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= VM_NR_PDFLUSH_THREADS,
 		.procname	= "nr_pdflush_threads",
 		.data		= &nr_pdflush_threads,
 		.maxlen		= sizeof nr_pdflush_threads,
@@ -1151,13 +1031,11 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= VM_SWAPPINESS,
 		.procname	= "swappiness",
 		.data		= &vm_swappiness,
 		.maxlen		= sizeof(vm_swappiness),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
@@ -1172,7 +1050,6 @@ static struct ctl_table vm_table[] = {
 		.extra2		= (void *)&hugetlb_infinity,
 	 },
 	 {
-		.ctl_name	= VM_HUGETLB_GROUP,
 		.procname	= "hugetlb_shm_group",
 		.data		= &sysctl_hugetlb_shm_group,
 		.maxlen		= sizeof(gid_t),
@@ -1180,7 +1057,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	 },
 	 {
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hugepages_treat_as_movable",
 		.data		= &hugepages_treat_as_movable,
 		.maxlen		= sizeof(int),
@@ -1188,7 +1064,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &hugetlb_treat_movable_handler,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nr_overcommit_hugepages",
 		.data		= NULL,
 		.maxlen		= sizeof(unsigned long),
@@ -1199,46 +1074,37 @@ static struct ctl_table vm_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= VM_LOWMEM_RESERVE_RATIO,
 		.procname	= "lowmem_reserve_ratio",
 		.data		= &sysctl_lowmem_reserve_ratio,
 		.maxlen		= sizeof(sysctl_lowmem_reserve_ratio),
 		.mode		= 0644,
 		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
-		.strategy	= &sysctl_intvec,
 	},
 	{
-		.ctl_name	= VM_DROP_PAGECACHE,
 		.procname	= "drop_caches",
 		.data		= &sysctl_drop_caches,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= drop_caches_sysctl_handler,
-		.strategy	= &sysctl_intvec,
 	},
 	{
-		.ctl_name	= VM_MIN_FREE_KBYTES,
 		.procname	= "min_free_kbytes",
 		.data		= &min_free_kbytes,
 		.maxlen		= sizeof(min_free_kbytes),
 		.mode		= 0644,
 		.proc_handler	= &min_free_kbytes_sysctl_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
 	{
-		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
 		.maxlen		= sizeof(percpu_pagelist_fraction),
 		.mode		= 0644,
 		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &min_percpu_pagelist_fract,
 	},
 #ifdef CONFIG_MMU
 	{
-		.ctl_name	= VM_MAX_MAP_COUNT,
 		.procname	= "max_map_count",
 		.data		= &sysctl_max_map_count,
 		.maxlen		= sizeof(sysctl_max_map_count),
@@ -1247,104 +1113,85 @@ static struct ctl_table vm_table[] = {
 	},
 #else
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nr_trim_pages",
 		.data		= &sysctl_nr_trim_pages,
 		.maxlen		= sizeof(sysctl_nr_trim_pages),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
 #endif
 	{
-		.ctl_name	= VM_LAPTOP_MODE,
 		.procname	= "laptop_mode",
 		.data		= &laptop_mode,
 		.maxlen		= sizeof(laptop_mode),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= &sysctl_jiffies,
 	},
 	{
-		.ctl_name	= VM_BLOCK_DUMP,
 		.procname	= "block_dump",
 		.data		= &block_dump,
 		.maxlen		= sizeof(block_dump),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
 	{
-		.ctl_name	= VM_VFS_CACHE_PRESSURE,
 		.procname	= "vfs_cache_pressure",
 		.data		= &sysctl_vfs_cache_pressure,
 		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
-		.ctl_name	= VM_LEGACY_VA_LAYOUT,
 		.procname	= "legacy_va_layout",
 		.data		= &sysctl_legacy_va_layout,
 		.maxlen		= sizeof(sysctl_legacy_va_layout),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
 #endif
 #ifdef CONFIG_NUMA
 	{
-		.ctl_name	= VM_ZONE_RECLAIM_MODE,
 		.procname	= "zone_reclaim_mode",
 		.data		= &zone_reclaim_mode,
 		.maxlen		= sizeof(zone_reclaim_mode),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
 	{
-		.ctl_name	= VM_MIN_UNMAPPED,
 		.procname	= "min_unmapped_ratio",
 		.data		= &sysctl_min_unmapped_ratio,
 		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
 		.mode		= 0644,
 		.proc_handler	= &sysctl_min_unmapped_ratio_sysctl_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
 	{
-		.ctl_name	= VM_MIN_SLAB,
 		.procname	= "min_slab_ratio",
 		.data		= &sysctl_min_slab_ratio,
 		.maxlen		= sizeof(sysctl_min_slab_ratio),
 		.mode		= 0644,
 		.proc_handler	= &sysctl_min_slab_ratio_sysctl_handler,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
 #endif
 #ifdef CONFIG_SMP
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "stat_interval",
 		.data		= &sysctl_stat_interval,
 		.maxlen		= sizeof(sysctl_stat_interval),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= &sysctl_jiffies,
 	},
 #endif
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "mmap_min_addr",
 		.data		= &dac_mmap_min_addr,
 		.maxlen		= sizeof(unsigned long),
@@ -1353,43 +1200,36 @@ static struct ctl_table vm_table[] = {
 	},
 #ifdef CONFIG_NUMA
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "numa_zonelist_order",
 		.data		= &numa_zonelist_order,
 		.maxlen		= NUMA_ZONELIST_ORDER_LEN,
 		.mode		= 0644,
 		.proc_handler	= &numa_zonelist_order_handler,
-		.strategy	= &sysctl_string,
 	},
 #endif
 #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
    (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
 	{
-		.ctl_name	= VM_VDSO_ENABLED,
 		.procname	= "vdso_enabled",
 		.data		= &vdso_enabled,
 		.maxlen		= sizeof(vdso_enabled),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
 #endif
 #ifdef CONFIG_HIGHMEM
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "highmem_is_dirtyable",
 		.data		= &vm_highmem_is_dirtyable,
 		.maxlen		= sizeof(vm_highmem_is_dirtyable),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
 #endif
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "scan_unevictable_pages",
 		.data		= &scan_unevictable_pages,
 		.maxlen		= sizeof(scan_unevictable_pages),
@@ -1398,24 +1238,20 @@ static struct ctl_table vm_table[] = {
 	},
 #ifdef CONFIG_MEMORY_FAILURE
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "memory_failure_early_kill",
 		.data		= &sysctl_memory_failure_early_kill,
 		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "memory_failure_recovery",
 		.data		= &sysctl_memory_failure_recovery,
 		.maxlen		= sizeof(sysctl_memory_failure_recovery),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -1425,18 +1261,17 @@ static struct ctl_table vm_table[] = {
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
  */
-	{ .ctl_name = 0 }
+	{ }
 };
 
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
 static struct ctl_table binfmt_misc_table[] = {
-	{ .ctl_name = 0 }
+	{ }
 };
 #endif
 
 static struct ctl_table fs_table[] = {
 	{
-		.ctl_name	= FS_NRINODE,
 		.procname	= "inode-nr",
 		.data		= &inodes_stat,
 		.maxlen		= 2*sizeof(int),
@@ -1444,7 +1279,6 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_STATINODE,
 		.procname	= "inode-state",
 		.data		= &inodes_stat,
 		.maxlen		= 7*sizeof(int),
@@ -1459,7 +1293,6 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &proc_nr_files,
 	},
 	{
-		.ctl_name	= FS_MAXFILE,
 		.procname	= "file-max",
 		.data		= &files_stat.max_files,
 		.maxlen		= sizeof(int),
@@ -1467,7 +1300,6 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nr_open",
 		.data		= &sysctl_nr_open,
 		.maxlen		= sizeof(int),
@@ -1477,7 +1309,6 @@ static struct ctl_table fs_table[] = {
 		.extra2		= &sysctl_nr_open_max,
 	},
 	{
-		.ctl_name	= FS_DENTRY,
 		.procname	= "dentry-state",
 		.data		= &dentry_stat,
 		.maxlen		= 6*sizeof(int),
@@ -1485,30 +1316,25 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_OVERFLOWUID,
 		.procname	= "overflowuid",
 		.data		= &fs_overflowuid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
 	{
-		.ctl_name	= FS_OVERFLOWGID,
 		.procname	= "overflowgid",
 		.data		= &fs_overflowgid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
 #ifdef CONFIG_FILE_LOCKING
 	{
-		.ctl_name	= FS_LEASES,
 		.procname	= "leases-enable",
 		.data		= &leases_enable,
 		.maxlen		= sizeof(int),
@@ -1518,7 +1344,6 @@ static struct ctl_table fs_table[] = {
 #endif
 #ifdef CONFIG_DNOTIFY
 	{
-		.ctl_name	= FS_DIR_NOTIFY,
 		.procname	= "dir-notify-enable",
 		.data		= &dir_notify_enable,
 		.maxlen		= sizeof(int),
@@ -1529,7 +1354,6 @@ static struct ctl_table fs_table[] = {
 #ifdef CONFIG_MMU
 #ifdef CONFIG_FILE_LOCKING
 	{
-		.ctl_name	= FS_LEASE_TIME,
 		.procname	= "lease-break-time",
 		.data		= &lease_break_time,
 		.maxlen		= sizeof(int),
@@ -1555,7 +1379,6 @@ static struct ctl_table fs_table[] = {
 #endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
 	{
-		.ctl_name	= FS_INOTIFY,
 		.procname	= "inotify",
 		.mode		= 0555,
 		.child		= inotify_table,
@@ -1570,19 +1393,16 @@ static struct ctl_table fs_table[] = {
 #endif
 #endif
 	{
-		.ctl_name	= KERN_SETUID_DUMPABLE,
 		.procname	= "suid_dumpable",
 		.data		= &suid_dumpable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "binfmt_misc",
 		.mode		= 0555,
 		.child		= binfmt_misc_table,
@@ -1592,13 +1412,12 @@ static struct ctl_table fs_table[] = {
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
  */
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static struct ctl_table debug_table[] = {
 #if defined(CONFIG_X86) || defined(CONFIG_PPC)
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "exception-trace",
 		.data		= &show_unhandled_signals,
 		.maxlen		= sizeof(int),
@@ -1606,11 +1425,11 @@ static struct ctl_table debug_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 #endif
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static struct ctl_table dev_table[] = {
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static DEFINE_SPINLOCK(sysctl_lock);
-- 
cgit v1.2.3-71-gd317


From 2315ffa0a9f789c588c7139effa7404a387d8685 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 03:18:02 -0700
Subject: sysctl: Don't look at ctl_name and strategy in the generic code

The ctl_name and strategy fields are unused, now that sys_sysctl
is a compatibility wrapper around /proc/sys.  No longer looking
at them in the generic code is effectively what we are doing
now and provides the guarantee that during further cleanups
we can just remove references to those fields and everything
will work ok.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/proc_sysctl.c  |  4 ++--
 include/linux/sysctl.h | 17 ++---------------
 kernel/sysctl.c        | 29 +++++++----------------------
 3 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f667e8aeabdf..6ff9981f0a18 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -48,7 +48,7 @@ out:
 static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
 {
 	int len;
-	for ( ; p->ctl_name || p->procname; p++) {
+	for ( ; p->procname; p++) {
 
 		if (!p->procname)
 			continue;
@@ -218,7 +218,7 @@ static int scan(struct ctl_table_header *head, ctl_table *table,
 		void *dirent, filldir_t filldir)
 {
 
-	for (; table->ctl_name || table->procname; table++, (*pos)++) {
+	for (; table->procname; table++, (*pos)++) {
 		int res;
 
 		/* Can't do anything without a proc name */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 82c32b89932d..7c4aabc04673 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1005,8 +1005,8 @@ extern ctl_handler sysctl_ms_jiffies;
 
 /*
  * Register a set of sysctl names by calling register_sysctl_table
- * with an initialised array of struct ctl_table's.  An entry with zero
- * ctl_name and NULL procname terminates the table.  table->de will be
+ * with an initialised array of struct ctl_table's.  An entry with 
+ * NULL procname terminates the table.  table->de will be
  * set up by the registration and need not be initialised in advance.
  *
  * sysctl names can be mirrored automatically under /proc/sys.  The
@@ -1019,24 +1019,11 @@ extern ctl_handler sysctl_ms_jiffies;
  * under /proc; non-leaf nodes will be represented by directories.  A
  * null procname disables /proc mirroring at this node.
  *
- * sysctl entries with a zero ctl_name will not be available through
- * the binary sysctl interface.
- *
  * sysctl(2) can automatically manage read and write requests through
  * the sysctl table.  The data and maxlen fields of the ctl_table
  * struct enable minimal validation of the values being written to be
  * performed, and the mode field allows minimal authentication.
  * 
- * More sophisticated management can be enabled by the provision of a
- * strategy routine with the table entry.  This will be called before
- * any automatic read or write of the data is performed.
- * 
- * The strategy routine may return:
- * <0: Error occurred (error is passed to user process)
- * 0:  OK - proceed with automatic read or write.
- * >0: OK - read or write has been done by the strategy routine, so 
- *     return immediately.
- * 
  * There must be a proc_handler routine for any terminal nodes
  * mirrored under /proc/sys (non-terminals are handled by a built-in
  * directory handler).  Several default handlers are available to
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f6dacc383c16..b6b6eb1abebb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1618,7 +1618,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 
 static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 {
-	for (; table->ctl_name || table->procname; table++) {
+	for (; table->procname; table++) {
 		table->parent = parent;
 		if (table->child)
 			sysctl_set_parent(table, table->child);
@@ -1650,11 +1650,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch,
 		return NULL;
 
 	/* ... and nothing else */
-	if (branch[1].procname || branch[1].ctl_name)
+	if (branch[1].procname)
 		return NULL;
 
 	/* table should contain subdirectory with the same name */
-	for (p = table; p->procname || p->ctl_name; p++) {
+	for (p = table; p->procname; p++) {
 		if (!p->child)
 			continue;
 		if (p->procname && strcmp(p->procname, s) == 0)
@@ -1699,8 +1699,7 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  *
  * The members of the &struct ctl_table structure are used as follows:
  *
- * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
- *            must be unique within that level of sysctl
+ * ctl_name - Dead
  *
  * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
  *            enter a sysctl file
@@ -1716,7 +1715,7 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  *
  * proc_handler - the text handler routine (described below)
  *
- * strategy - the strategy routine (described below)
+ * strategy - Dead
  *
  * de - for internal use by the sysctl routines
  *
@@ -1730,19 +1729,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  * struct enable minimal validation of the values being written to be
  * performed, and the mode field allows minimal authentication.
  *
- * More sophisticated management can be enabled by the provision of a
- * strategy routine with the table entry.  This will be called before
- * any automatic read or write of the data is performed.
- *
- * The strategy routine may return
- *
- * < 0 - Error occurred (error is passed to user process)
- *
- * 0   - OK - proceed with automatic read or write.
- *
- * > 0 - OK - read or write has been done by the strategy routine, so
- *       return immediately.
- *
  * There must be a proc_handler routine for any terminal nodes
  * mirrored under /proc/sys (non-terminals are handled by a built-in
  * directory handler).  Several default handlers are available to
@@ -1769,13 +1755,13 @@ struct ctl_table_header *__register_sysctl_paths(
 	struct ctl_table_set *set;
 
 	/* Count the path components */
-	for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
+	for (npath = 0; path[npath].procname; ++npath)
 		;
 
 	/*
 	 * For each path component, allocate a 2-element ctl_table array.
 	 * The first array element will be filled with the sysctl entry
-	 * for this, the second will be the sentinel (ctl_name == 0).
+	 * for this, the second will be the sentinel (procname == 0).
 	 *
 	 * We allocate everything in one go so that we don't have to
 	 * worry about freeing additional memory in unregister_sysctl_table.
@@ -1792,7 +1778,6 @@ struct ctl_table_header *__register_sysctl_paths(
 	for (n = 0; n < npath; ++n, ++path) {
 		/* Copy the procname */
 		new->procname = path->procname;
-		new->ctl_name = path->ctl_name;
 		new->mode     = 0555;
 
 		*prevp = new;
-- 
cgit v1.2.3-71-gd317


From 0e0fc1c23e04c15e814763f2b366e92d87d8b95d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 11 Nov 2009 11:28:06 -0800
Subject: rcu: Mark init-time-only rcu_bootup_announce() as __init

Because rcu_bootup_announce() is used only at boot time, mark it
as __init, presumably so that its memory can be reclaimed.

Suggested-by: Joe Perches <joe@perches.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <20091111192806.GA10073@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree_plugin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 52075da70549..5ca2d26c5971 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 /*
  * Tell them what RCU they are running.
  */
-static void rcu_bootup_announce(void)
+static void __init rcu_bootup_announce(void)
 {
 	printk(KERN_INFO
 	       "Experimental preemptable hierarchical RCU implementation.\n");
@@ -481,7 +481,7 @@ void exit_rcu(void)
 /*
  * Tell them what RCU they are running.
  */
-static void rcu_bootup_announce(void)
+static void __init rcu_bootup_announce(void)
 {
 	printk(KERN_INFO "Hierarchical RCU implementation.\n");
 }
-- 
cgit v1.2.3-71-gd317


From a646365cc330b5aaf4452c91f61b1e0d1acf68d0 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 11 Nov 2009 22:26:35 +0100
Subject: tracing: Fix return value of tracing_stats_read()

The function tracing_stats_read() mistakenly returns ENOMEM instead
of the negative value -ENOMEM.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
LKML-Reference: <4AFB2C0B.50605@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b20d3ec75de9..03c7fd55c5f9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3730,7 +3730,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
-		return ENOMEM;
+		return -ENOMEM;
 
 	trace_seq_init(s);
 
-- 
cgit v1.2.3-71-gd317


From 3586e0a9a4a5f19110627d6ba81ada32a358467d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Nov 2009 19:06:30 -0800
Subject: clocksource/timecompare: Fix symbol exports to be GPL'd.

Noticed by Thomas GLeixner.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/time/clocksource.c | 6 +++---
 kernel/time/timecompare.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5e18c6ab2c6a..4a310906b3e8 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc,
 	tc->cycle_last = cc->read(cc);
 	tc->nsec = start_tstamp;
 }
-EXPORT_SYMBOL(timecounter_init);
+EXPORT_SYMBOL_GPL(timecounter_init);
 
 /**
  * timecounter_read_delta - get nanoseconds since last call of this function
@@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc)
 
 	return nsec;
 }
-EXPORT_SYMBOL(timecounter_read);
+EXPORT_SYMBOL_GPL(timecounter_read);
 
 u64 timecounter_cyc2time(struct timecounter *tc,
 			 cycle_t cycle_tstamp)
@@ -105,7 +105,7 @@ u64 timecounter_cyc2time(struct timecounter *tc,
 
 	return nsec;
 }
-EXPORT_SYMBOL(timecounter_cyc2time);
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 
 /*[Clocksource internal variables]---------
  * curr_clocksource:
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 71e7f1a19156..96ff643a5a59 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -40,7 +40,7 @@ ktime_t timecompare_transform(struct timecompare *sync,
 
 	return ns_to_ktime(nsec);
 }
-EXPORT_SYMBOL(timecompare_transform);
+EXPORT_SYMBOL_GPL(timecompare_transform);
 
 int timecompare_offset(struct timecompare *sync,
 		       s64 *offset,
@@ -131,7 +131,7 @@ int timecompare_offset(struct timecompare *sync,
 
 	return used;
 }
-EXPORT_SYMBOL(timecompare_offset);
+EXPORT_SYMBOL_GPL(timecompare_offset);
 
 void __timecompare_update(struct timecompare *sync,
 			  u64 source_tstamp)
@@ -188,4 +188,4 @@ void __timecompare_update(struct timecompare *sync,
 		}
 	}
 }
-EXPORT_SYMBOL(__timecompare_update);
+EXPORT_SYMBOL_GPL(__timecompare_update);
-- 
cgit v1.2.3-71-gd317


From a6f0eb6adc42e5eed3f35af99c61c0e411b16f8e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Nov 2009 17:14:07 -0500
Subject: ring-buffer: Add multiple iterations between benchmark timestamps

The ring_buffer_benchmark does a gettimeofday after every write to the
ring buffer in its measurements. This adds the overhead of the call
to gettimeofday to the measurements and does not give an accurate picture
of the length of time it takes to record a trace.

This was first noticed with perf top:

------------------------------------------------------------------------------
   PerfTop:     679 irqs/sec  kernel:99.9% [1000Hz cpu-clock-msecs],  (all, 4 CPUs)
------------------------------------------------------------------------------

             samples    pcnt   kernel function
             _______   _____   _______________

             1673.00 - 27.8% : trace_clock_local
              806.00 - 13.4% : do_gettimeofday
              590.00 -  9.8% : rb_reserve_next_event
              554.00 -  9.2% : native_read_tsc
              431.00 -  7.2% : ring_buffer_lock_reserve
              365.00 -  6.1% : __rb_reserve_next
              355.00 -  5.9% : rb_end_commit
              322.00 -  5.4% : getnstimeofday
              268.00 -  4.5% : ring_buffer_unlock_commit
              262.00 -  4.4% : ring_buffer_producer_thread 	[ring_buffer_benchmark]
              113.00 -  1.9% : read_tsc
               91.00 -  1.5% : debug_smp_processor_id
               69.00 -  1.1% : trace_recursive_unlock
               66.00 -  1.1% : ring_buffer_event_data
               25.00 -  0.4% : _spin_unlock_irq

And the length of each write to the ring buffer measured at 310ns.

This patch adds a new module parameter called "write_interval" which is
defaulted to 50. This is the number of writes performed between
timestamps. After this patch perf top shows:

------------------------------------------------------------------------------
   PerfTop:     244 irqs/sec  kernel:100.0% [1000Hz cpu-clock-msecs],  (all, 4 CPUs)
------------------------------------------------------------------------------

             samples    pcnt   kernel function
             _______   _____   _______________

             2842.00 - 40.4% : trace_clock_local
             1043.00 - 14.8% : rb_reserve_next_event
              784.00 - 11.1% : ring_buffer_lock_reserve
              600.00 -  8.5% : __rb_reserve_next
              579.00 -  8.2% : rb_end_commit
              440.00 -  6.3% : ring_buffer_unlock_commit
              290.00 -  4.1% : ring_buffer_producer_thread 	[ring_buffer_benchmark]
              155.00 -  2.2% : debug_smp_processor_id
              117.00 -  1.7% : trace_recursive_unlock
              103.00 -  1.5% : ring_buffer_event_data
               28.00 -  0.4% : do_gettimeofday
               22.00 -  0.3% : _spin_unlock_irq
               14.00 -  0.2% : native_read_tsc
               11.00 -  0.2% : getnstimeofday

do_gettimeofday dropped from 13% usage to a mere 0.4%! (using the default
50 interval)  The measurement for each timestamp went from 310ns to 210ns.
That's 100ns (1/3rd) overhead that the gettimeofday call was introducing.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c3..70df73e4ff21 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -35,6 +35,10 @@ static int disable_reader;
 module_param(disable_reader, uint, 0644);
 MODULE_PARM_DESC(disable_reader, "only run producer");
 
+static int write_iteration = 50;
+module_param(write_iteration, uint, 0644);
+MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
+
 static int read_events;
 
 static int kill_test;
@@ -208,15 +212,18 @@ static void ring_buffer_producer(void)
 	do {
 		struct ring_buffer_event *event;
 		int *entry;
-
-		event = ring_buffer_lock_reserve(buffer, 10);
-		if (!event) {
-			missed++;
-		} else {
-			hit++;
-			entry = ring_buffer_event_data(event);
-			*entry = smp_processor_id();
-			ring_buffer_unlock_commit(buffer, event);
+		int i;
+
+		for (i = 0; i < write_iteration; i++) {
+			event = ring_buffer_lock_reserve(buffer, 10);
+			if (!event) {
+				missed++;
+			} else {
+				hit++;
+				entry = ring_buffer_event_data(event);
+				*entry = smp_processor_id();
+				ring_buffer_unlock_commit(buffer, event);
+			}
 		}
 		do_gettimeofday(&end_tv);
 
-- 
cgit v1.2.3-71-gd317


From 2fb10732c3b3c9671b1a391996ce7e551876c25e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 11 Nov 2009 19:32:48 -0800
Subject: sysctl:  Warn about all uses of sys_sysctl.

Now that the glibc pthread implemenation no longers uses sysctl() users
of sysctl are as rare as hen's teeth.  So remove the glibc exception
from the warning, and use the standard printk_ratelimit instead of
rolling our own.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sysctl_binary.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 471438bbece6..bf0a4b06782a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1393,15 +1393,9 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 
 static void deprecated_sysctl_warning(const int *name, int nlen)
 {
-	static int msg_count;
 	int i;
 
-	/* Ignore accesses to kernel.version */
-	if ((nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
-		return;
-
-	if (msg_count < 5) {
-		msg_count++;
+	if (printk_ratelimit()) {
 		printk(KERN_INFO
 			"warning: process `%s' used the deprecated sysctl "
 			"system call with ", current->comm);
-- 
cgit v1.2.3-71-gd317


From 8b2a5dac7859dd1954095fce8b6445c3ceb36ef6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Nov 2009 19:36:03 -0500
Subject: tracing: do not disable interrupts for trace_clock_local

Disabling interrupts in trace_clock_local takes quite a performance
hit to the recording of traces. Using perf top we see:

------------------------------------------------------------------------------
   PerfTop:     244 irqs/sec  kernel:100.0% [1000Hz cpu-clock-msecs],  (all, 4 CPUs)
------------------------------------------------------------------------------

             samples    pcnt   kernel function
             _______   _____   _______________

             2842.00 - 40.4% : trace_clock_local
             1043.00 - 14.8% : rb_reserve_next_event
              784.00 - 11.1% : ring_buffer_lock_reserve
              600.00 -  8.5% : __rb_reserve_next
              579.00 -  8.2% : rb_end_commit
              440.00 -  6.3% : ring_buffer_unlock_commit
              290.00 -  4.1% : ring_buffer_producer_thread 	[ring_buffer_benchmark]
              155.00 -  2.2% : debug_smp_processor_id
              117.00 -  1.7% : trace_recursive_unlock
              103.00 -  1.5% : ring_buffer_event_data
               28.00 -  0.4% : do_gettimeofday
               22.00 -  0.3% : _spin_unlock_irq
               14.00 -  0.2% : native_read_tsc
               11.00 -  0.2% : getnstimeofday

Where trace_clock_local is 40% of the tracing, and the time for recording
a trace according to ring_buffer_benchmark is 210ns. After converting
the interrupts to preemption disabling we have from perf top:

------------------------------------------------------------------------------
   PerfTop:    1084 irqs/sec  kernel:99.9% [1000Hz cpu-clock-msecs],  (all, 4 CPUs)
------------------------------------------------------------------------------

             samples    pcnt   kernel function
             _______   _____   _______________

             1277.00 - 16.8% : native_read_tsc
             1148.00 - 15.1% : rb_reserve_next_event
              896.00 - 11.8% : ring_buffer_lock_reserve
              688.00 -  9.1% : __rb_reserve_next
              664.00 -  8.8% : rb_end_commit
              563.00 -  7.4% : ring_buffer_unlock_commit
              508.00 -  6.7% : _spin_unlock_irq
              365.00 -  4.8% : debug_smp_processor_id
              321.00 -  4.2% : trace_clock_local
              303.00 -  4.0% : ring_buffer_producer_thread 	[ring_buffer_benchmark]
              273.00 -  3.6% : native_sched_clock
              122.00 -  1.6% : trace_recursive_unlock
              113.00 -  1.5% : sched_clock
              101.00 -  1.3% : ring_buffer_event_data
               53.00 -  0.7% : tick_nohz_stop_sched_tick

Where trace_clock_local drops from 40% to only taking 4% of the total time.
The trace time also goes from 210ns down to 179ns (31ns).

I talked with Peter Zijlstra about the impact that sched_clock may have
without having interrupts disabled, and he told me that if a timer interrupt
comes in, sched_clock may report a wrong time.

Balancing a seldom incorrect timestamp with a 15% performance boost, I'll
take the performance boost.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_clock.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 20c5f92e28a8..878c03f386ba 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -20,6 +20,8 @@
 #include <linux/ktime.h>
 #include <linux/trace_clock.h>
 
+#include "trace.h"
+
 /*
  * trace_clock_local(): the simplest and least coherent tracing clock.
  *
@@ -28,17 +30,17 @@
  */
 u64 notrace trace_clock_local(void)
 {
-	unsigned long flags;
 	u64 clock;
+	int resched;
 
 	/*
 	 * sched_clock() is an architecture implemented, fast, scalable,
 	 * lockless clock. It is not guaranteed to be coherent across
 	 * CPUs, nor across CPU idle events.
 	 */
-	raw_local_irq_save(flags);
+	resched = ftrace_preempt_disable();
 	clock = sched_clock();
-	raw_local_irq_restore(flags);
+	ftrace_preempt_enable(resched);
 
 	return clock;
 }
-- 
cgit v1.2.3-71-gd317


From 63395b65972c07edce595c9cc8a983016738cdac Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 12 Nov 2009 00:35:55 -0800
Subject: sysctl: sysctl_binary.c Fix compilation when !CONFIG_NET

dev_get_by_index does not exist when the network stack is not
compiled in, so only include the code to follow wild card paths
when the network stack is present.

I have shuffled the code around a little to make it clear
that dev_put is called after dev_get_by_index showing that
there is no leak.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sysctl_binary.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index bf0a4b06782a..0cf60400542d 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -12,6 +12,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/file.h>
 #include <linux/ctype.h>
+#include <linux/netdevice.h>
 
 #ifdef CONFIG_SYSCTL_SYSCALL
 
@@ -1250,9 +1251,12 @@ out:
 static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
 {
 	const struct bin_table *table = &bin_root_table[0];
-	struct net *net = current->nsproxy->net_ns;
 	int ctl_name;
 
+	/* The binary sysctl tables have a small maximum depth so
+	 * there is no danger of overflowing our path as it PATH_MAX
+	 * bytes long.
+	 */
 	memcpy(path, "sys/", 4);
 	path += 4;
 
@@ -1263,30 +1267,31 @@ repeat:
 	name++;
 	nlen--;
 	for ( ; table->convert; table++) {
-		struct net_device *dev = NULL;
-		const char *procname = NULL;
+		int len = 0;
 
 		/* Use the well known sysctl number to proc name mapping */
-		if (ctl_name == table->ctl_name)
-			procname = table->procname;
-
+		if (ctl_name == table->ctl_name) {
+			len = strlen(table->procname);
+			memcpy(path, table->procname, len);
+		}
+#ifdef CONFIG_NET
 		/*
 		 * For a wild card entry map from ifindex to network
 		 * device name.
 		 */
 		else if (!table->ctl_name) {
+			struct net *net = current->nsproxy->net_ns;
+			struct net_device *dev;
 			dev = dev_get_by_index(net, ctl_name);
-			if (dev)
-				procname = dev->name;
+			if (dev) {
+				len = strlen(dev->name);
+				memcpy(path, dev->name, len);
+				dev_put(dev);
+			}
 		}
-		if (procname) {
-			int len;
-
-			len = strlen(procname);
-			memcpy(path, procname, len);
+#endif
+		if (len) {
 			path += len;
-			if (dev)
-				dev_put(dev);
 			if (table->child) {
 				*path++ = '/';
 				table = table->child;
-- 
cgit v1.2.3-71-gd317


From 757010f026ab3044c594003e216d00a33ed95c56 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 12 Nov 2009 01:39:06 -0800
Subject: sysctl binary: Reorder the tests to process wild card entries first.

A malicious user could have passed in a ctl_name of 0 and triggered
the well know ctl_name to procname mapping code, instead of the wild
card matching code.  This is a slight problem as wild card entries don't
have procnames, and because in some alternate universe a network device
might have ifindex 0.  So test for and handle wild card entries first.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sysctl_binary.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 0cf60400542d..b75dbf40f573 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1269,17 +1269,12 @@ repeat:
 	for ( ; table->convert; table++) {
 		int len = 0;
 
-		/* Use the well known sysctl number to proc name mapping */
-		if (ctl_name == table->ctl_name) {
-			len = strlen(table->procname);
-			memcpy(path, table->procname, len);
-		}
-#ifdef CONFIG_NET
 		/*
 		 * For a wild card entry map from ifindex to network
 		 * device name.
 		 */
-		else if (!table->ctl_name) {
+		if (!table->ctl_name) {
+#ifdef CONFIG_NET
 			struct net *net = current->nsproxy->net_ns;
 			struct net_device *dev;
 			dev = dev_get_by_index(net, ctl_name);
@@ -1288,8 +1283,12 @@ repeat:
 				memcpy(path, dev->name, len);
 				dev_put(dev);
 			}
-		}
 #endif
+		/* Use the well known sysctl number to proc name mapping */
+		} else if (ctl_name == table->ctl_name) {
+			len = strlen(table->procname);
+			memcpy(path, table->procname, len);
+		}
 		if (len) {
 			path += len;
 			if (table->child) {
-- 
cgit v1.2.3-71-gd317


From 56992309ccbe71f4321ddd50ee2f76f91b412c1a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 5 Nov 2009 15:38:40 -0800
Subject: sysctl kernel: Remove binary sysctl logic

Now that sys_sysctl is a generic wrapper around /proc/sys  .ctl_name
and .strategy members of sysctl tables are dead code.  Remove them.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sched.c          |  5 ++---
 kernel/slow-work.c      |  5 +----
 kernel/utsname_sysctl.c | 31 -------------------------------
 3 files changed, 3 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a455dca884a6..dbb99d787a41 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7373,17 +7373,16 @@ static struct ctl_table sd_ctl_dir[] = {
 		.procname	= "sched_domain",
 		.mode		= 0555,
 	},
-	{0, },
+	{}
 };
 
 static struct ctl_table sd_ctl_root[] = {
 	{
-		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
-	{0, },
+	{}
 };
 
 static struct ctl_table *sd_alloc_ctl_entry(int n)
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 0d31135efbf4..0134b15b38d8 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -52,7 +52,6 @@ static const int slow_work_max_vslow = 99;
 
 ctl_table slow_work_sysctls[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "min-threads",
 		.data		= &slow_work_min_threads,
 		.maxlen		= sizeof(unsigned),
@@ -62,7 +61,6 @@ ctl_table slow_work_sysctls[] = {
 		.extra2		= &slow_work_max_threads,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "max-threads",
 		.data		= &slow_work_max_threads,
 		.maxlen		= sizeof(unsigned),
@@ -72,7 +70,6 @@ ctl_table slow_work_sysctls[] = {
 		.extra2		= (void *) &slow_work_max_max_threads,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "vslow-percentage",
 		.data		= &vslow_work_proportion,
 		.maxlen		= sizeof(unsigned),
@@ -81,7 +78,7 @@ ctl_table slow_work_sysctls[] = {
 		.extra1		= (void *) &slow_work_min_vslow,
 		.extra2		= (void *) &slow_work_max_vslow,
 	},
-	{ .ctl_name = 0 }
+	{}
 };
 #endif
 
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 69eae358a726..a2cd77e70d4d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write,
 #define proc_do_uts_string NULL
 #endif
 
-
-#ifdef CONFIG_SYSCTL_SYSCALL
-/* The generic string strategy routine: */
-static int sysctl_uts_string(ctl_table *table,
-		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen)
-{
-	struct ctl_table uts_table;
-	int r, write;
-	write = newval && newlen;
-	memcpy(&uts_table, table, sizeof(uts_table));
-	uts_table.data = get_uts(table, write);
-	r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
-	put_uts(table, write, uts_table.data);
-	return r;
-}
-#else
-#define sysctl_uts_string NULL
-#endif
-
 static struct ctl_table uts_kern_table[] = {
 	{
-		.ctl_name	= KERN_OSTYPE,
 		.procname	= "ostype",
 		.data		= init_uts_ns.name.sysname,
 		.maxlen		= sizeof(init_uts_ns.name.sysname),
 		.mode		= 0444,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{
-		.ctl_name	= KERN_OSRELEASE,
 		.procname	= "osrelease",
 		.data		= init_uts_ns.name.release,
 		.maxlen		= sizeof(init_uts_ns.name.release),
 		.mode		= 0444,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{
-		.ctl_name	= KERN_VERSION,
 		.procname	= "version",
 		.data		= init_uts_ns.name.version,
 		.maxlen		= sizeof(init_uts_ns.name.version),
 		.mode		= 0444,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{
-		.ctl_name	= KERN_NODENAME,
 		.procname	= "hostname",
 		.data		= init_uts_ns.name.nodename,
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{
-		.ctl_name	= KERN_DOMAINNAME,
 		.procname	= "domainname",
 		.data		= init_uts_ns.name.domainname,
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{}
 };
 
 static struct ctl_table uts_root_table[] = {
 	{
-		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= uts_kern_table,
-- 
cgit v1.2.3-71-gd317


From 4739a9748e1bd7459f22f7e94e7d85710ca83954 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 05:36:01 -0700
Subject: sysctl: Remove the last of the generic binary sysctl support

Now that all of the users stopped using ctl_name and strategy it
is safe to remove the fields from struct ctl_table, and it is safe
to remove the stub strategy routines as well.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sysctl.h |  3 ---
 kernel/sysctl.c        | 44 --------------------------------------------
 2 files changed, 47 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7c4aabc04673..4e40442777cf 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1033,7 +1033,6 @@ extern ctl_handler sysctl_ms_jiffies;
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table 
 {
-	int ctl_name;			/* Binary ID */
 	const char *procname;		/* Text ID for /proc/sys, or zero */
 	void *data;
 	int maxlen;
@@ -1041,7 +1040,6 @@ struct ctl_table
 	struct ctl_table *child;
 	struct ctl_table *parent;	/* Automatically set */
 	proc_handler *proc_handler;	/* Callback for text formatting */
-	ctl_handler *strategy;		/* Callback function for all r/w */
 	void *extra1;
 	void *extra2;
 };
@@ -1075,7 +1073,6 @@ struct ctl_table_header
 /* struct ctl_path describes where in the hierarchy a table is added */
 struct ctl_path {
 	const char *procname;
-	int ctl_name;
 };
 
 void register_sysctl_root(struct ctl_table_root *root);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b6b6eb1abebb..b4a5763d6dc8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1699,8 +1699,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  *
  * The members of the &struct ctl_table structure are used as follows:
  *
- * ctl_name - Dead
- *
  * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
  *            enter a sysctl file
  *
@@ -1715,8 +1713,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  *
  * proc_handler - the text handler routine (described below)
  *
- * strategy - Dead
- *
  * de - for internal use by the sysctl routines
  *
  * extra1, extra2 - extra pointers usable by the proc handler routines
@@ -2639,41 +2635,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_FS */
 
-int sysctl_data(struct ctl_table *table,
-		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-int sysctl_string(struct ctl_table *table,
-		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-int sysctl_intvec(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-int sysctl_jiffies(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-int sysctl_ms_jiffies(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
@@ -2688,9 +2649,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
 EXPORT_SYMBOL(register_sysctl_table);
 EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(sysctl_intvec);
-EXPORT_SYMBOL(sysctl_jiffies);
-EXPORT_SYMBOL(sysctl_ms_jiffies);
-EXPORT_SYMBOL(sysctl_string);
-EXPORT_SYMBOL(sysctl_data);
 EXPORT_SYMBOL(unregister_sysctl_table);
-- 
cgit v1.2.3-71-gd317


From 055a00865dcfc8e61f3cbefbb879c9577bd36ae5 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 12 Nov 2009 11:07:44 +0100
Subject: sched: Fix/add missing update_rq_clock() calls

kthread_bind(), migrate_task() and sched_fork were missing
updates, and try_to_wake_up() was updating after having already
used the stale clock.

Aside from preventing potential latency hits, there' a side
benefit in that early boot printk time stamps become monotonic.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1258020464.6491.2.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>
---
 kernel/sched.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3c11ae0a948d..701eca4958a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2017,6 +2017,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
 	}
 
 	spin_lock_irqsave(&rq->lock, flags);
+	update_rq_clock(rq);
 	set_task_cpu(p, cpu);
 	p->cpus_allowed = cpumask_of_cpu(cpu);
 	p->rt.nr_cpus_allowed = 1;
@@ -2115,6 +2116,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	 * it is sufficient to simply update the task's cpu field.
 	 */
 	if (!p->se.on_rq && !task_running(rq, p)) {
+		update_rq_clock(rq);
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
@@ -2376,14 +2378,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	task_rq_unlock(rq, &flags);
 
 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-	if (cpu != orig_cpu)
+	if (cpu != orig_cpu) {
+		local_irq_save(flags);
+		rq = cpu_rq(cpu);
+		update_rq_clock(rq);
 		set_task_cpu(p, cpu);
-
+		local_irq_restore(flags);
+	}
 	rq = task_rq_lock(p, &flags);
 
-	if (rq != orig_rq)
-		update_rq_clock(rq);
-
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 
@@ -2545,6 +2548,7 @@ static void __sched_fork(struct task_struct *p)
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
+	unsigned long flags;
 
 	__sched_fork(p);
 
@@ -2581,7 +2585,10 @@ void sched_fork(struct task_struct *p, int clone_flags)
 #ifdef CONFIG_SMP
 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
 #endif
+	local_irq_save(flags);
+	update_rq_clock(cpu_rq(cpu));
 	set_task_cpu(p, cpu);
+	local_irq_restore(flags);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
-- 
cgit v1.2.3-71-gd317


From 761b1d26df542fd5eb348837351e4d2f3bc7bffe Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 12 Nov 2009 13:33:45 +0900
Subject: sched: Fix granularity of task_u/stime()

Originally task_s/utime() were designed to return clock_t but
later changed to return cputime_t by following commit:

  commit efe567fc8281661524ffa75477a7c4ca9b466c63
  Author: Christian Borntraeger <borntraeger@de.ibm.com>
  Date:   Thu Aug 23 15:18:02 2007 +0200

It only changed the type of return value, but not the
implementation. As the result the granularity of task_s/utime()
is still that of clock_t, not that of cputime_t.

So using task_s/utime() in __exit_signal() makes values
accumulated to the signal struct to be rounded and coarse
grained.

This patch removes casts to clock_t in task_u/stime(), to keep
granularity of cputime_t over the calculation.

v2:
  Use div_u64() to avoid error "undefined reference to `__udivdi3`"
  on some 32bit systems.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: xiyou.wangcong@gmail.com
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4AFB9029.9000208@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 43e61fa04dc7..ab9a034c4a17 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5156,41 +5156,45 @@ cputime_t task_stime(struct task_struct *p)
 	return p->stime;
 }
 #else
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs) \
+	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+#endif
+
 cputime_t task_utime(struct task_struct *p)
 {
-	clock_t utime = cputime_to_clock_t(p->utime),
-		total = utime + cputime_to_clock_t(p->stime);
+	cputime_t utime = p->utime, total = utime + p->stime;
 	u64 temp;
 
 	/*
 	 * Use CFS's precise accounting:
 	 */
-	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+	temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
 
 	if (total) {
 		temp *= utime;
 		do_div(temp, total);
 	}
-	utime = (clock_t)temp;
+	utime = (cputime_t)temp;
 
-	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+	p->prev_utime = max(p->prev_utime, utime);
 	return p->prev_utime;
 }
 
 cputime_t task_stime(struct task_struct *p)
 {
-	clock_t stime;
+	cputime_t stime;
 
 	/*
 	 * Use CFS's precise accounting. (we subtract utime from
 	 * the total, to make sure the total observed by userspace
 	 * grows monotonically - apps rely on that):
 	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-			cputime_to_clock_t(task_utime(p));
+	stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
 
 	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+		p->prev_stime = max(p->prev_stime, stime);
 
 	return p->prev_stime;
 }
-- 
cgit v1.2.3-71-gd317


From a50bde5130f65733142b32975616427d0ea50856 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 12 Nov 2009 15:55:28 +0100
Subject: sched: Cleanup select_task_rq_fair()

Clean up the new affine to idle sibling bits while trying to
grok them. Should not have any function differences.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20091112145610.832503781@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 73 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 51 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e4d4483fd617..a32df1524746 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1318,6 +1318,41 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	return idlest;
 }
 
+/*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int
+select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+{
+	int cpu = smp_processor_id();
+	int prev_cpu = task_cpu(p);
+	int i;
+
+	/*
+	 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+	 * test in select_task_rq_fair) and the prev_cpu is idle then that's
+	 * always a better target than the current cpu.
+	 */
+	if (target == cpu) {
+		if (!cpu_rq(prev_cpu)->cfs.nr_running)
+			target = prev_cpu;
+	}
+
+	/*
+	 * Otherwise, iterate the domain and find an elegible idle cpu.
+	 */
+	if (target == -1 || target == cpu) {
+		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+			if (!cpu_rq(i)->cfs.nr_running) {
+				target = i;
+				break;
+			}
+		}
+	}
+
+	return target;
+}
+
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -1373,36 +1408,30 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 		}
 
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
-			int candidate = -1, i;
+			int target = -1;
 
+			/*
+			 * If both cpu and prev_cpu are part of this domain,
+			 * cpu is a valid SD_WAKE_AFFINE target.
+			 */
 			if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-				candidate = cpu;
+				target = cpu;
 
 			/*
-			 * Check for an idle shared cache.
+			 * If there's an idle sibling in this domain, make that
+			 * the wake_affine target instead of the current cpu.
+			 *
+			 * XXX: should we possibly do this outside of
+			 * WAKE_AFFINE, in case the shared cache domain is
+			 * smaller than the WAKE_AFFINE domain?
 			 */
-			if (tmp->flags & SD_PREFER_SIBLING) {
-				if (candidate == cpu) {
-					if (!cpu_rq(prev_cpu)->cfs.nr_running)
-						candidate = prev_cpu;
-				}
-
-				if (candidate == -1 || candidate == cpu) {
-					for_each_cpu(i, sched_domain_span(tmp)) {
-						if (!cpumask_test_cpu(i, &p->cpus_allowed))
-							continue;
-						if (!cpu_rq(i)->cfs.nr_running) {
-							candidate = i;
-							break;
-						}
-					}
-				}
-			}
+			if (tmp->flags & SD_PREFER_SIBLING)
+				target = select_idle_sibling(p, tmp, target);
 
-			if (candidate >= 0) {
+			if (target >= 0) {
 				affine_sd = tmp;
 				want_affine = 0;
-				cpu = candidate;
+				cpu = target;
 			}
 		}
 
-- 
cgit v1.2.3-71-gd317


From fe3bcfe1f6c1fc4ea7706ac2d05e579fd9092682 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 12 Nov 2009 15:55:29 +0100
Subject: sched: More generic WAKE_AFFINE vs select_idle_sibling()

Instead of only considering SD_WAKE_AFFINE | SD_PREFER_SIBLING
domains also allow all SD_PREFER_SIBLING domains below a
SD_WAKE_AFFINE domain to change the affinity target.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20091112145610.909723612@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a32df1524746..f28a2671a1a6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1333,20 +1333,16 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
 	 * test in select_task_rq_fair) and the prev_cpu is idle then that's
 	 * always a better target than the current cpu.
 	 */
-	if (target == cpu) {
-		if (!cpu_rq(prev_cpu)->cfs.nr_running)
-			target = prev_cpu;
-	}
+	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+		return prev_cpu;
 
 	/*
 	 * Otherwise, iterate the domain and find an elegible idle cpu.
 	 */
-	if (target == -1 || target == cpu) {
-		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
-			if (!cpu_rq(i)->cfs.nr_running) {
-				target = i;
-				break;
-			}
+	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+		if (!cpu_rq(i)->cfs.nr_running) {
+			target = i;
+			break;
 		}
 	}
 
@@ -1407,7 +1403,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 				want_sd = 0;
 		}
 
-		if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
+		/*
+		 * While iterating the domains looking for a spanning
+		 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+		 * in cache sharing domains along the way.
+		 */
+		if (want_affine) {
 			int target = -1;
 
 			/*
@@ -1420,17 +1421,15 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			/*
 			 * If there's an idle sibling in this domain, make that
 			 * the wake_affine target instead of the current cpu.
-			 *
-			 * XXX: should we possibly do this outside of
-			 * WAKE_AFFINE, in case the shared cache domain is
-			 * smaller than the WAKE_AFFINE domain?
 			 */
 			if (tmp->flags & SD_PREFER_SIBLING)
 				target = select_idle_sibling(p, tmp, target);
 
 			if (target >= 0) {
-				affine_sd = tmp;
-				want_affine = 0;
+				if (tmp->flags & SD_WAKE_AFFINE) {
+					affine_sd = tmp;
+					want_affine = 0;
+				}
 				cpu = target;
 			}
 		}
-- 
cgit v1.2.3-71-gd317


From b32e9eb6ad29572b4451847d0e8227c9be2b6d69 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 12 Nov 2009 22:35:03 -0800
Subject: rcu: Accelerate callback processing on CPUs not detecting GP end

An earlier fix for a race resulted in a situation where the CPUs
other than the CPU that detected the end of the grace period would
not process their callbacks until the next grace period started.

This means that these other CPUs would unnecessarily demand that an
extra grace period be started.

This patch eliminates this extra grace period and speeds callback
processing by propagating rsp->completed to the rcu_node structures
in the case where the CPU detecting the end of the grace period
sees no reason to start a new grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1258094104417-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d8024192c73b..b4efb9e36680 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -676,7 +676,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	if (!cpu_needs_another_gp(rsp, rdp)) {
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		if (rnp->completed == rsp->completed) {
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
+
+		/*
+		 * Propagate new ->completed value to rcu_node structures
+		 * so that other CPUs don't have to wait until the start
+		 * of the next grace period to process their callbacks.
+		 */
+		rcu_for_each_node_breadth_first(rsp, rnp) {
+			spin_lock(&rnp->lock);	 /* irqs already disabled. */
+			rnp->completed = rsp->completed;
+			spin_unlock(&rnp->lock); /* irqs remain disabled. */
+		}
+		local_irq_restore(flags);
 		return;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 8e9aa8f067d2dcd9457980ced618e1cffbcfba46 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 12 Nov 2009 22:35:04 -0800
Subject: rcu: Simplify association of forced quiescent states with grace
 periods

The force_quiescent_state() function also took a snapshot
of the ->completed field, which was as obnoxious as it was in
rcu_sched_qs() and friends.  So snapshot ->gpnum-1.

Also, since the dyntick_record_completed() and
dyntick_recall_completed() functions are now simple assignments
that are independent of CONFIG_NO_HZ, and since their names are
now misleading, get rid of them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12580941042308-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b4efb9e36680..3df04381ea3e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -178,28 +178,8 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 	return &rsp->node[0];
 }
 
-/*
- * Record the specified "completed" value, which is later used to validate
- * dynticks counter manipulations and CPU-offline checks.  Specify
- * "rsp->completed - 1" to unconditionally invalidate any future dynticks
- * manipulations and CPU-offline checks.  Such invalidation is useful at
- * the beginning of a grace period.
- */
-static void dyntick_record_completed(struct rcu_state *rsp, long comp)
-{
-	rsp->completed_fqs = comp;
-}
-
 #ifdef CONFIG_SMP
 
-/*
- * Recall the previously recorded value of the completion for dynticks.
- */
-static long dyntick_recall_completed(struct rcu_state *rsp)
-{
-	return rsp->completed_fqs;
-}
-
 /*
  * If the specified CPU is offline, tell the caller that it is in
  * a quiescent state.  Otherwise, whack it with a reschedule IPI.
@@ -702,7 +682,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
 	record_gp_stall_check_time(rsp);
-	dyntick_record_completed(rsp, rsp->completed - 1);
 
 	/* Special-case the common single-level case. */
 	if (NUM_RCU_NODES == 1) {
@@ -1214,7 +1193,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		goto unlock_ret; /* no emergency and done recently. */
 	rsp->n_force_qs++;
 	spin_lock(&rnp->lock);
-	lastcomp = rsp->completed;
+	lastcomp = rsp->gpnum - 1;
 	signaled = rsp->signaled;
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
 	if (lastcomp == rsp->gpnum) {
@@ -1248,7 +1227,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		if (lastcomp == rsp->completed &&
 		    rsp->signaled == signaled) {
 			rsp->signaled = RCU_FORCE_QS;
-			dyntick_record_completed(rsp, lastcomp);
+			rsp->completed_fqs = lastcomp;
 			forcenow = signaled == RCU_SAVE_COMPLETED;
 		}
 		spin_unlock(&rnp->lock);
@@ -1259,7 +1238,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 	case RCU_FORCE_QS:
 
 		/* Check dyntick-idle state, send IPI to laggarts. */
-		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+		if (rcu_process_dyntick(rsp, rsp->completed_fqs,
 					rcu_implicit_dynticks_qs))
 			goto unlock_ret;
 
-- 
cgit v1.2.3-71-gd317


From 67178767b936fb47a3a5e88097cff41ccbda7acb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 13 Nov 2009 10:06:34 +0100
Subject: tracing: Rename 'lockdep' event subsystem into 'lock'

Lockdep events subsystem gathers various locking related events
such as a request, release, contention or acquisition of a lock.

The name of this event subsystem is a bit of a misnomer since
these events are not quite related to lockdep but more generally
to locking, ie: these events are not reporting lock dependencies
or possible deadlock scenario but pure locking events.

Hence this rename.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1258103194-843-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/events/lock.h    | 96 ++++++++++++++++++++++++++++++++++++++++++
 include/trace/events/lockdep.h | 96 ------------------------------------------
 kernel/lockdep.c               |  2 +-
 3 files changed, 97 insertions(+), 97 deletions(-)
 create mode 100644 include/trace/events/lock.h
 delete mode 100644 include/trace/events/lockdep.h

(limited to 'kernel')

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
new file mode 100644
index 000000000000..a870ba125aa8
--- /dev/null
+++ b/include/trace/events/lock.h
@@ -0,0 +1,96 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lock
+
+#if !defined(_TRACE_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCK_H
+
+#include <linux/lockdep.h>
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_EVENT(lock_acquire,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, flags)
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "",
+		  (__entry->flags & 2) ? "read " : "",
+		  __get_str(name))
+);
+
+TRACE_EVENT(lock_release,
+
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+
+	TP_ARGS(lock, nested, ip),
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s", __get_str(name))
+);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_EVENT(lock_contended,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+	TP_ARGS(lock, ip),
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s", __get_str(name))
+);
+
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
+
+#endif
+#endif
+
+#endif /* _TRACE_LOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h
deleted file mode 100644
index bcf1d209a00d..000000000000
--- a/include/trace/events/lockdep.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lockdep
-
-#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_LOCKDEP_H
-
-#include <linux/lockdep.h>
-#include <linux/tracepoint.h>
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_EVENT(lock_acquire,
-
-	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-		int trylock, int read, int check,
-		struct lockdep_map *next_lock, unsigned long ip),
-
-	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-
-	TP_STRUCT__entry(
-		__field(unsigned int, flags)
-		__string(name, lock->name)
-	),
-
-	TP_fast_assign(
-		__entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
-		__assign_str(name, lock->name);
-	),
-
-	TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "",
-		  (__entry->flags & 2) ? "read " : "",
-		  __get_str(name))
-);
-
-TRACE_EVENT(lock_release,
-
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-
-	TP_ARGS(lock, nested, ip),
-
-	TP_STRUCT__entry(
-		__string(name, lock->name)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, lock->name);
-	),
-
-	TP_printk("%s", __get_str(name))
-);
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_EVENT(lock_contended,
-
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-
-	TP_ARGS(lock, ip),
-
-	TP_STRUCT__entry(
-		__string(name, lock->name)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, lock->name);
-	),
-
-	TP_printk("%s", __get_str(name))
-);
-
-TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
-
-	TP_ARGS(lock, ip, waittime),
-
-	TP_STRUCT__entry(
-		__string(name, lock->name)
-		__field(unsigned long, wait_usec)
-		__field(unsigned long, wait_nsec_rem)
-	),
-	TP_fast_assign(
-		__assign_str(name, lock->name);
-		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
-		__entry->wait_usec = (unsigned long) waittime;
-	),
-	TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec,
-				       __entry->wait_nsec_rem)
-);
-
-#endif
-#endif
-
-#endif /* _TRACE_LOCKDEP_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af56723c096..f5dcd36d3151 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
 #include "lockdep_internals.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/events/lockdep.h>
+#include <trace/events/lock.h>
 
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
-- 
cgit v1.2.3-71-gd317


From 23af368e9a904f59256c27d371ce223d6cee0430 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 11 Nov 2009 14:05:25 +0000
Subject: clockevents: Use u32 for mult and shift factors

The mult and shift factors of clock events differ in their data type
from those of clock sources for no reason. u32 is sufficient for
both. shift is always <= 32 and mult is limited to 2^32-1 to avoid
64bit multiplication overflows in the conversion.

Preparatory patch for a generic mult/shift factor calculation
function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Mikael Pettersson <mikpe@it.uu.se>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20091111134229.725664788@linutronix.de>
---
 include/linux/clockchips.h | 4 ++--
 kernel/time/timer_list.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 3a1dbba4d3ae..3b5841016276 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -79,8 +79,8 @@ struct clock_event_device {
 	unsigned int		features;
 	unsigned long		max_delta_ns;
 	unsigned long		min_delta_ns;
-	unsigned long		mult;
-	int			shift;
+	u32			mult;
+	u32			shift;
 	int			rating;
 	int			irq;
 	const struct cpumask	*cpumask;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdfd..fa00da108a14 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -206,8 +206,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	SEQ_printf(m, "%s\n", dev->name);
 	SEQ_printf(m, " max_delta_ns:   %lu\n", dev->max_delta_ns);
 	SEQ_printf(m, " min_delta_ns:   %lu\n", dev->min_delta_ns);
-	SEQ_printf(m, " mult:           %lu\n", dev->mult);
-	SEQ_printf(m, " shift:          %d\n", dev->shift);
+	SEQ_printf(m, " mult:           %u\n", dev->mult);
+	SEQ_printf(m, " shift:          %u\n", dev->shift);
 	SEQ_printf(m, " mode:           %d\n", dev->mode);
 	SEQ_printf(m, " next_event:     %Ld nsecs\n",
 		   (unsigned long long) ktime_to_ns(dev->next_event));
-- 
cgit v1.2.3-71-gd317


From 7d2f944a2b836c69a9d260a0a5f0d1720d57fdff Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 11 Nov 2009 14:05:29 +0000
Subject: clocksource: Provide a generic mult/shift factor calculation

MIPS has two functions to calculcate the mult/shift factors for clock
sources and clock events at run time. ARM needs such functions as
well.

Implement a function which calculates the mult/shift factors based on
the frequencies to which and from which is converted. The function
also has a parameter to specify the minimum conversion range in
seconds. This range is guaranteed not to produce a 64bit overflow when
a value is multiplied with the calculated mult factor. The larger the
conversion range the less becomes the conversion accuracy.

Provide two inline wrappers which handle clock events and clock
sources. For clock events the "from" frequency is nano seconds per
second which corresponds to 1GHz and "to" is the device frequency. For
clock sources "from" is the device frequency and "to" is nano seconds
per second.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Mikael Pettersson <mikpe@it.uu.se>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20091111134229.766673305@linutronix.de>
---
 include/linux/clockchips.h  |  7 ++++++
 include/linux/clocksource.h | 10 +++++++++
 kernel/time/clocksource.c   | 53 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 3b5841016276..4d438b0bc10a 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -130,6 +130,13 @@ extern int clockevents_program_event(struct clock_event_device *dev,
 
 extern void clockevents_handle_noop(struct clock_event_device *dev);
 
+static inline void
+clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec)
+{
+	return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC,
+				      freq, minsec);
+}
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern void clockevents_notify(unsigned long reason, void *arg);
 #else
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 83d2fbd81b93..f57f88250526 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -279,6 +279,16 @@ extern void clocksource_resume(void);
 extern struct clocksource * __init __weak clocksource_default_clock(void);
 extern void clocksource_mark_unstable(struct clocksource *cs);
 
+extern void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
+
+static inline void
+clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec)
+{
+	return clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+				      NSEC_PER_SEC, minsec);
+}
+
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
 extern void update_vsyscall_tz(void);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5e18c6ab2c6a..407c0894ef37 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,6 +107,59 @@ u64 timecounter_cyc2time(struct timecounter *tc,
 }
 EXPORT_SYMBOL(timecounter_cyc2time);
 
+/**
+ * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
+ * @mult:	pointer to mult variable
+ * @shift:	pointer to shift variable
+ * @from:	frequency to convert from
+ * @to:		frequency to convert to
+ * @minsec:	guaranteed runtime conversion range in seconds
+ *
+ * The function evaluates the shift/mult pair for the scaled math
+ * operations of clocksources and clockevents.
+ *
+ * @to and @from are frequency values in HZ. For clock sources @to is
+ * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
+ * event @to is the counter frequency and @from is NSEC_PER_SEC.
+ *
+ * The @minsec conversion range argument controls the time frame in
+ * seconds which must be covered by the runtime conversion with the
+ * calculated mult and shift factors. This guarantees that no 64bit
+ * overflow happens when the input value of the conversion is
+ * multiplied with the calculated mult factor. Larger ranges may
+ * reduce the conversion accuracy by chosing smaller mult and shift
+ * factors.
+ */
+void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+{
+	u64 tmp;
+	u32 sft, sftacc= 32;
+
+	/*
+	 * Calculate the shift factor which is limiting the conversion
+	 * range:
+	 */
+	tmp = ((u64)minsec * from) >> 32;
+	while (tmp) {
+		tmp >>=1;
+		sftacc--;
+	}
+
+	/*
+	 * Find the conversion shift/mult pair which has the best
+	 * accuracy and fits the maxsec conversion range:
+	 */
+	for (sft = 32; sft > 0; sft--) {
+		tmp = (u64) to << sft;
+		do_div(tmp, from);
+		if ((tmp >> sftacc) == 0)
+			break;
+	}
+	*mult = tmp;
+	*shift = sft;
+}
+
 /*[Clocksource internal variables]---------
  * curr_clocksource:
  *	currently selected clocksource.
-- 
cgit v1.2.3-71-gd317


From 529eaccd900a59724619b4a6ef6579fd518d5218 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 13 Nov 2009 14:32:19 +0100
Subject: nohz: Type cast printk argument

On some archs local_softirq_pending() has a data type of unsigned long
on others its unsigned int. Type cast it to (unsigned int) in the
printk to avoid the compiler warning.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3840f6dff7eb..c65ba0faa98f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -250,7 +250,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 
 		if (ratelimit < 10) {
 			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-			       local_softirq_pending());
+			       (unsigned int) local_softirq_pending());
 			ratelimit++;
 		}
 		goto end;
-- 
cgit v1.2.3-71-gd317


From 98962465ed9e6ea99c38e0af63fe1dcb5a79dc25 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jon-hunter@ti.com>
Date: Tue, 18 Aug 2009 12:45:10 -0500
Subject: nohz: Prevent clocksource wrapping during idle

The dynamic tick allows the kernel to sleep for periods longer than a
single tick, but it does not limit the sleep time currently. In the
worst case the kernel could sleep longer than the wrap around time of
the time keeping clock source which would result in losing track of
time.

Prevent this by limiting it to the safe maximum sleep time of the
current time keeping clock source. The value is calculated when the
clock source is registered.

[ tglx: simplified the code a bit and massaged the commit msg ]

Signed-off-by: Jon Hunter <jon-hunter@ti.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1250617512-23567-2-git-send-email-jon-hunter@ti.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h |  2 ++
 include/linux/time.h        |  1 +
 kernel/time/clocksource.c   | 44 ++++++++++++++++++++++++++++++++++++++
 kernel/time/tick-sched.c    | 52 +++++++++++++++++++++++++++++++++------------
 kernel/time/timekeeping.c   | 11 ++++++++++
 5 files changed, 96 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f57f88250526..279c5478e8a6 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -151,6 +151,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
  *			subtraction of non 64 bit counters
  * @mult:		cycle to nanosecond multiplier
  * @shift:		cycle to nanosecond divisor (power of two)
+ * @max_idle_ns:	max idle time permitted by the clocksource (nsecs)
  * @flags:		flags describing special properties
  * @vread:		vsyscall based read
  * @resume:		resume function for the clocksource, if necessary
@@ -168,6 +169,7 @@ struct clocksource {
 	cycle_t mask;
 	u32 mult;
 	u32 shift;
+	u64 max_idle_ns;
 	unsigned long flags;
 	cycle_t (*vread)(void);
 	void (*resume)(void);
diff --git a/include/linux/time.h b/include/linux/time.h
index fe04e5ef6a59..6e026e45a179 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -148,6 +148,7 @@ extern void monotonic_to_bootbased(struct timespec *ts);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_valid_for_hres(void);
+extern u64 timekeeping_max_deferment(void);
 extern void update_wall_time(void);
 extern void update_xtime_cache(u64 nsec);
 extern void timekeeping_leap_insert(int leapsecond);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 407c0894ef37..b65b242f04dd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -468,6 +468,47 @@ void clocksource_touch_watchdog(void)
 
 #ifdef CONFIG_GENERIC_TIME
 
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+	u64 max_nsecs, max_cycles;
+
+	/*
+	 * Calculate the maximum number of cycles that we can pass to the
+	 * cyc2ns function without overflowing a 64-bit signed result. The
+	 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
+	 * is equivalent to the below.
+	 * max_cycles < (2^63)/cs->mult
+	 * max_cycles < 2^(log2((2^63)/cs->mult))
+	 * max_cycles < 2^(log2(2^63) - log2(cs->mult))
+	 * max_cycles < 2^(63 - log2(cs->mult))
+	 * max_cycles < 1 << (63 - log2(cs->mult))
+	 * Please note that we add 1 to the result of the log2 to account for
+	 * any rounding errors, ensure the above inequality is satisfied and
+	 * no overflow will occur.
+	 */
+	max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+
+	/*
+	 * The actual maximum number of cycles we can defer the clocksource is
+	 * determined by the minimum of max_cycles and cs->mask.
+	 */
+	max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+	max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+
+	/*
+	 * To ensure that the clocksource does not wrap whilst we are idle,
+	 * limit the time the clocksource can be deferred by 12.5%. Please
+	 * note a margin of 12.5% is used because this can be computed with
+	 * a shift, versus say 10% which would require division.
+	 */
+	return max_nsecs - (max_nsecs >> 5);
+}
+
 /**
  * clocksource_select - Select the best clocksource available
  *
@@ -564,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs)
  */
 int clocksource_register(struct clocksource *cs)
 {
+	/* calculate max idle time permitted for this clocksource */
+	cs->max_idle_ns = clocksource_max_deferment(cs);
+
 	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
 	clocksource_select();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c65ba0faa98f..a80b4644fe6b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -208,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+	u64 time_delta;
 	int cpu;
 
 	local_irq_save(flags);
@@ -262,6 +263,17 @@ void tick_nohz_stop_sched_tick(int inidle)
 		seq = read_seqbegin(&xtime_lock);
 		last_update = last_jiffies_update;
 		last_jiffies = jiffies;
+
+		/*
+		 * On SMP we really should only care for the CPU which
+		 * has the do_timer duty assigned. All other CPUs can
+		 * sleep as long as they want.
+		 */
+		if (cpu == tick_do_timer_cpu ||
+		    tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+			time_delta = timekeeping_max_deferment();
+		else
+			time_delta = KTIME_MAX;
 	} while (read_seqretry(&xtime_lock, seq));
 
 	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
@@ -284,11 +296,26 @@ void tick_nohz_stop_sched_tick(int inidle)
 	if ((long)delta_jiffies >= 1) {
 
 		/*
-		* calculate the expiry time for the next timer wheel
-		* timer
-		*/
-		expires = ktime_add_ns(last_update, tick_period.tv64 *
-				   delta_jiffies);
+		 * calculate the expiry time for the next timer wheel
+		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
+		 * that there is no timer pending or at least extremely
+		 * far into the future (12 days for HZ=1000). In this
+		 * case we set the expiry to the end of time.
+		 */
+		if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
+			/*
+			 * Calculate the time delta for the next timer event.
+			 * If the time delta exceeds the maximum time delta
+			 * permitted by the current clocksource then adjust
+			 * the time delta accordingly to ensure the
+			 * clocksource does not wrap.
+			 */
+			time_delta = min_t(u64, time_delta,
+					   tick_period.tv64 * delta_jiffies);
+			expires = ktime_add_ns(last_update, time_delta);
+		} else {
+			expires.tv64 = KTIME_MAX;
+		}
 
 		/*
 		 * If this cpu is the one which updates jiffies, then
@@ -332,22 +359,19 @@ void tick_nohz_stop_sched_tick(int inidle)
 
 		ts->idle_sleeps++;
 
+		/* Mark expires */
+		ts->idle_expires = expires;
+
 		/*
-		 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
-		 * there is no timer pending or at least extremly far
-		 * into the future (12 days for HZ=1000). In this case
-		 * we simply stop the tick timer:
+		 * If the expiration time == KTIME_MAX, then
+		 * in this case we simply stop the tick timer.
 		 */
-		if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
-			ts->idle_expires.tv64 = KTIME_MAX;
+		 if (unlikely(expires.tv64 == KTIME_MAX)) {
 			if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 				hrtimer_cancel(&ts->sched_timer);
 			goto out;
 		}
 
-		/* Mark expiries */
-		ts->idle_expires = expires;
-
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 			hrtimer_start(&ts->sched_timer, expires,
 				      HRTIMER_MODE_ABS_PINNED);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 96b3f0dfa5dc..5d4d4239a0aa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -477,6 +477,17 @@ int timekeeping_valid_for_hres(void)
 	return ret;
 }
 
+/**
+ * timekeeping_max_deferment - Returns max time the clocksource can be deferred
+ *
+ * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
+ * ensure that the clocksource does not change!
+ */
+u64 timekeeping_max_deferment(void)
+{
+	return timekeeper.clock->max_idle_ns;
+}
+
 /**
  * read_persistent_clock -  Return time from the persistent clock.
  *
-- 
cgit v1.2.3-71-gd317


From 27185016b806d5a1181ff501cae120582b2b27dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Nov 2009 22:12:06 +0100
Subject: nohz: Track last do_timer() cpu

The previous patch which limits the sleep time to the maximum
deferment time of the time keeping clocksource has some limitations on
SMP machines: if all CPUs are idle then for all CPUs the maximum sleep
time is limited.

Solve this by keeping track of which cpu had the do_timer() duty
assigned last and limit the sleep time only for this cpu.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Cc: Jon Hunter <jon-hunter@ti.com>
Cc: John Stultz <johnstul@us.ibm.com>
---
 include/linux/tick.h     |  2 ++
 kernel/time/tick-sched.c | 52 ++++++++++++++++++++++++++----------------------
 2 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 8dc082194b22..d2ae79e21be3 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -43,6 +43,7 @@ enum tick_nohz_mode {
  * @idle_exittime:	Time when the idle state was left
  * @idle_sleeptime:	Sum of the time slept in idle with sched tick stopped
  * @sleep_length:	Duration of the current idle sleep
+ * @do_timer_lst:	CPU was the last one doing do_timer before going idle
  */
 struct tick_sched {
 	struct hrtimer			sched_timer;
@@ -64,6 +65,7 @@ struct tick_sched {
 	unsigned long			last_jiffies;
 	unsigned long			next_jiffies;
 	ktime_t				idle_expires;
+	int				do_timer_last;
 };
 
 extern void __init tick_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a80b4644fe6b..df133bc29f89 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -263,17 +263,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		seq = read_seqbegin(&xtime_lock);
 		last_update = last_jiffies_update;
 		last_jiffies = jiffies;
-
-		/*
-		 * On SMP we really should only care for the CPU which
-		 * has the do_timer duty assigned. All other CPUs can
-		 * sleep as long as they want.
-		 */
-		if (cpu == tick_do_timer_cpu ||
-		    tick_do_timer_cpu == TICK_DO_TIMER_NONE)
-			time_delta = timekeeping_max_deferment();
-		else
-			time_delta = KTIME_MAX;
+		time_delta = timekeeping_max_deferment();
 	} while (read_seqretry(&xtime_lock, seq));
 
 	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
@@ -295,6 +285,29 @@ void tick_nohz_stop_sched_tick(int inidle)
 	/* Schedule the tick, if we are at least one jiffie off */
 	if ((long)delta_jiffies >= 1) {
 
+		/*
+		 * If this cpu is the one which updates jiffies, then
+		 * give up the assignment and let it be taken by the
+		 * cpu which runs the tick timer next, which might be
+		 * this cpu as well. If we don't drop this here the
+		 * jiffies might be stale and do_timer() never
+		 * invoked. Keep track of the fact that it was the one
+		 * which had the do_timer() duty last. If this cpu is
+		 * the one which had the do_timer() duty last, we
+		 * limit the sleep time to the timekeeping
+		 * max_deferement value which we retrieved
+		 * above. Otherwise we can sleep as long as we want.
+		 */
+		if (cpu == tick_do_timer_cpu) {
+			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+			ts->do_timer_last = 1;
+		} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+			time_delta = KTIME_MAX;
+			ts->do_timer_last = 0;
+		} else if (!ts->do_timer_last) {
+			time_delta = KTIME_MAX;
+		}
+
 		/*
 		 * calculate the expiry time for the next timer wheel
 		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
@@ -312,21 +325,12 @@ void tick_nohz_stop_sched_tick(int inidle)
 			 */
 			time_delta = min_t(u64, time_delta,
 					   tick_period.tv64 * delta_jiffies);
-			expires = ktime_add_ns(last_update, time_delta);
-		} else {
-			expires.tv64 = KTIME_MAX;
 		}
 
-		/*
-		 * If this cpu is the one which updates jiffies, then
-		 * give up the assignment and let it be taken by the
-		 * cpu which runs the tick timer next, which might be
-		 * this cpu as well. If we don't drop this here the
-		 * jiffies might be stale and do_timer() never
-		 * invoked.
-		 */
-		if (cpu == tick_do_timer_cpu)
-			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+		if (time_delta < KTIME_MAX)
+			expires = ktime_add_ns(last_update, time_delta);
+		else
+			expires.tv64 = KTIME_MAX;
 
 		if (delta_jiffies > 1)
 			cpumask_set_cpu(cpu, nohz_cpu_mask);
-- 
cgit v1.2.3-71-gd317


From 97813f2fe77804a4464564c75ba8d8826377feea Mon Sep 17 00:00:00 2001
From: Jon Hunter <jon-hunter@ti.com>
Date: Tue, 18 Aug 2009 12:45:11 -0500
Subject: nohz: Allow 32-bit machines to sleep for more than 2.15 seconds

In the dynamic tick code, "max_delta_ns" (member of the
"clock_event_device" structure) represents the maximum sleep time
that can occur between timer events in nanoseconds.

The variable, "max_delta_ns", is defined as an unsigned long
which is a 32-bit integer for 32-bit machines and a 64-bit
integer for 64-bit machines (if -m64 option is used for gcc).
The value of max_delta_ns is set by calling the function
"clockevent_delta2ns()" which returns a maximum value of LONG_MAX.
For a 32-bit machine LONG_MAX is equal to 0x7fffffff and in
nanoseconds this equates to ~2.15 seconds. Hence, the maximum
sleep time for a 32-bit machine is ~2.15 seconds, where as for
a 64-bit machine it will be many years.

This patch changes the type of max_delta_ns to be "u64" instead of
"unsigned long" so that this variable is a 64-bit type for both 32-bit
and 64-bit machines. It also changes the maximum value returned by
clockevent_delta2ns() to KTIME_MAX.  Hence this allows a 32-bit
machine to sleep for longer than ~2.15 seconds. Please note that this
patch also changes "min_delta_ns" to be "u64" too and although this is
unnecessary, it makes the patch simpler as it avoids to fixup all
callers of clockevent_delta2ns().

[ tglx: changed "unsigned long long" to u64 as we use this data type
  	through out the time code ]

Signed-off-by: Jon Hunter <jon-hunter@ti.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1250617512-23567-3-git-send-email-jon-hunter@ti.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h |  8 ++++----
 kernel/hrtimer.c           |  3 ++-
 kernel/time/clockevents.c  | 11 +++++------
 kernel/time/tick-oneshot.c |  4 ++--
 kernel/time/timer_list.c   |  6 ++++--
 5 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 4d438b0bc10a..0cf725bdd2a1 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -77,8 +77,8 @@ enum clock_event_nofitiers {
 struct clock_event_device {
 	const char		*name;
 	unsigned int		features;
-	unsigned long		max_delta_ns;
-	unsigned long		min_delta_ns;
+	u64			max_delta_ns;
+	u64			min_delta_ns;
 	u32			mult;
 	u32			shift;
 	int			rating;
@@ -116,8 +116,8 @@ static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec,
 }
 
 /* Clock event layer functions */
-extern unsigned long clockevent_delta2ns(unsigned long latch,
-					 struct clock_event_device *evt);
+extern u64 clockevent_delta2ns(unsigned long latch,
+			       struct clock_event_device *evt);
 extern void clockevents_register_device(struct clock_event_device *dev);
 
 extern void clockevents_exchange_device(struct clock_event_device *old,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6d7020490f94..c215b74cd953 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1240,7 +1240,8 @@ hrtimer_interrupt_hanging(struct clock_event_device *dev,
 	force_clock_reprogram = 1;
 	dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
 	printk(KERN_WARNING "hrtimer: interrupt too slow, "
-		"forcing clock min delta to %lu ns\n", dev->min_delta_ns);
+	       "forcing clock min delta to %llu ns\n",
+	       (unsigned long long) dev->min_delta_ns);
 }
 /*
  * High resolution timer interrupt
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc32..05e8aeedcdf3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -37,10 +37,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
  *
  * Math helper, returns latch value converted to nanoseconds (bound checked)
  */
-unsigned long clockevent_delta2ns(unsigned long latch,
-				  struct clock_event_device *evt)
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 {
-	u64 clc = ((u64) latch << evt->shift);
+	u64 clc = (u64) latch << evt->shift;
 
 	if (unlikely(!evt->mult)) {
 		evt->mult = 1;
@@ -50,10 +49,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
 	do_div(clc, evt->mult);
 	if (clc < 1000)
 		clc = 1000;
-	if (clc > LONG_MAX)
-		clc = LONG_MAX;
+	if (clc > KTIME_MAX)
+		clc = KTIME_MAX;
 
-	return (unsigned long) clc;
+	return clc;
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89cf..0a8a213016f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 				dev->min_delta_ns += dev->min_delta_ns >> 1;
 
 			printk(KERN_WARNING
-			       "CE: %s increasing min_delta_ns to %lu nsec\n",
+			       "CE: %s increasing min_delta_ns to %llu nsec\n",
 			       dev->name ? dev->name : "?",
-			       dev->min_delta_ns << 1);
+			       (unsigned long long) dev->min_delta_ns << 1);
 
 			i = 0;
 		}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fa00da108a14..665c76edbf17 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -204,8 +204,10 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 		return;
 	}
 	SEQ_printf(m, "%s\n", dev->name);
-	SEQ_printf(m, " max_delta_ns:   %lu\n", dev->max_delta_ns);
-	SEQ_printf(m, " min_delta_ns:   %lu\n", dev->min_delta_ns);
+	SEQ_printf(m, " max_delta_ns:   %llu\n",
+		   (unsigned long long) dev->max_delta_ns);
+	SEQ_printf(m, " min_delta_ns:   %llu\n",
+		   (unsigned long long) dev->min_delta_ns);
 	SEQ_printf(m, " mult:           %u\n", dev->mult);
 	SEQ_printf(m, " shift:          %u\n", dev->shift);
 	SEQ_printf(m, " mode:           %d\n", dev->mode);
-- 
cgit v1.2.3-71-gd317


From 6beb000923882f6204ea2cfcd932e568e900803f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Nov 2009 15:21:34 +0000
Subject: locking: Make inlining decision Kconfig based

commit 892a7c67 (locking: Allow arch-inlined spinlocks) implements the
selection of which lock functions are inlined based on defines in
arch/.../spinlock.h: #define __always_inline__LOCK_FUNCTION

Despite of the name __always_inline__* the lock functions can be built
out of line depending on config options. Also if the arch does not set
some inline defines the generic code might set them; again depending on
config options.

This makes it unnecessary hard to figure out when and which lock
functions are inlined. Aside of that it makes it way harder and
messier for -rt to manipulate the lock functions.

Convert the inlining decision to CONFIG switches. Each lock function
is inlined depending on CONFIG_INLINE_*. The configs implement the
existing dependencies. The architecture code can select ARCH_INLINE_*
to signal that it wants the corresponding lock function inlined.
ARCH_INLINE_* is necessary as Kconfig ignores "depends on"
restrictions when a config element is selected.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20091109151428.504477141@linutronix.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/s390/Kconfig                |  28 ++++++
 arch/s390/include/asm/spinlock.h |  29 ------
 include/linux/spinlock_api_smp.h |  75 ++++++---------
 init/Kconfig                     |   1 +
 kernel/Kconfig.locks             | 199 +++++++++++++++++++++++++++++++++++++++
 kernel/spinlock.c                |  56 +++++------
 6 files changed, 284 insertions(+), 104 deletions(-)
 create mode 100644 kernel/Kconfig.locks

(limited to 'kernel')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 43c0acad7160..16c673096a22 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -95,6 +95,34 @@ config S390
 	select HAVE_ARCH_TRACEHOOK
 	select INIT_ALL_POSSIBLE
 	select HAVE_PERF_EVENTS
+	select ARCH_INLINE_SPIN_TRYLOCK
+	select ARCH_INLINE_SPIN_TRYLOCK_BH
+	select ARCH_INLINE_SPIN_LOCK
+	select ARCH_INLINE_SPIN_LOCK_BH
+	select ARCH_INLINE_SPIN_LOCK_IRQ
+	select ARCH_INLINE_SPIN_LOCK_IRQSAVE
+	select ARCH_INLINE_SPIN_UNLOCK
+	select ARCH_INLINE_SPIN_UNLOCK_BH
+	select ARCH_INLINE_SPIN_UNLOCK_IRQ
+	select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+	select ARCH_INLINE_READ_TRYLOCK
+	select ARCH_INLINE_READ_LOCK
+	select ARCH_INLINE_READ_LOCK_BH
+	select ARCH_INLINE_READ_LOCK_IRQ
+	select ARCH_INLINE_READ_LOCK_IRQSAVE
+	select ARCH_INLINE_READ_UNLOCK
+	select ARCH_INLINE_READ_UNLOCK_BH
+	select ARCH_INLINE_READ_UNLOCK_IRQ
+	select ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+	select ARCH_INLINE_WRITE_TRYLOCK
+	select ARCH_INLINE_WRITE_LOCK
+	select ARCH_INLINE_WRITE_LOCK_BH
+	select ARCH_INLINE_WRITE_LOCK_IRQ
+	select ARCH_INLINE_WRITE_LOCK_IRQSAVE
+	select ARCH_INLINE_WRITE_UNLOCK
+	select ARCH_INLINE_WRITE_UNLOCK_BH
+	select ARCH_INLINE_WRITE_UNLOCK_IRQ
+	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 
 config SCHED_OMIT_FRAME_POINTER
 	bool
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 41ce6861174e..c9af0d19c7ab 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -191,33 +191,4 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
 
-#define __always_inline__spin_lock
-#define __always_inline__read_lock
-#define __always_inline__write_lock
-#define __always_inline__spin_lock_bh
-#define __always_inline__read_lock_bh
-#define __always_inline__write_lock_bh
-#define __always_inline__spin_lock_irq
-#define __always_inline__read_lock_irq
-#define __always_inline__write_lock_irq
-#define __always_inline__spin_lock_irqsave
-#define __always_inline__read_lock_irqsave
-#define __always_inline__write_lock_irqsave
-#define __always_inline__spin_trylock
-#define __always_inline__read_trylock
-#define __always_inline__write_trylock
-#define __always_inline__spin_trylock_bh
-#define __always_inline__spin_unlock
-#define __always_inline__read_unlock
-#define __always_inline__write_unlock
-#define __always_inline__spin_unlock_bh
-#define __always_inline__read_unlock_bh
-#define __always_inline__write_unlock_bh
-#define __always_inline__spin_unlock_irq
-#define __always_inline__read_unlock_irq
-#define __always_inline__write_unlock_irq
-#define __always_inline__spin_unlock_irqrestore
-#define __always_inline__read_unlock_irqrestore
-#define __always_inline__write_unlock_irqrestore
-
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 7a7e18fc2415..8264a7f459bc 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -60,137 +60,118 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 							__releases(lock);
 
-/*
- * We inline the unlock functions in the nondebug case:
- */
-#if !defined(CONFIG_DEBUG_SPINLOCK) && !defined(CONFIG_PREEMPT)
-#define __always_inline__spin_unlock
-#define __always_inline__read_unlock
-#define __always_inline__write_unlock
-#define __always_inline__spin_unlock_irq
-#define __always_inline__read_unlock_irq
-#define __always_inline__write_unlock_irq
-#endif
-
-#ifndef CONFIG_DEBUG_SPINLOCK
-#ifndef CONFIG_GENERIC_LOCKBREAK
-
-#ifdef __always_inline__spin_lock
+#ifdef CONFIG_INLINE_SPIN_LOCK
 #define _spin_lock(lock) __spin_lock(lock)
 #endif
 
-#ifdef __always_inline__read_lock
+#ifdef CONFIG_INLINE_READ_LOCK
 #define _read_lock(lock) __read_lock(lock)
 #endif
 
-#ifdef __always_inline__write_lock
+#ifdef CONFIG_INLINE_WRITE_LOCK
 #define _write_lock(lock) __write_lock(lock)
 #endif
 
-#ifdef __always_inline__spin_lock_bh
+#ifdef CONFIG_INLINE_SPIN_LOCK_BH
 #define _spin_lock_bh(lock) __spin_lock_bh(lock)
 #endif
 
-#ifdef __always_inline__read_lock_bh
+#ifdef CONFIG_INLINE_READ_LOCK_BH
 #define _read_lock_bh(lock) __read_lock_bh(lock)
 #endif
 
-#ifdef __always_inline__write_lock_bh
+#ifdef CONFIG_INLINE_WRITE_LOCK_BH
 #define _write_lock_bh(lock) __write_lock_bh(lock)
 #endif
 
-#ifdef __always_inline__spin_lock_irq
+#ifdef CONFIG_INLINE_SPIN_LOCK_IRQ
 #define _spin_lock_irq(lock) __spin_lock_irq(lock)
 #endif
 
-#ifdef __always_inline__read_lock_irq
+#ifdef CONFIG_INLINE_READ_LOCK_IRQ
 #define _read_lock_irq(lock) __read_lock_irq(lock)
 #endif
 
-#ifdef __always_inline__write_lock_irq
+#ifdef CONFIG_INLINE_WRITE_LOCK_IRQ
 #define _write_lock_irq(lock) __write_lock_irq(lock)
 #endif
 
-#ifdef __always_inline__spin_lock_irqsave
+#ifdef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
 #define _spin_lock_irqsave(lock) __spin_lock_irqsave(lock)
 #endif
 
-#ifdef __always_inline__read_lock_irqsave
+#ifdef CONFIG_INLINE_READ_LOCK_IRQSAVE
 #define _read_lock_irqsave(lock) __read_lock_irqsave(lock)
 #endif
 
-#ifdef __always_inline__write_lock_irqsave
+#ifdef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
 #define _write_lock_irqsave(lock) __write_lock_irqsave(lock)
 #endif
 
-#endif /* !CONFIG_GENERIC_LOCKBREAK */
-
-#ifdef __always_inline__spin_trylock
+#ifdef CONFIG_INLINE_SPIN_TRYLOCK
 #define _spin_trylock(lock) __spin_trylock(lock)
 #endif
 
-#ifdef __always_inline__read_trylock
+#ifdef CONFIG_INLINE_READ_TRYLOCK
 #define _read_trylock(lock) __read_trylock(lock)
 #endif
 
-#ifdef __always_inline__write_trylock
+#ifdef CONFIG_INLINE_WRITE_TRYLOCK
 #define _write_trylock(lock) __write_trylock(lock)
 #endif
 
-#ifdef __always_inline__spin_trylock_bh
+#ifdef CONFIG_INLINE_SPIN_TRYLOCK_BH
 #define _spin_trylock_bh(lock) __spin_trylock_bh(lock)
 #endif
 
-#ifdef __always_inline__spin_unlock
+#ifdef CONFIG_INLINE_SPIN_UNLOCK
 #define _spin_unlock(lock) __spin_unlock(lock)
 #endif
 
-#ifdef __always_inline__read_unlock
+#ifdef CONFIG_INLINE_READ_UNLOCK
 #define _read_unlock(lock) __read_unlock(lock)
 #endif
 
-#ifdef __always_inline__write_unlock
+#ifdef CONFIG_INLINE_WRITE_UNLOCK
 #define _write_unlock(lock) __write_unlock(lock)
 #endif
 
-#ifdef __always_inline__spin_unlock_bh
+#ifdef CONFIG_INLINE_SPIN_UNLOCK_BH
 #define _spin_unlock_bh(lock) __spin_unlock_bh(lock)
 #endif
 
-#ifdef __always_inline__read_unlock_bh
+#ifdef CONFIG_INLINE_READ_UNLOCK_BH
 #define _read_unlock_bh(lock) __read_unlock_bh(lock)
 #endif
 
-#ifdef __always_inline__write_unlock_bh
+#ifdef CONFIG_INLINE_WRITE_UNLOCK_BH
 #define _write_unlock_bh(lock) __write_unlock_bh(lock)
 #endif
 
-#ifdef __always_inline__spin_unlock_irq
+#ifdef CONFIG_INLINE_SPIN_UNLOCK_IRQ
 #define _spin_unlock_irq(lock) __spin_unlock_irq(lock)
 #endif
 
-#ifdef __always_inline__read_unlock_irq
+#ifdef CONFIG_INLINE_READ_UNLOCK_IRQ
 #define _read_unlock_irq(lock) __read_unlock_irq(lock)
 #endif
 
-#ifdef __always_inline__write_unlock_irq
+#ifdef CONFIG_INLINE_WRITE_UNLOCK_IRQ
 #define _write_unlock_irq(lock) __write_unlock_irq(lock)
 #endif
 
-#ifdef __always_inline__spin_unlock_irqrestore
+#ifdef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
 #define _spin_unlock_irqrestore(lock, flags) __spin_unlock_irqrestore(lock, flags)
 #endif
 
-#ifdef __always_inline__read_unlock_irqrestore
+#ifdef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
 #define _read_unlock_irqrestore(lock, flags) __read_unlock_irqrestore(lock, flags)
 #endif
 
-#ifdef __always_inline__write_unlock_irqrestore
+#ifdef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
 #define _write_unlock_irqrestore(lock, flags) __write_unlock_irqrestore(lock, flags)
 #endif
 
-#endif /* CONFIG_DEBUG_SPINLOCK */
-
 static inline int __spin_trylock(spinlock_t *lock)
 {
 	preempt_disable();
diff --git a/init/Kconfig b/init/Kconfig
index 9e03ef8b311e..06863dd7bf49 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1210,3 +1210,4 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
+source "kernel/Kconfig.locks"
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..d1f86db71451
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,199 @@
+#
+# The ARCH_INLINE foo is necessary because select ignores "depends on"
+#
+config ARCH_INLINE_SPIN_TRYLOCK
+	bool
+
+config ARCH_INLINE_SPIN_TRYLOCK_BH
+	bool
+
+config ARCH_INLINE_SPIN_LOCK
+	bool
+
+config ARCH_INLINE_SPIN_LOCK_BH
+	bool
+
+config ARCH_INLINE_SPIN_LOCK_IRQ
+	bool
+
+config ARCH_INLINE_SPIN_LOCK_IRQSAVE
+	bool
+
+config ARCH_INLINE_SPIN_UNLOCK
+	bool
+
+config ARCH_INLINE_SPIN_UNLOCK_BH
+	bool
+
+config ARCH_INLINE_SPIN_UNLOCK_IRQ
+	bool
+
+config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+	bool
+
+
+config ARCH_INLINE_READ_TRYLOCK
+	bool
+
+config ARCH_INLINE_READ_LOCK
+	bool
+
+config ARCH_INLINE_READ_LOCK_BH
+	bool
+
+config ARCH_INLINE_READ_LOCK_IRQ
+	bool
+
+config ARCH_INLINE_READ_LOCK_IRQSAVE
+	bool
+
+config ARCH_INLINE_READ_UNLOCK
+	bool
+
+config ARCH_INLINE_READ_UNLOCK_BH
+	bool
+
+config ARCH_INLINE_READ_UNLOCK_IRQ
+	bool
+
+config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+	bool
+
+
+config ARCH_INLINE_WRITE_TRYLOCK
+	bool
+
+config ARCH_INLINE_WRITE_LOCK
+	bool
+
+config ARCH_INLINE_WRITE_LOCK_BH
+	bool
+
+config ARCH_INLINE_WRITE_LOCK_IRQ
+	bool
+
+config ARCH_INLINE_WRITE_LOCK_IRQSAVE
+	bool
+
+config ARCH_INLINE_WRITE_UNLOCK
+	bool
+
+config ARCH_INLINE_WRITE_UNLOCK_BH
+	bool
+
+config ARCH_INLINE_WRITE_UNLOCK_IRQ
+	bool
+
+config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	bool
+
+#
+# lock_* functions are inlined when:
+#   - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
+#
+# trylock_* functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+# unlock and unlock_irq functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#  or
+#   - DEBUG_SPINLOCK=n and PREEMPT=n
+#
+# unlock_bh and unlock_irqrestore functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+
+config INLINE_SPIN_TRYLOCK
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
+
+config INLINE_SPIN_TRYLOCK_BH
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
+
+config INLINE_SPIN_LOCK
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+
+config INLINE_SPIN_LOCK_BH
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_SPIN_LOCK_BH
+
+config INLINE_SPIN_LOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_SPIN_LOCK_IRQ
+
+config INLINE_SPIN_LOCK_IRQSAVE
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_SPIN_LOCK_IRQSAVE
+
+config INLINE_SPIN_UNLOCK
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+
+config INLINE_SPIN_UNLOCK_BH
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
+
+config INLINE_SPIN_UNLOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
+
+config INLINE_SPIN_UNLOCK_IRQRESTORE
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+
+
+config INLINE_READ_TRYLOCK
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
+
+config INLINE_READ_LOCK
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+
+config INLINE_READ_LOCK_BH
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_READ_LOCK_BH
+
+config INLINE_READ_LOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_READ_LOCK_IRQ
+
+config INLINE_READ_LOCK_IRQSAVE
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_READ_LOCK_IRQSAVE
+
+config INLINE_READ_UNLOCK
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
+
+config INLINE_READ_UNLOCK_BH
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
+
+config INLINE_READ_UNLOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
+
+config INLINE_READ_UNLOCK_IRQRESTORE
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+
+
+config INLINE_WRITE_TRYLOCK
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
+
+config INLINE_WRITE_LOCK
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+
+config INLINE_WRITE_LOCK_BH
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_WRITE_LOCK_BH
+
+config INLINE_WRITE_LOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_WRITE_LOCK_IRQ
+
+config INLINE_WRITE_LOCK_IRQSAVE
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_WRITE_LOCK_IRQSAVE
+
+config INLINE_WRITE_UNLOCK
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
+
+config INLINE_WRITE_UNLOCK_BH
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
+
+config INLINE_WRITE_UNLOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
+
+config INLINE_WRITE_UNLOCK_IRQRESTORE
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..235a9579a875 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,7 +21,7 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
 
-#ifndef _spin_trylock
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	return __spin_trylock(lock);
@@ -29,7 +29,7 @@ int __lockfunc _spin_trylock(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_trylock);
 #endif
 
-#ifndef _read_trylock
+#ifndef CONFIG_INLINE_READ_TRYLOCK
 int __lockfunc _read_trylock(rwlock_t *lock)
 {
 	return __read_trylock(lock);
@@ -37,7 +37,7 @@ int __lockfunc _read_trylock(rwlock_t *lock)
 EXPORT_SYMBOL(_read_trylock);
 #endif
 
-#ifndef _write_trylock
+#ifndef CONFIG_INLINE_WRITE_TRYLOCK
 int __lockfunc _write_trylock(rwlock_t *lock)
 {
 	return __write_trylock(lock);
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(_write_trylock);
  */
 #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 
-#ifndef _read_lock
+#ifndef CONFIG_INLINE_READ_LOCK
 void __lockfunc _read_lock(rwlock_t *lock)
 {
 	__read_lock(lock);
@@ -60,7 +60,7 @@ void __lockfunc _read_lock(rwlock_t *lock)
 EXPORT_SYMBOL(_read_lock);
 #endif
 
-#ifndef _spin_lock_irqsave
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
 unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 {
 	return __spin_lock_irqsave(lock);
@@ -68,7 +68,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_lock_irqsave);
 #endif
 
-#ifndef _spin_lock_irq
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
 void __lockfunc _spin_lock_irq(spinlock_t *lock)
 {
 	__spin_lock_irq(lock);
@@ -76,7 +76,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_lock_irq);
 #endif
 
-#ifndef _spin_lock_bh
+#ifndef CONFIG_INLINE_SPIN_LOCK_BH
 void __lockfunc _spin_lock_bh(spinlock_t *lock)
 {
 	__spin_lock_bh(lock);
@@ -84,7 +84,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_lock_bh);
 #endif
 
-#ifndef _read_lock_irqsave
+#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
 unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 {
 	return __read_lock_irqsave(lock);
@@ -92,7 +92,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 EXPORT_SYMBOL(_read_lock_irqsave);
 #endif
 
-#ifndef _read_lock_irq
+#ifndef CONFIG_INLINE_READ_LOCK_IRQ
 void __lockfunc _read_lock_irq(rwlock_t *lock)
 {
 	__read_lock_irq(lock);
@@ -100,7 +100,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
 EXPORT_SYMBOL(_read_lock_irq);
 #endif
 
-#ifndef _read_lock_bh
+#ifndef CONFIG_INLINE_READ_LOCK_BH
 void __lockfunc _read_lock_bh(rwlock_t *lock)
 {
 	__read_lock_bh(lock);
@@ -108,7 +108,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_read_lock_bh);
 #endif
 
-#ifndef _write_lock_irqsave
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
 unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 {
 	return __write_lock_irqsave(lock);
@@ -116,7 +116,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 EXPORT_SYMBOL(_write_lock_irqsave);
 #endif
 
-#ifndef _write_lock_irq
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
 void __lockfunc _write_lock_irq(rwlock_t *lock)
 {
 	__write_lock_irq(lock);
@@ -124,7 +124,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
 EXPORT_SYMBOL(_write_lock_irq);
 #endif
 
-#ifndef _write_lock_bh
+#ifndef CONFIG_INLINE_WRITE_LOCK_BH
 void __lockfunc _write_lock_bh(rwlock_t *lock)
 {
 	__write_lock_bh(lock);
@@ -132,7 +132,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_write_lock_bh);
 #endif
 
-#ifndef _spin_lock
+#ifndef CONFIG_INLINE_SPIN_LOCK
 void __lockfunc _spin_lock(spinlock_t *lock)
 {
 	__spin_lock(lock);
@@ -140,7 +140,7 @@ void __lockfunc _spin_lock(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_lock);
 #endif
 
-#ifndef _write_lock
+#ifndef CONFIG_INLINE_WRITE_LOCK
 void __lockfunc _write_lock(rwlock_t *lock)
 {
 	__write_lock(lock);
@@ -272,7 +272,7 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
 
 #endif
 
-#ifndef _spin_unlock
+#ifndef CONFIG_INLINE_SPIN_UNLOCK
 void __lockfunc _spin_unlock(spinlock_t *lock)
 {
 	__spin_unlock(lock);
@@ -280,7 +280,7 @@ void __lockfunc _spin_unlock(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_unlock);
 #endif
 
-#ifndef _write_unlock
+#ifndef CONFIG_INLINE_WRITE_UNLOCK
 void __lockfunc _write_unlock(rwlock_t *lock)
 {
 	__write_unlock(lock);
@@ -288,7 +288,7 @@ void __lockfunc _write_unlock(rwlock_t *lock)
 EXPORT_SYMBOL(_write_unlock);
 #endif
 
-#ifndef _read_unlock
+#ifndef CONFIG_INLINE_READ_UNLOCK
 void __lockfunc _read_unlock(rwlock_t *lock)
 {
 	__read_unlock(lock);
@@ -296,7 +296,7 @@ void __lockfunc _read_unlock(rwlock_t *lock)
 EXPORT_SYMBOL(_read_unlock);
 #endif
 
-#ifndef _spin_unlock_irqrestore
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	__spin_unlock_irqrestore(lock, flags);
@@ -304,7 +304,7 @@ void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 EXPORT_SYMBOL(_spin_unlock_irqrestore);
 #endif
 
-#ifndef _spin_unlock_irq
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
 void __lockfunc _spin_unlock_irq(spinlock_t *lock)
 {
 	__spin_unlock_irq(lock);
@@ -312,7 +312,7 @@ void __lockfunc _spin_unlock_irq(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_unlock_irq);
 #endif
 
-#ifndef _spin_unlock_bh
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
 	__spin_unlock_bh(lock);
@@ -320,7 +320,7 @@ void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_unlock_bh);
 #endif
 
-#ifndef _read_unlock_irqrestore
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
 void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
 	__read_unlock_irqrestore(lock, flags);
@@ -328,7 +328,7 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 EXPORT_SYMBOL(_read_unlock_irqrestore);
 #endif
 
-#ifndef _read_unlock_irq
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
 void __lockfunc _read_unlock_irq(rwlock_t *lock)
 {
 	__read_unlock_irq(lock);
@@ -336,7 +336,7 @@ void __lockfunc _read_unlock_irq(rwlock_t *lock)
 EXPORT_SYMBOL(_read_unlock_irq);
 #endif
 
-#ifndef _read_unlock_bh
+#ifndef CONFIG_INLINE_READ_UNLOCK_BH
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
 	__read_unlock_bh(lock);
@@ -344,7 +344,7 @@ void __lockfunc _read_unlock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_read_unlock_bh);
 #endif
 
-#ifndef _write_unlock_irqrestore
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
 	__write_unlock_irqrestore(lock, flags);
@@ -352,7 +352,7 @@ void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 EXPORT_SYMBOL(_write_unlock_irqrestore);
 #endif
 
-#ifndef _write_unlock_irq
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
 void __lockfunc _write_unlock_irq(rwlock_t *lock)
 {
 	__write_unlock_irq(lock);
@@ -360,7 +360,7 @@ void __lockfunc _write_unlock_irq(rwlock_t *lock)
 EXPORT_SYMBOL(_write_unlock_irq);
 #endif
 
-#ifndef _write_unlock_bh
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
 	__write_unlock_bh(lock);
@@ -368,7 +368,7 @@ void __lockfunc _write_unlock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_write_unlock_bh);
 #endif
 
-#ifndef _spin_trylock_bh
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
 int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 {
 	return __spin_trylock_bh(lock);
-- 
cgit v1.2.3-71-gd317


From 8e13c7b772387f55dc05c6a0e5b30010c3c46ff9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Nov 2009 15:21:41 +0000
Subject: locking: Reduce ifdefs in kernel/spinlock.c

With the Kconfig based inline decisions we can remove extra ifdefs in
kernel/spinlock.c by creating the complex lockbreak functions as
inlines which are inserted into the non inlined lock functions.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20091109151428.548614772@linutronix.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/spinlock.c | 258 +++++++++++++++++++++++++++---------------------------
 1 file changed, 127 insertions(+), 131 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 235a9579a875..41e042219ff6 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,6 +21,133 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
 
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
+/*
+ * The __lock_function inlines are taken from
+ * include/linux/spinlock_api_smp.h
+ */
+#else
+/*
+ * We build the __lock_function inlines here. They are too large for
+ * inlining all over the place, but here is only one user per function
+ * which embedds them into the calling _lock_function below.
+ *
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ */
+#define BUILD_LOCK_OPS(op, locktype)					\
+void __lockfunc __##op##_lock(locktype##_t *lock)			\
+{									\
+	for (;;) {							\
+		preempt_disable();					\
+		if (likely(_raw_##op##_trylock(lock)))			\
+			break;						\
+		preempt_enable();					\
+									\
+		if (!(lock)->break_lock)				\
+			(lock)->break_lock = 1;				\
+		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+			_raw_##op##_relax(&lock->raw_lock);		\
+	}								\
+	(lock)->break_lock = 0;						\
+}									\
+									\
+unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock)	\
+{									\
+	unsigned long flags;						\
+									\
+	for (;;) {							\
+		preempt_disable();					\
+		local_irq_save(flags);					\
+		if (likely(_raw_##op##_trylock(lock)))			\
+			break;						\
+		local_irq_restore(flags);				\
+		preempt_enable();					\
+									\
+		if (!(lock)->break_lock)				\
+			(lock)->break_lock = 1;				\
+		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+			_raw_##op##_relax(&lock->raw_lock);		\
+	}								\
+	(lock)->break_lock = 0;						\
+	return flags;							\
+}									\
+									\
+void __lockfunc __##op##_lock_irq(locktype##_t *lock)			\
+{									\
+	_##op##_lock_irqsave(lock);					\
+}									\
+									\
+void __lockfunc __##op##_lock_bh(locktype##_t *lock)			\
+{									\
+	unsigned long flags;						\
+									\
+	/*							*/	\
+	/* Careful: we must exclude softirqs too, hence the	*/	\
+	/* irq-disabling. We use the generic preemption-aware	*/	\
+	/* function:						*/	\
+	/**/								\
+	flags = _##op##_lock_irqsave(lock);				\
+	local_bh_disable();						\
+	local_irq_restore(flags);					\
+}									\
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ *         __[spin|read|write]_lock()
+ *         __[spin|read|write]_lock_irq()
+ *         __[spin|read|write]_lock_irqsave()
+ *         __[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, spinlock);
+BUILD_LOCK_OPS(read, rwlock);
+BUILD_LOCK_OPS(write, rwlock);
+
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	preempt_disable();
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_nested);
+
+unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock,
+						   int subclass)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
+				_raw_spin_lock_flags, &flags);
+	return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+				     struct lockdep_map *nest_lock)
+{
+	preempt_disable();
+	spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_nest_lock);
+
+#endif
+
 #ifndef CONFIG_INLINE_SPIN_TRYLOCK
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
@@ -45,13 +172,6 @@ int __lockfunc _write_trylock(rwlock_t *lock)
 EXPORT_SYMBOL(_write_trylock);
 #endif
 
-/*
- * If lockdep is enabled then we use the non-preemption spin-ops
- * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
- * not re-enabled during lock-acquire (which the preempt-spin-ops do):
- */
-#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
-
 #ifndef CONFIG_INLINE_READ_LOCK
 void __lockfunc _read_lock(rwlock_t *lock)
 {
@@ -148,130 +268,6 @@ void __lockfunc _write_lock(rwlock_t *lock)
 EXPORT_SYMBOL(_write_lock);
 #endif
 
-#else /* CONFIG_PREEMPT: */
-
-/*
- * This could be a long-held lock. We both prepare to spin for a long
- * time (making _this_ CPU preemptable if possible), and we also signal
- * towards that other CPU that it should break the lock ASAP.
- *
- * (We do this in a function because inlining it would be excessive.)
- */
-
-#define BUILD_LOCK_OPS(op, locktype)					\
-void __lockfunc _##op##_lock(locktype##_t *lock)			\
-{									\
-	for (;;) {							\
-		preempt_disable();					\
-		if (likely(_raw_##op##_trylock(lock)))			\
-			break;						\
-		preempt_enable();					\
-									\
-		if (!(lock)->break_lock)				\
-			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
-			_raw_##op##_relax(&lock->raw_lock);		\
-	}								\
-	(lock)->break_lock = 0;						\
-}									\
-									\
-EXPORT_SYMBOL(_##op##_lock);						\
-									\
-unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
-{									\
-	unsigned long flags;						\
-									\
-	for (;;) {							\
-		preempt_disable();					\
-		local_irq_save(flags);					\
-		if (likely(_raw_##op##_trylock(lock)))			\
-			break;						\
-		local_irq_restore(flags);				\
-		preempt_enable();					\
-									\
-		if (!(lock)->break_lock)				\
-			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
-			_raw_##op##_relax(&lock->raw_lock);		\
-	}								\
-	(lock)->break_lock = 0;						\
-	return flags;							\
-}									\
-									\
-EXPORT_SYMBOL(_##op##_lock_irqsave);					\
-									\
-void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
-{									\
-	_##op##_lock_irqsave(lock);					\
-}									\
-									\
-EXPORT_SYMBOL(_##op##_lock_irq);					\
-									\
-void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
-{									\
-	unsigned long flags;						\
-									\
-	/*							*/	\
-	/* Careful: we must exclude softirqs too, hence the	*/	\
-	/* irq-disabling. We use the generic preemption-aware	*/	\
-	/* function:						*/	\
-	/**/								\
-	flags = _##op##_lock_irqsave(lock);				\
-	local_bh_disable();						\
-	local_irq_restore(flags);					\
-}									\
-									\
-EXPORT_SYMBOL(_##op##_lock_bh)
-
-/*
- * Build preemption-friendly versions of the following
- * lock-spinning functions:
- *
- *         _[spin|read|write]_lock()
- *         _[spin|read|write]_lock_irq()
- *         _[spin|read|write]_lock_irqsave()
- *         _[spin|read|write]_lock_bh()
- */
-BUILD_LOCK_OPS(spin, spinlock);
-BUILD_LOCK_OPS(read, rwlock);
-BUILD_LOCK_OPS(write, rwlock);
-
-#endif /* CONFIG_PREEMPT */
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
-{
-	preempt_disable();
-	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-}
-EXPORT_SYMBOL(_spin_lock_nested);
-
-unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	preempt_disable();
-	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
-				_raw_spin_lock_flags, &flags);
-	return flags;
-}
-EXPORT_SYMBOL(_spin_lock_irqsave_nested);
-
-void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
-				     struct lockdep_map *nest_lock)
-{
-	preempt_disable();
-	spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-}
-EXPORT_SYMBOL(_spin_lock_nest_lock);
-
-#endif
-
 #ifndef CONFIG_INLINE_SPIN_UNLOCK
 void __lockfunc _spin_unlock(spinlock_t *lock)
 {
-- 
cgit v1.2.3-71-gd317


From a362c638bdf052bf424bce7645d39b101090f6ba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 14 Nov 2009 00:26:34 +0100
Subject: clocksource/events: Fix fallout of generic code changes

powerpc grew a new warning due to the type change of clockevent->mult.

The architectures which use parts of the generic time keeping
infrastructure tripped over my wrong assumption that
clocksource_register is only used when GENERIC_TIME=y.

I should have looked and also I should have known better. These
renitent Gaul villages are racking my nerves. Some serious deprecating
is due.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/kernel/time.c | 2 +-
 kernel/time/clocksource.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 92dc844299b6..60ceb2708948 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -905,7 +905,7 @@ static void register_decrementer_clockevent(int cpu)
 	*dec = decrementer_clockevent;
 	dec->cpumask = cpumask_of(cpu);
 
-	printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
+	printk(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
 	       dec->name, dec->mult, dec->shift, cpu);
 
 	clockevents_register_device(dec);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b65b242f04dd..72a2dcbd8b48 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -466,8 +466,6 @@ void clocksource_touch_watchdog(void)
 	clocksource_resume_watchdog();
 }
 
-#ifdef CONFIG_GENERIC_TIME
-
 /**
  * clocksource_max_deferment - Returns max time the clocksource can be deferred
  * @cs:         Pointer to clocksource
@@ -509,6 +507,8 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
 	return max_nsecs - (max_nsecs >> 5);
 }
 
+#ifdef CONFIG_GENERIC_TIME
+
 /**
  * clocksource_select - Select the best clocksource available
  *
-- 
cgit v1.2.3-71-gd317


From 560d4bc0df9a5e63b980432282d8c2bd3559ec74 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 13 Nov 2009 19:51:38 -0800
Subject: rcu: Further cleanups of use of lastcomp

Now that a copy of the rsp->completed flag is available in all
rcu_node structures, make full use of it.  It is still
legitimate to access rsp->completed while holding the root
rcu_node structure's lock, however.

Also, tighten up force_quiescent_state()'s checks for end of
current grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1258170699933-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3df04381ea3e..24bbf2ce0605 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -817,7 +817,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
 
 	rnp = rdp->mynode;
 	spin_lock_irqsave(&rnp->lock, flags);
-	if (lastcomp != ACCESS_ONCE(rsp->completed)) {
+	if (lastcomp != rnp->completed) {
 
 		/*
 		 * Someone beat us to it for this grace period, so leave.
@@ -935,7 +935,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-	long lastcomp;
 	unsigned long mask;
 	struct rcu_data *rdp = rsp->rda[cpu];
 	struct rcu_node *rnp;
@@ -971,7 +970,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 		spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 		rnp = rnp->parent;
 	} while (rnp != NULL);
-	lastcomp = rsp->completed;
 
 	spin_unlock_irqrestore(&rsp->onofflock, flags);
 
@@ -1145,7 +1143,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 	rcu_for_each_leaf_node(rsp, rnp) {
 		mask = 0;
 		spin_lock_irqsave(&rnp->lock, flags);
-		if (rsp->completed != lastcomp) {
+		if (rnp->completed != lastcomp) {
 			spin_unlock_irqrestore(&rnp->lock, flags);
 			return 1;
 		}
@@ -1159,7 +1157,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 			if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
 				mask |= bit;
 		}
-		if (mask != 0 && rsp->completed == lastcomp) {
+		if (mask != 0 && rnp->completed == lastcomp) {
 
 			/* cpu_quiet_msk() releases rnp->lock. */
 			cpu_quiet_msk(mask, rsp, rnp, flags);
@@ -1196,7 +1194,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 	lastcomp = rsp->gpnum - 1;
 	signaled = rsp->signaled;
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-	if (lastcomp == rsp->gpnum) {
+	if(!rcu_gp_in_progress(rsp)) {
 		rsp->n_force_qs_ngp++;
 		spin_unlock(&rnp->lock);
 		goto unlock_ret;  /* no GP in progress, time updated. */
@@ -1224,7 +1222,8 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		/* Update state, record completion counter. */
 		forcenow = 0;
 		spin_lock(&rnp->lock);
-		if (lastcomp == rsp->completed &&
+		if (lastcomp + 1 == rsp->gpnum &&
+		    lastcomp == rsp->completed &&
 		    rsp->signaled == signaled) {
 			rsp->signaled = RCU_FORCE_QS;
 			rsp->completed_fqs = lastcomp;
-- 
cgit v1.2.3-71-gd317


From 2f51f9884f6a36b0fe9636d5a1937e5cbd25723b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 13 Nov 2009 19:51:39 -0800
Subject: rcu: Eliminate __rcu_pending() false positives

Now that there are both ->gpnum and ->completed fields in the
rcu_node structure, __rcu_pending() should check rdp->gpnum and
rdp->completed against rnp->gpnum and rdp->completed, respectively,
instead of the prior comparison against the rcu_state fields
rsp->gpnum and rsp->completed.

Given the old comparison, __rcu_pending() could return 1, resulting
in a needless raise_softirq(RCU_SOFTIRQ).  This useless work would
happen if RCU responded to a scheduling-clock interrupt after the
rcu_state fields had been updated, but before the rcu_node fields
had been updated.

Changing the comparison from the rcu_state fields to the rcu_node
fields prevents this useless work from happening.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12581706991966-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 24bbf2ce0605..9b36d6d7fb97 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1403,6 +1403,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
  */
 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 {
+	struct rcu_node *rnp = rdp->mynode;
+
 	rdp->n_rcu_pending++;
 
 	/* Check for CPU stalls, if enabled. */
@@ -1427,13 +1429,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	/* Has another RCU grace period completed?  */
-	if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+	if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
 		rdp->n_rp_gp_completed++;
 		return 1;
 	}
 
 	/* Has a new RCU grace period started? */
-	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+	if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
 		rdp->n_rp_gp_started++;
 		return 1;
 	}
-- 
cgit v1.2.3-71-gd317


From 498657a478c60be092208422fefa9c7b248729c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Nov 2009 18:33:53 +0900
Subject: sched, kvm: Fix race condition involving sched_in_preempt_notifers

In finish_task_switch(), fire_sched_in_preempt_notifiers() is
called after finish_lock_switch().

However, depending on architecture, preemption can be enabled after
finish_lock_switch() which breaks the semantics of preempt
notifiers.

So move it before finish_arch_switch(). This also makes the in-
notifiers symmetric to out- notifiers in terms of locking - now
both are called under rq lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AFD2801.7020900@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 701eca4958a2..cea2beac7909 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2758,9 +2758,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(current, cpu_of(rq));
+	fire_sched_in_preempt_notifiers(current);
 	finish_lock_switch(rq, prev);
 
-	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
-- 
cgit v1.2.3-71-gd317


From 047106adcc85e3023da210143a6ab8a55df9e0fc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 16 Nov 2009 10:28:09 +0100
Subject: sched: Sched_rt_periodic_timer vs cpu hotplug

Heiko reported a case where a timer interrupt managed to
reference a root_domain structure that was already freed by a
concurrent hot-un-plug operation.

Solve this like the regular sched_domain stuff is also
synchronized, by adding a synchronize_sched() stmt to the free
path, this ensures that a root_domain stays present for any
atomic section that could have observed it.

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: Siddha Suresh B <suresh.b.siddha@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
LKML-Reference: <1258363873.26714.83.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cea2beac7909..3c91f110fc62 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7912,6 +7912,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 
 static void free_rootdomain(struct root_domain *rd)
 {
+	synchronize_sched();
+
 	cpupri_cleanup(&rd->cpupri);
 
 	free_cpumask_var(rd->rto_mask);
-- 
cgit v1.2.3-71-gd317


From 559fdc3c1b624edb1933a875022fe7e27934d11c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 16 Nov 2009 12:45:14 +0100
Subject: perf_event: Optimize perf_output_lock()

The purpose of perf_output_{un,}lock() is to:

 1) avoid publishing incomplete data
    [ possible when publishing a head that is ahead of an entry
      that is still being written ]

 2) guarantee fwd progress
    [ a simple refcount on pending writers doesn't need to drop to
      0, making it so would end up implementing something like forced
      quiecent states of RCU ]

To satisfy the above without undue complexity it serializes
between CPUs, this means that a pending writer can only be the
same cpu in a nested context, and since (under normal operation)
a cpu always makes progress we're good -- if the head is only
published when the bottom  most writer completes.

Now we don't need to disable IRQs in order to serialize between
CPUs, disabling preemption ought to be sufficient, esp since we
already deal with nesting due to NMIs.

This avoids potentially expensive (and needless) local IRQ
disable/enable ops.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1258373161.26714.254.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  1 -
 kernel/perf_event.c        | 21 +++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index df4e73e33774..7f87563c8485 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -714,7 +714,6 @@ struct perf_output_handle {
 	int				nmi;
 	int				sample;
 	int				locked;
-	unsigned long			flags;
 };
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6f4ed3b4cd73..3256e36ad251 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2674,20 +2674,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 static void perf_output_lock(struct perf_output_handle *handle)
 {
 	struct perf_mmap_data *data = handle->data;
-	int cpu;
+	int cur, cpu = get_cpu();
 
 	handle->locked = 0;
 
-	local_irq_save(handle->flags);
-	cpu = smp_processor_id();
-
-	if (in_nmi() && atomic_read(&data->lock) == cpu)
-		return;
+	for (;;) {
+		cur = atomic_cmpxchg(&data->lock, -1, cpu);
+		if (cur == -1) {
+			handle->locked = 1;
+			break;
+		}
+		if (cur == cpu)
+			break;
 
-	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
 		cpu_relax();
-
-	handle->locked = 1;
+	}
 }
 
 static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2733,7 +2734,7 @@ again:
 	if (atomic_xchg(&data->wakeup, 0))
 		perf_output_wakeup(handle);
 out:
-	local_irq_restore(handle->flags);
+	put_cpu();
 }
 
 void perf_output_copy(struct perf_output_handle *handle,
-- 
cgit v1.2.3-71-gd317


From 0696b711e4be45fa104c12329f617beb29c03f78 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Tue, 17 Nov 2009 13:49:50 +0800
Subject: timekeeping: Fix clock_gettime vsyscall time warp

Since commit 0a544198 "timekeeping: Move NTP adjusted clock multiplier
to struct timekeeper" the clock multiplier of vsyscall is updated with
the unmodified clock multiplier of the clock source and not with the
NTP adjusted multiplier of the timekeeper.

This causes user space observerable time warps:
new CLOCK-warp maximum: 120 nsecs,  00000025c337c537 -> 00000025c337c4bf

Add a new argument "mult" to update_vsyscall() and hand in the
timekeeping internal NTP adjusted multiplier.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: "Zhang Yanmin" <yanmin_zhang@linux.intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Tony Luck <tony.luck@intel.com>
LKML-Reference: <1258436990.17765.83.camel@minggr.sh.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/ia64/kernel/time.c       | 4 ++--
 arch/powerpc/kernel/time.c    | 5 +++--
 arch/s390/kernel/time.c       | 3 ++-
 arch/x86/kernel/vsyscall_64.c | 5 +++--
 include/linux/clocksource.h   | 6 ++++--
 kernel/time/timekeeping.c     | 6 +++---
 6 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 4990495d7531..a35c661e5e89 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -473,7 +473,7 @@ void update_vsyscall_tz(void)
 {
 }
 
-void update_vsyscall(struct timespec *wall, struct clocksource *c)
+void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
 {
         unsigned long flags;
 
@@ -481,7 +481,7 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c)
 
         /* copy fsyscall clock data */
         fsyscall_gtod_data.clk_mask = c->mask;
-        fsyscall_gtod_data.clk_mult = c->mult;
+        fsyscall_gtod_data.clk_mult = mult;
         fsyscall_gtod_data.clk_shift = c->shift;
         fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio;
         fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index a136a11c490d..39713312fbc7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -828,7 +828,8 @@ static cycle_t timebase_read(struct clocksource *cs)
 	return (cycle_t)get_tb();
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+		     u32 mult)
 {
 	u64 t2x, stamp_xsec;
 
@@ -841,7 +842,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 
 	/* XXX this assumes clock->shift == 22 */
 	/* 4611686018 ~= 2^(20+64-22) / 1e9 */
-	t2x = (u64) clock->mult * 4611686018ULL;
+	t2x = (u64) mult * 4611686018ULL;
 	stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
 	do_div(stamp_xsec, 1000000000);
 	stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 34162a0b2caa..68e1ecf5ebab 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -214,7 +214,8 @@ struct clocksource * __init clocksource_default_clock(void)
 	return &clocksource_tod;
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+		     u32 mult)
 {
 	if (clock != &clocksource_tod)
 		return;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..62f39d79b775 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+		     u32 mult)
 {
 	unsigned long flags;
 
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 	vsyscall_gtod_data.clock.vread = clock->vread;
 	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
 	vsyscall_gtod_data.clock.mask = clock->mask;
-	vsyscall_gtod_data.clock.mult = clock->mult;
+	vsyscall_gtod_data.clock.mult = mult;
 	vsyscall_gtod_data.clock.shift = clock->shift;
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 83d2fbd81b93..95e4995d9987 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -280,10 +280,12 @@ extern struct clocksource * __init __weak clocksource_default_clock(void);
 extern void clocksource_mark_unstable(struct clocksource *cs);
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
-extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
+extern void
+update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult);
 extern void update_vsyscall_tz(void);
 #else
-static inline void update_vsyscall(struct timespec *ts, struct clocksource *c)
+static inline void
+update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult)
 {
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907eaa..2a6d3e3e2c3e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsecond)
 {
 	xtime.tv_sec += leapsecond;
 	wall_to_monotonic.tv_sec -= leapsecond;
-	update_vsyscall(&xtime, timekeeper.clock);
+	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 
 #ifdef CONFIG_GENERIC_TIME
@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv)
 	timekeeper.ntp_error = 0;
 	ntp_clear();
 
-	update_vsyscall(&xtime, timekeeper.clock);
+	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -811,7 +811,7 @@ void update_wall_time(void)
 	update_xtime_cache(nsecs);
 
 	/* check to see if there is a new clocksource to use */
-	update_vsyscall(&xtime, timekeeper.clock);
+	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 5a50e33cc916f6a81cb96f0f24f6a88c9ab78b79 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Nov 2009 08:43:01 -0500
Subject: ring-buffer: Move access to commit_page up into function used

With the change of the way we process commits. Where a commit only happens
at the outer most level, and that we don't need to worry about
a commit ending after the rb_start_commit() has been called, the code
use to grab the commit page before the tail page to prevent a possible
race. But this race no longer exists with the rb_start_commit()
rb_end_commit() interface.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3ffa502fb243..4b8293fa545e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1785,9 +1785,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 static struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	     unsigned long length, unsigned long tail,
-	     struct buffer_page *commit_page,
 	     struct buffer_page *tail_page, u64 *ts)
 {
+	struct buffer_page *commit_page = cpu_buffer->commit_page;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct buffer_page *next_page;
 	int ret;
@@ -1890,13 +1890,10 @@ static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  unsigned type, unsigned long length, u64 *ts)
 {
-	struct buffer_page *tail_page, *commit_page;
+	struct buffer_page *tail_page;
 	struct ring_buffer_event *event;
 	unsigned long tail, write;
 
-	commit_page = cpu_buffer->commit_page;
-	/* we just need to protect against interrupts */
-	barrier();
 	tail_page = cpu_buffer->tail_page;
 	write = local_add_return(length, &tail_page->write);
 
@@ -1907,7 +1904,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	/* See if we shot pass the end of this buffer page */
 	if (write > BUF_PAGE_SIZE)
 		return rb_move_tail(cpu_buffer, length, tail,
-				    commit_page, tail_page, ts);
+				    tail_page, ts);
 
 	/* We reserved something on the buffer */
 
-- 
cgit v1.2.3-71-gd317


From c13d2f7c3231e873f30db92b96c8caa48f100f33 Mon Sep 17 00:00:00 2001
From: Carsten Emde <Carsten.Emde@osadl.org>
Date: Mon, 16 Nov 2009 20:56:13 +0100
Subject: tracing: Fix trace_marker output

When a string was written to <debugfs>/tracing/trace_marker, some
strange characters appeared in the trace output instead of the
string, since a vprint function erroneously called a vararg print
function with a va_list argument. This patch fixes the problem and
simplifies the related code.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
LKML-Reference: <4B01AE5D.1010801@osadl.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 03c7fd55c5f9..12b49caedf82 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1361,10 +1361,11 @@ int trace_array_vprintk(struct trace_array *tr,
 	pause_graph_tracing();
 	raw_local_irq_save(irq_flags);
 	__raw_spin_lock(&trace_buf_lock);
-	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	len = min(len, TRACE_BUF_SIZE-1);
-	trace_buf[len] = 0;
+	if (args == NULL) {
+		strncpy(trace_buf, fmt, TRACE_BUF_SIZE);
+		len = strlen(trace_buf);
+	} else
+		len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	size = sizeof(*entry) + len + 1;
 	buffer = tr->buffer;
@@ -1373,10 +1374,10 @@ int trace_array_vprintk(struct trace_array *tr,
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
+	entry->ip = ip;
 
 	memcpy(&entry->buf, trace_buf, len);
-	entry->buf[len] = 0;
+	entry->buf[len] = '\0';
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
 
@@ -3319,22 +3320,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static int mark_printk(const char *fmt, ...)
-{
-	int ret;
-	va_list args;
-	va_start(args, fmt);
-	ret = trace_vprintk(0, fmt, args);
-	va_end(args);
-	return ret;
-}
-
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
 					size_t cnt, loff_t *fpos)
 {
 	char *buf;
-	char *end;
 
 	if (tracing_disabled)
 		return -EINVAL;
@@ -3342,7 +3332,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	if (cnt > TRACE_BUF_SIZE)
 		cnt = TRACE_BUF_SIZE;
 
-	buf = kmalloc(cnt + 1, GFP_KERNEL);
+	buf = kmalloc(cnt + 2, GFP_KERNEL);
 	if (buf == NULL)
 		return -ENOMEM;
 
@@ -3350,14 +3340,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 		kfree(buf);
 		return -EFAULT;
 	}
+	if (buf[cnt-1] != '\n') {
+		buf[cnt] = '\n';
+		buf[cnt+1] = '\0';
+	} else
+		buf[cnt] = '\0';
 
-	/* Cut from the first nil or newline. */
-	buf[cnt] = '\0';
-	end = strchr(buf, '\n');
-	if (end)
-		*end = '\0';
-
-	cnt = mark_printk("%s\n", buf);
+	cnt = trace_vprintk(0, buf, NULL);
 	kfree(buf);
 	*fpos += cnt;
 
-- 
cgit v1.2.3-71-gd317


From f6060f46819f313d34a8c8151390cda509c23389 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 5 Nov 2009 11:16:17 +0800
Subject: tracing: Prevent build warning: 'ftrace_graph_buf' defined but not
 used

Prevent build warning when CONFIG_FUNCTION_GRAPH_TRACER is not set.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4AF24381.5060307@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1ed514fe3a30..7f9b51e8184b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2274,7 +2274,6 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
 #define FTRACE_FILTER_SIZE		COMMAND_LINE_SIZE
 static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
-static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
 
 static int __init set_ftrace_notrace(char *str)
 {
@@ -2291,6 +2290,7 @@ static int __init set_ftrace_filter(char *str)
 __setup("ftrace_filter=", set_ftrace_filter);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
 static int __init set_graph_function(char *str)
 {
 	strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
-- 
cgit v1.2.3-71-gd317


From 8e1a928a2ed7e8d5cad97c8e985294b4caedd168 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 16 Oct 2009 18:19:01 -0400
Subject: clockevents: Add missing include to pacify sparse

Include "tick-internal.h" in order to pick up the extern function
prototype for clockevents_shutdown(). This quiets the following sparse
build noise:

  warning: symbol 'clockevents_shutdown' was not declared. Should it be static?

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
LKML-Reference: <BD79186B4FD85F4B8E60E381CAEE190901E24550@mi8nycmail19.Mi8.com>
Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: johnstul@us.ibm.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 05e8aeedcdf3..20a8920029ee 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
 #include <linux/sysdev.h>
 #include <linux/tick.h>
 
+#include "tick-internal.h"
+
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
 static LIST_HEAD(clockevents_released);
-- 
cgit v1.2.3-71-gd317


From ba5ea951d0b5e5896180e21fe07f228d2b3b0e63 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 17 Nov 2009 14:14:13 -0800
Subject: posix-cpu-timers: optimize and document timer_create callback

We have already new_timer initialized to all-zeros hence in function
initializations are not needed. Document function expectation about
new_timer argument as well.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: johnstul@us.ibm.com
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747b..438ff4523513 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 
 /*
  * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
- * This is called from sys_timer_create with the new timer already locked.
+ * This is called from sys_timer_create() and do_cpu_nanosleep() with the
+ * new timer already all-zeros initialized.
  */
 int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 		return -EINVAL;
 
 	INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-	new_timer->it.cpu.incr.sched = 0;
-	new_timer->it.cpu.expires.sched = 0;
 
 	read_lock(&tasklist_lock);
 	if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
-- 
cgit v1.2.3-71-gd317


From a1afb6371bb5341057056194d1168753f6d77242 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 28 Aug 2009 22:19:33 +0400
Subject: genirq: switch /proc/irq/*/spurious to seq_file

[ tglx: compacted it a bit ]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
LKML-Reference: <20090828181743.GA14050@x200.localdomain>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index dfef5b9f3845..4e47f11da6a7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = {
 };
 #endif
 
-static int irq_spurious_read(char *page, char **start, off_t off,
-				  int count, int *eof, void *data)
+static int irq_spurious_proc_show(struct seq_file *m, void *v)
 {
-	struct irq_desc *desc = irq_to_desc((long) data);
-	return sprintf(page, "count %u\n"
-			     "unhandled %u\n"
-			     "last_unhandled %u ms\n",
-			desc->irq_count,
-			desc->irqs_unhandled,
-			jiffies_to_msecs(desc->last_unhandled));
+	struct irq_desc *desc = irq_to_desc((long) m->private);
+
+	seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
+		   desc->irq_count, desc->irqs_unhandled,
+		   jiffies_to_msecs(desc->last_unhandled));
+	return 0;
+}
+
+static int irq_spurious_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_spurious_proc_show, NULL);
 }
 
+static const struct file_operations irq_spurious_proc_fops = {
+	.open		= irq_spurious_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 #define MAX_NAMELEN 128
 
 static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
 	char name [MAX_NAMELEN];
-	struct proc_dir_entry *entry;
 
 	if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
 		return;
@@ -223,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 			 &irq_affinity_proc_fops, (void *)(long)irq);
 #endif
 
-	entry = create_proc_entry("spurious", 0444, desc->dir);
-	if (entry) {
-		entry->data = (void *)(long)irq;
-		entry->read_proc = irq_spurious_read;
-	}
+	proc_create_data("spurious", 0444, desc->dir,
+			 &irq_spurious_proc_fops, (void *)(long)irq);
 }
 
 #undef MAX_NAMELEN
-- 
cgit v1.2.3-71-gd317


From 2ea6dec4a22a6f66f6633876212fd4d195cf8277 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 17 Nov 2009 14:27:27 -0800
Subject: generic-ipi: Add smp_call_function_any()

Andrew points out that acpi-cpufreq uses cpumask_any, when it really
would prefer to use the same CPU if possible (to avoid an IPI).  In
general, this seems a good idea to offer.

[ tglx: Documented selection preference and Inlined the UP case to
  	avoid the copy of smp_call_function_single() and the extra
  	EXPORT ]

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Zhao Yakui <yakui.zhao@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/smp.h | 11 ++++++++++-
 kernel/smp.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 39c64bae776d..7a0570e6a596 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -76,6 +76,9 @@ void smp_call_function_many(const struct cpumask *mask,
 void __smp_call_function_single(int cpuid, struct call_single_data *data,
 				int wait);
 
+int smp_call_function_any(const struct cpumask *mask,
+			  void (*func)(void *info), void *info, int wait);
+
 /*
  * Generic and arch helpers
  */
@@ -137,9 +140,15 @@ static inline void smp_send_reschedule(int cpu) { }
 #define smp_prepare_boot_cpu()			do {} while (0)
 #define smp_call_function_many(mask, func, info, wait) \
 			(up_smp_call_function(func, info))
-static inline void init_call_single_data(void)
+static inline void init_call_single_data(void) { }
+
+static inline int
+smp_call_function_any(const struct cpumask *mask, void (*func)(void *info),
+		      void *info, int wait)
 {
+	return smp_call_function_single(0, func, info, wait);
 }
+
 #endif /* !SMP */
 
 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index 8bd618f0364d..a8c76069cf50 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -319,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
+/*
+ * smp_call_function_any - Run a function on any of the given cpus
+ * @mask: The mask of cpus it can run on.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait until function has completed.
+ *
+ * Returns 0 on success, else a negative status code (if no cpus were online).
+ * Note that @wait will be implicitly turned on in case of allocation failures,
+ * since we fall back to on-stack allocation.
+ *
+ * Selection preference:
+ *	1) current cpu if in @mask
+ *	2) any cpu of current node if in @mask
+ *	3) any other online cpu in @mask
+ */
+int smp_call_function_any(const struct cpumask *mask,
+			  void (*func)(void *info), void *info, int wait)
+{
+	unsigned int cpu;
+	const struct cpumask *nodemask;
+	int ret;
+
+	/* Try for same CPU (cheapest) */
+	cpu = get_cpu();
+	if (cpumask_test_cpu(cpu, mask))
+		goto call;
+
+	/* Try for same node. */
+	nodemask = cpumask_of_node(cpu);
+	for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
+	     cpu = cpumask_next_and(cpu, nodemask, mask)) {
+		if (cpu_online(cpu))
+			goto call;
+	}
+
+	/* Any online will do: smp_call_function_single handles nr_cpu_ids. */
+	cpu = cpumask_any_and(mask, cpu_online_mask);
+call:
+	ret = smp_call_function_single(cpu, func, info, wait);
+	put_cpu();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_function_any);
+
 /**
  * __smp_call_function_single(): Run a function on another CPU
  * @cpu: The CPU to run on.
-- 
cgit v1.2.3-71-gd317


From 8747d793fc5c4d3e4decd41d55f6dc24498dd5f5 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 17 Nov 2009 14:14:12 -0800
Subject: itimers: Fix racy writes to cpu_itimer fields

incr_error and error fields of struct cpu_itimer are used when calculating
next timer tick in check_cpu_itimers() and should not be modified without
tsk->sighand->siglock taken.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <1253802903-979-1-git-send-email-sgruszka@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/itimer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede528..d802883153da 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 {
 	cputime_t cval, nval, cinterval, ninterval;
 	s64 ns_ninterval, ns_nval;
+	u32 error, incr_error;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
 
 	nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	ninterval = timeval_to_cputime(&value->it_interval);
 	ns_ninterval = timeval_to_ns(&value->it_interval);
 
-	it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
-	it->error = cputime_sub_ns(nval, ns_nval);
+	error = cputime_sub_ns(nval, ns_nval);
+	incr_error = cputime_sub_ns(ninterval, ns_ninterval);
 
 	spin_lock_irq(&tsk->sighand->siglock);
 
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	}
 	it->expires = nval;
 	it->incr = ninterval;
+	it->error = error;
+	it->incr_error = incr_error;
 	trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
 			   ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
 
-- 
cgit v1.2.3-71-gd317


From 6d4561110a3e9fa742aeec6717248a491dfb1878 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 16 Nov 2009 03:11:48 -0800
Subject: sysctl: Drop & in front of every proc_handler.

For consistency drop & in front of every proc_handler.  Explicity
taking the address is unnecessary and it prevents optimizations
like stubbing the proc_handlers to NULL.

Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/arm/kernel/isa.c              |   6 +-
 arch/arm/mach-bcmring/arch.c       |   2 +-
 arch/frv/kernel/pm.c               |   8 +-
 arch/frv/kernel/sysctl.c           |   4 +-
 arch/ia64/kernel/crash.c           |   4 +-
 arch/ia64/kernel/perfmon.c         |   8 +-
 arch/mips/lasat/sysctl.c           |  22 +--
 arch/powerpc/kernel/idle.c         |   2 +-
 arch/s390/appldata/appldata_base.c |   4 +-
 arch/s390/kernel/debug.c           |   4 +-
 arch/s390/mm/cmm.c                 |   6 +-
 arch/sh/kernel/traps_64.c          |   6 +-
 crypto/proc.c                      |   2 +-
 drivers/cdrom/cdrom.c              |  12 +-
 drivers/char/hpet.c                |   2 +-
 drivers/char/ipmi/ipmi_poweroff.c  |   2 +-
 drivers/char/pty.c                 |   4 +-
 drivers/char/random.c              |  12 +-
 drivers/char/rtc.c                 |   2 +-
 drivers/macintosh/mac_hid.c        |   6 +-
 drivers/md/md.c                    |   4 +-
 drivers/misc/sgi-xp/xpc_main.c     |   6 +-
 drivers/net/wireless/arlan-proc.c  |  64 ++++-----
 drivers/parport/procfs.c           |  28 ++--
 drivers/scsi/scsi_sysctl.c         |   2 +-
 fs/coda/sysctl.c                   |   6 +-
 fs/eventpoll.c                     |   2 +-
 fs/lockd/svc.c                     |  12 +-
 fs/nfs/sysctl.c                    |   8 +-
 fs/notify/inotify/inotify_user.c   |   6 +-
 fs/ntfs/sysctl.c                   |   2 +-
 fs/ocfs2/stackglue.c               |   2 +-
 fs/quota/dquot.c                   |  18 +--
 fs/xfs/linux-2.6/xfs_sysctl.c      |  30 ++---
 kernel/slow-work.c                 |   2 +-
 kernel/sysctl.c                    | 266 ++++++++++++++++++-------------------
 net/rds/ib_sysctl.c                |  12 +-
 net/rds/iw_sysctl.c                |  12 +-
 net/rds/sysctl.c                   |  10 +-
 net/sctp/sysctl.c                  |   2 +-
 net/sunrpc/sysctl.c                |  10 +-
 net/sunrpc/xprtrdma/svc_rdma.c     |  24 ++--
 net/sunrpc/xprtrdma/transport.c    |  12 +-
 net/sunrpc/xprtsock.c              |  10 +-
 security/keys/sysctl.c             |  10 +-
 45 files changed, 339 insertions(+), 339 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/isa.c b/arch/arm/kernel/isa.c
index 738dfcc658ca..346485910732 100644
--- a/arch/arm/kernel/isa.c
+++ b/arch/arm/kernel/isa.c
@@ -26,19 +26,19 @@ static ctl_table ctl_isa_vars[4] = {
 		.data		= &isa_membase, 
 		.maxlen		= sizeof(isa_membase),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	}, {
 		.procname	= "portbase",
 		.data		= &isa_portbase, 
 		.maxlen		= sizeof(isa_portbase),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	}, {
 		.procname	= "portshift",
 		.data		= &isa_portshift, 
 		.maxlen		= sizeof(isa_portshift),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	}, {}
 };
 
diff --git a/arch/arm/mach-bcmring/arch.c b/arch/arm/mach-bcmring/arch.c
index f3c11199707a..fbe6fa02c882 100644
--- a/arch/arm/mach-bcmring/arch.c
+++ b/arch/arm/mach-bcmring/arch.c
@@ -58,7 +58,7 @@ static struct ctl_table bcmring_sysctl_warm_reboot[] = {
 	 .data = &bcmring_arch_warm_reboot,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec},
+	 .proc_handler = proc_dointvec},
 	{}
 };
 
diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index 940d8bb486f8..5fa3889d858b 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -303,28 +303,28 @@ static struct ctl_table pm_table[] =
 		.data		= NULL,
 		.maxlen		= 0,
 		.mode		= 0200,
-		.proc_handler	= &sysctl_pm_do_suspend,
+		.proc_handler	= sysctl_pm_do_suspend,
 	},
 	{
 		.procname	= "cmode",
 		.data		= &clock_cmode_current,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cmode_procctl,
+		.proc_handler	= cmode_procctl,
 	},
 	{
 		.procname	= "p0",
 		.data		= &clock_p0_current,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &p0_procctl,
+		.proc_handler	= p0_procctl,
 	},
 	{
 		.procname	= "cm",
 		.data		= &clock_cm_current,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cm_procctl,
+		.proc_handler	= cm_procctl,
 	},
 	{ }
 };
diff --git a/arch/frv/kernel/sysctl.c b/arch/frv/kernel/sysctl.c
index b30a4f2cda3e..035516cb7a97 100644
--- a/arch/frv/kernel/sysctl.c
+++ b/arch/frv/kernel/sysctl.c
@@ -180,7 +180,7 @@ static struct ctl_table frv_table[] =
 		.data		= NULL,
 		.maxlen		= 0,
 		.mode		= 0644,
-		.proc_handler	= &procctl_frv_cachemode,
+		.proc_handler	= procctl_frv_cachemode,
 	},
 #ifdef CONFIG_MMU
 	{
@@ -188,7 +188,7 @@ static struct ctl_table frv_table[] =
 		.data		= NULL,
 		.maxlen		= 0,
 		.mode		= 0644,
-		.proc_handler	= &procctl_frv_pin_cxnr
+		.proc_handler	= procctl_frv_pin_cxnr
 	},
 #endif
 	{}
diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
index 7c7d6a6dc0f7..b942f4032d7a 100644
--- a/arch/ia64/kernel/crash.c
+++ b/arch/ia64/kernel/crash.c
@@ -243,14 +243,14 @@ static ctl_table kdump_ctl_table[] = {
 		.data = &kdump_on_init,
 		.maxlen = sizeof(int),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec,
+		.proc_handler = proc_dointvec,
 	},
 	{
 		.procname = "kdump_on_fatal_mca",
 		.data = &kdump_on_fatal_mca,
 		.maxlen = sizeof(int),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec,
+		.proc_handler = proc_dointvec,
 	},
 	{ }
 };
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index ca30b3646405..402698b6689f 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -526,28 +526,28 @@ static ctl_table pfm_ctl_table[]={
 		.data		= &pfm_sysctl.debug,
 		.maxlen		= sizeof(int),
 		.mode		= 0666,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "debug_ovfl",
 		.data		= &pfm_sysctl.debug_ovfl,
 		.maxlen		= sizeof(int),
 		.mode		= 0666,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "fastctxsw",
 		.data		= &pfm_sysctl.fastctxsw,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	=  &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "expert_mode",
 		.data		= &pfm_sysctl.expert_mode,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{}
 };
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index 1dbdd76a8c04..14b9a28a4aec 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -182,28 +182,28 @@ static ctl_table lasat_table[] = {
 		.data		= &lasat_board_info.li_cpu_hz,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "bus-hz",
 		.data		= &lasat_board_info.li_bus_hz,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "bmid",
 		.data		= &lasat_board_info.li_bmid,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "prid",
 		.data		= &lasat_board_info.li_prid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_lasat_prid,
+		.proc_handler	= proc_lasat_prid,
 .	},
 #ifdef CONFIG_INET
 	{
@@ -211,14 +211,14 @@ static ctl_table lasat_table[] = {
 		.data		= &lasat_board_info.li_eeprom_info.ipaddr,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_lasat_ip,
+		.proc_handler	= proc_lasat_ip,
 	},
 	{
 		.procname	= "netmask",
 		.data		= &lasat_board_info.li_eeprom_info.netmask,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_lasat_ip,
+		.proc_handler	= proc_lasat_ip,
 	},
 #endif
 	{
@@ -227,14 +227,14 @@ static ctl_table lasat_table[] = {
 		.maxlen		=
 			sizeof(lasat_board_info.li_eeprom_info.passwd_hash),
 		.mode		= 0600,
-		.proc_handler	= &proc_dolasatstring,
+		.proc_handler	= proc_dolasatstring,
 	},
 	{
 		.procname	= "boot-service",
 		.data		= &lasat_boot_to_service,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #ifdef CONFIG_DS1603
 	{
@@ -242,7 +242,7 @@ static ctl_table lasat_table[] = {
 		.data		= &rtctmp,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dolasatrtc,
+		.proc_handler	= proc_dolasatrtc,
 	},
 #endif
 	{
@@ -250,14 +250,14 @@ static ctl_table lasat_table[] = {
 		.data		= &lasat_board_info.li_namestr,
 		.maxlen		= sizeof(lasat_board_info.li_namestr),
 		.mode		= 0444,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= proc_dostring,
 	},
 	{
 		.procname	= "typestr",
 		.data		= &lasat_board_info.li_typestr,
 		.maxlen		= sizeof(lasat_board_info.li_typestr),
 		.mode		= 0444,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= proc_dostring,
 	},
 	{}
 };
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index cece9e2cc5e4..049dda60e475 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -114,7 +114,7 @@ static ctl_table powersave_nap_ctl_table[]={
 		.data		= &powersave_nap,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{}
 };
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index b55fd7ed1c31..495589950dc7 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -61,12 +61,12 @@ static struct ctl_table appldata_table[] = {
 	{
 		.procname	= "timer",
 		.mode		= S_IRUGO | S_IWUSR,
-		.proc_handler	= &appldata_timer_handler,
+		.proc_handler	= appldata_timer_handler,
 	},
 	{
 		.procname	= "interval",
 		.mode		= S_IRUGO | S_IWUSR,
-		.proc_handler	= &appldata_interval_handler,
+		.proc_handler	= appldata_interval_handler,
 	},
 	{ },
 };
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index adf11260260a..071c81f179ef 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -897,14 +897,14 @@ static struct ctl_table s390dbf_table[] = {
 		.data		= &debug_stoppable,
 		.maxlen		= sizeof(int),
 		.mode           = S_IRUGO | S_IWUSR,
-		.proc_handler   = &proc_dointvec,
+		.proc_handler   = proc_dointvec,
 	},
 	 {
 		.procname       = "debug_active",
 		.data		= &debug_active,
 		.maxlen		= sizeof(int),
 		.mode           = S_IRUGO | S_IWUSR,
-		.proc_handler   = &s390dbf_procactive,
+		.proc_handler   = s390dbf_procactive,
 	},
 	{ }
 };
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index dab3e4a7582e..ff58779bf7e9 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -343,17 +343,17 @@ static struct ctl_table cmm_table[] = {
 	{
 		.procname	= "cmm_pages",
 		.mode		= 0644,
-		.proc_handler	= &cmm_pages_handler,
+		.proc_handler	= cmm_pages_handler,
 	},
 	{
 		.procname	= "cmm_timed_pages",
 		.mode		= 0644,
-		.proc_handler	= &cmm_pages_handler,
+		.proc_handler	= cmm_pages_handler,
 	},
 	{
 		.procname	= "cmm_timeout",
 		.mode		= 0644,
-		.proc_handler	= &cmm_timeout_handler,
+		.proc_handler	= cmm_timeout_handler,
 	},
 	{ }
 };
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index 080c8ee2d862..75c0cbe2eda0 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -881,21 +881,21 @@ static ctl_table unaligned_table[] = {
 		.data		= &kernel_mode_unaligned_fixup_count,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{
 		.procname	= "user_reports",
 		.data		= &user_mode_unaligned_fixup_count,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{
 		.procname	= "user_enable",
 		.data		= &user_mode_unaligned_fixup_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec},
+		.proc_handler	= proc_dointvec},
 	{}
 };
 
diff --git a/crypto/proc.c b/crypto/proc.c
index fe95975fc533..1c38733c224d 100644
--- a/crypto/proc.c
+++ b/crypto/proc.c
@@ -29,7 +29,7 @@ static struct ctl_table crypto_sysctl_table[] = {
 		.data           = &fips_enabled,
 		.maxlen         = sizeof(int),
 		.mode           = 0444,
-		.proc_handler   = &proc_dointvec
+		.proc_handler   = proc_dointvec
 	},
 	{}
 };
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 1872b6dc168a..e3749d0ba68b 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3557,42 +3557,42 @@ static ctl_table cdrom_table[] = {
 		.data		= &cdrom_sysctl_settings.info, 
 		.maxlen		= CDROM_STR_SIZE,
 		.mode		= 0444,
-		.proc_handler	= &cdrom_sysctl_info,
+		.proc_handler	= cdrom_sysctl_info,
 	},
 	{
 		.procname	= "autoclose",
 		.data		= &cdrom_sysctl_settings.autoclose,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler,
+		.proc_handler	= cdrom_sysctl_handler,
 	},
 	{
 		.procname	= "autoeject",
 		.data		= &cdrom_sysctl_settings.autoeject,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler,
+		.proc_handler	= cdrom_sysctl_handler,
 	},
 	{
 		.procname	= "debug",
 		.data		= &cdrom_sysctl_settings.debug,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler,
+		.proc_handler	= cdrom_sysctl_handler,
 	},
 	{
 		.procname	= "lock",
 		.data		= &cdrom_sysctl_settings.lock,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler,
+		.proc_handler	= cdrom_sysctl_handler,
 	},
 	{
 		.procname	= "check_media",
 		.data		= &cdrom_sysctl_settings.check,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler
+		.proc_handler	= cdrom_sysctl_handler
 	},
 	{ }
 };
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index a05a6112240b..e481c5938bad 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -679,7 +679,7 @@ static ctl_table hpet_table[] = {
 	 .data = &hpet_max_freq,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec,
+	 .proc_handler = proc_dointvec,
 	 },
 	{}
 };
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index aa39722696dd..0dec5da000ef 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -664,7 +664,7 @@ static ctl_table ipmi_table[] = {
 	  .data		= &poweroff_powercycle,
 	  .maxlen	= sizeof(poweroff_powercycle),
 	  .mode		= 0644,
-	  .proc_handler	= &proc_dointvec },
+	  .proc_handler	= proc_dointvec },
 	{ }
 };
 
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index d516e9ced3c2..d86c0bc05c1c 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -435,7 +435,7 @@ static struct ctl_table pty_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.data		= &pty_limit,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &pty_limit_min,
 		.extra2		= &pty_limit_max,
 	}, {
@@ -443,7 +443,7 @@ static struct ctl_table pty_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
 		.data		= &pty_count,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	}, 
 	{}
 };
diff --git a/drivers/char/random.c b/drivers/char/random.c
index bcf680f9ff58..dcd08635cf1b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1264,13 +1264,13 @@ ctl_table random_table[] = {
 		.data		= &sysctl_poolsize,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "entropy_avail",
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 		.data		= &input_pool.entropy_count,
 	},
 	{
@@ -1278,7 +1278,7 @@ ctl_table random_table[] = {
 		.data		= &random_read_wakeup_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_read_thresh,
 		.extra2		= &max_read_thresh,
 	},
@@ -1287,7 +1287,7 @@ ctl_table random_table[] = {
 		.data		= &random_write_wakeup_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_write_thresh,
 		.extra2		= &max_write_thresh,
 	},
@@ -1296,13 +1296,13 @@ ctl_table random_table[] = {
 		.data		= &sysctl_bootid,
 		.maxlen		= 16,
 		.mode		= 0444,
-		.proc_handler	= &proc_do_uuid,
+		.proc_handler	= proc_do_uuid,
 	},
 	{
 		.procname	= "uuid",
 		.maxlen		= 16,
 		.mode		= 0444,
-		.proc_handler	= &proc_do_uuid,
+		.proc_handler	= proc_do_uuid,
 	},
 	{ }
 };
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 37bfe23c218e..95acb8c880f4 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -286,7 +286,7 @@ static ctl_table rtc_table[] = {
 		.data		= &rtc_max_user_freq,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{ }
 };
diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c
index 2dd29b42a49e..7b4ef5bb556b 100644
--- a/drivers/macintosh/mac_hid.c
+++ b/drivers/macintosh/mac_hid.c
@@ -31,21 +31,21 @@ static ctl_table mac_hid_files[] = {
 		.data		= &mouse_emulate_buttons,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "mouse_button2_keycode",
 		.data		= &mouse_button2_keycode,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "mouse_button3_keycode",
 		.data		= &mouse_button3_keycode,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{ }
 };
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1d529551e944..1d7b9a23c765 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -102,14 +102,14 @@ static ctl_table raid_table[] = {
 		.data		= &sysctl_speed_limit_min,
 		.maxlen		= sizeof(int),
 		.mode		= S_IRUGO|S_IWUSR,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "speed_limit_max",
 		.data		= &sysctl_speed_limit_max,
 		.maxlen		= sizeof(int),
 		.mode		= S_IRUGO|S_IWUSR,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{ }
 };
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index ce98b9373be7..832ed4c88cf7 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -93,7 +93,7 @@ static ctl_table xpc_sys_xpc_hb_dir[] = {
 	 .data = &xpc_hb_interval,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec_minmax,
+	 .proc_handler = proc_dointvec_minmax,
 	 .extra1 = &xpc_hb_min_interval,
 	 .extra2 = &xpc_hb_max_interval},
 	{
@@ -101,7 +101,7 @@ static ctl_table xpc_sys_xpc_hb_dir[] = {
 	 .data = &xpc_hb_check_interval,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec_minmax,
+	 .proc_handler = proc_dointvec_minmax,
 	 .extra1 = &xpc_hb_check_min_interval,
 	 .extra2 = &xpc_hb_check_max_interval},
 	{}
@@ -116,7 +116,7 @@ static ctl_table xpc_sys_xpc_dir[] = {
 	 .data = &xpc_disengage_timelimit,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec_minmax,
+	 .proc_handler = proc_dointvec_minmax,
 	 .extra1 = &xpc_disengage_min_timelimit,
 	 .extra2 = &xpc_disengage_max_timelimit},
 	{}
diff --git a/drivers/net/wireless/arlan-proc.c b/drivers/net/wireless/arlan-proc.c
index 66a0b330b756..b22983e6c0cf 100644
--- a/drivers/net/wireless/arlan-proc.c
+++ b/drivers/net/wireless/arlan-proc.c
@@ -819,15 +819,15 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write,
 #define CTBLN(card,nam) \
         { .procname = #nam,\
           .data = &(arlan_conf[card].nam),\
-          .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec}
+          .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec}
 #ifdef ARLAN_DEBUGGING
 
 #define ARLAN_PROC_DEBUG_ENTRIES \
         { .procname = "entry_exit_debug",\
           .data = &arlan_entry_and_exit_debug,\
-          .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec},\
+          .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec},\
 	{ .procname = "debug", .data = &arlan_debug,\
-          .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec},
+          .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec},
 #else 
 #define ARLAN_PROC_DEBUG_ENTRIES
 #endif
@@ -864,7 +864,7 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write,
 	CTBLN(cardNo, channelSet), \
 	{ .procname = "name",\
 	 .data = arlan_conf[cardNo].siteName,\
-	 .maxlen = 16, .mode = 0600, .proc_handler = &proc_dostring},\
+	 .maxlen = 16, .mode = 0600, .proc_handler = proc_dostring},\
 	CTBLN(cardNo,waitTime),\
 	CTBLN(cardNo,lParameter),\
 	CTBLN(cardNo,_15),\
@@ -906,35 +906,35 @@ static ctl_table arlan_conf_table0[] =
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_infotxRing,
+		.proc_handler	= arlan_sysctl_infotxRing,
 	},
 	{
 		.procname	= "arlan0-rxRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_inforxRing,
+		.proc_handler	= arlan_sysctl_inforxRing,
 	},
 	{
 		.procname	= "arlan0-18",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info18,
+		.proc_handler	= arlan_sysctl_info18,
 	},
 	{
 		.procname	= "arlan0-ring",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info161719,
+		.proc_handler	= arlan_sysctl_info161719,
 	},
 	{
 		.procname	= "arlan0-shm-cpy",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info,
+		.proc_handler	= arlan_sysctl_info,
 	},
 #endif
 	{
@@ -942,14 +942,14 @@ static ctl_table arlan_conf_table0[] =
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_configure
+		.proc_handler	= arlan_configure
 	},
 	{
 		.procname	= "reset0",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_reset,
+		.proc_handler	= arlan_sysctl_reset,
 	},
 	{  }
 };
@@ -965,35 +965,35 @@ static ctl_table arlan_conf_table1[] =
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_infotxRing,
+		.proc_handler	= arlan_sysctl_infotxRing,
 	},
 	{
 		.procname	= "arlan1-rxRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_inforxRing,
+		.proc_handler	= arlan_sysctl_inforxRing,
 	},
 	{
 		.procname	= "arlan1-18",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info18,
+		.proc_handler	= arlan_sysctl_info18,
 	},
 	{
 		.procname	= "arlan1-ring",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info161719,
+		.proc_handler	= arlan_sysctl_info161719,
 	},
 	{
 		.procname	= "arlan1-shm-cpy",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info,
+		.proc_handler	= arlan_sysctl_info,
 	},
 #endif
 	{
@@ -1001,14 +1001,14 @@ static ctl_table arlan_conf_table1[] =
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_configure,
+		.proc_handler	= arlan_configure,
 	},
 	{
 		.procname	= "reset1",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_reset,
+		.proc_handler	= arlan_sysctl_reset,
 	},
 	{ }
 };
@@ -1024,35 +1024,35 @@ static ctl_table arlan_conf_table2[] =
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_infotxRing,
+		.proc_handler	= arlan_sysctl_infotxRing,
 	},
 	{
 		.procname	= "arlan2-rxRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_inforxRing,
+		.proc_handler	= arlan_sysctl_inforxRing,
 	},
 	{
 		.procname	= "arlan2-18",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info18,
+		.proc_handler	= arlan_sysctl_info18,
 	},
 	{
 		.procname	= "arlan2-ring",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info161719,
+		.proc_handler	= arlan_sysctl_info161719,
 	},
 	{
 		.procname	= "arlan2-shm-cpy",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info,
+		.proc_handler	= arlan_sysctl_info,
 	},
 #endif
 	{
@@ -1060,14 +1060,14 @@ static ctl_table arlan_conf_table2[] =
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_configure,
+		.proc_handler	= arlan_configure,
 	},
 	{
 		.procname	= "reset2",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_reset,
+		.proc_handler	= arlan_sysctl_reset,
 	},
 	{ }
 };
@@ -1083,35 +1083,35 @@ static ctl_table arlan_conf_table3[] =
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_infotxRing,
+		.proc_handler	= arlan_sysctl_infotxRing,
 	},
 	{
 		.procname	= "arlan3-rxRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_inforxRing,
+		.proc_handler	= arlan_sysctl_inforxRing,
 	},
 	{
 		.procname	= "arlan3-18",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info18,
+		.proc_handler	= arlan_sysctl_info18,
 	},
 	{
 		.procname	= "arlan3-ring",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info161719,
+		.proc_handler	= arlan_sysctl_info161719,
 	},
 	{
 		.procname	= "arlan3-shm-cpy",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info,
+		.proc_handler	= arlan_sysctl_info,
 	},
 #endif
 	{
@@ -1119,14 +1119,14 @@ static ctl_table arlan_conf_table3[] =
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_configure,
+		.proc_handler	= arlan_configure,
 	},
 	{
 		.procname	= "reset3",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_reset,
+		.proc_handler	= arlan_sysctl_reset,
 	},
 	{ }
 };
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index f808bdbf1772..3f56bc086cb5 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -270,7 +270,7 @@ static const struct parport_sysctl_table parport_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.proc_handler	= &proc_dointvec_minmax,
+			.proc_handler	= proc_dointvec_minmax,
 			.extra1		= (void*) &parport_min_spintime_value,
 			.extra2		= (void*) &parport_max_spintime_value
 		},
@@ -279,28 +279,28 @@ static const struct parport_sysctl_table parport_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_hardware_base_addr
+			.proc_handler	= do_hardware_base_addr
 		},
 		{
 			.procname	= "irq",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_hardware_irq
+			.proc_handler	= do_hardware_irq
 		},
 		{
 			.procname	= "dma",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_hardware_dma
+			.proc_handler	= do_hardware_dma
 		},
 		{
 			.procname	= "modes",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_hardware_modes
+			.proc_handler	= do_hardware_modes
 		},
 		PARPORT_DEVICES_ROOT_DIR,
 #ifdef CONFIG_PARPORT_1284
@@ -309,35 +309,35 @@ static const struct parport_sysctl_table parport_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 		{
 			.procname	= "autoprobe0",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	=  &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 		{
 			.procname	= "autoprobe1",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 		{
 			.procname	= "autoprobe2",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 		{
 			.procname	= "autoprobe3",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 #endif /* IEEE 1284 support */
 		{}
@@ -348,7 +348,7 @@ static const struct parport_sysctl_table parport_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_active_device
+			.proc_handler	= do_active_device
 		},
 		{}
 	},
@@ -386,7 +386,7 @@ parport_device_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= sizeof(unsigned long),
 			.mode		= 0644,
-			.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
+			.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
 			.extra1		= (void*) &parport_min_timeslice_value,
 			.extra2		= (void*) &parport_max_timeslice_value
 		},
@@ -437,7 +437,7 @@ parport_default_sysctl_table = {
 			.data		= &parport_default_timeslice,
 			.maxlen		= sizeof(parport_default_timeslice),
 			.mode		= 0644,
-			.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
+			.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
 			.extra1		= (void*) &parport_min_timeslice_value,
 			.extra2		= (void*) &parport_max_timeslice_value
 		},
@@ -446,7 +446,7 @@ parport_default_sysctl_table = {
 			.data		= &parport_default_spintime,
 			.maxlen		= sizeof(parport_default_spintime),
 			.mode		= 0644,
-			.proc_handler	= &proc_dointvec_minmax,
+			.proc_handler	= proc_dointvec_minmax,
 			.extra1		= (void*) &parport_min_spintime_value,
 			.extra2		= (void*) &parport_max_spintime_value
 		},
diff --git a/drivers/scsi/scsi_sysctl.c b/drivers/scsi/scsi_sysctl.c
index 42c31bee7113..2b6b93f7d8ef 100644
--- a/drivers/scsi/scsi_sysctl.c
+++ b/drivers/scsi/scsi_sysctl.c
@@ -17,7 +17,7 @@ static ctl_table scsi_table[] = {
 	  .data		= &scsi_logging_level,
 	  .maxlen	= sizeof(scsi_logging_level),
 	  .mode		= 0644,
-	  .proc_handler	= &proc_dointvec },
+	  .proc_handler	= proc_dointvec },
 	{ }
 };
 
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 354c050d4263..c6405ce3c50e 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -21,21 +21,21 @@ static ctl_table coda_table[] = {
 		.data		= &coda_timeout,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{
 		.procname	= "hard",
 		.data		= &coda_hard,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{
 		.procname	= "fake_statfs",
 		.data		= &coda_fake_statfs,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{}
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 70aa66c96c51..366c503f9657 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -251,7 +251,7 @@ ctl_table epoll_table[] = {
 		.data		= &max_user_watches,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
 	{ }
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 307ed4c3e1f5..e50cfa3d9654 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -375,7 +375,7 @@ static ctl_table nlm_sysctls[] = {
 		.data		= &nlm_grace_period,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= (unsigned long *) &nlm_grace_period_min,
 		.extra2		= (unsigned long *) &nlm_grace_period_max,
 	},
@@ -384,7 +384,7 @@ static ctl_table nlm_sysctls[] = {
 		.data		= &nlm_timeout,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= (unsigned long *) &nlm_timeout_min,
 		.extra2		= (unsigned long *) &nlm_timeout_max,
 	},
@@ -393,7 +393,7 @@ static ctl_table nlm_sysctls[] = {
 		.data		= &nlm_udpport,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (int *) &nlm_port_min,
 		.extra2		= (int *) &nlm_port_max,
 	},
@@ -402,7 +402,7 @@ static ctl_table nlm_sysctls[] = {
 		.data		= &nlm_tcpport,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (int *) &nlm_port_min,
 		.extra2		= (int *) &nlm_port_max,
 	},
@@ -411,14 +411,14 @@ static ctl_table nlm_sysctls[] = {
 		.data		= &nsm_use_hostnames,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "nsm_local_state",
 		.data		= &nsm_local_state,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{ }
 };
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index af51e6af2072..70e1fbbaaeab 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -26,7 +26,7 @@ static ctl_table nfs_cb_sysctls[] = {
 		.data = &nfs_callback_set_tcpport,
 		.maxlen = sizeof(int),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_minmax,
+		.proc_handler = proc_dointvec_minmax,
 		.extra1 = (int *)&nfs_set_port_min,
 		.extra2 = (int *)&nfs_set_port_max,
 	},
@@ -35,7 +35,7 @@ static ctl_table nfs_cb_sysctls[] = {
 		.data = &nfs_idmap_cache_timeout,
 		.maxlen = sizeof(int),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_jiffies,
+		.proc_handler = proc_dointvec_jiffies,
 	},
 #endif
 	{
@@ -43,14 +43,14 @@ static ctl_table nfs_cb_sysctls[] = {
 		.data		= &nfs_mountpoint_expiry_timeout,
 		.maxlen		= sizeof(nfs_mountpoint_expiry_timeout),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
+		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{
 		.procname	= "nfs_congestion_kb",
 		.data		= &nfs_congestion_kb,
 		.maxlen		= sizeof(nfs_congestion_kb),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{ }
 };
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5275921ed1ce..1d1d1a2765dd 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -73,7 +73,7 @@ ctl_table inotify_table[] = {
 		.data		= &inotify_max_user_instances,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
 	{
@@ -81,7 +81,7 @@ ctl_table inotify_table[] = {
 		.data		= &inotify_max_user_watches,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
 	{
@@ -89,7 +89,7 @@ ctl_table inotify_table[] = {
 		.data		= &inotify_max_queued_events,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero
 	},
 	{ }
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 99612ea690c2..79a89184cb5e 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -40,7 +40,7 @@ static ctl_table ntfs_sysctls[] = {
 		.data		= &debug_msgs,		/* Data pointer and size. */
 		.maxlen		= sizeof(debug_msgs),
 		.mode		= 0644,			/* Mode, proc handler. */
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{}
 };
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index ed12c1161479..f3df0baa9a48 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -624,7 +624,7 @@ static ctl_table ocfs2_nm_table[] = {
 		.data		= ocfs2_hb_ctl_path,
 		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= proc_dostring,
 	},
 	{ }
 };
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 60940f8709d6..f0eb200d8f8e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2477,56 +2477,56 @@ static ctl_table fs_dqstats_table[] = {
 		.data		= &dqstats.lookups,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "drops",
 		.data		= &dqstats.drops,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "reads",
 		.data		= &dqstats.reads,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "writes",
 		.data		= &dqstats.writes,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "cache_hits",
 		.data		= &dqstats.cache_hits,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "allocated_dquots",
 		.data		= &dqstats.allocated_dquots,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "free_dquots",
 		.data		= &dqstats.free_dquots,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "syncs",
 		.data		= &dqstats.syncs,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 	{
@@ -2534,7 +2534,7 @@ static ctl_table fs_dqstats_table[] = {
 		.data		= &flag_print_warnings,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 	{ },
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 6880147cafa8..7bb5092d6ae4 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -59,7 +59,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.sgid_inherit.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.sgid_inherit.min,
 		.extra2		= &xfs_params.sgid_inherit.max
 	},
@@ -68,7 +68,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.symlink_mode.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.symlink_mode.min,
 		.extra2		= &xfs_params.symlink_mode.max
 	},
@@ -77,7 +77,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.panic_mask.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.panic_mask.min,
 		.extra2		= &xfs_params.panic_mask.max
 	},
@@ -87,7 +87,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.error_level.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.error_level.min,
 		.extra2		= &xfs_params.error_level.max
 	},
@@ -96,7 +96,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.syncd_timer.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.syncd_timer.min,
 		.extra2		= &xfs_params.syncd_timer.max
 	},
@@ -105,7 +105,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.inherit_sync.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_sync.min,
 		.extra2		= &xfs_params.inherit_sync.max
 	},
@@ -114,7 +114,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.inherit_nodump.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_nodump.min,
 		.extra2		= &xfs_params.inherit_nodump.max
 	},
@@ -123,7 +123,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.inherit_noatim.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_noatim.min,
 		.extra2		= &xfs_params.inherit_noatim.max
 	},
@@ -132,7 +132,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.xfs_buf_timer.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.xfs_buf_timer.min,
 		.extra2		= &xfs_params.xfs_buf_timer.max
 	},
@@ -141,7 +141,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.xfs_buf_age.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.xfs_buf_age.min,
 		.extra2		= &xfs_params.xfs_buf_age.max
 	},
@@ -150,7 +150,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.inherit_nosym.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_nosym.min,
 		.extra2		= &xfs_params.inherit_nosym.max
 	},
@@ -159,7 +159,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.rotorstep.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.rotorstep.min,
 		.extra2		= &xfs_params.rotorstep.max
 	},
@@ -168,7 +168,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.inherit_nodfrg.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_nodfrg.min,
 		.extra2		= &xfs_params.inherit_nodfrg.max
 	},
@@ -177,7 +177,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.fstrm_timer.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.fstrm_timer.min,
 		.extra2		= &xfs_params.fstrm_timer.max,
 	},
@@ -188,7 +188,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.stats_clear.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &xfs_stats_clear_proc_handler,
+		.proc_handler	= xfs_stats_clear_proc_handler,
 		.extra1		= &xfs_params.stats_clear.min,
 		.extra2		= &xfs_params.stats_clear.max
 	},
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 0134b15b38d8..ebd9a8317a76 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -74,7 +74,7 @@ ctl_table slow_work_sysctls[] = {
 		.data		= &vslow_work_proportion,
 		.maxlen		= sizeof(unsigned),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (void *) &slow_work_min_vslow,
 		.extra2		= (void *) &slow_work_max_vslow,
 	},
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b4a5763d6dc8..e2ccc89382cb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -249,7 +249,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_child_runs_first,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #ifdef CONFIG_SCHED_DEBUG
 	{
@@ -257,7 +257,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_min_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &sched_nr_latency_handler,
+		.proc_handler	= sched_nr_latency_handler,
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
 	},
@@ -266,7 +266,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_latency,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &sched_nr_latency_handler,
+		.proc_handler	= sched_nr_latency_handler,
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
 	},
@@ -275,7 +275,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_wakeup_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
@@ -284,14 +284,14 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_shares_ratelimit,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "sched_shares_thresh",
 		.data		= &sysctl_sched_shares_thresh,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
 	{
@@ -299,35 +299,35 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_features,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "sched_migration_cost",
 		.data		= &sysctl_sched_migration_cost,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "sched_nr_migrate",
 		.data		= &sysctl_sched_nr_migrate,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "sched_time_avg",
 		.data		= &sysctl_sched_time_avg,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "timer_migration",
 		.data		= &sysctl_timer_migration,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -337,21 +337,21 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_rt_period,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &sched_rt_handler,
+		.proc_handler	= sched_rt_handler,
 	},
 	{
 		.procname	= "sched_rt_runtime_us",
 		.data		= &sysctl_sched_rt_runtime,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &sched_rt_handler,
+		.proc_handler	= sched_rt_handler,
 	},
 	{
 		.procname	= "sched_compat_yield",
 		.data		= &sysctl_sched_compat_yield,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #ifdef CONFIG_PROVE_LOCKING
 	{
@@ -359,7 +359,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &prove_locking,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_LOCK_STAT
@@ -368,7 +368,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &lock_stat,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 	{
@@ -376,35 +376,35 @@ static struct ctl_table kern_table[] = {
 		.data		= &panic_timeout,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "core_uses_pid",
 		.data		= &core_uses_pid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "core_pattern",
 		.data		= core_pattern,
 		.maxlen		= CORENAME_MAX_SIZE,
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= proc_dostring,
 	},
 	{
 		.procname	= "core_pipe_limit",
 		.data		= &core_pipe_limit,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "tainted",
 		.maxlen 	= sizeof(long),
 		.mode		= 0644,
-		.proc_handler	= &proc_taint,
+		.proc_handler	= proc_taint,
 	},
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -413,7 +413,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &latencytop_enabled,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -422,7 +422,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &real_root_dev,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 	{
@@ -430,7 +430,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &print_fatal_signals,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #ifdef CONFIG_SPARC
 	{
@@ -438,21 +438,21 @@ static struct ctl_table kern_table[] = {
 		.data		= reboot_command,
 		.maxlen		= 256,
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= proc_dostring,
 	},
 	{
 		.procname	= "stop-a",
 		.data		= &stop_a_enabled,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "scons-poweroff",
 		.data		= &scons_pwroff,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_SPARC64
@@ -461,7 +461,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_tsb_ratio,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef __hppa__
@@ -470,14 +470,14 @@ static struct ctl_table kern_table[] = {
 		.data		= &pwrsw_enabled,
 		.maxlen		= sizeof (int),
 	 	.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "unaligned-trap",
 		.data		= &unaligned_enabled,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 	{
@@ -485,7 +485,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &C_A_D,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #ifdef CONFIG_FUNCTION_TRACER
 	{
@@ -493,7 +493,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &ftrace_enabled,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &ftrace_enable_sysctl,
+		.proc_handler	= ftrace_enable_sysctl,
 	},
 #endif
 #ifdef CONFIG_STACK_TRACER
@@ -502,7 +502,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &stack_tracer_enabled,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &stack_trace_sysctl,
+		.proc_handler	= stack_trace_sysctl,
 	},
 #endif
 #ifdef CONFIG_TRACING
@@ -511,7 +511,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &ftrace_dump_on_oops,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_MODULES
@@ -520,7 +520,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &modprobe_path,
 		.maxlen		= KMOD_PATH_LEN,
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= proc_dostring,
 	},
 	{
 		.procname	= "modules_disabled",
@@ -528,7 +528,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 		.extra2		= &one,
 	},
@@ -539,7 +539,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &uevent_helper,
 		.maxlen		= UEVENT_HELPER_PATH_LEN,
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= proc_dostring,
 	},
 #endif
 #ifdef CONFIG_CHR_DEV_SG
@@ -548,7 +548,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sg_big_buff,
 		.maxlen		= sizeof (int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
@@ -557,7 +557,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &acct_parm,
 		.maxlen		= 3*sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -566,7 +566,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &__sysrq_enabled,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_PROC_SYSCTL
@@ -575,7 +575,7 @@ static struct ctl_table kern_table[] = {
 		.data		= NULL,
 		.maxlen		= sizeof (int),
 		.mode		= 0600,
-		.proc_handler	= &proc_do_cad_pid,
+		.proc_handler	= proc_do_cad_pid,
 	},
 #endif
 	{
@@ -583,7 +583,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &max_threads,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "random",
@@ -595,7 +595,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &overflowuid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
@@ -604,7 +604,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &overflowgid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
@@ -615,7 +615,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_ieee_emulation_warnings,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 	{
@@ -623,7 +623,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_userprocess_debug,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 	{
@@ -631,7 +631,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &pid_max,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &pid_max_min,
 		.extra2		= &pid_max_max,
 	},
@@ -640,7 +640,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &panic_on_oops,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #if defined CONFIG_PRINTK
 	{
@@ -648,28 +648,28 @@ static struct ctl_table kern_table[] = {
 		.data		= &console_loglevel,
 		.maxlen		= 4*sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "printk_ratelimit",
 		.data		= &printk_ratelimit_state.interval,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
+		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{
 		.procname	= "printk_ratelimit_burst",
 		.data		= &printk_ratelimit_state.burst,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "printk_delay",
 		.data		= &printk_delay_msec,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &ten_thousand,
 	},
@@ -679,7 +679,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &ngroups_max,
 		.maxlen		= sizeof (int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
@@ -687,14 +687,14 @@ static struct ctl_table kern_table[] = {
 		.data           = &unknown_nmi_panic,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
+		.proc_handler   = proc_dointvec,
 	},
 	{
 		.procname       = "nmi_watchdog",
 		.data           = &nmi_watchdog_enabled,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
-		.proc_handler   = &proc_nmi_enabled,
+		.proc_handler   = proc_nmi_enabled,
 	},
 #endif
 #if defined(CONFIG_X86)
@@ -703,42 +703,42 @@ static struct ctl_table kern_table[] = {
 		.data		= &panic_on_unrecovered_nmi,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "panic_on_io_nmi",
 		.data		= &panic_on_io_nmi,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "bootloader_type",
 		.data		= &bootloader_type,
 		.maxlen		= sizeof (int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "bootloader_version",
 		.data		= &bootloader_version,
 		.maxlen		= sizeof (int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "kstack_depth_to_print",
 		.data		= &kstack_depth_to_print,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "io_delay_type",
 		.data		= &io_delay_type,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #if defined(CONFIG_MMU)
@@ -747,7 +747,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &randomize_va_space,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
@@ -756,7 +756,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &spin_retry,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #if	defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
@@ -765,7 +765,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &acpi_realmode_flags,
 		.maxlen		= sizeof (unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 #endif
 #ifdef CONFIG_IA64
@@ -774,14 +774,14 @@ static struct ctl_table kern_table[] = {
 		.data		= &no_unaligned_warning,
 		.maxlen		= sizeof (int),
 	 	.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "unaligned-dump-stack",
 		.data		= &unaligned_dump_stack,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -790,7 +790,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &softlockup_panic,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -799,7 +799,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &softlockup_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dosoftlockup_thresh,
+		.proc_handler	= proc_dosoftlockup_thresh,
 		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
@@ -810,7 +810,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_hung_task_panic,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -819,21 +819,21 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_hung_task_check_count,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "hung_task_timeout_secs",
 		.data		= &sysctl_hung_task_timeout_secs,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dohung_task_timeout_secs,
+		.proc_handler	= proc_dohung_task_timeout_secs,
 	},
 	{
 		.procname	= "hung_task_warnings",
 		.data		= &sysctl_hung_task_warnings,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 #endif
 #ifdef CONFIG_COMPAT
@@ -842,7 +842,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &compat_log,
 		.maxlen		= sizeof (int),
 	 	.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_RT_MUTEXES
@@ -851,7 +851,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &max_lock_depth,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 	{
@@ -859,7 +859,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &poweroff_cmd,
 		.maxlen		= POWEROFF_CMD_PATH_LEN,
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= proc_dostring,
 	},
 #ifdef CONFIG_KEYS
 	{
@@ -874,7 +874,7 @@ static struct ctl_table kern_table[] = {
 		.data           = &rcutorture_runnable,
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_SLOW_WORK
@@ -890,21 +890,21 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_perf_event_paranoid,
 		.maxlen		= sizeof(sysctl_perf_event_paranoid),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "perf_event_mlock_kb",
 		.data		= &sysctl_perf_event_mlock,
 		.maxlen		= sizeof(sysctl_perf_event_mlock),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "perf_event_max_sample_rate",
 		.data		= &sysctl_perf_event_sample_rate,
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_KMEMCHECK
@@ -913,7 +913,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &kmemcheck_enabled,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_BLOCK
@@ -922,7 +922,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &blk_iopoll_enabled,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 /*
@@ -938,49 +938,49 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_overcommit_memory,
 		.maxlen		= sizeof(sysctl_overcommit_memory),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "panic_on_oom",
 		.data		= &sysctl_panic_on_oom,
 		.maxlen		= sizeof(sysctl_panic_on_oom),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "oom_kill_allocating_task",
 		.data		= &sysctl_oom_kill_allocating_task,
 		.maxlen		= sizeof(sysctl_oom_kill_allocating_task),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "oom_dump_tasks",
 		.data		= &sysctl_oom_dump_tasks,
 		.maxlen		= sizeof(sysctl_oom_dump_tasks),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
 		.maxlen		= sizeof(sysctl_overcommit_ratio),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "page-cluster", 
 		.data		= &page_cluster,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "dirty_background_ratio",
 		.data		= &dirty_background_ratio,
 		.maxlen		= sizeof(dirty_background_ratio),
 		.mode		= 0644,
-		.proc_handler	= &dirty_background_ratio_handler,
+		.proc_handler	= dirty_background_ratio_handler,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
@@ -989,7 +989,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_background_bytes,
 		.maxlen		= sizeof(dirty_background_bytes),
 		.mode		= 0644,
-		.proc_handler	= &dirty_background_bytes_handler,
+		.proc_handler	= dirty_background_bytes_handler,
 		.extra1		= &one_ul,
 	},
 	{
@@ -997,7 +997,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &vm_dirty_ratio,
 		.maxlen		= sizeof(vm_dirty_ratio),
 		.mode		= 0644,
-		.proc_handler	= &dirty_ratio_handler,
+		.proc_handler	= dirty_ratio_handler,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
@@ -1006,7 +1006,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &vm_dirty_bytes,
 		.maxlen		= sizeof(vm_dirty_bytes),
 		.mode		= 0644,
-		.proc_handler	= &dirty_bytes_handler,
+		.proc_handler	= dirty_bytes_handler,
 		.extra1		= &dirty_bytes_min,
 	},
 	{
@@ -1014,28 +1014,28 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_writeback_interval,
 		.maxlen		= sizeof(dirty_writeback_interval),
 		.mode		= 0644,
-		.proc_handler	= &dirty_writeback_centisecs_handler,
+		.proc_handler	= dirty_writeback_centisecs_handler,
 	},
 	{
 		.procname	= "dirty_expire_centisecs",
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "nr_pdflush_threads",
 		.data		= &nr_pdflush_threads,
 		.maxlen		= sizeof nr_pdflush_threads,
 		.mode		= 0444 /* read-only*/,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "swappiness",
 		.data		= &vm_swappiness,
 		.maxlen		= sizeof(vm_swappiness),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
@@ -1045,7 +1045,7 @@ static struct ctl_table vm_table[] = {
 		.data		= NULL,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &hugetlb_sysctl_handler,
+		.proc_handler	= hugetlb_sysctl_handler,
 		.extra1		= (void *)&hugetlb_zero,
 		.extra2		= (void *)&hugetlb_infinity,
 	 },
@@ -1054,21 +1054,21 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_hugetlb_shm_group,
 		.maxlen		= sizeof(gid_t),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	 },
 	 {
 		.procname	= "hugepages_treat_as_movable",
 		.data		= &hugepages_treat_as_movable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &hugetlb_treat_movable_handler,
+		.proc_handler	= hugetlb_treat_movable_handler,
 	},
 	{
 		.procname	= "nr_overcommit_hugepages",
 		.data		= NULL,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &hugetlb_overcommit_handler,
+		.proc_handler	= hugetlb_overcommit_handler,
 		.extra1		= (void *)&hugetlb_zero,
 		.extra2		= (void *)&hugetlb_infinity,
 	},
@@ -1078,7 +1078,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_lowmem_reserve_ratio,
 		.maxlen		= sizeof(sysctl_lowmem_reserve_ratio),
 		.mode		= 0644,
-		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
+		.proc_handler	= lowmem_reserve_ratio_sysctl_handler,
 	},
 	{
 		.procname	= "drop_caches",
@@ -1092,7 +1092,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &min_free_kbytes,
 		.maxlen		= sizeof(min_free_kbytes),
 		.mode		= 0644,
-		.proc_handler	= &min_free_kbytes_sysctl_handler,
+		.proc_handler	= min_free_kbytes_sysctl_handler,
 		.extra1		= &zero,
 	},
 	{
@@ -1100,7 +1100,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &percpu_pagelist_fraction,
 		.maxlen		= sizeof(percpu_pagelist_fraction),
 		.mode		= 0644,
-		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
+		.proc_handler	= percpu_pagelist_fraction_sysctl_handler,
 		.extra1		= &min_percpu_pagelist_fract,
 	},
 #ifdef CONFIG_MMU
@@ -1109,7 +1109,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_max_map_count,
 		.maxlen		= sizeof(sysctl_max_map_count),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 #else
 	{
@@ -1117,7 +1117,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_nr_trim_pages,
 		.maxlen		= sizeof(sysctl_nr_trim_pages),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
 #endif
@@ -1126,14 +1126,14 @@ static struct ctl_table vm_table[] = {
 		.data		= &laptop_mode,
 		.maxlen		= sizeof(laptop_mode),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
+		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{
 		.procname	= "block_dump",
 		.data		= &block_dump,
 		.maxlen		= sizeof(block_dump),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
 	},
 	{
@@ -1141,7 +1141,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_vfs_cache_pressure,
 		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
 	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
@@ -1150,7 +1150,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_legacy_va_layout,
 		.maxlen		= sizeof(sysctl_legacy_va_layout),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
 	},
 #endif
@@ -1160,7 +1160,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &zone_reclaim_mode,
 		.maxlen		= sizeof(zone_reclaim_mode),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
 	},
 	{
@@ -1168,7 +1168,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_min_unmapped_ratio,
 		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
 		.mode		= 0644,
-		.proc_handler	= &sysctl_min_unmapped_ratio_sysctl_handler,
+		.proc_handler	= sysctl_min_unmapped_ratio_sysctl_handler,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
@@ -1177,7 +1177,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_min_slab_ratio,
 		.maxlen		= sizeof(sysctl_min_slab_ratio),
 		.mode		= 0644,
-		.proc_handler	= &sysctl_min_slab_ratio_sysctl_handler,
+		.proc_handler	= sysctl_min_slab_ratio_sysctl_handler,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
@@ -1188,7 +1188,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_stat_interval,
 		.maxlen		= sizeof(sysctl_stat_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
+		.proc_handler	= proc_dointvec_jiffies,
 	},
 #endif
 	{
@@ -1196,7 +1196,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &dac_mmap_min_addr,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &mmap_min_addr_handler,
+		.proc_handler	= mmap_min_addr_handler,
 	},
 #ifdef CONFIG_NUMA
 	{
@@ -1204,7 +1204,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &numa_zonelist_order,
 		.maxlen		= NUMA_ZONELIST_ORDER_LEN,
 		.mode		= 0644,
-		.proc_handler	= &numa_zonelist_order_handler,
+		.proc_handler	= numa_zonelist_order_handler,
 	},
 #endif
 #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
@@ -1214,7 +1214,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &vdso_enabled,
 		.maxlen		= sizeof(vdso_enabled),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
 	},
 #endif
@@ -1224,7 +1224,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &vm_highmem_is_dirtyable,
 		.maxlen		= sizeof(vm_highmem_is_dirtyable),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -1234,7 +1234,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &scan_unevictable_pages,
 		.maxlen		= sizeof(scan_unevictable_pages),
 		.mode		= 0644,
-		.proc_handler	= &scan_unevictable_handler,
+		.proc_handler	= scan_unevictable_handler,
 	},
 #ifdef CONFIG_MEMORY_FAILURE
 	{
@@ -1242,7 +1242,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_memory_failure_early_kill,
 		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -1251,7 +1251,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_memory_failure_recovery,
 		.maxlen		= sizeof(sysctl_memory_failure_recovery),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -1276,35 +1276,35 @@ static struct ctl_table fs_table[] = {
 		.data		= &inodes_stat,
 		.maxlen		= 2*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "inode-state",
 		.data		= &inodes_stat,
 		.maxlen		= 7*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "file-nr",
 		.data		= &files_stat,
 		.maxlen		= 3*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_nr_files,
+		.proc_handler	= proc_nr_files,
 	},
 	{
 		.procname	= "file-max",
 		.data		= &files_stat.max_files,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "nr_open",
 		.data		= &sysctl_nr_open,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &sysctl_nr_open_min,
 		.extra2		= &sysctl_nr_open_max,
 	},
@@ -1313,14 +1313,14 @@ static struct ctl_table fs_table[] = {
 		.data		= &dentry_stat,
 		.maxlen		= 6*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "overflowuid",
 		.data		= &fs_overflowuid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
@@ -1329,7 +1329,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &fs_overflowgid,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
@@ -1339,7 +1339,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &leases_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_DNOTIFY
@@ -1348,7 +1348,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &dir_notify_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_MMU
@@ -1358,7 +1358,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &lease_break_time,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_AIO
@@ -1367,14 +1367,14 @@ static struct ctl_table fs_table[] = {
 		.data		= &aio_nr,
 		.maxlen		= sizeof(aio_nr),
 		.mode		= 0444,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "aio-max-nr",
 		.data		= &aio_max_nr,
 		.maxlen		= sizeof(aio_max_nr),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 #endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
@@ -1397,7 +1397,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &suid_dumpable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index 517c6c9987ba..03f01cb4e0fe 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -71,7 +71,7 @@ ctl_table rds_ib_sysctl_table[] = {
 		.data		= &rds_ib_sysctl_max_send_wr,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 		.extra1		= &rds_ib_sysctl_max_wr_min,
 		.extra2		= &rds_ib_sysctl_max_wr_max,
 	},
@@ -80,7 +80,7 @@ ctl_table rds_ib_sysctl_table[] = {
 		.data		= &rds_ib_sysctl_max_recv_wr,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 		.extra1		= &rds_ib_sysctl_max_wr_min,
 		.extra2		= &rds_ib_sysctl_max_wr_max,
 	},
@@ -89,7 +89,7 @@ ctl_table rds_ib_sysctl_table[] = {
 		.data		= &rds_ib_sysctl_max_unsig_wrs,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 		.extra1		= &rds_ib_sysctl_max_unsig_wr_min,
 		.extra2		= &rds_ib_sysctl_max_unsig_wr_max,
 	},
@@ -98,7 +98,7 @@ ctl_table rds_ib_sysctl_table[] = {
 		.data		= &rds_ib_sysctl_max_unsig_bytes,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 		.extra1		= &rds_ib_sysctl_max_unsig_bytes_min,
 		.extra2		= &rds_ib_sysctl_max_unsig_bytes_max,
 	},
@@ -107,14 +107,14 @@ ctl_table rds_ib_sysctl_table[] = {
 		.data		= &rds_ib_sysctl_max_recv_allocation,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "flow_control",
 		.data		= &rds_ib_sysctl_flow_control,
 		.maxlen		= sizeof(rds_ib_sysctl_flow_control),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{ }
 };
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
index 3e00b01559f2..1c4428a61a02 100644
--- a/net/rds/iw_sysctl.c
+++ b/net/rds/iw_sysctl.c
@@ -61,7 +61,7 @@ ctl_table rds_iw_sysctl_table[] = {
 		.data		= &rds_iw_sysctl_max_send_wr,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 		.extra1		= &rds_iw_sysctl_max_wr_min,
 		.extra2		= &rds_iw_sysctl_max_wr_max,
 	},
@@ -70,7 +70,7 @@ ctl_table rds_iw_sysctl_table[] = {
 		.data		= &rds_iw_sysctl_max_recv_wr,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 		.extra1		= &rds_iw_sysctl_max_wr_min,
 		.extra2		= &rds_iw_sysctl_max_wr_max,
 	},
@@ -79,7 +79,7 @@ ctl_table rds_iw_sysctl_table[] = {
 		.data		= &rds_iw_sysctl_max_unsig_wrs,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 		.extra1		= &rds_iw_sysctl_max_unsig_wr_min,
 		.extra2		= &rds_iw_sysctl_max_unsig_wr_max,
 	},
@@ -88,7 +88,7 @@ ctl_table rds_iw_sysctl_table[] = {
 		.data		= &rds_iw_sysctl_max_unsig_bytes,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 		.extra1		= &rds_iw_sysctl_max_unsig_bytes_min,
 		.extra2		= &rds_iw_sysctl_max_unsig_bytes_max,
 	},
@@ -97,14 +97,14 @@ ctl_table rds_iw_sysctl_table[] = {
 		.data		= &rds_iw_sysctl_max_recv_allocation,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "flow_control",
 		.data		= &rds_iw_sysctl_flow_control,
 		.maxlen		= sizeof(rds_iw_sysctl_flow_control),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{ }
 };
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 8fb499ee3687..7829a20325d3 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -55,7 +55,7 @@ static ctl_table rds_sysctl_rds_table[] = {
 		.data		= &rds_sysctl_reconnect_min_jiffies,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+		.proc_handler   = proc_doulongvec_ms_jiffies_minmax,
 		.extra1		= &rds_sysctl_reconnect_min,
 		.extra2		= &rds_sysctl_reconnect_max_jiffies,
 	},
@@ -64,7 +64,7 @@ static ctl_table rds_sysctl_rds_table[] = {
 		.data		= &rds_sysctl_reconnect_max_jiffies,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+		.proc_handler   = proc_doulongvec_ms_jiffies_minmax,
 		.extra1		= &rds_sysctl_reconnect_min_jiffies,
 		.extra2		= &rds_sysctl_reconnect_max,
 	},
@@ -73,21 +73,21 @@ static ctl_table rds_sysctl_rds_table[] = {
 		.data		= &rds_sysctl_max_unacked_packets,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
+		.proc_handler   = proc_dointvec,
 	},
 	{
 		.procname	= "max_unacked_bytes",
 		.data		= &rds_sysctl_max_unacked_bytes,
 		.maxlen         = sizeof(unsigned long),
 		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
+		.proc_handler   = proc_dointvec,
 	},
 	{
 		.procname	= "ping_enable",
 		.data		= &rds_sysctl_ping_enable,
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
+		.proc_handler   = proc_dointvec,
 	},
 	{ }
 };
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index c4ece9829541..d50a042f9552 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -237,7 +237,7 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_scope_policy,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &addr_scope_max,
 	},
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index f0ce326d0178..e65dcc613339 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -139,34 +139,34 @@ static ctl_table debug_table[] = {
 		.data		= &rpc_debug,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dodebug
+		.proc_handler	= proc_dodebug
 	},
 	{
 		.procname	= "nfs_debug",
 		.data		= &nfs_debug,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dodebug
+		.proc_handler	= proc_dodebug
 	},
 	{
 		.procname	= "nfsd_debug",
 		.data		= &nfsd_debug,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dodebug
+		.proc_handler	= proc_dodebug
 	},
 	{
 		.procname	= "nlm_debug",
 		.data		= &nlm_debug,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dodebug
+		.proc_handler	= proc_dodebug
 	},
 	{
 		.procname	= "transports",
 		.maxlen		= 256,
 		.mode		= 0444,
-		.proc_handler	= &proc_do_xprt,
+		.proc_handler	= proc_do_xprt,
 	},
 	{ }
 };
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 678cee22013f..5b8a8ff93a25 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -120,7 +120,7 @@ static ctl_table svcrdma_parm_table[] = {
 		.data		= &svcrdma_max_requests,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_max_requests,
 		.extra2		= &max_max_requests
 	},
@@ -129,7 +129,7 @@ static ctl_table svcrdma_parm_table[] = {
 		.data		= &svcrdma_max_req_size,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_max_inline,
 		.extra2		= &max_max_inline
 	},
@@ -138,7 +138,7 @@ static ctl_table svcrdma_parm_table[] = {
 		.data		= &svcrdma_ord,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_ord,
 		.extra2		= &max_ord,
 	},
@@ -148,63 +148,63 @@ static ctl_table svcrdma_parm_table[] = {
 		.data		= &rdma_stat_read,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{
 		.procname	= "rdma_stat_recv",
 		.data		= &rdma_stat_recv,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{
 		.procname	= "rdma_stat_write",
 		.data		= &rdma_stat_write,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{
 		.procname	= "rdma_stat_sq_starve",
 		.data		= &rdma_stat_sq_starve,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{
 		.procname	= "rdma_stat_rq_starve",
 		.data		= &rdma_stat_rq_starve,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{
 		.procname	= "rdma_stat_rq_poll",
 		.data		= &rdma_stat_rq_poll,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{
 		.procname	= "rdma_stat_rq_prod",
 		.data		= &rdma_stat_rq_prod,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{
 		.procname	= "rdma_stat_sq_poll",
 		.data		= &rdma_stat_sq_poll,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{
 		.procname	= "rdma_stat_sq_prod",
 		.data		= &rdma_stat_sq_prod,
 		.maxlen		= sizeof(atomic_t),
 		.mode		= 0644,
-		.proc_handler	= &read_reset_stat,
+		.proc_handler	= read_reset_stat,
 	},
 	{ },
 };
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 476816062243..7018eef1dcdd 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -90,7 +90,7 @@ static ctl_table xr_tunables_table[] = {
 		.data		= &xprt_rdma_slot_table_entries,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_slot_table_size,
 		.extra2		= &max_slot_table_size
 	},
@@ -99,21 +99,21 @@ static ctl_table xr_tunables_table[] = {
 		.data		= &xprt_rdma_max_inline_read,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "rdma_max_inline_write",
 		.data		= &xprt_rdma_max_inline_write,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "rdma_inline_write_padding",
 		.data		= &xprt_rdma_inline_write_padding,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &max_padding,
 	},
@@ -122,7 +122,7 @@ static ctl_table xr_tunables_table[] = {
 		.data		= &xprt_rdma_memreg_strategy,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_memreg,
 		.extra2		= &max_memreg,
 	},
@@ -131,7 +131,7 @@ static ctl_table xr_tunables_table[] = {
 		.data		= &xprt_rdma_pad_optimize,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{ },
 };
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8b9a2079f2e3..04732d09013e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -85,7 +85,7 @@ static ctl_table xs_tunables_table[] = {
 		.data		= &xprt_udp_slot_table_entries,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_slot_table_size,
 		.extra2		= &max_slot_table_size
 	},
@@ -94,7 +94,7 @@ static ctl_table xs_tunables_table[] = {
 		.data		= &xprt_tcp_slot_table_entries,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_slot_table_size,
 		.extra2		= &max_slot_table_size
 	},
@@ -103,7 +103,7 @@ static ctl_table xs_tunables_table[] = {
 		.data		= &xprt_min_resvport,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xprt_min_resvport_limit,
 		.extra2		= &xprt_max_resvport_limit
 	},
@@ -112,7 +112,7 @@ static ctl_table xs_tunables_table[] = {
 		.data		= &xprt_max_resvport,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xprt_min_resvport_limit,
 		.extra2		= &xprt_max_resvport_limit
 	},
@@ -121,7 +121,7 @@ static ctl_table xs_tunables_table[] = {
 		.data		= &xs_tcp_fin_timeout,
 		.maxlen		= sizeof(xs_tcp_fin_timeout),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
+		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{ },
 };
diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c
index 3565e2fc10c9..ee32d181764a 100644
--- a/security/keys/sysctl.c
+++ b/security/keys/sysctl.c
@@ -21,7 +21,7 @@ ctl_table key_sysctls[] = {
 		.data = &key_quota_maxkeys,
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_minmax,
+		.proc_handler = proc_dointvec_minmax,
 		.extra1 = (void *) &one,
 		.extra2 = (void *) &max,
 	},
@@ -30,7 +30,7 @@ ctl_table key_sysctls[] = {
 		.data = &key_quota_maxbytes,
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_minmax,
+		.proc_handler = proc_dointvec_minmax,
 		.extra1 = (void *) &one,
 		.extra2 = (void *) &max,
 	},
@@ -39,7 +39,7 @@ ctl_table key_sysctls[] = {
 		.data = &key_quota_root_maxkeys,
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_minmax,
+		.proc_handler = proc_dointvec_minmax,
 		.extra1 = (void *) &one,
 		.extra2 = (void *) &max,
 	},
@@ -48,7 +48,7 @@ ctl_table key_sysctls[] = {
 		.data = &key_quota_root_maxbytes,
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_minmax,
+		.proc_handler = proc_dointvec_minmax,
 		.extra1 = (void *) &one,
 		.extra2 = (void *) &max,
 	},
@@ -57,7 +57,7 @@ ctl_table key_sysctls[] = {
 		.data = &key_gc_delay,
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_minmax,
+		.proc_handler = proc_dointvec_minmax,
 		.extra1 = (void *) &zero,
 		.extra2 = (void *) &max,
 	},
-- 
cgit v1.2.3-71-gd317


From 3d7a641e544e428191667e8b1f83f96fa46dbd65 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:23 +0000
Subject: SLOW_WORK: Wait for outstanding work items belonging to a module to
 clear

Wait for outstanding slow work items belonging to a module to clear when
unregistering that module as a user of the facility.  This prevents the put_ref
code of a work item from being taken away before it returns.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  13 ++++-
 fs/fscache/main.c           |   6 +-
 fs/fscache/object.c         |   1 +
 fs/fscache/operation.c      |   1 +
 fs/gfs2/main.c              |   4 +-
 fs/gfs2/recovery.c          |   1 +
 include/linux/slow-work.h   |   8 ++-
 kernel/slow-work.c          | 132 ++++++++++++++++++++++++++++++++++++++++++--
 8 files changed, 150 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index ebc50f808ea4..f12fda31dcdc 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -64,9 +64,11 @@ USING SLOW WORK ITEMS
 Firstly, a module or subsystem wanting to make use of slow work items must
 register its interest:
 
-	 int ret = slow_work_register_user();
+	 int ret = slow_work_register_user(struct module *module);
 
-This will return 0 if successful, or a -ve error upon failure.
+This will return 0 if successful, or a -ve error upon failure.  The module
+pointer should be the module interested in using this facility (almost
+certainly THIS_MODULE).
 
 
 Slow work items may then be set up by:
@@ -110,7 +112,12 @@ operation.  When all a module's slow work items have been processed, and the
 module has no further interest in the facility, it should unregister its
 interest:
 
-	slow_work_unregister_user();
+	slow_work_unregister_user(struct module *module);
+
+The module pointer is used to wait for all outstanding work items for that
+module before completing the unregistration.  This prevents the put_ref() code
+from being taken away before it completes.  module should almost certainly be
+THIS_MODULE.
 
 
 ===============
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4de41b597499..add6bdb53f04 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -48,7 +48,7 @@ static int __init fscache_init(void)
 {
 	int ret;
 
-	ret = slow_work_register_user();
+	ret = slow_work_register_user(THIS_MODULE);
 	if (ret < 0)
 		goto error_slow_work;
 
@@ -80,7 +80,7 @@ error_kobj:
 error_cookie_jar:
 	fscache_proc_cleanup();
 error_proc:
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 error_slow_work:
 	return ret;
 }
@@ -97,7 +97,7 @@ static void __exit fscache_exit(void)
 	kobject_put(fscache_root);
 	kmem_cache_destroy(fscache_cookie_jar);
 	fscache_proc_cleanup();
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
 
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 392a41b1b79d..d236eb1d6f37 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -45,6 +45,7 @@ static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 
 const struct slow_work_ops fscache_object_slow_work_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= fscache_object_slow_work_get_ref,
 	.put_ref	= fscache_object_slow_work_put_ref,
 	.execute	= fscache_object_slow_work_execute,
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7f8d53b8b6b..f1a2857b2ff5 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -453,6 +453,7 @@ static void fscache_op_execute(struct slow_work *work)
 }
 
 const struct slow_work_ops fscache_op_slow_work_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= fscache_op_get_ref,
 	.put_ref	= fscache_op_put_ref,
 	.execute	= fscache_op_execute,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index eacd78a5d082..5b31f7741a8f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
-	error = slow_work_register_user();
+	error = slow_work_register_user(THIS_MODULE);
 	if (error)
 		goto fail_slow;
 
@@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void)
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 59d2695509d3..b2bb779f09ed 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -593,6 +593,7 @@ fail:
 }
 
 struct slow_work_ops gfs2_recover_ops = {
+	.owner	 = THIS_MODULE,
 	.get_ref = gfs2_recover_get_ref,
 	.put_ref = gfs2_recover_put_ref,
 	.execute = gfs2_recover_work,
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index b65c8881f07a..9adb2b30754f 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -24,6 +24,9 @@ struct slow_work;
  * The operations used to support slow work items
  */
 struct slow_work_ops {
+	/* owner */
+	struct module *owner;
+
 	/* get a ref on a work item
 	 * - return 0 if successful, -ve if not
 	 */
@@ -42,6 +45,7 @@ struct slow_work_ops {
  *   queued
  */
 struct slow_work {
+	struct module		*owner;	/* the owning module */
 	unsigned long		flags;
 #define SLOW_WORK_PENDING	0	/* item pending (further) execution */
 #define SLOW_WORK_EXECUTING	1	/* item currently executing */
@@ -84,8 +88,8 @@ static inline void vslow_work_init(struct slow_work *work,
 }
 
 extern int slow_work_enqueue(struct slow_work *work);
-extern int slow_work_register_user(void);
-extern void slow_work_unregister_user(void);
+extern int slow_work_register_user(struct module *owner);
+extern void slow_work_unregister_user(struct module *owner);
 
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 0d31135efbf4..dd08f376e406 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -22,6 +22,8 @@
 #define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
 					 * OOM */
 
+#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
@@ -46,7 +48,7 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
 
 #ifdef CONFIG_SYSCTL
 static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = 255;
+static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
 static const int slow_work_min_vslow = 1;
 static const int slow_work_max_vslow = 99;
 
@@ -97,6 +99,23 @@ static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
 static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
 static struct slow_work slow_work_new_thread; /* new thread starter */
 
+/*
+ * slow work ID allocation (use slow_work_queue_lock)
+ */
+static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
+
+/*
+ * Unregistration tracking to prevent put_ref() from disappearing during module
+ * unload
+ */
+#ifdef CONFIG_MODULES
+static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
+static struct module *slow_work_unreg_module;
+static struct slow_work *slow_work_unreg_work_item;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
+static DEFINE_MUTEX(slow_work_unreg_sync_lock);
+#endif
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -149,8 +168,11 @@ static unsigned slow_work_calc_vsmax(void)
  * Attempt to execute stuff queued on a slow thread.  Return true if we managed
  * it, false if there was nothing to do.
  */
-static bool slow_work_execute(void)
+static bool slow_work_execute(int id)
 {
+#ifdef CONFIG_MODULES
+	struct module *module;
+#endif
 	struct slow_work *work = NULL;
 	unsigned vsmax;
 	bool very_slow;
@@ -186,6 +208,12 @@ static bool slow_work_execute(void)
 	} else {
 		very_slow = false; /* avoid the compiler warning */
 	}
+
+#ifdef CONFIG_MODULES
+	if (work)
+		slow_work_thread_processing[id] = work->owner;
+#endif
+
 	spin_unlock_irq(&slow_work_queue_lock);
 
 	if (!work)
@@ -219,7 +247,18 @@ static bool slow_work_execute(void)
 		spin_unlock_irq(&slow_work_queue_lock);
 	}
 
+	/* sort out the race between module unloading and put_ref() */
 	work->ops->put_ref(work);
+
+#ifdef CONFIG_MODULES
+	module = slow_work_thread_processing[id];
+	slow_work_thread_processing[id] = NULL;
+	smp_mb();
+	if (slow_work_unreg_work_item == work ||
+	    slow_work_unreg_module == module)
+		wake_up_all(&slow_work_unreg_wq);
+#endif
+
 	return true;
 
 auto_requeue:
@@ -232,6 +271,7 @@ auto_requeue:
 	else
 		list_add_tail(&work->link, &slow_work_queue);
 	spin_unlock_irq(&slow_work_queue_lock);
+	slow_work_thread_processing[id] = NULL;
 	return true;
 }
 
@@ -368,13 +408,22 @@ static inline bool slow_work_available(int vsmax)
  */
 static int slow_work_thread(void *_data)
 {
-	int vsmax;
+	int vsmax, id;
 
 	DEFINE_WAIT(wait);
 
 	set_freezable();
 	set_user_nice(current, -5);
 
+	/* allocate ourselves an ID */
+	spin_lock_irq(&slow_work_queue_lock);
+	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
+	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
+	__set_bit(id, slow_work_ids);
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	sprintf(current->comm, "kslowd%03u", id);
+
 	for (;;) {
 		vsmax = vslow_work_proportion;
 		vsmax *= atomic_read(&slow_work_thread_count);
@@ -395,7 +444,7 @@ static int slow_work_thread(void *_data)
 		vsmax *= atomic_read(&slow_work_thread_count);
 		vsmax /= 100;
 
-		if (slow_work_available(vsmax) && slow_work_execute()) {
+		if (slow_work_available(vsmax) && slow_work_execute(id)) {
 			cond_resched();
 			if (list_empty(&slow_work_queue) &&
 			    list_empty(&vslow_work_queue) &&
@@ -412,6 +461,10 @@ static int slow_work_thread(void *_data)
 			break;
 	}
 
+	spin_lock_irq(&slow_work_queue_lock);
+	__clear_bit(id, slow_work_ids);
+	spin_unlock_irq(&slow_work_queue_lock);
+
 	if (atomic_dec_and_test(&slow_work_thread_count))
 		complete_and_exit(&slow_work_last_thread_exited, 0);
 	return 0;
@@ -475,6 +528,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 }
 
 static const struct slow_work_ops slow_work_new_thread_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= slow_work_new_thread_get_ref,
 	.put_ref	= slow_work_new_thread_put_ref,
 	.execute	= slow_work_new_thread_execute,
@@ -546,12 +600,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
 
 /**
  * slow_work_register_user - Register a user of the facility
+ * @module: The module about to make use of the facility
  *
  * Register a user of the facility, starting up the initial threads if there
  * aren't any other users at this point.  This will return 0 if successful, or
  * an error if not.
  */
-int slow_work_register_user(void)
+int slow_work_register_user(struct module *module)
 {
 	struct task_struct *p;
 	int loop;
@@ -598,14 +653,79 @@ error:
 }
 EXPORT_SYMBOL(slow_work_register_user);
 
+/*
+ * wait for all outstanding items from the calling module to complete
+ * - note that more items may be queued whilst we're waiting
+ */
+static void slow_work_wait_for_items(struct module *module)
+{
+	DECLARE_WAITQUEUE(myself, current);
+	struct slow_work *work;
+	int loop;
+
+	mutex_lock(&slow_work_unreg_sync_lock);
+	add_wait_queue(&slow_work_unreg_wq, &myself);
+
+	for (;;) {
+		spin_lock_irq(&slow_work_queue_lock);
+
+		/* first of all, we wait for the last queued item in each list
+		 * to be processed */
+		list_for_each_entry_reverse(work, &vslow_work_queue, link) {
+			if (work->owner == module) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				slow_work_unreg_work_item = work;
+				goto do_wait;
+			}
+		}
+		list_for_each_entry_reverse(work, &slow_work_queue, link) {
+			if (work->owner == module) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				slow_work_unreg_work_item = work;
+				goto do_wait;
+			}
+		}
+
+		/* then we wait for the items being processed to finish */
+		slow_work_unreg_module = module;
+		smp_mb();
+		for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
+			if (slow_work_thread_processing[loop] == module)
+				goto do_wait;
+		}
+		spin_unlock_irq(&slow_work_queue_lock);
+		break; /* okay, we're done */
+
+	do_wait:
+		spin_unlock_irq(&slow_work_queue_lock);
+		schedule();
+		slow_work_unreg_work_item = NULL;
+		slow_work_unreg_module = NULL;
+	}
+
+	remove_wait_queue(&slow_work_unreg_wq, &myself);
+	mutex_unlock(&slow_work_unreg_sync_lock);
+}
+
 /**
  * slow_work_unregister_user - Unregister a user of the facility
+ * @module: The module whose items should be cleared
  *
  * Unregister a user of the facility, killing all the threads if this was the
  * last one.
+ *
+ * This waits for all the work items belonging to the nominated module to go
+ * away before proceeding.
  */
-void slow_work_unregister_user(void)
+void slow_work_unregister_user(struct module *module)
 {
+	/* first of all, wait for all outstanding items from the calling module
+	 * to complete */
+	if (module)
+		slow_work_wait_for_items(module);
+
+	/* then we can actually go about shutting down the facility if need
+	 * be */
 	mutex_lock(&slow_work_user_lock);
 
 	BUG_ON(slow_work_user_count <= 0);
-- 
cgit v1.2.3-71-gd317


From 4d8bb2cbccf6dccaada509aafeb01c6205c9d8c4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Nov 2009 18:10:39 +0000
Subject: SLOW_WORK: Make slow_work_ops ->get_ref/->put_ref optional

Make the ability for the slow-work facility to take references on a work item
optional as not everyone requires this.

Even the internal slow-work stubs them out, so those can be got rid of too.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  2 +-
 kernel/slow-work.c          | 36 ++++++++++++++++--------------------
 2 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index f12fda31dcdc..c655c517fc68 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -125,7 +125,7 @@ ITEM OPERATIONS
 ===============
 
 Each work item requires a table of operations of type struct slow_work_ops.
-All members are required:
+Only ->execute() is required, getting and putting of a reference are optional.
 
  (*) Get a reference on an item:
 
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index dd08f376e406..fccf421eb5c1 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -145,6 +145,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited);
 static int slow_work_user_count;
 static DEFINE_MUTEX(slow_work_user_lock);
 
+static inline int slow_work_get_ref(struct slow_work *work)
+{
+	if (work->ops->get_ref)
+		return work->ops->get_ref(work);
+
+	return 0;
+}
+
+static inline void slow_work_put_ref(struct slow_work *work)
+{
+	if (work->ops->put_ref)
+		work->ops->put_ref(work);
+}
+
 /*
  * Calculate the maximum number of active threads in the pool that are
  * permitted to process very slow work items.
@@ -248,7 +262,7 @@ static bool slow_work_execute(int id)
 	}
 
 	/* sort out the race between module unloading and put_ref() */
-	work->ops->put_ref(work);
+	slow_work_put_ref(work);
 
 #ifdef CONFIG_MODULES
 	module = slow_work_thread_processing[id];
@@ -309,7 +323,6 @@ int slow_work_enqueue(struct slow_work *work)
 	BUG_ON(slow_work_user_count <= 0);
 	BUG_ON(!work);
 	BUG_ON(!work->ops);
-	BUG_ON(!work->ops->get_ref);
 
 	/* when honouring an enqueue request, we only promise that we will run
 	 * the work function in the future; we do not promise to run it once
@@ -339,7 +352,7 @@ int slow_work_enqueue(struct slow_work *work)
 		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 		} else {
-			if (work->ops->get_ref(work) < 0)
+			if (slow_work_get_ref(work) < 0)
 				goto cant_get_ref;
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
@@ -479,21 +492,6 @@ static void slow_work_cull_timeout(unsigned long data)
 	wake_up(&slow_work_thread_wq);
 }
 
-/*
- * Get a reference on slow work thread starter
- */
-static int slow_work_new_thread_get_ref(struct slow_work *work)
-{
-	return 0;
-}
-
-/*
- * Drop a reference on slow work thread starter
- */
-static void slow_work_new_thread_put_ref(struct slow_work *work)
-{
-}
-
 /*
  * Start a new slow work thread
  */
@@ -529,8 +527,6 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.owner		= THIS_MODULE,
-	.get_ref	= slow_work_new_thread_get_ref,
-	.put_ref	= slow_work_new_thread_put_ref,
 	.execute	= slow_work_new_thread_execute,
 };
 
-- 
cgit v1.2.3-71-gd317


From 0160950297c08f8233c89b9f9e7dd59cfb080809 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Nov 2009 18:10:43 +0000
Subject: SLOW_WORK: Add support for cancellation of slow work

Add support for cancellation of queued slow work and delayed slow work items.
The cancellation functions will wait for items that are pending or undergoing
execution to be discarded by the slow work facility.

Attempting to enqueue work that is in the process of being cancelled will
result in ECANCELED.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt | 12 ++++++-
 include/linux/slow-work.h   |  2 ++
 kernel/slow-work.c          | 81 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 88 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index c655c517fc68..2e384bd4dead 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -108,7 +108,17 @@ on the item, 0 otherwise.
 
 
 The items are reference counted, so there ought to be no need for a flush
-operation.  When all a module's slow work items have been processed, and the
+operation.  But as the reference counting is optional, means to cancel
+existing work items are also included:
+
+	cancel_slow_work(&myitem);
+
+can be used to cancel pending work.  The above cancel function waits for
+existing work to have been executed (or prevent execution of them, depending
+on timing).
+
+
+When all a module's slow work items have been processed, and the
 module has no further interest in the facility, it should unregister its
 interest:
 
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 9adb2b30754f..eef20182d5b4 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -51,6 +51,7 @@ struct slow_work {
 #define SLOW_WORK_EXECUTING	1	/* item currently executing */
 #define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
 #define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
+#define SLOW_WORK_CANCELLING	4	/* item is being cancelled, don't enqueue */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
 };
@@ -88,6 +89,7 @@ static inline void vslow_work_init(struct slow_work *work,
 }
 
 extern int slow_work_enqueue(struct slow_work *work);
+extern void slow_work_cancel(struct slow_work *work);
 extern int slow_work_register_user(struct module *owner);
 extern void slow_work_unregister_user(struct module *owner);
 
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index fccf421eb5c1..671cc434532a 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -236,12 +236,17 @@ static bool slow_work_execute(int id)
 	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
 		BUG();
 
-	work->ops->execute(work);
+	/* don't execute if the work is in the process of being cancelled */
+	if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		work->ops->execute(work);
 
 	if (very_slow)
 		atomic_dec(&vslow_work_executing_count);
 	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
 
+	/* wake up anyone waiting for this work to be complete */
+	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
+
 	/* if someone tried to enqueue the item whilst we were executing it,
 	 * then it'll be left unenqueued to avoid multiple threads trying to
 	 * execute it simultaneously
@@ -314,11 +319,16 @@ auto_requeue:
  * allowed to pick items to execute.  This ensures that very slow items won't
  * overly block ones that are just ordinarily slow.
  *
- * Returns 0 if successful, -EAGAIN if not.
+ * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
+ * attempted queued)
  */
 int slow_work_enqueue(struct slow_work *work)
 {
 	unsigned long flags;
+	int ret;
+
+	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		return -ECANCELED;
 
 	BUG_ON(slow_work_user_count <= 0);
 	BUG_ON(!work);
@@ -335,6 +345,9 @@ int slow_work_enqueue(struct slow_work *work)
 	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
 		spin_lock_irqsave(&slow_work_queue_lock, flags);
 
+		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
+			goto cancelled;
+
 		/* we promise that we will not attempt to execute the work
 		 * function in more than one thread simultaneously
 		 *
@@ -352,8 +365,9 @@ int slow_work_enqueue(struct slow_work *work)
 		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 		} else {
-			if (slow_work_get_ref(work) < 0)
-				goto cant_get_ref;
+			ret = slow_work_get_ref(work);
+			if (ret < 0)
+				goto failed;
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -365,12 +379,67 @@ int slow_work_enqueue(struct slow_work *work)
 	}
 	return 0;
 
-cant_get_ref:
+cancelled:
+	ret = -ECANCELED;
+failed:
 	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return -EAGAIN;
+	return ret;
 }
 EXPORT_SYMBOL(slow_work_enqueue);
 
+static int slow_work_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+/**
+ * slow_work_cancel - Cancel a slow work item
+ * @work: The work item to cancel
+ *
+ * This function will cancel a previously enqueued work item. If we cannot
+ * cancel the work item, it is guarenteed to have run when this function
+ * returns.
+ */
+void slow_work_cancel(struct slow_work *work)
+{
+	bool wait = true, put = false;
+
+	set_bit(SLOW_WORK_CANCELLING, &work->flags);
+
+	spin_lock_irq(&slow_work_queue_lock);
+
+	if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
+	    !list_empty(&work->link)) {
+		/* the link in the pending queue holds a reference on the item
+		 * that we will need to release */
+		list_del_init(&work->link);
+		wait = false;
+		put = true;
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+
+	} else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
+		/* the executor is holding our only reference on the item, so
+		 * we merely need to wait for it to finish executing */
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+	}
+
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	/* the EXECUTING flag is set by the executor whilst the spinlock is set
+	 * and before the item is dequeued - so assuming the above doesn't
+	 * actually dequeue it, simply waiting for the EXECUTING flag to be
+	 * released here should be sufficient */
+	if (wait)
+		wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
+			    TASK_UNINTERRUPTIBLE);
+
+	clear_bit(SLOW_WORK_CANCELLING, &work->flags);
+	if (put)
+		slow_work_put_ref(work);
+}
+EXPORT_SYMBOL(slow_work_cancel);
+
 /*
  * Schedule a cull of the thread pool at some time in the near future
  */
-- 
cgit v1.2.3-71-gd317


From 6b8268b17a1ffc942bc72d7d00274e433d6b6719 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Nov 2009 18:10:47 +0000
Subject: SLOW_WORK: Add delayed_slow_work support

This adds support for starting slow work with a delay, similar
to the functionality we have for workqueues.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  16 +++++-
 include/linux/slow-work.h   |  29 ++++++++++
 kernel/slow-work.c          | 129 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 171 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 2e384bd4dead..a9d1b0ffdded 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -41,6 +41,13 @@ expand files, provided the time taken to do so isn't too long.
 Operations of both types may sleep during execution, thus tying up the thread
 loaned to it.
 
+A further class of work item is available, based on the slow work item class:
+
+ (*) Delayed slow work items.
+
+These are slow work items that have a timer to defer queueing of the item for
+a while.
+
 
 THREAD-TO-CLASS ALLOCATION
 --------------------------
@@ -93,6 +100,10 @@ Slow work items may then be set up by:
 
 	slow_work_init(&myitem, &myitem_ops);
 
+     or:
+
+	delayed_slow_work_init(&myitem, &myitem_ops);
+
      or:
 
 	vslow_work_init(&myitem, &myitem_ops);
@@ -104,7 +115,9 @@ A suitably set up work item can then be enqueued for processing:
 	int ret = slow_work_enqueue(&myitem);
 
 This will return a -ve error if the thread pool is unable to gain a reference
-on the item, 0 otherwise.
+on the item, 0 otherwise, or (for delayed work):
+
+	int ret = delayed_slow_work_enqueue(&myitem, my_jiffy_delay);
 
 
 The items are reference counted, so there ought to be no need for a flush
@@ -112,6 +125,7 @@ operation.  But as the reference counting is optional, means to cancel
 existing work items are also included:
 
 	cancel_slow_work(&myitem);
+	cancel_delayed_slow_work(&myitem);
 
 can be used to cancel pending work.  The above cancel function waits for
 existing work to have been executed (or prevent execution of them, depending
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index eef20182d5b4..b245b9a9cc0b 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -17,6 +17,7 @@
 #ifdef CONFIG_SLOW_WORK
 
 #include <linux/sysctl.h>
+#include <linux/timer.h>
 
 struct slow_work;
 
@@ -52,10 +53,16 @@ struct slow_work {
 #define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
 #define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
 #define SLOW_WORK_CANCELLING	4	/* item is being cancelled, don't enqueue */
+#define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
 };
 
+struct delayed_slow_work {
+	struct slow_work	work;
+	struct timer_list	timer;
+};
+
 /**
  * slow_work_init - Initialise a slow work item
  * @work: The work item to initialise
@@ -71,6 +78,20 @@ static inline void slow_work_init(struct slow_work *work,
 	INIT_LIST_HEAD(&work->link);
 }
 
+/**
+ * slow_work_init - Initialise a delayed slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a delayed slow work item.
+ */
+static inline void delayed_slow_work_init(struct delayed_slow_work *dwork,
+					  const struct slow_work_ops *ops)
+{
+	init_timer(&dwork->timer);
+	slow_work_init(&dwork->work, ops);
+}
+
 /**
  * vslow_work_init - Initialise a very slow work item
  * @work: The work item to initialise
@@ -93,6 +114,14 @@ extern void slow_work_cancel(struct slow_work *work);
 extern int slow_work_register_user(struct module *owner);
 extern void slow_work_unregister_user(struct module *owner);
 
+extern int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
+				     unsigned long delay);
+
+static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork)
+{
+	slow_work_cancel(&dwork->work);
+}
+
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
 #endif
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 671cc434532a..f67e1daae93d 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -406,11 +406,40 @@ void slow_work_cancel(struct slow_work *work)
 	bool wait = true, put = false;
 
 	set_bit(SLOW_WORK_CANCELLING, &work->flags);
+	smp_mb();
+
+	/* if the work item is a delayed work item with an active timer, we
+	 * need to wait for the timer to finish _before_ getting the spinlock,
+	 * lest we deadlock against the timer routine
+	 *
+	 * the timer routine will leave DELAYED set if it notices the
+	 * CANCELLING flag in time
+	 */
+	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
+		struct delayed_slow_work *dwork =
+			container_of(work, struct delayed_slow_work, work);
+		del_timer_sync(&dwork->timer);
+	}
 
 	spin_lock_irq(&slow_work_queue_lock);
 
-	if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
-	    !list_empty(&work->link)) {
+	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
+		/* the timer routine aborted or never happened, so we are left
+		 * holding the timer's reference on the item and should just
+		 * drop the pending flag and wait for any ongoing execution to
+		 * finish */
+		struct delayed_slow_work *dwork =
+			container_of(work, struct delayed_slow_work, work);
+
+		BUG_ON(timer_pending(&dwork->timer));
+		BUG_ON(!list_empty(&work->link));
+
+		clear_bit(SLOW_WORK_DELAYED, &work->flags);
+		put = true;
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+
+	} else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
+		   !list_empty(&work->link)) {
 		/* the link in the pending queue holds a reference on the item
 		 * that we will need to release */
 		list_del_init(&work->link);
@@ -440,6 +469,102 @@ void slow_work_cancel(struct slow_work *work)
 }
 EXPORT_SYMBOL(slow_work_cancel);
 
+/*
+ * Handle expiry of the delay timer, indicating that a delayed slow work item
+ * should now be queued if not cancelled
+ */
+static void delayed_slow_work_timer(unsigned long data)
+{
+	struct slow_work *work = (struct slow_work *) data;
+	unsigned long flags;
+	bool queued = false, put = false;
+
+	spin_lock_irqsave(&slow_work_queue_lock, flags);
+	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
+		clear_bit(SLOW_WORK_DELAYED, &work->flags);
+
+		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+			/* we discard the reference the timer was holding in
+			 * favour of the one the executor holds */
+			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+			put = true;
+		} else {
+			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+				list_add_tail(&work->link, &vslow_work_queue);
+			else
+				list_add_tail(&work->link, &slow_work_queue);
+			queued = true;
+		}
+	}
+
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	if (put)
+		slow_work_put_ref(work);
+	if (queued)
+		wake_up(&slow_work_thread_wq);
+}
+
+/**
+ * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
+ * @dwork: The delayed work item to queue
+ * @delay: When to start executing the work, in jiffies from now
+ *
+ * This is similar to slow_work_enqueue(), but it adds a delay before the work
+ * is actually queued for processing.
+ *
+ * The item can have delayed processing requested on it whilst it is being
+ * executed.  The delay will begin immediately, and if it expires before the
+ * item finishes executing, the item will be placed back on the queue when it
+ * has done executing.
+ */
+int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
+			      unsigned long delay)
+{
+	struct slow_work *work = &dwork->work;
+	unsigned long flags;
+	int ret;
+
+	if (delay == 0)
+		return slow_work_enqueue(&dwork->work);
+
+	BUG_ON(slow_work_user_count <= 0);
+	BUG_ON(!work);
+	BUG_ON(!work->ops);
+
+	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		return -ECANCELED;
+
+	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+		if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+			goto cancelled;
+
+		/* the timer holds a reference whilst it is pending */
+		ret = work->ops->get_ref(work);
+		if (ret < 0)
+			goto cant_get_ref;
+
+		if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
+			BUG();
+		dwork->timer.expires = jiffies + delay;
+		dwork->timer.data = (unsigned long) work;
+		dwork->timer.function = delayed_slow_work_timer;
+		add_timer(&dwork->timer);
+
+		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	}
+
+	return 0;
+
+cancelled:
+	ret = -ECANCELED;
+cant_get_ref:
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(delayed_slow_work_enqueue);
+
 /*
  * Schedule a cull of the thread pool at some time in the near future
  */
-- 
cgit v1.2.3-71-gd317


From 8fba10a42d191de612e60e7009c8f0313f90a9b3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:51 +0000
Subject: SLOW_WORK: Allow the work items to be viewed through a /proc file

Allow the executing and queued work items to be viewed through a /proc file
for debugging purposes.  The contents look something like the following:

    THR PID   ITEM ADDR        FL MARK  DESC
    === ===== ================ == ===== ==========
      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK

In the 'THR' column, executing items show the thread they're occupying and
queued threads indicate which queue they're on.  'PID' shows the process ID of
a slow-work thread that's executing something.  'FL' shows the work item flags.
'MARK' indicates how long since an item was queued or began executing.  Lastly,
the 'DESC' column permits the owner of an item to give some information.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  60 +++++++++++-
 include/linux/slow-work.h   |  11 +++
 init/Kconfig                |  10 ++
 kernel/Makefile             |   1 +
 kernel/slow-work-proc.c     | 227 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/slow-work.c          |  44 ++++++---
 kernel/slow-work.h          |  72 ++++++++++++++
 7 files changed, 413 insertions(+), 12 deletions(-)
 create mode 100644 kernel/slow-work-proc.c
 create mode 100644 kernel/slow-work.h

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index a9d1b0ffdded..f120238e70fe 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -149,7 +149,8 @@ ITEM OPERATIONS
 ===============
 
 Each work item requires a table of operations of type struct slow_work_ops.
-Only ->execute() is required, getting and putting of a reference are optional.
+Only ->execute() is required; the getting and putting of a reference and the
+describing of an item are all optional.
 
  (*) Get a reference on an item:
 
@@ -179,6 +180,16 @@ Only ->execute() is required, getting and putting of a reference are optional.
      This should perform the work required of the item.  It may sleep, it may
      perform disk I/O and it may wait for locks.
 
+ (*) View an item through /proc:
+
+	void (*desc)(struct slow_work *work, struct seq_file *m);
+
+     If supplied, this should print to 'm' a small string describing the work
+     the item is to do.  This should be no more than about 40 characters, and
+     shouldn't include a newline character.
+
+     See the 'Viewing executing and queued items' section below.
+
 
 ==================
 POOL CONFIGURATION
@@ -203,3 +214,50 @@ The slow-work thread pool has a number of configurables:
      is bounded to between 1 and one fewer than the number of active threads.
      This ensures there is always at least one thread that can process very
      slow work items, and always at least one thread that won't.
+
+
+==================================
+VIEWING EXECUTING AND QUEUED ITEMS
+==================================
+
+If CONFIG_SLOW_WORK_PROC is enabled, a proc file is made available:
+
+	/proc/slow_work_rq
+
+through which the list of work items being executed and the queues of items to
+be executed may be viewed.  The owner of a work item is given the chance to
+add some information of its own.
+
+The contents look something like the following:
+
+    THR PID   ITEM ADDR        FL MARK  DESC
+    === ===== ================ == ===== ==========
+      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
+      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
+      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
+      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
+      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
+      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
+      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
+      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
+    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
+    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
+    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
+    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
+    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
+    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
+    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
+    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
+    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
+    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
+    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
+    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
+    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
+    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK
+
+In the 'THR' column, executing items show the thread they're occupying and
+queued threads indicate which queue they're on.  'PID' shows the process ID of
+a slow-work thread that's executing something.  'FL' shows the work item flags.
+'MARK' indicates how long since an item was queued or began executing.  Lastly,
+the 'DESC' column permits the owner of an item to give some information.
+
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index b245b9a9cc0b..f41485145ed1 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -20,6 +20,9 @@
 #include <linux/timer.h>
 
 struct slow_work;
+#ifdef CONFIG_SLOW_WORK_PROC
+struct seq_file;
+#endif
 
 /*
  * The operations used to support slow work items
@@ -38,6 +41,11 @@ struct slow_work_ops {
 
 	/* execute a work item */
 	void (*execute)(struct slow_work *work);
+
+#ifdef CONFIG_SLOW_WORK_PROC
+	/* describe a work item for /proc */
+	void (*desc)(struct slow_work *work, struct seq_file *m);
+#endif
 };
 
 /*
@@ -56,6 +64,9 @@ struct slow_work {
 #define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
+#ifdef CONFIG_SLOW_WORK_PROC
+	struct timespec		mark;	/* jiffies at which queued or exec begun */
+#endif
 };
 
 struct delayed_slow_work {
diff --git a/init/Kconfig b/init/Kconfig
index 9e03ef8b311e..ab5c64801fe5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1098,6 +1098,16 @@ config SLOW_WORK
 
 	  See Documentation/slow-work.txt.
 
+config SLOW_WORK_PROC
+	bool "Slow work debugging through /proc"
+	default n
+	depends on SLOW_WORK && PROC_FS
+	help
+	  Display the contents of the slow work run queue through /proc,
+	  including items currently executing.
+
+	  See Documentation/slow-work.txt.
+
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..776ffed1556d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_SLOW_WORK_PROC) += slow-work-proc.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/slow-work-proc.c b/kernel/slow-work-proc.c
new file mode 100644
index 000000000000..3988032571f5
--- /dev/null
+++ b/kernel/slow-work-proc.c
@@ -0,0 +1,227 @@
+/* Slow work debugging
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include "slow-work.h"
+
+#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
+#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
+#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
+
+void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
+{
+	seq_puts(m, "Slow-work: New thread");
+}
+
+/*
+ * Render the time mark field on a work item into a 5-char time with units plus
+ * a space
+ */
+static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
+{
+	struct timespec now, diff;
+
+	now = CURRENT_TIME;
+	diff = timespec_sub(now, work->mark);
+
+	if (diff.tv_sec < 0)
+		seq_puts(m, "  -ve ");
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
+		seq_printf(m, "%3luns ", diff.tv_nsec);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
+		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
+		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
+	else if (diff.tv_sec <= 1)
+		seq_puts(m, "   1s ");
+	else if (diff.tv_sec < 60)
+		seq_printf(m, "%4lus ", diff.tv_sec);
+	else if (diff.tv_sec < 60 * 60)
+		seq_printf(m, "%4lum ", diff.tv_sec / 60);
+	else if (diff.tv_sec < 60 * 60 * 24)
+		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
+	else
+		seq_puts(m, "exces ");
+}
+
+/*
+ * Describe a slow work item for /proc
+ */
+static int slow_work_runqueue_show(struct seq_file *m, void *v)
+{
+	struct slow_work *work;
+	struct list_head *p = v;
+	unsigned long id;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
+		return 0;
+	case 2:
+		seq_puts(m, "=== ===== ================ == ===== ==========\n");
+		return 0;
+
+	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
+		id = (unsigned long) v - 3;
+
+		read_lock(&slow_work_execs_lock);
+		work = slow_work_execs[id];
+		if (work) {
+			smp_read_barrier_depends();
+
+			seq_printf(m, "%3lu %5d %16p %2lx ",
+				   id, slow_work_pids[id], work, work->flags);
+			slow_work_print_mark(m, work);
+
+			if (work->ops->desc)
+				work->ops->desc(work, m);
+			seq_putc(m, '\n');
+		}
+		read_unlock(&slow_work_execs_lock);
+		return 0;
+
+	default:
+		work = list_entry(p, struct slow_work, link);
+		seq_printf(m, "%3s     - %16p %2lx ",
+			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
+			   work, work->flags);
+		slow_work_print_mark(m, work);
+
+		if (work->ops->desc)
+			work->ops->desc(work, m);
+		seq_putc(m, '\n');
+		return 0;
+	}
+}
+
+/*
+ * map the iterator to a work item
+ */
+static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
+{
+	struct list_head *p;
+	unsigned long count, id;
+
+	switch (*_pos >> ITERATOR_SHIFT) {
+	case 0x0:
+		if (*_pos == 0)
+			*_pos = 1;
+		if (*_pos < 3)
+			return (void *)(unsigned long) *_pos;
+		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
+			for (id = *_pos - 3;
+			     id < SLOW_WORK_THREAD_LIMIT;
+			     id++, (*_pos)++)
+				if (slow_work_execs[id])
+					return (void *)(unsigned long) *_pos;
+		*_pos = 0x1UL << ITERATOR_SHIFT;
+
+	case 0x1:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &slow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+
+	case 0x2:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &vslow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
+{
+	spin_lock_irq(&slow_work_queue_lock);
+	return slow_work_runqueue_index(m, _pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	struct list_head *p = v;
+	unsigned long selector = *_pos >> ITERATOR_SHIFT;
+
+	(*_pos)++;
+	switch (selector) {
+	case 0x0:
+		return slow_work_runqueue_index(m, _pos);
+
+	case 0x1:
+		if (*_pos >> ITERATOR_SHIFT == 0x1) {
+			p = p->next;
+			if (p != &slow_work_queue)
+				return p;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+		p = &vslow_work_queue;
+
+	case 0x2:
+		if (*_pos >> ITERATOR_SHIFT == 0x2) {
+			p = p->next;
+			if (p != &vslow_work_queue)
+				return p;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * clean up after reading
+ */
+static void slow_work_runqueue_stop(struct seq_file *m, void *v)
+{
+	spin_unlock_irq(&slow_work_queue_lock);
+}
+
+static const struct seq_operations slow_work_runqueue_ops = {
+	.start		= slow_work_runqueue_start,
+	.stop		= slow_work_runqueue_stop,
+	.next		= slow_work_runqueue_next,
+	.show		= slow_work_runqueue_show,
+};
+
+/*
+ * open "/proc/slow_work_rq" to list queue contents
+ */
+static int slow_work_runqueue_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slow_work_runqueue_ops);
+}
+
+const struct file_operations slow_work_runqueue_fops = {
+	.owner		= THIS_MODULE,
+	.open		= slow_work_runqueue_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index f67e1daae93d..b763bc2d2670 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,13 +16,8 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
-					 * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
-					 * OOM */
-
-#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+#include <linux/proc_fs.h>
+#include "slow-work.h"
 
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
@@ -116,6 +111,15 @@ static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
 static DEFINE_MUTEX(slow_work_unreg_sync_lock);
 #endif
 
+/*
+ * Data for tracking currently executing items for indication through /proc
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
+pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
+DEFINE_RWLOCK(slow_work_execs_lock);
+#endif
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -124,9 +128,9 @@ static DEFINE_MUTEX(slow_work_unreg_sync_lock);
  * There are two queues of work items: one for slow work items, and one for
  * very slow work items.
  */
-static LIST_HEAD(slow_work_queue);
-static LIST_HEAD(vslow_work_queue);
-static DEFINE_SPINLOCK(slow_work_queue_lock);
+LIST_HEAD(slow_work_queue);
+LIST_HEAD(vslow_work_queue);
+DEFINE_SPINLOCK(slow_work_queue_lock);
 
 /*
  * The thread controls.  A variable used to signal to the threads that they
@@ -182,7 +186,7 @@ static unsigned slow_work_calc_vsmax(void)
  * Attempt to execute stuff queued on a slow thread.  Return true if we managed
  * it, false if there was nothing to do.
  */
-static bool slow_work_execute(int id)
+static noinline bool slow_work_execute(int id)
 {
 #ifdef CONFIG_MODULES
 	struct module *module;
@@ -227,6 +231,10 @@ static bool slow_work_execute(int id)
 	if (work)
 		slow_work_thread_processing[id] = work->owner;
 #endif
+	if (work) {
+		slow_work_mark_time(work);
+		slow_work_begin_exec(id, work);
+	}
 
 	spin_unlock_irq(&slow_work_queue_lock);
 
@@ -247,6 +255,8 @@ static bool slow_work_execute(int id)
 	/* wake up anyone waiting for this work to be complete */
 	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
 
+	slow_work_end_exec(id, work);
+
 	/* if someone tried to enqueue the item whilst we were executing it,
 	 * then it'll be left unenqueued to avoid multiple threads trying to
 	 * execute it simultaneously
@@ -285,6 +295,7 @@ auto_requeue:
 	 * - we transfer our ref on the item back to the appropriate queue
 	 * - don't wake another thread up as we're awake already
 	 */
+	slow_work_mark_time(work);
 	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 		list_add_tail(&work->link, &vslow_work_queue);
 	else
@@ -368,6 +379,7 @@ int slow_work_enqueue(struct slow_work *work)
 			ret = slow_work_get_ref(work);
 			if (ret < 0)
 				goto failed;
+			slow_work_mark_time(work);
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -489,6 +501,7 @@ static void delayed_slow_work_timer(unsigned long data)
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 			put = true;
 		} else {
+			slow_work_mark_time(work);
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -627,6 +640,7 @@ static int slow_work_thread(void *_data)
 	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
 	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
 	__set_bit(id, slow_work_ids);
+	slow_work_set_thread_pid(id, current->pid);
 	spin_unlock_irq(&slow_work_queue_lock);
 
 	sprintf(current->comm, "kslowd%03u", id);
@@ -669,6 +683,7 @@ static int slow_work_thread(void *_data)
 	}
 
 	spin_lock_irq(&slow_work_queue_lock);
+	slow_work_set_thread_pid(id, 0);
 	__clear_bit(id, slow_work_ids);
 	spin_unlock_irq(&slow_work_queue_lock);
 
@@ -722,6 +737,9 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.owner		= THIS_MODULE,
 	.execute	= slow_work_new_thread_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+	.desc		= slow_work_new_thread_desc,
+#endif
 };
 
 /*
@@ -948,6 +966,10 @@ static int __init init_slow_work(void)
 #ifdef CONFIG_SYSCTL
 	if (slow_work_max_max_threads < nr_cpus * 2)
 		slow_work_max_max_threads = nr_cpus * 2;
+#endif
+#ifdef CONFIG_SLOW_WORK_PROC
+	proc_create("slow_work_rq", S_IFREG | 0400, NULL,
+		    &slow_work_runqueue_fops);
 #endif
 	return 0;
 }
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
new file mode 100644
index 000000000000..3c2f007f3ad6
--- /dev/null
+++ b/kernel/slow-work.h
@@ -0,0 +1,72 @@
+/* Slow work private definitions
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
+					 * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
+					 * OOM */
+
+#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+
+/*
+ * slow-work.c
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+extern struct slow_work *slow_work_execs[];
+extern pid_t slow_work_pids[];
+extern rwlock_t slow_work_execs_lock;
+#endif
+
+extern struct list_head slow_work_queue;
+extern struct list_head vslow_work_queue;
+extern spinlock_t slow_work_queue_lock;
+
+/*
+ * slow-work-proc.c
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+extern const struct file_operations slow_work_runqueue_fops;
+
+extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
+#endif
+
+/*
+ * Helper functions
+ */
+static inline void slow_work_set_thread_pid(int id, pid_t pid)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	slow_work_pids[id] = pid;
+#endif
+}
+
+static inline void slow_work_mark_time(struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	work->mark = CURRENT_TIME;
+#endif
+}
+
+static inline void slow_work_begin_exec(int id, struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	slow_work_execs[id] = work;
+#endif
+}
+
+static inline void slow_work_end_exec(int id, struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	write_lock(&slow_work_execs_lock);
+	slow_work_execs[id] = NULL;
+	write_unlock(&slow_work_execs_lock);
+#endif
+}
-- 
cgit v1.2.3-71-gd317


From 3bde31a4ac225cb5805be02eff6eaaf7e0766ccd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:57 +0000
Subject: SLOW_WORK: Allow a requeueable work item to sleep till the thread is
 needed

Add a function to allow a requeueable work item to sleep till the thread
processing it is needed by the slow-work facility to perform other work.

Sometimes a work item can't progress immediately, but must wait for the
completion of another work item that's currently being processed by another
slow-work thread.

In some circumstances, the waiting item could instead - theoretically - put
itself back on the queue and yield its thread back to the slow-work facility,
thus waiting till it gets processing time again before attempting to progress.
This would allow other work items processing time on that thread.

However, this only works if there is something on the queue for it to queue
behind - otherwise it will just get a thread again immediately, and will end
up cycling between the queue and the thread, eating up valuable CPU time.

So, slow_work_sleep_till_thread_needed() is provided such that an item can put
itself on a wait queue that will wake it up when the event it is actually
interested in occurs, then call this function in lieu of calling schedule().

This function will then sleep until either the item's event occurs or another
work item appears on the queue.  If another work item is queued, but the
item's event hasn't occurred, then the work item should requeue itself and
yield the thread back to the slow-work facility by returning.

This can be used by CacheFiles for an object that is being created on one
thread to wait for an object being deleted on another thread where there is
nothing on the queue for the creation to go and wait behind.  As soon as an
item appears on the queue that could be given thread time instead, CacheFiles
can stick the creating object back on the queue and return to the slow-work
facility - assuming the object deletion didn't also complete.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt | 44 +++++++++++++++++++++
 include/linux/slow-work.h   |  3 ++
 kernel/slow-work.c          | 94 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 132 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 0169c9d9dd16..52bc31433723 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -158,6 +158,50 @@ with a requeue pending).  This can be used to work out whether an item on which
 another depends is on the queue, thus allowing a dependent item to be queued
 after it.
 
+If the above shows an item on which another depends not to be queued, then the
+owner of the dependent item might need to wait.  However, to avoid locking up
+the threads unnecessarily be sleeping in them, it can make sense under some
+circumstances to return the work item to the queue, thus deferring it until
+some other items have had a chance to make use of the yielded thread.
+
+To yield a thread and defer an item, the work function should simply enqueue
+the work item again and return.  However, this doesn't work if there's nothing
+actually on the queue, as the thread just vacated will jump straight back into
+the item's work function, thus busy waiting on a CPU.
+
+Instead, the item should use the thread to wait for the dependency to go away,
+but rather than using schedule() or schedule_timeout() to sleep, it should use
+the following function:
+
+	bool requeue = slow_work_sleep_till_thread_needed(
+			struct slow_work *work,
+			signed long *_timeout);
+
+This will add a second wait and then sleep, such that it will be woken up if
+either something appears on the queue that could usefully make use of the
+thread - and behind which this item can be queued, or if the event the caller
+set up to wait for happens.  True will be returned if something else appeared
+on the queue and this work function should perhaps return, of false if
+something else woke it up.  The timeout is as for schedule_timeout().
+
+For example:
+
+	wq = bit_waitqueue(&my_flags, MY_BIT);
+	init_wait(&wait);
+	requeue = false;
+	do {
+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (!test_bit(MY_BIT, &my_flags))
+			break;
+		requeue = slow_work_sleep_till_thread_needed(&my_work,
+							     &timeout);
+	} while (timeout > 0 && !requeue);
+	finish_wait(wq, &wait);
+	if (!test_bit(MY_BIT, &my_flags)
+		goto do_my_thing;
+	if (requeue)
+		return; // to slow_work
+
 
 ===============
 ITEM OPERATIONS
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index bfd3ab4c8898..5035a2691739 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -152,6 +152,9 @@ static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork)
 	slow_work_cancel(&dwork->work);
 }
 
+extern bool slow_work_sleep_till_thread_needed(struct slow_work *work,
+					       signed long *_timeout);
+
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
 #endif
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b763bc2d2670..da94f3c101af 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -132,6 +132,15 @@ LIST_HEAD(slow_work_queue);
 LIST_HEAD(vslow_work_queue);
 DEFINE_SPINLOCK(slow_work_queue_lock);
 
+/*
+ * The following are two wait queues that get pinged when a work item is placed
+ * on an empty queue.  These allow work items that are hogging a thread by
+ * sleeping in a way that could be deferred to yield their thread and enqueue
+ * themselves.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
+static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
+
 /*
  * The thread controls.  A variable used to signal to the threads that they
  * should exit when the queue is empty, a waitqueue used by the threads to wait
@@ -305,6 +314,50 @@ auto_requeue:
 	return true;
 }
 
+/**
+ * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
+ * work: The work item under execution that wants to sleep
+ * _timeout: Scheduler sleep timeout
+ *
+ * Allow a requeueable work item to sleep on a slow-work processor thread until
+ * that thread is needed to do some other work or the sleep is interrupted by
+ * some other event.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * False is returned if there is nothing on the queue; true is returned if the
+ * work item should be requeued
+ */
+bool slow_work_sleep_till_thread_needed(struct slow_work *work,
+					signed long *_timeout)
+{
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
+
+	DEFINE_WAIT(wait);
+
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+		wfo_wq = &vslow_work_queue_waits_for_occupation;
+		queue = &vslow_work_queue;
+	} else {
+		wfo_wq = &slow_work_queue_waits_for_occupation;
+		queue = &slow_work_queue;
+	}
+
+	if (!list_empty(queue))
+		return true;
+
+	add_wait_queue_exclusive(wfo_wq, &wait);
+	if (list_empty(queue))
+		*_timeout = schedule_timeout(*_timeout);
+	finish_wait(wfo_wq, &wait);
+
+	return !list_empty(queue);
+}
+EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
+
 /**
  * slow_work_enqueue - Schedule a slow work item for processing
  * @work: The work item to queue
@@ -335,6 +388,8 @@ auto_requeue:
  */
 int slow_work_enqueue(struct slow_work *work)
 {
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
 	unsigned long flags;
 	int ret;
 
@@ -354,6 +409,14 @@ int slow_work_enqueue(struct slow_work *work)
 	 * maintaining our promise
 	 */
 	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+			wfo_wq = &vslow_work_queue_waits_for_occupation;
+			queue = &vslow_work_queue;
+		} else {
+			wfo_wq = &slow_work_queue_waits_for_occupation;
+			queue = &slow_work_queue;
+		}
+
 		spin_lock_irqsave(&slow_work_queue_lock, flags);
 
 		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
@@ -380,11 +443,13 @@ int slow_work_enqueue(struct slow_work *work)
 			if (ret < 0)
 				goto failed;
 			slow_work_mark_time(work);
-			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-				list_add_tail(&work->link, &vslow_work_queue);
-			else
-				list_add_tail(&work->link, &slow_work_queue);
+			list_add_tail(&work->link, queue);
 			wake_up(&slow_work_thread_wq);
+
+			/* if someone who could be requeued is sleeping on a
+			 * thread, then ask them to yield their thread */
+			if (work->link.prev == queue)
+				wake_up(wfo_wq);
 		}
 
 		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
@@ -487,9 +552,19 @@ EXPORT_SYMBOL(slow_work_cancel);
  */
 static void delayed_slow_work_timer(unsigned long data)
 {
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
 	struct slow_work *work = (struct slow_work *) data;
 	unsigned long flags;
-	bool queued = false, put = false;
+	bool queued = false, put = false, first = false;
+
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+		wfo_wq = &vslow_work_queue_waits_for_occupation;
+		queue = &vslow_work_queue;
+	} else {
+		wfo_wq = &slow_work_queue_waits_for_occupation;
+		queue = &slow_work_queue;
+	}
 
 	spin_lock_irqsave(&slow_work_queue_lock, flags);
 	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
@@ -502,17 +577,18 @@ static void delayed_slow_work_timer(unsigned long data)
 			put = true;
 		} else {
 			slow_work_mark_time(work);
-			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-				list_add_tail(&work->link, &vslow_work_queue);
-			else
-				list_add_tail(&work->link, &slow_work_queue);
+			list_add_tail(&work->link, queue);
 			queued = true;
+			if (work->link.prev == queue)
+				first = true;
 		}
 	}
 
 	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 	if (put)
 		slow_work_put_ref(work);
+	if (first)
+		wake_up(wfo_wq);
 	if (queued)
 		wake_up(&slow_work_thread_wq);
 }
-- 
cgit v1.2.3-71-gd317


From 34769945f7cd9ab470413ffe64426e3ad069f49e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 20 Nov 2009 11:46:21 +0100
Subject: genirq: Fix spurious irq seqfile conversion

single_open data argument must be PDE(inode)->data instead of NULL
otherwise seq_file->private is always NULL and we always read the
spurious data of irq 0.

Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4e47f11da6a7..0832145fea97 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -136,7 +136,7 @@ out:
 
 static int default_affinity_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, default_affinity_show, NULL);
+	return single_open(file, default_affinity_show, PDE(inode)->data);
 }
 
 static const struct file_operations default_affinity_proc_fops = {
-- 
cgit v1.2.3-71-gd317


From 453f19eea7dbad837425e9b07d84568d14898794 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:43 +0100
Subject: perf: Allow for custom overflow handlers

in-kernel perf users might wish to have custom actions on the
sample interrupt.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.222339539@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 6 ++++++
 kernel/perf_event.c        | 8 +++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b5cdac0de370..a430ac3074af 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -567,6 +567,8 @@ struct perf_pending_entry {
 
 typedef void (*perf_callback_t)(struct perf_event *, void *);
 
+struct perf_sample_data;
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -658,6 +660,10 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	void (*overflow_handler)(struct perf_event *event,
+			int nmi, struct perf_sample_data *data,
+			struct pt_regs *regs);
+
 #ifdef CONFIG_EVENT_PROFILE
 	struct event_filter		*filter;
 #endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3852e2656bb0..1dfb6cc4fdea 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3710,7 +3710,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 			perf_event_disable(event);
 	}
 
-	perf_event_output(event, nmi, data, regs);
+	if (event->overflow_handler)
+		event->overflow_handler(event, nmi, data, regs);
+	else
+		perf_event_output(event, nmi, data, regs);
+
 	return ret;
 }
 
@@ -4836,6 +4840,8 @@ inherit_event(struct perf_event *parent_event,
 	if (parent_event->attr.freq)
 		child_event->hw.sample_period = parent_event->hw.sample_period;
 
+	child_event->overflow_handler = parent_event->overflow_handler;
+
 	/*
 	 * Link it up in the child's context:
 	 */
-- 
cgit v1.2.3-71-gd317


From 0cff784ae41cc125368ae77f1c01328ae2fdc6b3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:44 +0100
Subject: perf: Optimize some swcounter attr.sample_period==1 paths

Avoid the rather expensive perf_swevent_set_period() if we know
we have to sample every single event anyway.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.299508332@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 1dfb6cc4fdea..8e55b440e28a 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3759,16 +3759,16 @@ again:
 	return nr;
 }
 
-static void perf_swevent_overflow(struct perf_event *event,
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 				    int nmi, struct perf_sample_data *data,
 				    struct pt_regs *regs)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int throttle = 0;
-	u64 overflow;
 
 	data->period = event->hw.last_period;
-	overflow = perf_swevent_set_period(event);
+	if (!overflow)
+		overflow = perf_swevent_set_period(event);
 
 	if (hwc->interrupts == MAX_INTERRUPTS)
 		return;
@@ -3801,14 +3801,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 
 	atomic64_add(nr, &event->count);
 
+	if (!regs)
+		return;
+
 	if (!hwc->sample_period)
 		return;
 
-	if (!regs)
+	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+		return perf_swevent_overflow(event, 1, nmi, data, regs);
+
+	if (atomic64_add_negative(nr, &hwc->period_left))
 		return;
 
-	if (!atomic64_add_negative(nr, &hwc->period_left))
-		perf_swevent_overflow(event, nmi, data, regs);
+	perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
 static int perf_swevent_is_counting(struct perf_event *event)
-- 
cgit v1.2.3-71-gd317


From 81520183878a8813c71c9372de28bb70913ba549 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:45 +0100
Subject: perf: Optimize perf_swevent_ctx_event()

Remove a rcu_read_{,un}lock() pair and a few conditionals.

We can remove the rcu_read_lock() by increasing the scope of one
in the calling function.

We can do away with the system_state check if the machine still
boots after this patch (seems to be the case).

We can do away with the list_empty() check because the bare
list_for_each_entry_rcu() reduces to that now that we've removed
everything else.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.378188589@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8e55b440e28a..cda17acfcaf8 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3886,15 +3886,10 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 {
 	struct perf_event *event;
 
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (perf_swevent_match(event, type, event_id, data, regs))
 			perf_swevent_add(event, nr, nmi, data, regs);
 	}
-	rcu_read_unlock();
 }
 
 static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
@@ -3926,9 +3921,9 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 	(*recursion)++;
 	barrier();
 
+	rcu_read_lock();
 	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
 				 nr, nmi, data, regs);
-	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
 	 * events ends up in.
-- 
cgit v1.2.3-71-gd317


From d6ff86cfb50a72df820e7e839836d55d245306fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:46 +0100
Subject: perf: Optimize perf_event_task_ctx()

Remove a rcu_read_{,un}lock() pair and a few conditionals.

We can remove the rcu_read_lock() by increasing the scope of one
in the calling function.

We can do away with the system_state check if the machine still
boots after this patch (seems to be the case).

We can do away with the list_empty() check because the bare
list_for_each_entry_rcu() reduces to that now that we've removed
everything else.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.452227115@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index cda17acfcaf8..2afb305944af 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3267,15 +3267,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 {
 	struct perf_event *event;
 
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (perf_event_task_match(event))
 			perf_event_task_output(event, task_event);
 	}
-	rcu_read_unlock();
 }
 
 static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3283,11 +3278,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx = task_event->task_ctx;
 
+	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_event_task_ctx(&cpuctx->ctx, task_event);
 	put_cpu_var(perf_cpu_context);
 
-	rcu_read_lock();
 	if (!ctx)
 		ctx = rcu_dereference(task_event->task->perf_event_ctxp);
 	if (ctx)
-- 
cgit v1.2.3-71-gd317


From f6595f3a9680c86b6332f881a7ae2cbbcfdc8619 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:47 +0100
Subject: perf: Optimize perf_event_comm_ctx()

Remove a rcu_read_{,un}lock() pair and a few conditionals.

We can remove the rcu_read_lock() by increasing the scope of one
in the calling function.

We can do away with the system_state check if the machine still
boots after this patch (seems to be the case).

We can do away with the list_empty() check because the bare
list_for_each_entry_rcu() reduces to that now that we've removed
everything else.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.527608793@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2afb305944af..4deefaace90e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3374,15 +3374,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
 {
 	struct perf_event *event;
 
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (perf_event_comm_match(event))
 			perf_event_comm_output(event, comm_event);
 	}
-	rcu_read_unlock();
 }
 
 static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3401,11 +3396,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 
 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 
+	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
 
-	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
 	 * events ends up in.
-- 
cgit v1.2.3-71-gd317


From f6d9dd237da400effb265f3554c64413f8a3e7b4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:48 +0100
Subject: perf: Optimize perf_event_mmap_ctx()

Remove a rcu_read_{,un}lock() pair and a few conditionals.

We can remove the rcu_read_lock() by increasing the scope of one
in the calling function.

We can do away with the system_state check if the machine still
boots after this patch (seems to be the case).

We can do away with the list_empty() check because the bare
list_for_each_entry_rcu() reduces to that now that we've removed
everything else.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.606459548@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 4deefaace90e..68fbf4ff6888 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3493,15 +3493,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
 {
 	struct perf_event *event;
 
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (perf_event_mmap_match(event, mmap_event))
 			perf_event_mmap_output(event, mmap_event);
 	}
-	rcu_read_unlock();
 }
 
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3557,11 +3552,11 @@ got_name:
 
 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
+	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
-	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
 	 * events ends up in.
-- 
cgit v1.2.3-71-gd317


From abf4868b8548cae18d4fe8bbfb4e207443be01be Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:49 +0100
Subject: perf: Fix PERF_FORMAT_GROUP scale info

As Corey reported, the total_enabled and total_running times
could occasionally be 0, even though there were events counted.

It turns out this is because we record the times before reading
the counter while the latter updates the times.

This patch corrects that.

While looking at this code I found that there is a lot of
locking iffyness around, the following patches correct most of
that.

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.685559857@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 52 +++++++++++++++++++++-------------------------------
 1 file changed, 21 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 68fbf4ff6888..9a18ff28ea5b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1784,30 +1784,15 @@ u64 perf_event_read_value(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_read_value);
 
-static int perf_event_read_entry(struct perf_event *event,
-				   u64 read_format, char __user *buf)
-{
-	int n = 0, count = 0;
-	u64 values[2];
-
-	values[n++] = perf_event_read_value(event);
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_event_id(event);
-
-	count = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, count))
-		return -EFAULT;
-
-	return count;
-}
-
 static int perf_event_read_group(struct perf_event *event,
 				   u64 read_format, char __user *buf)
 {
 	struct perf_event *leader = event->group_leader, *sub;
-	int n = 0, size = 0, err = -EFAULT;
-	u64 values[3];
+	int n = 0, size = 0, ret = 0;
+	u64 values[5];
+	u64 count;
+
+	count = perf_event_read_value(leader);
 
 	values[n++] = 1 + leader->nr_siblings;
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
@@ -1818,28 +1803,33 @@ static int perf_event_read_group(struct perf_event *event,
 		values[n++] = leader->total_time_running +
 			atomic64_read(&leader->child_total_time_running);
 	}
+	values[n++] = count;
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(leader);
 
 	size = n * sizeof(u64);
 
 	if (copy_to_user(buf, values, size))
 		return -EFAULT;
 
-	err = perf_event_read_entry(leader, read_format, buf + size);
-	if (err < 0)
-		return err;
-
-	size += err;
+	ret += size;
 
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		err = perf_event_read_entry(sub, read_format,
-				buf + size);
-		if (err < 0)
-			return err;
+		n = 0;
 
-		size += err;
+		values[n++] = perf_event_read_value(sub);
+		if (read_format & PERF_FORMAT_ID)
+			values[n++] = primary_event_id(sub);
+
+		size = n * sizeof(u64);
+
+		if (copy_to_user(buf + size, values, size))
+			return -EFAULT;
+
+		ret += size;
 	}
 
-	return size;
+	return ret;
 }
 
 static int perf_event_read_one(struct perf_event *event,
-- 
cgit v1.2.3-71-gd317


From 02ffdbc866c8b1c8644601e9aa6155700eed4c91 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:50 +0100
Subject: perf: Optimize perf_event_task_sched_out

Remove an update_context_time() call from the
perf_event_task_sched_out() path and into the branch its needed.

The call was both superfluous, because __perf_event_sched_out()
already does it, and wrong, because it was done without holding
ctx->lock.

Place it in perf_event_sync_stat(), which is the only place it
is needed and which does already hold ctx->lock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.779516394@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9a18ff28ea5b..65f4dab0ce60 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1120,6 +1120,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 	if (!ctx->nr_stat)
 		return;
 
+	update_context_time(ctx);
+
 	event = list_first_entry(&ctx->event_list,
 				   struct perf_event, event_entry);
 
@@ -1163,8 +1165,6 @@ void perf_event_task_sched_out(struct task_struct *task,
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
 
-	update_context_time(ctx);
-
 	rcu_read_lock();
 	parent = rcu_dereference(ctx->parent_ctx);
 	next_ctx = next->perf_event_ctxp;
-- 
cgit v1.2.3-71-gd317


From f6f83785222b0ee037f7be90731f62a649292b5e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:51 +0100
Subject: perf: Optimize __perf_event_read()

Both callers actually have IRQs disabled, no need doing so
again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.863685796@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 65f4dab0ce60..e66f6c400d13 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1517,7 +1517,6 @@ static void __perf_event_read(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
-	unsigned long flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -1529,12 +1528,10 @@ static void __perf_event_read(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	local_irq_save(flags);
 	if (ctx->is_active)
 		update_context_time(ctx);
 	event->pmu->read(event);
 	update_event_times(event);
-	local_irq_restore(flags);
 }
 
 static u64 perf_event_read(struct perf_event *event)
-- 
cgit v1.2.3-71-gd317


From 3dbebf15c5d3e265f751eec72c1538a00da4be27 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:52 +0100
Subject: perf: Simplify __perf_event_sync_stat

Removes constraints from __perf_event_read() by leaving it with
a single callsite; this callsite had ctx->lock held, the other
one does not.

Removes some superfluous code from __perf_event_sync_stat().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.918544317@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e66f6c400d13..af150bbcfc5b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1061,8 +1061,6 @@ static int context_equiv(struct perf_event_context *ctx1,
 		&& !ctx1->pin_count && !ctx2->pin_count;
 }
 
-static void __perf_event_read(void *event);
-
 static void __perf_event_sync_stat(struct perf_event *event,
 				     struct perf_event *next_event)
 {
@@ -1080,8 +1078,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 */
 	switch (event->state) {
 	case PERF_EVENT_STATE_ACTIVE:
-		__perf_event_read(event);
-		break;
+		event->pmu->read(event);
+		/* fall-through */
 
 	case PERF_EVENT_STATE_INACTIVE:
 		update_event_times(event);
-- 
cgit v1.2.3-71-gd317


From 58e5ad1de3d6ad931c84f0cc8ef0655c922f30ad Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:53 +0100
Subject: perf: Simplify __perf_event_read

cpuctx is always active, task context is always active for
current

the previous condition verifies that if its a task context its
for current, hence we can assume ctx->is_active.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212509.000272254@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index af150bbcfc5b..028619dd6d0e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1526,10 +1526,9 @@ static void __perf_event_read(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	if (ctx->is_active)
-		update_context_time(ctx);
-	event->pmu->read(event);
+	update_context_time(ctx);
 	update_event_times(event);
+	event->pmu->read(event);
 }
 
 static u64 perf_event_read(struct perf_event *event)
-- 
cgit v1.2.3-71-gd317


From 2b8988c9f7defe319cffe0cd362a7cd356c86f62 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:54 +0100
Subject: perf: Fix time locking

Most sites updating ctx->time and event times do so under
ctx->lock, make sure they all do.

This was made possible by removing the __perf_event_read() call
from __perf_event_sync_stat(), which already had this lock
taken.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212509.102316434@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 028619dd6d0e..fdfae888a67c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1526,8 +1526,11 @@ static void __perf_event_read(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
+	spin_lock(&ctx->lock);
 	update_context_time(ctx);
 	update_event_times(event);
+	spin_unlock(&ctx->lock);
+
 	event->pmu->read(event);
 }
 
@@ -1541,7 +1544,13 @@ static u64 perf_event_read(struct perf_event *event)
 		smp_call_function_single(event->oncpu,
 					 __perf_event_read, event, 1);
 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		struct perf_event_context *ctx = event->ctx;
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->lock, flags);
+		update_context_time(ctx);
 		update_event_times(event);
+		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
 	return atomic64_read(&event->count);
-- 
cgit v1.2.3-71-gd317


From 59ed446f792cc07d37b1536b9c4664d14e25e425 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:55 +0100
Subject: perf: Fix event scaling for inherited counters

Properly account the full hierarchy of counters for both the
count (we already did so) and the scale times (new).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212509.153379276@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  3 ++-
 kernel/perf_event.c        | 48 +++++++++++++++++++++++++---------------------
 2 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a430ac3074af..36fe89f72641 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -782,7 +782,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				int cpu,
 				pid_t pid,
 				perf_callback_t callback);
-extern u64 perf_event_read_value(struct perf_event *event);
+extern u64 perf_event_read_value(struct perf_event *event,
+				 u64 *enabled, u64 *running);
 
 struct perf_sample_data {
 	u64				type;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fdfae888a67c..80f40da9a01e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1774,14 +1774,25 @@ static int perf_event_read_size(struct perf_event *event)
 	return size;
 }
 
-u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
 	struct perf_event *child;
 	u64 total = 0;
 
+	*enabled = 0;
+	*running = 0;
+
 	total += perf_event_read(event);
-	list_for_each_entry(child, &event->child_list, child_list)
+	*enabled += event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+	*running += event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+
+	list_for_each_entry(child, &event->child_list, child_list) {
 		total += perf_event_read(child);
+		*enabled += child->total_time_enabled;
+		*running += child->total_time_running;
+	}
 
 	return total;
 }
@@ -1793,19 +1804,15 @@ static int perf_event_read_group(struct perf_event *event,
 	struct perf_event *leader = event->group_leader, *sub;
 	int n = 0, size = 0, ret = 0;
 	u64 values[5];
-	u64 count;
+	u64 count, enabled, running;
 
-	count = perf_event_read_value(leader);
+	count = perf_event_read_value(leader, &enabled, &running);
 
 	values[n++] = 1 + leader->nr_siblings;
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = leader->total_time_enabled +
-			atomic64_read(&leader->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = leader->total_time_running +
-			atomic64_read(&leader->child_total_time_running);
-	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = enabled;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = running;
 	values[n++] = count;
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
@@ -1820,7 +1827,7 @@ static int perf_event_read_group(struct perf_event *event,
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		n = 0;
 
-		values[n++] = perf_event_read_value(sub);
+		values[n++] = perf_event_read_value(sub, &enabled, &running);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
 
@@ -1838,18 +1845,15 @@ static int perf_event_read_group(struct perf_event *event,
 static int perf_event_read_one(struct perf_event *event,
 				 u64 read_format, char __user *buf)
 {
+	u64 enabled, running;
 	u64 values[4];
 	int n = 0;
 
-	values[n++] = perf_event_read_value(event);
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = event->total_time_enabled +
-			atomic64_read(&event->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = event->total_time_running +
-			atomic64_read(&event->child_total_time_running);
-	}
+	values[n++] = perf_event_read_value(event, &enabled, &running);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = enabled;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = running;
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
 
-- 
cgit v1.2.3-71-gd317


From 6f10581aeaa5543a3b7a8c7a87a064375ec357f8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:56 +0100
Subject: perf: Fix locking for PERF_FORMAT_GROUP

We should hold event->child_mutex when iterating the inherited
counters, we should hold ctx->mutex when iterating siblings.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212509.251030114@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 80f40da9a01e..3ede0981f4ac 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1782,6 +1782,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 	*enabled = 0;
 	*running = 0;
 
+	mutex_lock(&event->child_mutex);
 	total += perf_event_read(event);
 	*enabled += event->total_time_enabled +
 			atomic64_read(&event->child_total_time_enabled);
@@ -1793,6 +1794,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 		*enabled += child->total_time_enabled;
 		*running += child->total_time_running;
 	}
+	mutex_unlock(&event->child_mutex);
 
 	return total;
 }
@@ -1802,10 +1804,12 @@ static int perf_event_read_group(struct perf_event *event,
 				   u64 read_format, char __user *buf)
 {
 	struct perf_event *leader = event->group_leader, *sub;
-	int n = 0, size = 0, ret = 0;
+	int n = 0, size = 0, ret = -EFAULT;
+	struct perf_event_context *ctx = leader->ctx;
 	u64 values[5];
 	u64 count, enabled, running;
 
+	mutex_lock(&ctx->mutex);
 	count = perf_event_read_value(leader, &enabled, &running);
 
 	values[n++] = 1 + leader->nr_siblings;
@@ -1820,9 +1824,9 @@ static int perf_event_read_group(struct perf_event *event,
 	size = n * sizeof(u64);
 
 	if (copy_to_user(buf, values, size))
-		return -EFAULT;
+		goto unlock;
 
-	ret += size;
+	ret = size;
 
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		n = 0;
@@ -1833,11 +1837,15 @@ static int perf_event_read_group(struct perf_event *event,
 
 		size = n * sizeof(u64);
 
-		if (copy_to_user(buf + size, values, size))
-			return -EFAULT;
+		if (copy_to_user(buf + size, values, size)) {
+			ret = -EFAULT;
+			goto unlock;
+		}
 
 		ret += size;
 	}
+unlock:
+	mutex_unlock(&ctx->mutex);
 
 	return ret;
 }
@@ -1884,12 +1892,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
 		return -ENOSPC;
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
-	mutex_lock(&event->child_mutex);
 	if (read_format & PERF_FORMAT_GROUP)
 		ret = perf_event_read_group(event, read_format, buf);
 	else
 		ret = perf_event_read_one(event, read_format, buf);
-	mutex_unlock(&event->child_mutex);
 
 	return ret;
 }
-- 
cgit v1.2.3-71-gd317


From 8904b18046c2f050107f6449e887e7c1142b9ab9 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@gmail.com>
Date: Fri, 20 Nov 2009 22:19:57 +0100
Subject: perf_events: Fix default watermark calculation

This patch fixes the default watermark value for the sampling
buffer. With the existing calculation (watermark =
max(PAGE_SIZE, max_size / 2)), no notification was ever received
when the buffer was exactly 1 page. This was because you would
never cross the threshold (there is no partial samples).

In certain configuration, there was no possibilty detecting the
problem because there was not enough space left to store the
LOST record.In fact, there may be a more generic problem here.
The kernel should ensure that there is alaways enough space to
store one LOST record.

This patch sets the default watermark to half the buffer size.
With such limit, we are guaranteed to get a notification even
with a single page buffer assuming no sample is bigger than a
page.

Signed-off-by: Stephane Eranian <eranian@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212509.344964101@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1256302576-6169-1-git-send-email-eranian@gmail.com>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3ede0981f4ac..718fa939b1a7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2340,7 +2340,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
 	}
 
 	if (!data->watermark)
-		data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
+		data->watermark = max_size / 2;
 
 
 	rcu_assign_pointer(event->data, data);
-- 
cgit v1.2.3-71-gd317


From ce71b9df8893ec954e56c5979df6da274f20f65e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Nov 2009 05:26:55 +0100
Subject: tracing: Use the perf recursion protection from trace event

When we commit a trace to perf, we first check if we are
recursing in the same buffer so that we don't mess-up the buffer
with a recursing trace. But later on, we do the same check from
perf to avoid commit recursion. The recursion check is desired
early before we touch the buffer but we want to do this check
only once.

Then export the recursion protection from perf and use it from
the trace events before submitting a trace.

v2: Put appropriate Reported-by tag

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
LKML-Reference: <1258864015-10579-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  9 ++---
 include/linux/perf_event.h         |  4 +++
 include/trace/ftrace.h             | 23 +++++++------
 kernel/perf_event.c                | 68 +++++++++++++++++++++++++-------------
 kernel/trace/trace_event_profile.c | 14 ++++----
 kernel/trace/trace_kprobe.c        | 48 ++++++++++-----------------
 kernel/trace/trace_syscalls.c      | 47 ++++++++++----------------
 7 files changed, 106 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 43360c1d8f70..47bbdf9c38d0 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -137,13 +137,8 @@ struct ftrace_event_call {
 
 #define FTRACE_MAX_PROFILE_SIZE	2048
 
-struct perf_trace_buf {
-	char	buf[FTRACE_MAX_PROFILE_SIZE];
-	int	recursion;
-};
-
-extern struct perf_trace_buf	*perf_trace_buf;
-extern struct perf_trace_buf	*perf_trace_buf_nmi;
+extern char *perf_trace_buf;
+extern char *perf_trace_buf_nmi;
 
 #define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 36fe89f72641..74e98b1d3391 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -874,6 +874,8 @@ extern int perf_output_begin(struct perf_output_handle *handle,
 extern void perf_output_end(struct perf_output_handle *handle);
 extern void perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
+extern int perf_swevent_get_recursion_context(int **recursion);
+extern void perf_swevent_put_recursion_context(int *recursion);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -902,6 +904,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
+static int perf_swevent_get_recursion_context(int **recursion)	{ return -1; }
+static void perf_swevent_put_recursion_context(int *recursion)		{ }
 
 #endif
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 4945d1c99864..c222ef5238bf 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -724,16 +724,19 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
+	extern int perf_swevent_get_recursion_context(int **recursion); \
+	extern void perf_swevent_put_recursion_context(int *recursion); \
 	struct ftrace_event_call *event_call = &event_##call;		\
 	extern void perf_tp_event(int, u64, u64, void *, int);		\
 	struct ftrace_raw_##call *entry;				\
-	struct perf_trace_buf *trace_buf;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
 	struct trace_entry *ent;					\
 	int __entry_size;						\
 	int __data_size;						\
+	char *trace_buf;						\
 	char *raw_data;							\
+	int *recursion;							\
 	int __cpu;							\
 	int pc;								\
 									\
@@ -749,6 +752,10 @@ static void ftrace_profile_##call(proto)				\
 		return;							\
 									\
 	local_irq_save(irq_flags);					\
+									\
+	if (perf_swevent_get_recursion_context(&recursion))		\
+		goto end_recursion;						\
+									\
 	__cpu = smp_processor_id();					\
 									\
 	if (in_nmi())							\
@@ -759,13 +766,7 @@ static void ftrace_profile_##call(proto)				\
 	if (!trace_buf)							\
 		goto end;						\
 									\
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);			\
-	if (trace_buf->recursion++)					\
-		goto end_recursion;					\
-									\
-	barrier();							\
-									\
-	raw_data = trace_buf->buf;					\
+	raw_data = per_cpu_ptr(trace_buf, __cpu);			\
 									\
 	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;		\
 	entry = (struct ftrace_raw_##call *)raw_data;			\
@@ -780,9 +781,9 @@ static void ftrace_profile_##call(proto)				\
 	perf_tp_event(event_call->id, __addr, __count, entry,		\
 			     __entry_size);				\
 									\
-end_recursion:								\
-	trace_buf->recursion--;						\
-end:									\
+end:								\
+	perf_swevent_put_recursion_context(recursion);			\
+end_recursion:									\
 	local_irq_restore(irq_flags);					\
 									\
 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 718fa939b1a7..aba822722300 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3880,34 +3880,42 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 	}
 }
 
-static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+/*
+ * Must be called with preemption disabled
+ */
+int perf_swevent_get_recursion_context(int **recursion)
 {
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
 	if (in_nmi())
-		return &cpuctx->recursion[3];
+		*recursion = &cpuctx->recursion[3];
+	else if (in_irq())
+		*recursion = &cpuctx->recursion[2];
+	else if (in_softirq())
+		*recursion = &cpuctx->recursion[1];
+	else
+		*recursion = &cpuctx->recursion[0];
 
-	if (in_irq())
-		return &cpuctx->recursion[2];
+	if (**recursion)
+		return -1;
 
-	if (in_softirq())
-		return &cpuctx->recursion[1];
+	(**recursion)++;
 
-	return &cpuctx->recursion[0];
+	return 0;
 }
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
+void perf_swevent_put_recursion_context(int *recursion)
 {
-	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
-	int *recursion = perf_swevent_recursion_context(cpuctx);
-	struct perf_event_context *ctx;
-
-	if (*recursion)
-		goto out;
+	(*recursion)--;
+}
 
-	(*recursion)++;
-	barrier();
+static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
+			       u64 nr, int nmi,
+			       struct perf_sample_data *data,
+			       struct pt_regs *regs)
+{
+	struct perf_event_context *ctx;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
 	rcu_read_lock();
 	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
@@ -3920,12 +3928,25 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 	if (ctx)
 		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
 	rcu_read_unlock();
+}
 
-	barrier();
-	(*recursion)--;
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
+{
+	int *recursion;
+
+	preempt_disable();
+
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto out;
+
+	__do_perf_sw_event(type, event_id, nr, nmi, data, regs);
 
+	perf_swevent_put_recursion_context(recursion);
 out:
-	put_cpu_var(perf_cpu_context);
+	preempt_enable();
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4159,7 +4180,8 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 	if (!regs)
 		regs = task_pt_regs(current);
 
-	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+	/* Trace events already protected against recursion */
+	__do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
 				&data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index e0d351b01f5a..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -9,31 +9,33 @@
 #include "trace.h"
 
 
-struct perf_trace_buf *perf_trace_buf;
+char *perf_trace_buf;
 EXPORT_SYMBOL_GPL(perf_trace_buf);
 
-struct perf_trace_buf *perf_trace_buf_nmi;
+char *perf_trace_buf_nmi;
 EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
 
+typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
+
 /* Count the events in use (per event id, not per instance) */
 static int	total_profile_count;
 
 static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 {
-	struct perf_trace_buf *buf;
+	char *buf;
 	int ret = -ENOMEM;
 
 	if (atomic_inc_return(&event->profile_count))
 		return 0;
 
 	if (!total_profile_count) {
-		buf = alloc_percpu(struct perf_trace_buf);
+		buf = (char *)alloc_percpu(perf_trace_t);
 		if (!buf)
 			goto fail_buf;
 
 		rcu_assign_pointer(perf_trace_buf, buf);
 
-		buf = alloc_percpu(struct perf_trace_buf);
+		buf = (char *)alloc_percpu(perf_trace_t);
 		if (!buf)
 			goto fail_buf_nmi;
 
@@ -79,7 +81,7 @@ int ftrace_profile_enable(int event_id)
 
 static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
-	struct perf_trace_buf *buf, *nmi_buf;
+	char *buf, *nmi_buf;
 
 	if (!atomic_add_negative(-1, &event->profile_count))
 		return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3696476f307d..22e6f68b05b3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1208,11 +1208,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry *entry;
-	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
+	char *trace_buf;
 	char *raw_data;
+	int *recursion;
 
 	pc = preempt_count();
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1227,6 +1228,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	 * This also protects the rcu read side
 	 */
 	local_irq_save(irq_flags);
+
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto end_recursion;
+
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -1237,18 +1242,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, __cpu);
 
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1263,9 +1257,9 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ip, 1, entry, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(recursion);
+end_recursion:
 	local_irq_restore(irq_flags);
 
 	return 0;
@@ -1278,10 +1272,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry *entry;
-	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
+	char *trace_buf;
+	int *recursion;
 	char *raw_data;
 
 	pc = preempt_count();
@@ -1297,6 +1292,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	 * This also protects the rcu read side
 	 */
 	local_irq_save(irq_flags);
+
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto end_recursion;
+
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -1307,18 +1306,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, __cpu);
 
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1334,9 +1322,9 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(recursion);
+end_recursion:
 	local_irq_restore(irq_flags);
 
 	return 0;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 51213b0aa81b..0bb934875263 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -477,10 +477,11 @@ static int sys_prof_refcount_exit;
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
-	struct perf_trace_buf *trace_buf;
 	struct syscall_trace_enter *rec;
 	unsigned long flags;
+	char *trace_buf;
 	char *raw_data;
+	int *recursion;
 	int syscall_nr;
 	int size;
 	int cpu;
@@ -505,6 +506,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
 
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto end_recursion;
+
 	cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -515,18 +519,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, cpu);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -539,9 +532,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 			       (unsigned long *)&rec->args);
 	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(recursion);
+end_recursion:
 	local_irq_restore(flags);
 }
 
@@ -588,10 +581,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
-	struct perf_trace_buf *trace_buf;
 	unsigned long flags;
 	int syscall_nr;
+	char *trace_buf;
 	char *raw_data;
+	int *recursion;
 	int size;
 	int cpu;
 
@@ -617,6 +611,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
+
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto end_recursion;
+
 	cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -627,18 +625,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, cpu);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -652,9 +639,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
 	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(recursion);
+end_recursion:
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3-71-gd317


From 28889bf9e2db29747d58cd47a92d727f927c3aee Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Nov 2009 05:21:33 +0100
Subject: tracing: Forget about the NMI buffer for syscall events

We are never in an NMI context when we commit a syscall trace to
perf. So just forget about the nmi buffer there.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jason Baron <jbaron@redhat.com>
LKML-Reference: <1258863695-10464-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 0bb934875263..41b6dd963daa 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -511,10 +511,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 
 	cpu = smp_processor_id();
 
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
+	trace_buf = rcu_dereference(perf_trace_buf);
 
 	if (!trace_buf)
 		goto end;
@@ -617,10 +614,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
 	cpu = smp_processor_id();
 
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
+	trace_buf = rcu_dereference(perf_trace_buf);
 
 	if (!trace_buf)
 		goto end;
-- 
cgit v1.2.3-71-gd317


From b3a75542d329ce4e1c66b293cefeb4429a2af043 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Nov 2009 05:21:34 +0100
Subject: hw-breakpoints: Remove x86 specific headers from core file

Remove asm/processor.h and asm/debugreg.h as these headers are
not used anymore in the hw-breakpoints core file.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1258863695-10464-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 9ea9414e0e58..b6d6fa273eeb 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,12 +40,6 @@
 
 #include <linux/hw_breakpoint.h>
 
-#include <asm/processor.h>
-
-#ifdef CONFIG_X86
-#include <asm/debugreg.h>
-#endif
-
 /*
  * Constraints data
  */
-- 
cgit v1.2.3-71-gd317


From 96b02d78a7e47cd189f6b307c5513fec6b2155dc Mon Sep 17 00:00:00 2001
From: Márton Németh <nm127@freemail.hu>
Date: Sat, 21 Nov 2009 23:10:15 +0100
Subject: perf_event: Remove redundant zero fill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The buffer is first zeroed out by memset(). Then strncpy() is
used to fill the content. The strncpy() function also pads the
string till the end of the specified length, which is redundant.
The strncpy() does not ensures that the string will be properly
closed with 0. Use strlcpy() instead.

The semantic match that finds this kind of pattern is as
follows: (http://coccinelle.lip6.fr/)

// <smpl>
@@
expression buffer;
expression size;
expression str;
@@
	memset(buffer, 0, size);
	...
-	strncpy(
+	strlcpy(
	buffer, str, sizeof(buffer)
	);
@@
expression buffer;
expression size;
expression str;
@@
	memset(&buffer, 0, size);
	...
-	strncpy(
+	strlcpy(
	&buffer, str, sizeof(buffer));
@@
expression buffer;
identifier field;
expression size;
expression str;
@@
	memset(buffer, 0, size);
	...
-	strncpy(
+	strlcpy(
	buffer->field, str, sizeof(buffer->field)
	);
@@
expression buffer;
identifier field;
expression size;
expression str;
@@
	memset(&buffer, 0, size);
	...
-	strncpy(
+	strlcpy(
	buffer.field, str, sizeof(buffer.field));
// </smpl>

On strncpy() vs strlcpy() see
http://www.gratisoft.us/todd/papers/strlcpy.html .

Signed-off-by: Márton Németh <nm127@freemail.hu>
Cc: Julia Lawall <julia@diku.dk>
Cc: cocci@diku.dk
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <4B086547.5040100@freemail.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index aba822722300..b26cb03c1914 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3391,7 +3391,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 	char comm[TASK_COMM_LEN];
 
 	memset(comm, 0, sizeof(comm));
-	strncpy(comm, comm_event->task->comm, sizeof(comm));
+	strlcpy(comm, comm_event->task->comm, sizeof(comm));
 	size = ALIGN(strlen(comm)+1, sizeof(u64));
 
 	comm_event->comm = comm;
-- 
cgit v1.2.3-71-gd317


From 645e8cc0c9f01f07f384fd522b782e5e6ae9de18 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Nov 2009 12:20:19 +0100
Subject: perf_events: Fix modular build

Fix:

  ERROR: "perf_swevent_put_recursion_context" [fs/ext4/ext4.ko] undefined!
  ERROR: "perf_swevent_get_recursion_context" [fs/ext4/ext4.ko] undefined!

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
LKML-Reference: <1258864015-10579-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b26cb03c1914..abe1ef47496c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3903,11 +3903,13 @@ int perf_swevent_get_recursion_context(int **recursion)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
 void perf_swevent_put_recursion_context(int *recursion)
 {
 	(*recursion)--;
 }
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
 			       u64 nr, int nmi,
-- 
cgit v1.2.3-71-gd317


From b668c9cf3e58739dac54a1d6f42f2b4bdd980b3e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 22 Nov 2009 08:53:48 -0800
Subject: rcu: Fix grace-period-stall bug on large systems with CPU hotplug

When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure.  This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy.  Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:

1.	Kernel built for more than 32 CPUs on 32-bit systems or for more
	than 64 CPUs on 64-bit systems, so that there is more than one
	rcu_node structure.  (Or CONFIG_RCU_FANOUT is artificially set
	to a number smaller than CONFIG_NR_CPUS.)

2.	The kernel is built with CONFIG_TREE_PREEMPT_RCU.

3.	A task running on a CPU associated with a given leaf rcu_node
	structure blocks while in an RCU read-side critical section
	-and- that CPU has not yet passed through a quiescent state
	for the current RCU grace period.  This will cause the task
	to be queued on the leaf rcu_node's blocked_tasks[] array, in
	particular, on the element of this array corresponding to the
	current grace period.

4.	Each of the remaining CPUs corresponding to this same leaf rcu_node
	structure pass through a quiescent state.  However, the task is
	still in its RCU read-side critical section, so these quiescent
	states cannot be reported further up the rcu_node hierarchy.
	Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
	field are now zero.

5.	Each of the remaining CPUs go offline.  (The events in step
	#4 and #5 can happen in any order as long as each CPU passes
	through a quiescent state before going offline.)

6.	When the last CPU goes offline, __rcu_offline_cpu() will invoke
	rcu_preempt_offline_tasks(), which will move the task to the
	root rcu_node structure, but without reporting a quiescent state
	up the rcu_node hierarchy (and this failure to report a quiescent
	state is the bug).

	But because this leaf rcu_node structure's ->qsmask field is
	already zero and its ->block_tasks[] entries are all empty,
	force_quiescent_state() will skip this rcu_node structure.

	Therefore, grace periods are now hung.

This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu().  Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug.  This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 40 ++++++++++++-----------
 kernel/rcutree.h        |  3 ++
 kernel/rcutree_plugin.h | 85 +++++++++++++++++++++++++++++++++++--------------
 3 files changed, 85 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 9b36d6d7fb97..b79bfcd28e95 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -51,7 +51,7 @@
 
 /* Data structures. */
 
-static struct lock_class_key rcu_root_class;
+static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 
 #define RCU_STATE_INITIALIZER(name) { \
 	.level = { &name.node[0] }, \
@@ -936,6 +936,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
 	unsigned long mask;
+	int need_quiet = 0;
 	struct rcu_data *rdp = rsp->rda[cpu];
 	struct rcu_node *rnp;
 
@@ -949,29 +950,30 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 		spin_lock(&rnp->lock);		/* irqs already disabled. */
 		rnp->qsmaskinit &= ~mask;
 		if (rnp->qsmaskinit != 0) {
-			spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			if (rnp != rdp->mynode)
+				spin_unlock(&rnp->lock); /* irqs remain disabled. */
 			break;
 		}
-
-		/*
-		 * If there was a task blocking the current grace period,
-		 * and if all CPUs have checked in, we need to propagate
-		 * the quiescent state up the rcu_node hierarchy.  But that
-		 * is inconvenient at the moment due to deadlock issues if
-		 * this should end the current grace period.  So set the
-		 * offlined CPU's bit in ->qsmask in order to force the
-		 * next force_quiescent_state() invocation to clean up this
-		 * mess in a deadlock-free manner.
-		 */
-		if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
-			rnp->qsmask |= mask;
-
+		if (rnp == rdp->mynode)
+			need_quiet = rcu_preempt_offline_tasks(rsp, rnp, rdp);
+		else
+			spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		mask = rnp->grpmask;
-		spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 		rnp = rnp->parent;
 	} while (rnp != NULL);
 
-	spin_unlock_irqrestore(&rsp->onofflock, flags);
+	/*
+	 * We still hold the leaf rcu_node structure lock here, and
+	 * irqs are still disabled.  The reason for this subterfuge is
+	 * because invoking task_quiet() with ->onofflock held leads
+	 * to deadlock.
+	 */
+	spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+	rnp = rdp->mynode;
+	if (need_quiet)
+		task_quiet(rnp, flags);
+	else
+		spin_unlock_irqrestore(&rnp->lock, flags);
 
 	rcu_adopt_orphan_cbs(rsp);
 }
@@ -1731,6 +1733,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		rnp = rsp->level[i];
 		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
 			spin_lock_init(&rnp->lock);
+			lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
 			rnp->gpnum = 0;
 			rnp->qsmask = 0;
 			rnp->qsmaskinit = 0;
@@ -1753,7 +1756,6 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
 		}
 	}
-	lockdep_set_class(&rcu_get_root(rsp)->lock, &rcu_root_class);
 }
 
 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 17a28a08b559..a81188c42929 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -305,6 +305,9 @@ static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempted_readers(struct rcu_node *rnp);
+#ifdef CONFIG_HOTPLUG_CPU
+static void task_quiet(struct rcu_node *rnp, unsigned long flags);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 static void rcu_print_task_stall(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5ca2d26c5971..0bdb592eee66 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -160,11 +160,51 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 	return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
 }
 
+/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period.  The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void task_quiet(struct rcu_node *rnp, unsigned long flags)
+	__releases(rnp->lock)
+{
+	unsigned long mask;
+	struct rcu_node *rnp_p;
+
+	if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;  /* Still need more quiescent states! */
+	}
+
+	rnp_p = rnp->parent;
+	if (rnp_p == NULL) {
+		/*
+		 * Either there is only one rcu_node in the tree,
+		 * or tasks were kicked up to root rcu_node due to
+		 * CPUs going offline.
+		 */
+		cpu_quiet_msk_finish(&rcu_preempt_state, flags);
+		return;
+	}
+
+	/* Report up the rest of the hierarchy. */
+	mask = rnp->grpmask;
+	spin_unlock(&rnp->lock);	/* irqs remain disabled. */
+	spin_lock(&rnp_p->lock);	/* irqs already disabled. */
+	cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
+}
+
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
 static void rcu_read_unlock_special(struct task_struct *t)
 {
 	int empty;
 	unsigned long flags;
-	unsigned long mask;
 	struct rcu_node *rnp;
 	int special;
 
@@ -213,30 +253,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		/*
 		 * If this was the last task on the current list, and if
 		 * we aren't waiting on any CPUs, report the quiescent state.
-		 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
-		 * drop rnp->lock and restore irq.
+		 * Note that task_quiet() releases rnp->lock.
 		 */
-		if (!empty && rnp->qsmask == 0 &&
-		    !rcu_preempted_readers(rnp)) {
-			struct rcu_node *rnp_p;
-
-			if (rnp->parent == NULL) {
-				/* Only one rcu_node in the tree. */
-				cpu_quiet_msk_finish(&rcu_preempt_state, flags);
-				return;
-			}
-			/* Report up the rest of the hierarchy. */
-			mask = rnp->grpmask;
+		if (empty)
 			spin_unlock_irqrestore(&rnp->lock, flags);
-			rnp_p = rnp->parent;
-			spin_lock_irqsave(&rnp_p->lock, flags);
-			WARN_ON_ONCE(rnp->qsmask);
-			cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
-			return;
-		}
-		spin_unlock(&rnp->lock);
+		else
+			task_quiet(rnp, flags);
+	} else {
+		local_irq_restore(flags);
 	}
-	local_irq_restore(flags);
 }
 
 /*
@@ -303,6 +328,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
  * rcu_node.  The reason for not just moving them to the immediate
  * parent is to remove the need for rcu_read_unlock_special() to
  * make more than two attempts to acquire the target rcu_node's lock.
+ * Returns true if there were tasks blocking the current RCU grace
+ * period.
  *
  * Returns 1 if there was previously a task blocking the current grace
  * period on the specified rcu_node structure.
@@ -316,7 +343,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 	int i;
 	struct list_head *lp;
 	struct list_head *lp_root;
-	int retval = rcu_preempted_readers(rnp);
+	int retval;
 	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	struct task_struct *tp;
 
@@ -334,6 +361,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 	 * rcu_nodes in terms of gp_num value.  This fact allows us to
 	 * move the blocked_tasks[] array directly, element by element.
 	 */
+	retval = rcu_preempted_readers(rnp);
 	for (i = 0; i < 2; i++) {
 		lp = &rnp->blocked_tasks[i];
 		lp_root = &rnp_root->blocked_tasks[i];
@@ -346,7 +374,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 			spin_unlock(&rnp_root->lock); /* irqs remain disabled */
 		}
 	}
-
 	return retval;
 }
 
@@ -512,6 +539,16 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Because preemptible RCU does not exist, no quieting of tasks. */
+static void task_quiet(struct rcu_node *rnp, unsigned long flags)
+{
+	spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 
 /*
-- 
cgit v1.2.3-71-gd317


From 9f680ab41485edfdc96331b70afa7513aa0a7720 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 22 Nov 2009 08:53:49 -0800
Subject: rcu: Eliminate unneeded function wrapping

The functions rcu_init() is a wrapper for __rcu_init(), and also
sets up the CPU-hotplug notifier for rcu_barrier_cpu_hotplug().
But TINY_RCU doesn't need CPU-hotplug notification, and the
rcu_barrier_cpu_hotplug() is a simple wrapper for
rcu_cpu_notify().

So push rcu_init() out to kernel/rcutree.c and kernel/rcutiny.c
and get rid of the wrapper function rcu_barrier_cpu_hotplug().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088302320-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutiny.h |  2 --
 include/linux/rcutree.h |  3 ---
 kernel/rcupdate.c       | 22 ----------------------
 kernel/rcutiny.c        | 11 +----------
 kernel/rcutree.c        | 17 ++++++++++++++---
 5 files changed, 15 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 2c1fe8373e71..a3b6272af2dd 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -38,7 +38,6 @@ void rcu_bh_qs(int cpu);
 
 #define rcu_init_sched()	do { } while (0)
 extern void rcu_check_callbacks(int cpu, int user);
-extern void __rcu_init(void);
 
 /*
  * Return the number of grace periods.
@@ -69,7 +68,6 @@ static inline void synchronize_rcu_bh_expedited(void)
 }
 
 struct notifier_block;
-extern int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu);
 
 #ifdef CONFIG_NO_HZ
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9642c6bcb399..111a65257350 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,8 +34,6 @@ struct notifier_block;
 
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
-extern int rcu_cpu_notify(struct notifier_block *self,
-			  unsigned long action, void *hcpu);
 extern int rcu_needs_cpu(int cpu);
 extern int rcu_expedited_torture_stats(char *page);
 
@@ -83,7 +81,6 @@ static inline void synchronize_rcu_bh_expedited(void)
 	synchronize_sched_expedited();
 }
 
-extern void __rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 
 extern long rcu_batches_completed(void);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 7625f207f65e..eb6b534db318 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -161,28 +161,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
 #endif /* #ifndef CONFIG_TINY_RCU */
 
-static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
-		unsigned long action, void *hcpu)
-{
-	return rcu_cpu_notify(self, action, hcpu);
-}
-
-void __init rcu_init(void)
-{
-	int i;
-
-	__rcu_init();
-	cpu_notifier(rcu_barrier_cpu_hotplug, 0);
-
-	/*
-	 * We don't need protection against CPU-hotplug here because
-	 * this is called early in boot, before either interrupts
-	 * or the scheduler are operational.
-	 */
-	for_each_online_cpu(i)
-		rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
-}
-
 void rcu_scheduler_starting(void)
 {
 	WARN_ON(num_online_cpus() != 1);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index b33ec3aa377a..9f6d9ff2572c 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -177,15 +177,6 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 	__rcu_process_callbacks(&rcu_bh_ctrlblk);
 }
 
-/*
- * Null function to handle CPU being onlined.  Longer term, we want to
- * make TINY_RCU avoid using rcupdate.c, but later...
- */
-int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	return NOTIFY_OK;
-}
-
 /*
  * Wait for a grace period to elapse.  But it is illegal to invoke
  * synchronize_sched() from within an RCU read-side critical section.
@@ -285,7 +276,7 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
-void __rcu_init(void)
+void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b79bfcd28e95..e3d3bbddbcd5 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1644,8 +1644,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
 /*
  * Handle CPU online/offline notification events.
  */
-int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-			     unsigned long action, void *hcpu)
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 
@@ -1781,8 +1781,10 @@ do { \
 	} \
 } while (0)
 
-void __init __rcu_init(void)
+void __init rcu_init(void)
 {
+	int i;
+
 	rcu_bootup_announce();
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
@@ -1791,6 +1793,15 @@ void __init __rcu_init(void)
 	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
 	__rcu_init_preempt();
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+
+	/*
+	 * We don't need protection against CPU-hotplug here because
+	 * this is called early in boot, before either interrupts
+	 * or the scheduler are operational.
+	 */
+	cpu_notifier(rcu_cpu_notify, 0);
+	for_each_online_cpu(i)
+		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
 }
 
 #include "rcutree_plugin.h"
-- 
cgit v1.2.3-71-gd317


From 6ebb237bece23275d1da149b61a342f0d4d06a08 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 22 Nov 2009 08:53:50 -0800
Subject: rcu: Re-arrange code to reduce #ifdef pain

Remove #ifdefs from kernel/rcupdate.c and
include/linux/rcupdate.h by moving code to
include/linux/rcutiny.h, include/linux/rcutree.h, and
kernel/rcutree.c.

Also remove some definitions that are no longer used.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1258908830885-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupdate.h |  12 ------
 include/linux/rcutiny.h  |  11 +++++
 include/linux/rcutree.h  |   4 +-
 kernel/rcupdate.c        | 104 -----------------------------------------------
 kernel/rcutree.c         |  80 ++++++++++++++++++++++++++++++++++++
 kernel/rcutree_plugin.h  |  24 +++++++++++
 6 files changed, 118 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2f1bc42a3b82..24440f4bf476 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,6 @@ struct rcu_head {
 };
 
 /* Exported common interfaces */
-#ifdef CONFIG_TREE_PREEMPT_RCU
-extern void synchronize_rcu(void);
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#define synchronize_rcu synchronize_sched
-#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 extern void synchronize_rcu_bh(void);
 extern void synchronize_sched(void);
 extern void rcu_barrier(void);
@@ -67,13 +62,6 @@ extern int sched_expedited_torture_stats(char *page);
 
 /* Internal to kernel */
 extern void rcu_init(void);
-extern void rcu_scheduler_starting(void);
-#ifndef CONFIG_TINY_RCU
-extern int rcu_needs_cpu(int cpu);
-#else
-static inline int rcu_needs_cpu(int cpu) { return 0; }
-#endif
-extern int rcu_scheduler_active;
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a3b6272af2dd..c4ba9a78721e 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -39,6 +39,11 @@ void rcu_bh_qs(int cpu);
 #define rcu_init_sched()	do { } while (0)
 extern void rcu_check_callbacks(int cpu, int user);
 
+static inline int rcu_needs_cpu(int cpu)
+{
+	return 0;
+}
+
 /*
  * Return the number of grace periods.
  */
@@ -57,6 +62,8 @@ static inline long rcu_batches_completed_bh(void)
 
 extern int rcu_expedited_torture_stats(char *page);
 
+#define synchronize_rcu synchronize_sched
+
 static inline void synchronize_rcu_expedited(void)
 {
 	synchronize_sched();
@@ -86,6 +93,10 @@ static inline void rcu_exit_nohz(void)
 
 #endif /* #else #ifdef CONFIG_NO_HZ */
 
+static inline void rcu_scheduler_starting(void)
+{
+}
+
 static inline void exit_rcu(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 111a65257350..c93eee5911b0 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,12 +35,14 @@ struct notifier_block;
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern int rcu_needs_cpu(int cpu);
+extern void rcu_scheduler_starting(void);
 extern int rcu_expedited_torture_stats(char *page);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 extern void __rcu_read_lock(void);
 extern void __rcu_read_unlock(void);
+extern void synchronize_rcu(void);
 extern void exit_rcu(void);
 
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
@@ -55,7 +57,7 @@ static inline void __rcu_read_unlock(void)
 	preempt_enable();
 }
 
-#define __synchronize_sched() synchronize_rcu()
+#define synchronize_rcu synchronize_sched
 
 static inline void exit_rcu(void)
 {
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index eb6b534db318..9b7fd4723878 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
-#include <linux/kernel_stat.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -53,8 +52,6 @@ struct lockdep_map rcu_lock_map =
 EXPORT_SYMBOL_GPL(rcu_lock_map);
 #endif
 
-int rcu_scheduler_active __read_mostly;
-
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
@@ -66,104 +63,3 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	rcu = container_of(head, struct rcu_synchronize, head);
 	complete(&rcu->completion);
 }
-
-#ifndef CONFIG_TINY_RCU
-
-#ifdef CONFIG_TREE_PREEMPT_RCU
-
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	if (!rcu_scheduler_active)
-		return;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-
-/**
- * synchronize_sched - wait until an rcu-sched grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu-sched
- * grace period has elapsed, in other words after all currently executing
- * rcu-sched read-side critical sections have completed.   These read-side
- * critical sections are delimited by rcu_read_lock_sched() and
- * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
- * local_irq_disable(), and so on may be used in place of
- * rcu_read_lock_sched().
- *
- * This means that all preempt_disable code sequences, including NMI and
- * hardware-interrupt handlers, in progress on entry will have completed
- * before this primitive returns.  However, this does not guarantee that
- * softirq handlers will have completed, since in some kernels, these
- * handlers can run in process context, and can block.
- *
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API.  In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
- */
-void synchronize_sched(void)
-{
-	struct rcu_synchronize rcu;
-
-	if (rcu_blocking_is_gp())
-		return;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_sched(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-}
-EXPORT_SYMBOL_GPL(synchronize_sched);
-
-/**
- * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu_bh grace
- * period has elapsed, in other words after all currently executing rcu_bh
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
- * and may be nested.
- */
-void synchronize_rcu_bh(void)
-{
-	struct rcu_synchronize rcu;
-
-	if (rcu_blocking_is_gp())
-		return;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_bh(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-
-#endif /* #ifndef CONFIG_TINY_RCU */
-
-void rcu_scheduler_starting(void)
-{
-	WARN_ON(num_online_cpus() != 1);
-	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = 1;
-}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e3d3bbddbcd5..4ca7e0292fd8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/time.h>
+#include <linux/kernel_stat.h>
 
 #include "rcutree.h"
 
@@ -79,6 +80,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
+static int rcu_scheduler_active __read_mostly;
+
 
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
@@ -1396,6 +1399,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 
+/**
+ * synchronize_sched - wait until an rcu-sched grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-sched
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-sched read-side critical sections have completed.   These read-side
+ * critical sections are delimited by rcu_read_lock_sched() and
+ * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
+ * local_irq_disable(), and so on may be used in place of
+ * rcu_read_lock_sched().
+ *
+ * This means that all preempt_disable code sequences, including NMI and
+ * hardware-interrupt handlers, in progress on entry will have completed
+ * before this primitive returns.  However, this does not guarantee that
+ * softirq handlers will have completed, since in some kernels, these
+ * handlers can run in process context, and can block.
+ *
+ * This primitive provides the guarantees made by the (now removed)
+ * synchronize_kernel() API.  In contrast, synchronize_rcu() only
+ * guarantees that rcu_read_lock() sections will have completed.
+ * In "classic RCU", these two guarantees happen to be one and
+ * the same, but can differ in realtime RCU implementations.
+ */
+void synchronize_sched(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (rcu_blocking_is_gp())
+		return;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_sched(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
+/**
+ * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu_bh grace
+ * period has elapsed, in other words after all currently executing rcu_bh
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
+ * and may be nested.
+ */
+void synchronize_rcu_bh(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (rcu_blocking_is_gp())
+		return;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_bh(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1480,6 +1545,21 @@ int rcu_needs_cpu(int cpu)
 	       rcu_preempt_needs_cpu(cpu);
 }
 
+/*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process.  Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system).  After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections.
+ */
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(num_online_cpus() != 1);
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
+
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0bdb592eee66..1d295c789d3d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -425,6 +425,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (!rcu_scheduler_active)
+		return;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
 /*
  * Wait for an rcu-preempt grace period.  We are supposed to expedite the
  * grace period, but this is the crude slow compatability hack, so just
-- 
cgit v1.2.3-71-gd317


From 98e4833ba3c314c99dc364012fba6ac894230ad0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Nov 2009 08:03:09 +0100
Subject: ring-buffer benchmark: Run producer/consumer threads at nice +19

The ring-buffer benchmark threads run on nice 0 by default, using
up a lot of CPU time and slowing down the system:

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
  1024 root      20   0     0    0    0 D 95.3  0.0   4:01.67 rb_producer
  1023 root      20   0     0    0    0 R 93.5  0.0   2:54.33 rb_consumer
 21569 mingo     40   0 14852 1048  772 R  3.6  0.1   0:00.05 top
     1 root      40   0  4080  928  668 S  0.0  0.0   0:23.98 init

Renice them to +19 to make them less intrusive.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer_benchmark.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 70df73e4ff21..3875d49da990 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -399,6 +399,12 @@ static int __init ring_buffer_benchmark_init(void)
 	if (IS_ERR(producer))
 		goto out_kill;
 
+	/*
+	 * Run them as low-prio background tasks by default:
+	 */
+	set_user_nice(consumer, 19);
+	set_user_nice(producer, 19);
+
 	return 0;
 
  out_kill:
-- 
cgit v1.2.3-71-gd317


From 6e3d8330ae2c4b2c11a9577a0130d2ecda1c610d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Nov 2009 10:19:20 +0100
Subject: perf events: Do not generate function trace entries in perf code

Decreases perf overhead when function tracing is enabled,
by about 50%.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/Makefile | 1 +
 kernel/Makefile              | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..1d2cb383410e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
 # Don't trace early stages of a secondary CPU boot
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
 # Make sure load_percpu_segment has no stackprotector
diff --git a/kernel/Makefile b/kernel/Makefile
index 17b575ec7d07..6b7ce8173dfd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
-- 
cgit v1.2.3-71-gd317


From 457dc928f586f3f4b930206965e6db270034e97e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Nov 2009 11:03:28 +0100
Subject: tracing, function tracer: Clean up strstrip() usage

Clean up strstrip() usage - which also addresses this build warning:

  kernel/trace/ftrace.c: In function 'ftrace_pid_write':
  kernel/trace/ftrace.c:3004: warning: ignoring return value of 'strstrip', declared with attribute warn_unused_result

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7f9b51e8184b..1dc101d09765 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2985,7 +2985,7 @@ static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
-	char buf[64];
+	char buf[64], *tmp;
 	long val;
 	int ret;
 
@@ -3001,11 +3001,11 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
 	 * to clean the filter quietly.
 	 */
-	strstrip(buf);
-	if (strlen(buf) == 0)
+	tmp = strstrip(buf);
+	if (strlen(tmp) == 0)
 		return 1;
 
-	ret = strict_strtol(buf, 10, &val);
+	ret = strict_strtol(tmp, 10, &val);
 	if (ret < 0)
 		return ret;
 
@@ -3391,4 +3391,3 @@ void ftrace_graph_stop(void)
 	ftrace_stop();
 }
 #endif
-
-- 
cgit v1.2.3-71-gd317


From a4234bfcf4d72a10a99176cdef007345e9c3b4aa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Nov 2009 10:57:59 +0100
Subject: perf_events: Optimize the swcounter hotpath

The structure init creates a bit memcpy, which shows
up big time in perf annotate output:

          :      ffffffff810a859d <__perf_sw_event>:
     1.68 :      ffffffff810a859d:       55                      push   %rbp
     1.69 :      ffffffff810a859e:       41 89 fa                mov    %edi,%r10d
     0.01 :      ffffffff810a85a1:       49 89 c9                mov    %rcx,%r9
     0.00 :      ffffffff810a85a4:       31 c0                   xor    %eax,%eax
     1.71 :      ffffffff810a85a6:       b9 16 00 00 00          mov    $0x16,%ecx
     0.00 :      ffffffff810a85ab:       48 89 e5                mov    %rsp,%rbp
     0.00 :      ffffffff810a85ae:       48 83 ec 60             sub    $0x60,%rsp
     1.52 :      ffffffff810a85b2:       48 8d 7d a0             lea    -0x60(%rbp),%rdi
    85.20 :      ffffffff810a85b6:       f3 ab                   rep stos %eax,%es:(%rdi)

None of the callees depends on the structure being pre-initialized,
so only initialize ->addr. This gets rid of the memcpy overhead.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index abe1ef47496c..20df8aba8da5 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3954,12 +3954,12 @@ out:
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
 {
-	struct perf_sample_data data = {
-		.addr = addr,
-	};
+	struct perf_sample_data data;
 
-	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
-				&data, regs);
+	data.addr = addr;
+	data.raw  = NULL;
+
+	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
 }
 
 static void perf_swevent_read(struct perf_event *event)
-- 
cgit v1.2.3-71-gd317


From a66a3052e2d4c5815d7ad26887b1d4193206e691 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Nov 2009 11:37:23 +0100
Subject: perf_events: Undo copy/paste damage

We had two almost identical functions, avoid the duplication.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20091123103819.537537928@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 20df8aba8da5..e2daa10bb5ce 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1704,16 +1704,10 @@ static void free_event(struct perf_event *event)
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
 
-/*
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
+int perf_event_release_kernel(struct perf_event *event)
 {
-	struct perf_event *event = file->private_data;
 	struct perf_event_context *ctx = event->ctx;
 
-	file->private_data = NULL;
-
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_event_remove_from_context(event);
@@ -1728,26 +1722,19 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 
-int perf_event_release_kernel(struct perf_event *event)
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
 {
-	struct perf_event_context *ctx = event->ctx;
-
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	perf_event_remove_from_context(event);
-	mutex_unlock(&ctx->mutex);
-
-	mutex_lock(&event->owner->perf_event_mutex);
-	list_del_init(&event->owner_entry);
-	mutex_unlock(&event->owner->perf_event_mutex);
-	put_task_struct(event->owner);
+	struct perf_event *event = file->private_data;
 
-	free_event(event);
+	file->private_data = NULL;
 
-	return 0;
+	return perf_event_release_kernel(event);
 }
-EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 
 static int perf_event_read_size(struct perf_event *event)
 {
-- 
cgit v1.2.3-71-gd317


From 6c2bfcbe58e0dd39554be88940149f5aa11e17d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Nov 2009 11:37:24 +0100
Subject: perf_events: Fix style nits

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091123103819.613427378@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e2daa10bb5ce..1f14481c2337 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -447,9 +447,8 @@ retry:
 	 * can remove the event safely, if the call above did not
 	 * succeed.
 	 */
-	if (!list_empty(&event->group_entry)) {
+	if (!list_empty(&event->group_entry))
 		list_del_event(event, ctx);
-	}
 	spin_unlock_irq(&ctx->lock);
 }
 
@@ -1033,10 +1032,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
 	update_context_time(ctx);
 
 	perf_disable();
-	if (ctx->nr_active)
+	if (ctx->nr_active) {
 		list_for_each_entry(event, &ctx->group_list, group_entry)
 			group_sched_out(event, cpuctx, ctx);
-
+	}
 	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
-- 
cgit v1.2.3-71-gd317


From 2e2af50b1fab3c40636839a7f439c167ae559533 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Nov 2009 11:37:25 +0100
Subject: perf_events: Disable events when we detach them

If we leave the event in STATE_INACTIVE, any read of the event
after the detach will increase the running count but not the
enabled count and cause funny scaling artefacts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091123103819.689055515@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 1f14481c2337..fb851ec34461 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -294,6 +294,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->group_leader != event)
 		event->group_leader->nr_siblings--;
 
+	event->state = PERF_EVENT_STATE_OFF;
+
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
-- 
cgit v1.2.3-71-gd317


From 5e942bb33371254a474653123cd9e13a4c89ee44 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Nov 2009 11:37:26 +0100
Subject: perf_events: Update the context time on exit

It appeared we did call update_event_times() on exit, but we
failed to update the context time, which renders the former
moot.

Locking is a bit iffy, we call update_event_times under
ctx->mutex instead of ctx->lock - the next patch fixes this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091123103819.764207355@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fb851ec34461..8be2574b89b6 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4983,6 +4983,7 @@ void perf_event_exit_task(struct task_struct *child)
 	 * the events from it.
 	 */
 	unclone_ctx(child_ctx);
+	update_context_time(child_ctx);
 	spin_unlock_irqrestore(&child_ctx->lock, flags);
 
 	/*
-- 
cgit v1.2.3-71-gd317


From f67218c3e93abaf0f480bb94b53d234853ffe4de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Nov 2009 11:37:27 +0100
Subject: perf_events: Fix __perf_event_exit_task() vs. update_event_times()
 locking

Move the update_event_times() call in __perf_event_exit_task()
into list_del_event() because that holds the proper lock
(ctx->lock) and seems a more natural place to do the last time
update.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091123103819.842455480@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 78 ++++++++++++++++++++++++++---------------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8be2574b89b6..50f11b5f8c3d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -246,6 +246,44 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 	put_ctx(ctx);
 }
 
+static inline u64 perf_clock(void)
+{
+	return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+	u64 now = perf_clock();
+
+	ctx->time += now - ctx->timestamp;
+	ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	u64 run_end;
+
+	if (event->state < PERF_EVENT_STATE_INACTIVE ||
+	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+		return;
+
+	event->total_time_enabled = ctx->time - event->tstamp_enabled;
+
+	if (event->state == PERF_EVENT_STATE_INACTIVE)
+		run_end = event->tstamp_stopped;
+	else
+		run_end = ctx->time;
+
+	event->total_time_running = run_end - event->tstamp_running;
+}
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -294,6 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->group_leader != event)
 		event->group_leader->nr_siblings--;
 
+	update_event_times(event);
 	event->state = PERF_EVENT_STATE_OFF;
 
 	/*
@@ -454,44 +493,6 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
-static inline u64 perf_clock(void)
-{
-	return cpu_clock(smp_processor_id());
-}
-
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_event_context *ctx)
-{
-	u64 now = perf_clock();
-
-	ctx->time += now - ctx->timestamp;
-	ctx->timestamp = now;
-}
-
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
-	struct perf_event_context *ctx = event->ctx;
-	u64 run_end;
-
-	if (event->state < PERF_EVENT_STATE_INACTIVE ||
-	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-		return;
-
-	event->total_time_enabled = ctx->time - event->tstamp_enabled;
-
-	if (event->state == PERF_EVENT_STATE_INACTIVE)
-		run_end = event->tstamp_stopped;
-	else
-		run_end = ctx->time;
-
-	event->total_time_running = run_end - event->tstamp_running;
-}
-
 /*
  * Update total_time_enabled and total_time_running for all events in a group.
  */
@@ -4931,7 +4932,6 @@ __perf_event_exit_task(struct perf_event *child_event,
 {
 	struct perf_event *parent_event;
 
-	update_event_times(child_event);
 	perf_event_remove_from_context(child_event);
 
 	parent_event = child_event->parent;
-- 
cgit v1.2.3-71-gd317


From 4ed7c92d68a5387ba5f7030dc76eab03558e27f5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Nov 2009 11:37:29 +0100
Subject: perf_events: Undo some recursion damage

Make perf_swevent_get_recursion_context return a context number
and disable preemption.

This could be used to remove the IRQ disable from the trace bit
and index the per-cpu buffer with.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091123103819.993226816@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h    |  8 ++---
 include/trace/ftrace.h        | 17 ++++++-----
 kernel/perf_event.c           | 71 +++++++++++++++++++------------------------
 kernel/trace/trace_kprobe.c   | 14 +++++----
 kernel/trace/trace_syscalls.c | 14 +++++----
 5 files changed, 61 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 74e98b1d3391..43adbd7f0010 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -874,8 +874,8 @@ extern int perf_output_begin(struct perf_output_handle *handle,
 extern void perf_output_end(struct perf_output_handle *handle);
 extern void perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
-extern int perf_swevent_get_recursion_context(int **recursion);
-extern void perf_swevent_put_recursion_context(int *recursion);
+extern int perf_swevent_get_recursion_context(void);
+extern void perf_swevent_put_recursion_context(int rctx);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -904,8 +904,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
-static int perf_swevent_get_recursion_context(int **recursion)	{ return -1; }
-static void perf_swevent_put_recursion_context(int *recursion)		{ }
+static inline int  perf_swevent_get_recursion_context(void)  { return -1; }
+static inline void perf_swevent_put_recursion_context(int rctx)		{ }
 
 #endif
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index c222ef5238bf..c3417c13e3ed 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -724,8 +724,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
-	extern int perf_swevent_get_recursion_context(int **recursion); \
-	extern void perf_swevent_put_recursion_context(int *recursion); \
+	extern int perf_swevent_get_recursion_context(void);		\
+	extern void perf_swevent_put_recursion_context(int rctx);	\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	extern void perf_tp_event(int, u64, u64, void *, int);		\
 	struct ftrace_raw_##call *entry;				\
@@ -736,8 +736,8 @@ static void ftrace_profile_##call(proto)				\
 	int __data_size;						\
 	char *trace_buf;						\
 	char *raw_data;							\
-	int *recursion;							\
 	int __cpu;							\
+	int rctx;							\
 	int pc;								\
 									\
 	pc = preempt_count();						\
@@ -753,8 +753,9 @@ static void ftrace_profile_##call(proto)				\
 									\
 	local_irq_save(irq_flags);					\
 									\
-	if (perf_swevent_get_recursion_context(&recursion))		\
-		goto end_recursion;						\
+	rctx = perf_swevent_get_recursion_context();			\
+	if (rctx < 0)							\
+		goto end_recursion;					\
 									\
 	__cpu = smp_processor_id();					\
 									\
@@ -781,9 +782,9 @@ static void ftrace_profile_##call(proto)				\
 	perf_tp_event(event_call->id, __addr, __count, entry,		\
 			     __entry_size);				\
 									\
-end:								\
-	perf_swevent_put_recursion_context(recursion);			\
-end_recursion:									\
+end:									\
+	perf_swevent_put_recursion_context(rctx);			\
+end_recursion:								\
 	local_irq_restore(irq_flags);					\
 									\
 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 50f11b5f8c3d..0b0d5f72fe7d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3869,45 +3869,50 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 	}
 }
 
-/*
- * Must be called with preemption disabled
- */
-int perf_swevent_get_recursion_context(int **recursion)
+int perf_swevent_get_recursion_context(void)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	int rctx;
 
 	if (in_nmi())
-		*recursion = &cpuctx->recursion[3];
+		rctx = 3;
 	else if (in_irq())
-		*recursion = &cpuctx->recursion[2];
+		rctx = 2;
 	else if (in_softirq())
-		*recursion = &cpuctx->recursion[1];
+		rctx = 1;
 	else
-		*recursion = &cpuctx->recursion[0];
+		rctx = 0;
 
-	if (**recursion)
+	if (cpuctx->recursion[rctx]) {
+		put_cpu_var(perf_cpu_context);
 		return -1;
+	}
 
-	(**recursion)++;
+	cpuctx->recursion[rctx]++;
+	barrier();
 
-	return 0;
+	return rctx;
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-void perf_swevent_put_recursion_context(int *recursion)
+void perf_swevent_put_recursion_context(int rctx)
 {
-	(*recursion)--;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	barrier();
+	cpuctx->recursion[rctx]++;
+	put_cpu_var(perf_cpu_context);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
-static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
-			       u64 nr, int nmi,
-			       struct perf_sample_data *data,
-			       struct pt_regs *regs)
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
+	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
+	cpuctx = &__get_cpu_var(perf_cpu_context);
 	rcu_read_lock();
 	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
 				 nr, nmi, data, regs);
@@ -3921,34 +3926,22 @@ static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
 	rcu_read_unlock();
 }
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	int *recursion;
-
-	preempt_disable();
-
-	if (perf_swevent_get_recursion_context(&recursion))
-		goto out;
-
-	__do_perf_sw_event(type, event_id, nr, nmi, data, regs);
-
-	perf_swevent_put_recursion_context(recursion);
-out:
-	preempt_enable();
-}
-
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data;
+	int rctx;
+
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
+		return;
 
 	data.addr = addr;
 	data.raw  = NULL;
 
 	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+
+	perf_swevent_put_recursion_context(rctx);
 }
 
 static void perf_swevent_read(struct perf_event *event)
@@ -4172,7 +4165,7 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 		regs = task_pt_regs(current);
 
 	/* Trace events already protected against recursion */
-	__do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
 				&data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 22e6f68b05b3..79ce6a2bd74f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1213,7 +1213,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	unsigned long irq_flags;
 	char *trace_buf;
 	char *raw_data;
-	int *recursion;
+	int rctx;
 
 	pc = preempt_count();
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1229,7 +1229,8 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	 */
 	local_irq_save(irq_flags);
 
-	if (perf_swevent_get_recursion_context(&recursion))
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		goto end_recursion;
 
 	__cpu = smp_processor_id();
@@ -1258,7 +1259,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	perf_tp_event(call->id, entry->ip, 1, entry, size);
 
 end:
-	perf_swevent_put_recursion_context(recursion);
+	perf_swevent_put_recursion_context(rctx);
 end_recursion:
 	local_irq_restore(irq_flags);
 
@@ -1276,8 +1277,8 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
 	char *trace_buf;
-	int *recursion;
 	char *raw_data;
+	int rctx;
 
 	pc = preempt_count();
 	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1293,7 +1294,8 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	 */
 	local_irq_save(irq_flags);
 
-	if (perf_swevent_get_recursion_context(&recursion))
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		goto end_recursion;
 
 	__cpu = smp_processor_id();
@@ -1323,7 +1325,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
 
 end:
-	perf_swevent_put_recursion_context(recursion);
+	perf_swevent_put_recursion_context(rctx);
 end_recursion:
 	local_irq_restore(irq_flags);
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 41b6dd963daa..9189cbe86079 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -481,8 +481,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	unsigned long flags;
 	char *trace_buf;
 	char *raw_data;
-	int *recursion;
 	int syscall_nr;
+	int rctx;
 	int size;
 	int cpu;
 
@@ -506,7 +506,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
 
-	if (perf_swevent_get_recursion_context(&recursion))
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		goto end_recursion;
 
 	cpu = smp_processor_id();
@@ -530,7 +531,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 
 end:
-	perf_swevent_put_recursion_context(recursion);
+	perf_swevent_put_recursion_context(rctx);
 end_recursion:
 	local_irq_restore(flags);
 }
@@ -582,7 +583,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	int syscall_nr;
 	char *trace_buf;
 	char *raw_data;
-	int *recursion;
+	int rctx;
 	int size;
 	int cpu;
 
@@ -609,7 +610,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
 
-	if (perf_swevent_get_recursion_context(&recursion))
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		goto end_recursion;
 
 	cpu = smp_processor_id();
@@ -634,7 +636,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
 
 end:
-	perf_swevent_put_recursion_context(recursion);
+	perf_swevent_put_recursion_context(rctx);
 end_recursion:
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3-71-gd317


From acd1d7c1f8f3d848a3c5327dc09f8c1efb971678 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 Nov 2009 15:00:36 +0100
Subject: perf_events: Restore sanity to scaling land

It is quite possible to call update_event_times() on a context
that isn't actually running and thereby confuse the thing.

perf stat was reporting !100% scale values for software counters
(2e2af50b perf_events: Disable events when we detach them,
solved the worst of that, but there was still some left).

The thing that happens is that because we are not self-reaping
(we have a caring parent) there is a time between the last
schedule (out) and having do_exit() called which will detach the
events.

This period would be accounted as enabled,!running because the
event->state==INACTIVE, even though !event->ctx->is_active.

Similar issues could have been observed by calling read() on a
event while the attached task was not scheduled in.

Solve this by teaching update_event_times() about
ctx->is_active.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1258984836.4531.480.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 0b0d5f72fe7d..0aafe85362fd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -274,7 +274,12 @@ static void update_event_times(struct perf_event *event)
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
 
-	event->total_time_enabled = ctx->time - event->tstamp_enabled;
+	if (ctx->is_active)
+		run_end = ctx->time;
+	else
+		run_end = event->tstamp_stopped;
+
+	event->total_time_enabled = run_end - event->tstamp_enabled;
 
 	if (event->state == PERF_EVENT_STATE_INACTIVE)
 		run_end = event->tstamp_stopped;
-- 
cgit v1.2.3-71-gd317


From ba6909b719a5ccc0c8100d2895bb7ff557b2eeae Mon Sep 17 00:00:00 2001
From: "K.Prasad" <prasad@linux.vnet.ibm.com>
Date: Mon, 23 Nov 2009 21:17:13 +0530
Subject: hw-breakpoint: Attribute authorship of hw-breakpoint related files

Attribute authorship to developers of hw-breakpoint related
files.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091123154713.GA5593@in.ibm.com>
[ v2: moved it to latest -tip ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/hw_breakpoint.c         | 4 ++++
 kernel/hw_breakpoint.c                  | 4 ++++
 samples/hw_breakpoint/data_breakpoint.c | 2 ++
 3 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 752daebe91c6..4d267fb77828 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -16,6 +16,10 @@
  * Copyright (C) 2007 Alan Stern
  * Copyright (C) 2009 IBM Corporation
  * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
  */
 
 /*
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index b6d6fa273eeb..c16662268d75 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -18,6 +18,10 @@
  * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
  *
  * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
  */
 
 /*
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index 5bc9819a819e..95063818bcf4 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -23,6 +23,8 @@
  * that variable.
  *
  * Copyright (C) IBM Corporation, 2009
+ *
+ * Author: K.Prasad <prasad@linux.vnet.ibm.com>
  */
 #include <linux/module.h>	/* Needed by all modules */
 #include <linux/kernel.h>	/* Needed for KERN_INFO */
-- 
cgit v1.2.3-71-gd317


From fdf6bc95229821e3d9405eba28925b76e92b74d0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Nov 2009 15:42:33 +0100
Subject: hw-breakpoints: Check the breakpoint params from perf tools

Perf tools create perf events as disabled in the beginning.
Breakpoints are then considered like ptrace temporary
breakpoints, only meant to reserve a breakpoint slot until we
get all the necessary informations from the user.

In this case, we don't check the address that is breakpointed as
it is NULL in the ptrace case.

But perf tools don't have the same purpose, events are created
disabled to wait for all events to be created before enabling
all of them. We want to check the breakpoint parameters in this
case.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1258987355-8751-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c16662268d75..06d372fc026d 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -267,7 +267,16 @@ int __register_perf_hw_breakpoint(struct perf_event *bp)
 	if (ret)
 		return ret;
 
-	if (!bp->attr.disabled)
+	/*
+	 * Ptrace breakpoints can be temporary perf events only
+	 * meant to reserve a slot. In this case, it is created disabled and
+	 * we don't want to check the params right now (as we put a null addr)
+	 * But perf tools create events as disabled and we want to check
+	 * the params for them.
+	 * This is a quick hack that will be removed soon, once we remove
+	 * the tmp breakpoints from ptrace
+	 */
+	if (!bp->attr.disabled || bp->callback == perf_bp_event)
 		ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
 
 	return ret;
-- 
cgit v1.2.3-71-gd317


From f5ffe02e5046003ae7e2ce70d3d1c2a73331268b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Nov 2009 15:42:34 +0100
Subject: perf: Add kernel side syscall events support for breakpoints

Add the remaining necessary bits to support breakpoints created
through perf syscall.

We don't use the software counter interface as:

- We don't need to check against recursion, this is already done
  in hardware breakpoints arch level.

- We already know the perf event we are dealing with when the
  event is to be committed.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1258987355-8751-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 0aafe85362fd..9425c9600c89 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3831,6 +3831,20 @@ static int perf_swevent_is_counting(struct perf_event *event)
 static int perf_tp_event_match(struct perf_event *event,
 				struct perf_sample_data *data);
 
+static int perf_exclude_event(struct perf_event *event,
+			      struct pt_regs *regs)
+{
+	if (regs) {
+		if (event->attr.exclude_user && user_mode(regs))
+			return 1;
+
+		if (event->attr.exclude_kernel && !user_mode(regs))
+			return 1;
+	}
+
+	return 0;
+}
+
 static int perf_swevent_match(struct perf_event *event,
 				enum perf_type_id type,
 				u32 event_id,
@@ -3842,16 +3856,12 @@ static int perf_swevent_match(struct perf_event *event,
 
 	if (event->attr.type != type)
 		return 0;
+
 	if (event->attr.config != event_id)
 		return 0;
 
-	if (regs) {
-		if (event->attr.exclude_user && user_mode(regs))
-			return 0;
-
-		if (event->attr.exclude_kernel && !user_mode(regs))
-			return 0;
-	}
+	if (perf_exclude_event(event, regs))
+		return 0;
 
 	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
 	    !perf_tp_event_match(event, data))
@@ -4282,9 +4292,15 @@ static const struct pmu *bp_perf_event_init(struct perf_event *bp)
 	return &perf_ops_bp;
 }
 
-void perf_bp_event(struct perf_event *bp, void *regs)
+void perf_bp_event(struct perf_event *bp, void *data)
 {
-	/* TODO */
+	struct perf_sample_data sample;
+	struct pt_regs *regs = data;
+
+	sample.addr = bp->attr.bp_addr;
+
+	if (!perf_exclude_event(bp, regs))
+		perf_swevent_add(bp, 1, 1, &sample, regs);
 }
 #else
 static void bp_perf_event_destroy(struct perf_event *event)
-- 
cgit v1.2.3-71-gd317


From 429947248f814e90f416ab4f68a871ab628000c3 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Fri, 20 Nov 2009 17:40:37 +0100
Subject: sched_feat_write(): Update ppos instead of file->f_pos

sched_feat_write() should update ppos instead of file->f_pos.

(This reduces some BKL dependencies of this code.)

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: jkacur@redhat.com
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
LKML-Reference: <1258735245-25826-8-git-send-email-jblunck@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ab9a034c4a17..93474a7935ae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -771,7 +771,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 	if (!sched_feat_names[i])
 		return -EINVAL;
 
-	filp->f_pos += cnt;
+	*ppos += cnt;
 
 	return cnt;
 }
-- 
cgit v1.2.3-71-gd317


From c4a5af54c8ef277a59189fc9358e190f3c1b8206 Mon Sep 17 00:00:00 2001
From: "Andrew G. Morgan" <morgan@kernel.org>
Date: Mon, 23 Nov 2009 04:57:52 +0000
Subject: Silence the existing API for capability version compatibility check.

When libcap, or other libraries attempt to confirm/determine the supported
capability version magic, they generally supply a NULL dataptr to capget().

In this case, while returning the supported/preferred magic (via a
modified header content), the return code of this system call may be 0,
-EINVAL, or -EFAULT.

No libcap code depends on the previous -EINVAL etc. return code, and
all of the above three return codes can accompany a valid (successful)
attempt to determine the requested magic value.

This patch cleans up the system call to return 0, if the call is
successfully being used to determine the supported/preferred capability
magic value.

Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Steve Grubb <sgrubb@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index c2316d3fa094..c450375e855f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -169,8 +169,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 	kernel_cap_t pE, pI, pP;
 
 	ret = cap_validate_magic(header, &tocopy);
-	if (ret != 0)
-		return ret;
+	if ((dataptr == NULL) || (ret != 0))
+		return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
 
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
-- 
cgit v1.2.3-71-gd317


From b3a222e52e4d4be77cc4520a57af1a4a0d8222d1 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 23 Nov 2009 16:21:30 -0600
Subject: remove CONFIG_SECURITY_FILE_CAPABILITIES compile option

As far as I know, all distros currently ship kernels with default
CONFIG_SECURITY_FILE_CAPABILITIES=y.  Since having the option on
leaves a 'no_file_caps' option to boot without file capabilities,
the main reason to keep the option is that turning it off saves
you (on my s390x partition) 5k.  In particular, vmlinux sizes
came to:

without patch fscaps=n:		 	53598392
without patch fscaps=y:		 	53603406
with this patch applied:		53603342

with the security-next tree.

Against this we must weigh the fact that there is no simple way for
userspace to figure out whether file capabilities are supported,
while things like per-process securebits, capability bounding
sets, and adding bits to pI if CAP_SETPCAP is in pE are not supported
with SECURITY_FILE_CAPABILITIES=n, leaving a bit of a problem for
applications wanting to know whether they can use them and/or why
something failed.

It also adds another subtly different set of semantics which we must
maintain at the risk of severe security regressions.

So this patch removes the SECURITY_FILE_CAPABILITIES compile
option.  It drops the kernel size by about 50k over the stock
SECURITY_FILE_CAPABILITIES=y kernel, by removing the
cap_limit_ptraced_target() function.

Changelog:
	Nov 20: remove cap_limit_ptraced_target() as it's logic
		was ifndef'ed.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan" <morgan@kernel.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h |  2 --
 include/linux/init_task.h  |  4 ---
 kernel/capability.c        |  2 --
 security/Kconfig           |  9 ------
 security/commoncap.c       | 72 ++--------------------------------------------
 5 files changed, 2 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index c8f2a5f70ed5..39e5ff512fbe 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -92,9 +92,7 @@ struct vfs_cap_data {
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
 #define _KERNEL_CAPABILITY_U32S    _LINUX_CAPABILITY_U32S_3
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 extern int file_caps_enabled;
-#endif
 
 typedef struct kernel_cap_struct {
 	__u32 cap[_KERNEL_CAPABILITY_U32S];
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 21a6f5d9af22..8d10aa7fd4c9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -83,16 +83,12 @@ extern struct group_info init_groups;
 #define INIT_IDS
 #endif
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 /*
  * Because of the reduced scope of CAP_SETPCAP when filesystem
  * capabilities are in effect, it is safe to allow CAP_SETPCAP to
  * be available in the default configuration.
  */
 # define CAP_INIT_BSET  CAP_FULL_SET
-#else
-# define CAP_INIT_BSET  CAP_INIT_EFF_SET
-#endif
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 #define INIT_TASK_RCU_PREEMPT(tsk)					\
diff --git a/kernel/capability.c b/kernel/capability.c
index c450375e855f..7f876e60521f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
 EXPORT_SYMBOL(__cap_full_set);
 EXPORT_SYMBOL(__cap_init_eff_set);
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 int file_caps_enabled = 1;
 
 static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
 	return 1;
 }
 __setup("no_file_caps", file_caps_disable);
-#endif
 
 /*
  * More recent versions of libcap are available from:
diff --git a/security/Kconfig b/security/Kconfig
index 95cc08913ca1..226b9556b25f 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -91,15 +91,6 @@ config SECURITY_PATH
 	  implement pathname based access controls.
 	  If you are unsure how to answer this question, answer N.
 
-config SECURITY_FILE_CAPABILITIES
-	bool "File POSIX Capabilities"
-	default n
-	help
-	  This enables filesystem capabilities, allowing you to give
-	  binaries a subset of root's powers without using setuid 0.
-
-	  If in doubt, answer N.
-
 config INTEL_TXT
 	bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)"
 	depends on HAVE_INTEL_TXT
diff --git a/security/commoncap.c b/security/commoncap.c
index 45b87af4ae5d..f800fdb3de94 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -173,7 +173,6 @@ int cap_capget(struct task_struct *target, kernel_cap_t *effective,
  */
 static inline int cap_inh_is_capped(void)
 {
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 
 	/* they are so limited unless the current task has the CAP_SETPCAP
 	 * capability
@@ -181,7 +180,6 @@ static inline int cap_inh_is_capped(void)
 	if (cap_capable(current, current_cred(), CAP_SETPCAP,
 			SECURITY_CAP_AUDIT) == 0)
 		return 0;
-#endif
 	return 1;
 }
 
@@ -239,8 +237,6 @@ static inline void bprm_clear_caps(struct linux_binprm *bprm)
 	bprm->cap_effective = false;
 }
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
-
 /**
  * cap_inode_need_killpriv - Determine if inode change affects privileges
  * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
@@ -421,49 +417,6 @@ out:
 	return rc;
 }
 
-#else
-int cap_inode_need_killpriv(struct dentry *dentry)
-{
-	return 0;
-}
-
-int cap_inode_killpriv(struct dentry *dentry)
-{
-	return 0;
-}
-
-int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
-{
-	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
- 	return -ENODATA;
-}
-
-static inline int get_file_caps(struct linux_binprm *bprm, bool *effective)
-{
-	bprm_clear_caps(bprm);
-	return 0;
-}
-#endif
-
-/*
- * Determine whether a exec'ing process's new permitted capabilities should be
- * limited to just what it already has.
- *
- * This prevents processes that are being ptraced from gaining access to
- * CAP_SETPCAP, unless the process they're tracing already has it, and the
- * binary they're executing has filecaps that elevate it.
- *
- *  Returns 1 if they should be limited, 0 if they are not.
- */
-static inline int cap_limit_ptraced_target(void)
-{
-#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-	if (capable(CAP_SETPCAP))
-		return 0;
-#endif
-	return 1;
-}
-
 /**
  * cap_bprm_set_creds - Set up the proposed credentials for execve().
  * @bprm: The execution parameters, including the proposed creds
@@ -523,9 +476,8 @@ skip:
 			new->euid = new->uid;
 			new->egid = new->gid;
 		}
-		if (cap_limit_ptraced_target())
-			new->cap_permitted = cap_intersect(new->cap_permitted,
-							   old->cap_permitted);
+		new->cap_permitted = cap_intersect(new->cap_permitted,
+						   old->cap_permitted);
 	}
 
 	new->suid = new->fsuid = new->euid;
@@ -739,7 +691,6 @@ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
 	return 0;
 }
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 /*
  * Rationale: code calling task_setscheduler, task_setioprio, and
  * task_setnice, assumes that
@@ -820,22 +771,6 @@ static long cap_prctl_drop(struct cred *new, unsigned long cap)
 	return 0;
 }
 
-#else
-int cap_task_setscheduler (struct task_struct *p, int policy,
-			   struct sched_param *lp)
-{
-	return 0;
-}
-int cap_task_setioprio (struct task_struct *p, int ioprio)
-{
-	return 0;
-}
-int cap_task_setnice (struct task_struct *p, int nice)
-{
-	return 0;
-}
-#endif
-
 /**
  * cap_task_prctl - Implement process control functions for this security module
  * @option: The process control function requested
@@ -866,7 +801,6 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		error = !!cap_raised(new->cap_bset, arg2);
 		goto no_change;
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 	case PR_CAPBSET_DROP:
 		error = cap_prctl_drop(new, arg2);
 		if (error < 0)
@@ -917,8 +851,6 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		error = new->securebits;
 		goto no_change;
 
-#endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
-
 	case PR_GET_KEEPCAPS:
 		if (issecure(SECURE_KEEP_CAPS))
 			error = 1;
-- 
cgit v1.2.3-71-gd317


From 184d3da8ef0ca552dffa0fdd35c046e058a2cf9a Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 23 Nov 2009 21:40:49 -0800
Subject: perf_events: Fix bogus copy_to_user() in perf_event_read_group()

When using an event group, the value and id for non leaders events
were wrong due to invalid offset into the outgoing buffer.

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: paulus@samba.org
Cc: perfmon2-devel@lists.sourceforge.net
LKML-Reference: <4b0b71e1.0508d00a.075e.ffff84a3@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9425c9600c89..accfd7bfe387 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1831,7 +1831,7 @@ static int perf_event_read_group(struct perf_event *event,
 
 		size = n * sizeof(u64);
 
-		if (copy_to_user(buf + size, values, size)) {
+		if (copy_to_user(buf + ret, values, size)) {
 			ret = -EFAULT;
 			goto unlock;
 		}
-- 
cgit v1.2.3-71-gd317


From 36ace27e3e60d44ea69ce394b2e45386ae98d9d9 Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Tue, 24 Nov 2009 11:55:45 +0100
Subject: sched: Optimize branch hint in pick_next_task_fair()

Branch hint profiling on my nehalem machine showed 90%
incorrect branch hints:

  15728471 158903754  90 pick_next_task_fair
  sched_fair.c    1555

Signed-off-by: Tim Blechmann <tim@klingt.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0BBBB1.2050100@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f28a2671a1a6..24086e7e7593 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1704,7 +1704,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 
-	if (unlikely(!cfs_rq->nr_running))
+	if (!cfs_rq->nr_running)
 		return NULL;
 
 	do {
-- 
cgit v1.2.3-71-gd317


From 710390d90f143a9ebb87a475215140f426792efd Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Tue, 24 Nov 2009 11:55:27 +0100
Subject: sched: Optimize branch hint in context_switch()

Branch hint profiling on my nehalem machine showed over 90%
incorrect branch hints:

  10420275 170645395  94 context_switch                 sched.c
   3043
  10408421 171098521  94 context_switch                 sched.c
   3050

Signed-off-by: Tim Blechmann <tim@klingt.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0BBB9F.6080304@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 93474a7935ae..010d5e16b4c5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2829,14 +2829,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 */
 	arch_start_context_switch(prev);
 
-	if (unlikely(!mm)) {
+	if (likely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 
-	if (unlikely(!prev->mm)) {
+	if (likely(!prev->mm)) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
-- 
cgit v1.2.3-71-gd317


From fe6126722718e51fba4879517c11ac12d9775bcc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 24 Nov 2009 20:38:22 +0100
Subject: perf_events: Fix bad software/trace event recursion counting

Commit 4ed7c92d68a5387ba5f7030dc76eab03558e27f5
(perf_events: Undo some recursion damage) has introduced a bad
reference counting of the recursion context. putting the context
behaves like getting it, dropping every software/trace events
after the first one in a context.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1259091502-5171-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index accfd7bfe387..35df94e344f2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3914,7 +3914,7 @@ void perf_swevent_put_recursion_context(int rctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	barrier();
-	cpuctx->recursion[rctx]++;
+	cpuctx->recursion[rctx]--;
 	put_cpu_var(perf_cpu_context);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-- 
cgit v1.2.3-71-gd317


From 99df5a6a215f026e62287083de2b44b22edd3623 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 25 Nov 2009 01:14:59 -0600
Subject: trace/syscalls: Change ret param in struct syscall_trace_exit to long

Commit ee949a86b3aef15845ea677aa60231008de62672 ("tracing/syscalls:
Use long for syscall ret format and field definitions") changed the
syscall exit return type to long, but forgot to change it in the
struct.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1259133299-23594-3-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4da6ede74401..1d7f4830a80d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -100,7 +100,7 @@ struct syscall_trace_enter {
 struct syscall_trace_exit {
 	struct trace_entry	ent;
 	int			nr;
-	unsigned long		ret;
+	long			ret;
 };
 
 struct kprobe_trace_entry {
-- 
cgit v1.2.3-71-gd317


From 93335a21557e80f6a99bc2812c634e488139043c Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Wed, 25 Nov 2009 15:23:41 +0200
Subject: sched.c: Call debug_show_all_locks() when dumping all tasks

In commit v2.6.21-691-g39bc89f ("make SysRq-T show all tasks
again") the interface of show_state_filter() was changed: zero
valued 'state_filter' specifies "dump all tasks" (instead of -1).

However, the condition for calling debug_show_all_locks() ("show
locks if all tasks are dumped") was not updated accordingly.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Cc: peterz@infradead.org
LKML-Reference: <4b0d2fe4.0ab6660a.6437.3cfc@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 010d5e16b4c5..a57c6aee6d4a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6915,7 +6915,7 @@ void show_state_filter(unsigned long state_filter)
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
-	if (state_filter == -1)
+	if (!state_filter)
 		debug_show_all_locks();
 }
 
-- 
cgit v1.2.3-71-gd317


From 7ac074340480018681a0d72b324d4487543bdc0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Nov 2009 13:22:21 -0500
Subject: ring-buffer-benchmark: Add parameters to set produce/consumer
 priorities

Running the ring-buffer-benchmark's threads at the lowest priority may
work well for keeping it in the background, but it is not appropriate
for the benchmarks.

This patch adds 4 parameters to the module:

  consumer_fifo
  consumer_nice
  producer_fifo
  producer_nice

By default the consumer and producer still run at nice +19.

If the *_fifo options are set, they will override the *_nice values.

 modprobe ring_buffer_benchmark consumer_nice=0 producer_fifo=10

The above will set the consumer thread to a nice value of 0, and
the producer thread to a RT SCHED_FIFO priority of 10.

Note, this patch also fixes a bug where calling set_user_nice on the
consumer thread would oops the kernel when the parameter "disable_reader"
is set.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 58 ++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 3875d49da990..b2477caf09c2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -39,6 +39,24 @@ static int write_iteration = 50;
 module_param(write_iteration, uint, 0644);
 MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
 
+static int producer_nice = 19;
+static int consumer_nice = 19;
+
+static int producer_fifo = -1;
+static int consumer_fifo = -1;
+
+module_param(producer_nice, uint, 0644);
+MODULE_PARM_DESC(producer_nice, "nice prio for producer");
+
+module_param(consumer_nice, uint, 0644);
+MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
+
+module_param(producer_fifo, uint, 0644);
+MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+
+module_param(consumer_fifo, uint, 0644);
+MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
+
 static int read_events;
 
 static int kill_test;
@@ -270,6 +288,27 @@ static void ring_buffer_producer(void)
 
 	if (kill_test)
 		trace_printk("ERROR!\n");
+
+	if (!disable_reader) {
+		if (consumer_fifo < 0)
+			trace_printk("Running Consumer at nice: %d\n",
+				     consumer_nice);
+		else
+			trace_printk("Running Consumer at SCHED_FIFO %d\n",
+				     consumer_fifo);
+	}
+	if (producer_fifo < 0)
+		trace_printk("Running Producer at nice: %d\n",
+			     producer_nice);
+	else
+		trace_printk("Running Producer at SCHED_FIFO %d\n",
+			     producer_fifo);
+
+	/* Let the user know that the test is running at low priority */
+	if (producer_fifo < 0 && consumer_fifo < 0 &&
+	    producer_nice == 19 && consumer_nice == 19)
+		trace_printk("WARNING!!! This test is running at lowest priority.\n");
+
 	trace_printk("Time:     %lld (usecs)\n", time);
 	trace_printk("Overruns: %lld\n", overruns);
 	if (disable_reader)
@@ -402,8 +441,23 @@ static int __init ring_buffer_benchmark_init(void)
 	/*
 	 * Run them as low-prio background tasks by default:
 	 */
-	set_user_nice(consumer, 19);
-	set_user_nice(producer, 19);
+	if (!disable_reader) {
+		if (consumer_fifo >= 0) {
+			struct sched_param param = {
+				.sched_priority = consumer_fifo
+			};
+			sched_setscheduler(consumer, SCHED_FIFO, &param);
+		} else
+			set_user_nice(consumer, consumer_nice);
+	}
+
+	if (producer_fifo >= 0) {
+		struct sched_param param = {
+			.sched_priority = consumer_fifo
+		};
+		sched_setscheduler(producer, SCHED_FIFO, &param);
+	} else
+		set_user_nice(producer, producer_nice);
 
 	return 0;
 
-- 
cgit v1.2.3-71-gd317


From d99be40aff88722ab03ee295e4f6c13a4cca9a3d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 26 Nov 2009 05:35:40 +0100
Subject: ksym_tracer: Fix breakpoint removal after modification

The error path of a breakpoint modification is broken in
the ksym tracer. A modified breakpoint hlist node is immediately
released after its removal. Also we leak a breakpoint in this
case.

Fix the path.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1259210142-5714-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_ksym.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 11935b53a6cb..9f040e42f516 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -339,14 +339,20 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 					ksym_hbp_handler, true);
 			if (IS_ERR(entry->ksym_hbp))
 				entry->ksym_hbp = NULL;
-			if (!entry->ksym_hbp)
+
+			/* modified without problem */
+			if (entry->ksym_hbp) {
+				ret = 0;
 				goto out;
+			}
+		} else {
+			ret = 0;
 		}
+		/* Error or "symbol:---" case: drop it */
 		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
 		kfree(entry);
-		ret = 0;
 		goto out;
 	} else {
 		/* Check for malformed request: (4) */
-- 
cgit v1.2.3-71-gd317


From c6567f642e20bcc79abed030f44be5b0d6da2ded Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 26 Nov 2009 05:35:41 +0100
Subject: hw-breakpoints: Improve in-kernel event creation error granularity

In fail case, perf_event_create_kernel_counter() returns NULL
instead of an error, which doesn't help us to inform the user
about the origin of the problem from the outer most callers.
Often we can just return -EINVAL, which doesn't help anyone when
it's eventually about a memory allocation failure.

Then, this patch makes perf_event_create_kernel_counter() always
return a detailed error code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1259210142-5714-2-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 35df94e344f2..34a1b9d7633e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4780,14 +4780,17 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	 */
 
 	ctx = find_get_context(pid, cpu);
-	if (IS_ERR(ctx))
-		return NULL;
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		goto err_exit;
+	}
 
 	event = perf_event_alloc(attr, cpu, ctx, NULL,
 				     NULL, callback, GFP_KERNEL);
-	err = PTR_ERR(event);
-	if (IS_ERR(event))
+	if (IS_ERR(event)) {
+		err = PTR_ERR(event);
 		goto err_put_context;
+	}
 
 	event->filp = NULL;
 	WARN_ON_ONCE(ctx->parent_ctx);
@@ -4804,11 +4807,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
-err_put_context:
-	if (err < 0)
-		put_ctx(ctx);
-
-	return NULL;
+ err_put_context:
+	put_ctx(ctx);
+ err_exit:
+	return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 
-- 
cgit v1.2.3-71-gd317


From 605bfaee9078cd0b01d83402315389839ee4bb5c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 26 Nov 2009 05:35:42 +0100
Subject: hw-breakpoints: Simplify error handling in breakpoint creation
 requests

This simplifies the error handling when we create a breakpoint.
We don't need to check the NULL return value corner case anymore
since we have improved perf_event_create_kernel_counter() to
always return an error code in the failure case.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1259210142-5714-3-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c                |  8 +-------
 kernel/hw_breakpoint.c                  |  4 ++--
 kernel/trace/trace_ksym.c               | 16 ++++------------
 samples/hw_breakpoint/data_breakpoint.c |  3 ---
 4 files changed, 7 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b25f8947ed7a..75e0cd847bd6 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -657,10 +657,7 @@ restore:
 					       tsk, true);
 		thread->ptrace_bps[i] = NULL;
 
-		if (!bp) { /* incorrect bp, or we have a bug in bp API */
-			rc = -EINVAL;
-			break;
-		}
+		/* Incorrect bp, or we have a bug in bp API */
 		if (IS_ERR(bp)) {
 			rc = PTR_ERR(bp);
 			bp = NULL;
@@ -729,9 +726,6 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 					       tsk,
 					       bp->attr.disabled);
 	}
-
-	if (!bp)
-		return -EIO;
 	/*
 	 * CHECKME: the previous code returned -EIO if the addr wasn't a
 	 * valid task virtual addr. The new one will return -EINVAL in this
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 06d372fc026d..dd3fb4a999d3 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -442,7 +442,7 @@ register_wide_hw_breakpoint(unsigned long addr,
 
 		*pevent = bp;
 
-		if (IS_ERR(bp) || !bp) {
+		if (IS_ERR(bp)) {
 			err = PTR_ERR(bp);
 			goto fail;
 		}
@@ -453,7 +453,7 @@ register_wide_hw_breakpoint(unsigned long addr,
 fail:
 	for_each_possible_cpu(cpu) {
 		pevent = per_cpu_ptr(cpu_events, cpu);
-		if (IS_ERR(*pevent) || !*pevent)
+		if (IS_ERR(*pevent))
 			break;
 		unregister_hw_breakpoint(*pevent);
 	}
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 9f040e42f516..c538b15b95d6 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -200,12 +200,9 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr,
 					entry->len, entry->type,
 					ksym_hbp_handler, true);
+
 	if (IS_ERR(entry->ksym_hbp)) {
-		entry->ksym_hbp = NULL;
 		ret = PTR_ERR(entry->ksym_hbp);
-	}
-
-	if (!entry->ksym_hbp) {
 		printk(KERN_INFO "ksym_tracer request failed. Try again"
 					" later!!\n");
 		goto err;
@@ -332,21 +329,16 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 	if (changed) {
 		unregister_wide_hw_breakpoint(entry->ksym_hbp);
 		entry->type = op;
+		ret = 0;
 		if (op > 0) {
 			entry->ksym_hbp =
 				register_wide_hw_breakpoint(entry->ksym_addr,
 					entry->len, entry->type,
 					ksym_hbp_handler, true);
 			if (IS_ERR(entry->ksym_hbp))
-				entry->ksym_hbp = NULL;
-
-			/* modified without problem */
-			if (entry->ksym_hbp) {
-				ret = 0;
+				ret = PTR_ERR(entry->ksym_hbp);
+			else
 				goto out;
-			}
-		} else {
-			ret = 0;
 		}
 		/* Error or "symbol:---" case: drop it */
 		ksym_filter_entry_count--;
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index 95063818bcf4..ee7f9fbaffbd 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -61,9 +61,6 @@ static int __init hw_break_module_init(void)
 	if (IS_ERR(sample_hbp)) {
 		ret = PTR_ERR(sample_hbp);
 		goto fail;
-	} else if (!sample_hbp) {
-		ret = -EINVAL;
-		goto fail;
 	}
 
 	printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
-- 
cgit v1.2.3-71-gd317


From 11e6635763bdc0e24b39a38876574660755acffc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 25 Nov 2009 23:01:50 -0800
Subject: kernel/hw_breakpoint.c: Fix local/global shadowing

If the new percpu tree is combined with the perf events tree
the following new warning triggers:

 kernel/hw_breakpoint.c: In function 'toggle_bp_task_slot':
 kernel/hw_breakpoint.c:151: warning: 'task_bp_pinned' is used uninitialized in this function

Because it's not valid anymore to define a local variable
and a percpu variable (even if it's file scope local) with
the same name.

Rename the local variable to resolve this.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <200911260701.nAQ71owx016356@imap1.linux-foundation.org>
[ v2: added changelog ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index dd3fb4a999d3..32e1018191be 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -121,7 +121,7 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
 	int count = 0;
 	struct perf_event *bp;
 	struct perf_event_context *ctx = tsk->perf_event_ctxp;
-	unsigned int *task_bp_pinned;
+	unsigned int *tsk_pinned;
 	struct list_head *list;
 	unsigned long flags;
 
@@ -146,15 +146,15 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
 	if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
 		return;
 
-	task_bp_pinned = per_cpu(task_bp_pinned, cpu);
+	tsk_pinned = per_cpu(task_bp_pinned, cpu);
 	if (enable) {
-		task_bp_pinned[count]++;
+		tsk_pinned[count]++;
 		if (count > 0)
-			task_bp_pinned[count-1]--;
+			tsk_pinned[count-1]--;
 	} else {
-		task_bp_pinned[count]--;
+		tsk_pinned[count]--;
 		if (count > 0)
-			task_bp_pinned[count-1]++;
+			tsk_pinned[count-1]++;
 	}
 }
 
-- 
cgit v1.2.3-71-gd317


From f6630114d9198aa959ac95c131334c020038f253 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 17 Nov 2009 18:22:15 -0600
Subject: sched: Limit the number of scheduler debug messages

Remove the verbose scheduler debug messages unless kernel
parameter "sched_debug" set.  /proc/sched_debug unchanged.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091118002221.489305000@alcatraz.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  2 ++
 kernel/sched.c                      | 13 +++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9107b387e91f..f2a9507b27b2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2182,6 +2182,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	sbni=		[NET] Granch SBNI12 leased line adapter
 
+	sched_debug	[KNL] Enables verbose scheduler debug messages.
+
 	sc1200wdt=	[HW,WDT] SC1200 WDT (watchdog) driver
 			Format: <io>[,<timeout>[,<isapnp>]]
 
diff --git a/kernel/sched.c b/kernel/sched.c
index a57c6aee6d4a..48ff66a6892d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7720,6 +7720,16 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SCHED_DEBUG
 
+static __read_mostly int sched_domain_debug_enabled;
+
+static int __init sched_domain_debug_setup(char *str)
+{
+	sched_domain_debug_enabled = 1;
+
+	return 0;
+}
+early_param("sched_debug", sched_domain_debug_setup);
+
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  struct cpumask *groupmask)
 {
@@ -7806,6 +7816,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 	cpumask_var_t groupmask;
 	int level = 0;
 
+	if (!sched_domain_debug_enabled)
+		return;
+
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
-- 
cgit v1.2.3-71-gd317


From feae3203d711db0a9965300ee6d592257fdaae4f Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 17 Nov 2009 18:22:13 -0600
Subject: timers, init: Limit the number of per cpu calibration bootup messages

Limit the number of per cpu calibration messages by only
printing out results for the first cpu to boot.

Also, don't print "CPUx is down" as this is expected, and we
don't need 4096 reminders... ;-)

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091118002219.889552000@alcatraz.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/calibrate.c | 24 +++++++++++++++---------
 kernel/cpu.c     |  5 ++---
 2 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/init/calibrate.c b/init/calibrate.c
index a379c9061199..6eb48e53d61c 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -123,23 +123,26 @@ void __cpuinit calibrate_delay(void)
 {
 	unsigned long ticks, loopbit;
 	int lps_precision = LPS_PREC;
+	static bool printed;
 
 	if (preset_lpj) {
 		loops_per_jiffy = preset_lpj;
-		printk(KERN_INFO
-			"Calibrating delay loop (skipped) preset value.. ");
-	} else if ((smp_processor_id() == 0) && lpj_fine) {
+		if (!printed)
+			pr_info("Calibrating delay loop (skipped) "
+				"preset value.. ");
+	} else if ((!printed) && lpj_fine) {
 		loops_per_jiffy = lpj_fine;
-		printk(KERN_INFO
-			"Calibrating delay loop (skipped), "
+		pr_info("Calibrating delay loop (skipped), "
 			"value calculated using timer frequency.. ");
 	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
-		printk(KERN_INFO
-			"Calibrating delay using timer specific routine.. ");
+		if (!printed)
+			pr_info("Calibrating delay using timer "
+				"specific routine.. ");
 	} else {
 		loops_per_jiffy = (1<<12);
 
-		printk(KERN_INFO "Calibrating delay loop... ");
+		if (!printed)
+			pr_info("Calibrating delay loop... ");
 		while ((loops_per_jiffy <<= 1) != 0) {
 			/* wait for "start of" clock tick */
 			ticks = jiffies;
@@ -170,7 +173,10 @@ void __cpuinit calibrate_delay(void)
 				loops_per_jiffy &= ~loopbit;
 		}
 	}
-	printk(KERN_CONT "%lu.%02lu BogoMIPS (lpj=%lu)\n",
+	if (!printed)
+		pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
 			loops_per_jiffy/(500000/HZ),
 			(loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy);
+
+	printed = true;
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb212..7c4e2713df0a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -392,10 +392,9 @@ int disable_nonboot_cpus(void)
 		if (cpu == first_cpu)
 			continue;
 		error = _cpu_down(cpu, 1);
-		if (!error) {
+		if (!error)
 			cpumask_set_cpu(cpu, frozen_cpus);
-			printk("CPU%d is down\n", cpu);
-		} else {
+		else {
 			printk(KERN_ERR "Error taking CPU%d down: %d\n",
 				cpu, error);
 			break;
-- 
cgit v1.2.3-71-gd317


From 80bbf6b641c8843b9d751a1f299aa7ee073ab9d4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Nov 2009 21:20:53 +0100
Subject: hw-breakpoints: Fix unused function in off-case

bp_perf_event_destroy() is unused in its off-case version, let's
remove it to fix the following warning reported by Stephen
Rothwell in linux-next:

  kernel/perf_event.c:4306: warning: 'bp_perf_event_destroy' defined but not used

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1259180453-5813-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 34a1b9d7633e..f8c7939c57cf 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4303,10 +4303,6 @@ void perf_bp_event(struct perf_event *bp, void *data)
 		perf_swevent_add(bp, 1, 1, &sample, regs);
 }
 #else
-static void bp_perf_event_destroy(struct perf_event *event)
-{
-}
-
 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
 {
 	return NULL;
-- 
cgit v1.2.3-71-gd317


From d1eb650ff4130972fa21462fa49cd35a2865403b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 24 Nov 2009 16:56:45 -0500
Subject: tracepoint: Move signal sending tracepoint to events/signal.h

Move signal sending event to events/signal.h. This patch also
renames sched_signal_send event to signal_generate.

Changes in v4:
 - Fix a typo of task_struct pointer.

Changes in v3:
 - Add docbook style comments

Changes in v2:
 - Add siginfo argument
 - Add siginfo storing macro

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Reviewed-by: Jason Baron <jbaron@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <20091124215645.30449.60208.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/DocBook/tracepoint.tmpl |  5 +++
 include/trace/events/sched.h          | 25 -------------
 include/trace/events/signal.h         | 66 +++++++++++++++++++++++++++++++++++
 kernel/signal.c                       |  5 +--
 4 files changed, 74 insertions(+), 27 deletions(-)
 create mode 100644 include/trace/events/signal.h

(limited to 'kernel')

diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
index b0756d0fd579..8bca1d5cec09 100644
--- a/Documentation/DocBook/tracepoint.tmpl
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -86,4 +86,9 @@
 !Iinclude/trace/events/irq.h
   </chapter>
 
+  <chapter id="signal">
+   <title>SIGNAL</title>
+!Iinclude/trace/events/signal.h
+  </chapter>
+
 </book>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9d316b22388c..cfceb0b73e20 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -287,31 +287,6 @@ TRACE_EVENT(sched_process_fork,
 		__entry->child_comm, __entry->child_pid)
 );
 
-/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
-	TP_PROTO(int sig, struct task_struct *p),
-
-	TP_ARGS(sig, p),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig			)
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->sig	= sig;
-	),
-
-	TP_printk("sig=%d comm=%s pid=%d",
-		  __entry->sig, __entry->comm, __entry->pid)
-);
-
 /*
  * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
  *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
new file mode 100644
index 000000000000..ef51756a801d
--- /dev/null
+++ b/include/trace/events/signal.h
@@ -0,0 +1,66 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM signal
+
+#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SIGNAL_H
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#define TP_STORE_SIGINFO(__entry, info)				\
+	do {							\
+		if (info == SEND_SIG_NOINFO) {			\
+			__entry->errno	= 0;			\
+			__entry->code	= SI_USER;		\
+		} else if (info == SEND_SIG_PRIV) {		\
+			__entry->errno	= 0;			\
+			__entry->code	= SI_KERNEL;		\
+		} else {					\
+			__entry->errno	= info->si_errno;	\
+			__entry->code	= info->si_code;	\
+		}						\
+	} while (0)
+
+/**
+ * signal_generate - called when a signal is generated
+ * @sig: signal number
+ * @info: pointer to struct siginfo
+ * @task: pointer to struct task_struct
+ *
+ * Current process sends a 'sig' signal to 'task' process with
+ * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV,
+ * 'info' is not a pointer and you can't access its field. Instead,
+ * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV
+ * means that si_code is SI_KERNEL.
+ */
+TRACE_EVENT(signal_generate,
+
+	TP_PROTO(int sig, struct siginfo *info, struct task_struct *task),
+
+	TP_ARGS(sig, info, task),
+
+	TP_STRUCT__entry(
+		__field(	int,	sig			)
+		__field(	int,	errno			)
+		__field(	int,	code			)
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		__entry->sig	= sig;
+		TP_STORE_SIGINFO(__entry, info);
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->pid	= task->pid;
+	),
+
+	TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d",
+		  __entry->sig, __entry->errno, __entry->code,
+		  __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_SIGNAL_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 6705320784fd..a1e0cc6b32c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,8 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/signal.h>
 
 #include <asm/param.h>
 #include <asm/uaccess.h>
@@ -834,7 +835,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	struct sigqueue *q;
 	int override_rlimit;
 
-	trace_sched_signal_send(sig, t);
+	trace_signal_generate(sig, info, t);
 
 	assert_spin_locked(&t->sighand->siglock);
 
-- 
cgit v1.2.3-71-gd317


From f9d4257e01d266e67420cc99d456b6d4c8464f54 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 24 Nov 2009 16:56:51 -0500
Subject: tracepoint: Add signal deliver event

Add a tracepoint where a process gets a signal. This tracepoint
shows signal-number, sa-handler and sa-flag.

Changes in v3:
 - Add docbook style comments

Changes in v2:
 - Add siginfo argument
 - Fix comment

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Reviewed-by: Jason Baron <jbaron@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <20091124215651.30449.20926.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/events/signal.h | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/signal.c               |  3 +++
 2 files changed, 42 insertions(+)

(limited to 'kernel')

diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
index ef51756a801d..a6d71de0dc0d 100644
--- a/include/trace/events/signal.h
+++ b/include/trace/events/signal.h
@@ -60,6 +60,45 @@ TRACE_EVENT(signal_generate,
 		  __entry->comm, __entry->pid)
 );
 
+/**
+ * signal_deliver - called when a signal is delivered
+ * @sig: signal number
+ * @info: pointer to struct siginfo
+ * @ka: pointer to struct k_sigaction
+ *
+ * A 'sig' signal is delivered to current process with 'info' siginfo,
+ * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or
+ * SIG_DFL.
+ * Note that some signals reported by signal_generate tracepoint can be
+ * lost, ignored or modified (by debugger) before hitting this tracepoint.
+ * This means, this can show which signals are actually delivered, but
+ * matching generated signals and delivered signals may not be correct.
+ */
+TRACE_EVENT(signal_deliver,
+
+	TP_PROTO(int sig, struct siginfo *info, struct k_sigaction *ka),
+
+	TP_ARGS(sig, info, ka),
+
+	TP_STRUCT__entry(
+		__field(	int,		sig		)
+		__field(	int,		errno		)
+		__field(	int,		code		)
+		__field(	unsigned long,	sa_handler	)
+		__field(	unsigned long,	sa_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->sig	= sig;
+		TP_STORE_SIGINFO(__entry, info);
+		__entry->sa_handler	= (unsigned long)ka->sa.sa_handler;
+		__entry->sa_flags	= ka->sa.sa_flags;
+	),
+
+	TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx",
+		  __entry->sig, __entry->errno, __entry->code,
+		  __entry->sa_handler, __entry->sa_flags)
+);
 #endif /* _TRACE_SIGNAL_H */
 
 /* This part must be outside protection */
diff --git a/kernel/signal.c b/kernel/signal.c
index a1e0cc6b32c3..349d44937406 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1840,6 +1840,9 @@ relock:
 			ka = &sighand->action[signr-1];
 		}
 
+		/* Trace actually delivered signals. */
+		trace_signal_deliver(signr, info, ka);
+
 		if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
 			continue;
 		if (ka->sa.sa_handler != SIG_DFL) {
-- 
cgit v1.2.3-71-gd317


From ba005e1f417295d28cd1563ab82bc33af07fb16a Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 24 Nov 2009 16:56:58 -0500
Subject: tracepoint: Add signal loss events

Add signal_overflow_fail and signal_lose_info tracepoints
for signal-lost events.

Changes in v3:
 - Add docbook style comments

Changes in v2:
 - Use siginfo string macro

Suggested-by: Roland McGrath <roland@redhat.com>
Reviewed-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <20091124215658.30449.9934.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/events/signal.h | 68 +++++++++++++++++++++++++++++++++++++++++++
 kernel/signal.c               | 19 ++++++++----
 2 files changed, 82 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
index a6d71de0dc0d..a510b75ac304 100644
--- a/include/trace/events/signal.h
+++ b/include/trace/events/signal.h
@@ -99,6 +99,74 @@ TRACE_EVENT(signal_deliver,
 		  __entry->sig, __entry->errno, __entry->code,
 		  __entry->sa_handler, __entry->sa_flags)
 );
+
+/**
+ * signal_overflow_fail - called when signal queue is overflow
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel fails to generate 'sig' signal with 'info' siginfo, because
+ * siginfo queue is overflow, and the signal is dropped.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of RT signals.
+ */
+TRACE_EVENT(signal_overflow_fail,
+
+	TP_PROTO(int sig, int group, struct siginfo *info),
+
+	TP_ARGS(sig, group, info),
+
+	TP_STRUCT__entry(
+		__field(	int,	sig	)
+		__field(	int,	group	)
+		__field(	int,	errno	)
+		__field(	int,	code	)
+	),
+
+	TP_fast_assign(
+		__entry->sig	= sig;
+		__entry->group	= group;
+		TP_STORE_SIGINFO(__entry, info);
+	),
+
+	TP_printk("sig=%d group=%d errno=%d code=%d",
+		  __entry->sig, __entry->group, __entry->errno, __entry->code)
+);
+
+/**
+ * signal_lose_info - called when siginfo is lost
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel generates 'sig' signal but loses 'info' siginfo, because siginfo
+ * queue is overflow.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of non-RT signals.
+ */
+TRACE_EVENT(signal_lose_info,
+
+	TP_PROTO(int sig, int group, struct siginfo *info),
+
+	TP_ARGS(sig, group, info),
+
+	TP_STRUCT__entry(
+		__field(	int,	sig	)
+		__field(	int,	group	)
+		__field(	int,	errno	)
+		__field(	int,	code	)
+	),
+
+	TP_fast_assign(
+		__entry->sig	= sig;
+		__entry->group	= group;
+		TP_STORE_SIGINFO(__entry, info);
+	),
+
+	TP_printk("sig=%d group=%d errno=%d code=%d",
+		  __entry->sig, __entry->group, __entry->errno, __entry->code)
+);
 #endif /* _TRACE_SIGNAL_H */
 
 /* This part must be outside protection */
diff --git a/kernel/signal.c b/kernel/signal.c
index 349d44937406..93e72e5feae6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -897,12 +897,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			break;
 		}
 	} else if (!is_si_special(info)) {
-		if (sig >= SIGRTMIN && info->si_code != SI_USER)
-		/*
-		 * Queue overflow, abort.  We may abort if the signal was rt
-		 * and sent by user using something other than kill().
-		 */
+		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
+			/*
+			 * Queue overflow, abort.  We may abort if the
+			 * signal was rt and sent by user using something
+			 * other than kill().
+			 */
+			trace_signal_overflow_fail(sig, group, info);
 			return -EAGAIN;
+		} else {
+			/*
+			 * This is a silent loss of information.  We still
+			 * send the signal, but the *info bits are lost.
+			 */
+			trace_signal_lose_info(sig, group, info);
+		}
 	}
 
 out_set:
-- 
cgit v1.2.3-71-gd317


From d180c5bccec02612256fd8076ff3c1fac3429553 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:48:30 +0900
Subject: sched: Introduce task_times() to replace task_{u,s}time() pair

Functions task_{u,s}time() are called in pair in almost all
cases.  However task_stime() is implemented to call task_utime()
from its inside, so such paired calls run task_utime() twice.

It means we do heavy divisions (div_u64 + do_div) twice to get
utime and stime which can be obtained at same time by one set
of divisions.

This patch introduces a function task_times(*tsk, *utime,
*stime) to retrieve utime and stime at once in better, optimized
way.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
LKML-Reference: <4B0E16AE.906@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c       |  3 +--
 include/linux/sched.h |  1 +
 kernel/exit.c         |  7 +++++--
 kernel/sched.c        | 55 ++++++++++++++++++++++++++++++++-------------------
 kernel/sys.c          |  3 +--
 5 files changed, 43 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index e209f64ab27b..330deda70d08 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -535,8 +535,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	if (!whole) {
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
-		utime = task_utime(task);
-		stime = task_stime(task);
+		task_times(task, &utime, &stime);
 		gtime = task_gtime(task);
 	}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78ba664474f3..fe6ae1516640 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1723,6 +1723,7 @@ static inline void put_task_struct(struct task_struct *t)
 extern cputime_t task_utime(struct task_struct *p);
 extern cputime_t task_stime(struct task_struct *p);
 extern cputime_t task_gtime(struct task_struct *p);
+extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..29068ab2670a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,6 +91,8 @@ static void __exit_signal(struct task_struct *tsk)
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
 	else {
+		cputime_t utime, stime;
+
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
@@ -110,8 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, task_utime(tsk));
-		sig->stime = cputime_add(sig->stime, task_stime(tsk));
+		task_times(tsk, &utime, &stime);
+		sig->utime = cputime_add(sig->utime, utime);
+		sig->stime = cputime_add(sig->stime, stime);
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
diff --git a/kernel/sched.c b/kernel/sched.c
index 315ba4059f93..475a6f2b7158 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5191,6 +5191,14 @@ cputime_t task_stime(struct task_struct *p)
 {
 	return p->stime;
 }
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	if (ut)
+		*ut = task_utime(p);
+	if (st)
+		*st = task_stime(p);
+}
 #else
 
 #ifndef nsecs_to_cputime
@@ -5198,41 +5206,48 @@ cputime_t task_stime(struct task_struct *p)
 	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
 #endif
 
-cputime_t task_utime(struct task_struct *p)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	cputime_t utime = p->utime, total = utime + p->stime;
-	u64 temp;
+	cputime_t rtime, utime = p->utime, total = utime + p->stime;
 
 	/*
 	 * Use CFS's precise accounting:
 	 */
-	temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
+	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 
 	if (total) {
-		temp *= utime;
+		u64 temp;
+
+		temp = (u64)(rtime * utime);
 		do_div(temp, total);
-	}
-	utime = (cputime_t)temp;
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
 
+	/*
+	 * Compare with previous values, to keep monotonicity:
+	 */
 	p->prev_utime = max(p->prev_utime, utime);
-	return p->prev_utime;
+	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+	if (ut)
+		*ut = p->prev_utime;
+	if (st)
+		*st = p->prev_stime;
+}
+
+cputime_t task_utime(struct task_struct *p)
+{
+	cputime_t utime;
+	task_times(p, &utime, NULL);
+	return utime;
 }
 
 cputime_t task_stime(struct task_struct *p)
 {
 	cputime_t stime;
-
-	/*
-	 * Use CFS's precise accounting. (we subtract utime from
-	 * the total, to make sure the total observed by userspace
-	 * grows monotonically - apps rely on that):
-	 */
-	stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
-
-	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, stime);
-
-	return p->prev_stime;
+	task_times(p, NULL, &stime);
+	return stime;
 }
 #endif
 
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760d9c51..bbdfce0d4347 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1346,8 +1346,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	utime = stime = cputime_zero;
 
 	if (who == RUSAGE_THREAD) {
-		utime = task_utime(current);
-		stime = task_stime(current);
+		task_times(current, &utime, &stime);
 		accumulate_thread_rusage(p, r);
 		maxrss = p->signal->maxrss;
 		goto out;
-- 
cgit v1.2.3-71-gd317


From d5b7c78e975302a1bab28263266c39ecb71acad4 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:49:05 +0900
Subject: sched: Remove task_{u,s,g}time()

Now all task_{u,s}time() pairs are replaced by task_times().
And task_gtime() is too simple to be an inline function.

Cleanup them all.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
LKML-Reference: <4B0E16D1.70902@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c       |  4 ++--
 include/linux/sched.h |  3 ---
 kernel/exit.c         |  2 +-
 kernel/sched.c        | 33 ++-------------------------------
 4 files changed, 5 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 330deda70d08..ca61a88aed66 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				gtime = cputime_add(gtime, task_gtime(t));
+				gtime = cputime_add(gtime, t->gtime);
 				t = next_thread(t);
 			} while (t != task);
 
@@ -536,7 +536,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
 		task_times(task, &utime, &stime);
-		gtime = task_gtime(task);
+		gtime = task->gtime;
 	}
 
 	/* scale priority and nice values from timeslices to -20..20 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe6ae1516640..0395b0f4df3a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1720,9 +1720,6 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
-extern cputime_t task_utime(struct task_struct *p);
-extern cputime_t task_stime(struct task_struct *p);
-extern cputime_t task_gtime(struct task_struct *p);
 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 29068ab2670a..2eaf68b634e3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,7 +115,7 @@ static void __exit_signal(struct task_struct *tsk)
 		task_times(tsk, &utime, &stime);
 		sig->utime = cputime_add(sig->utime, utime);
 		sig->stime = cputime_add(sig->stime, stime);
-		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
+		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
diff --git a/kernel/sched.c b/kernel/sched.c
index 475a6f2b7158..82251c21f785 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5182,22 +5182,12 @@ void account_idle_ticks(unsigned long ticks)
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-cputime_t task_utime(struct task_struct *p)
-{
-	return p->utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
-	return p->stime;
-}
-
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	if (ut)
-		*ut = task_utime(p);
+		*ut = p->utime;
 	if (st)
-		*st = task_stime(p);
+		*st = p->stime;
 }
 #else
 
@@ -5235,27 +5225,8 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	if (st)
 		*st = p->prev_stime;
 }
-
-cputime_t task_utime(struct task_struct *p)
-{
-	cputime_t utime;
-	task_times(p, &utime, NULL);
-	return utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
-	cputime_t stime;
-	task_times(p, NULL, &stime);
-	return stime;
-}
 #endif
 
-inline cputime_t task_gtime(struct task_struct *p)
-{
-	return p->gtime;
-}
-
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
-- 
cgit v1.2.3-71-gd317


From b7b20df91d43d5e59578b8fc16e895c0c8cbd9b5 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:49:27 +0900
Subject: sched, time: Define nsecs_to_jiffies()

Use of msecs_to_jiffies() for nsecs_to_cputime() have some
problems:

 - The type of msecs_to_jiffies()'s argument is unsigned int, so
   it cannot convert msecs greater than UINT_MAX = about 49.7 days.

 - msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument
   is set, assuming that input was negative value.  So it cannot
   convert msecs greater than INT_MAX = about 24.8 days too.

This patch defines a new function nsecs_to_jiffies() that can
deal greater values, and that can deal all incoming values as
unsigned.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Amrico Wang <xiyou.wangcong@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@linux.vnet.ibm.com>
LKML-Reference: <4B0E16E7.5070307@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/jiffies.h |  1 +
 kernel/sched.c          |  3 +--
 kernel/time.c           | 30 ++++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 1a9cf78bfce5..6811f4bfc6e7 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
 extern unsigned long clock_t_to_jiffies(unsigned long x);
 extern u64 jiffies_64_to_clock_t(u64 x);
 extern u64 nsec_to_clock_t(u64 x);
+extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 82251c21f785..b3d4e2be95aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5192,8 +5192,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 #else
 
 #ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) \
-	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
 #endif
 
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..804798005d19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x)
 #endif
 }
 
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:	nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+#if (NSEC_PER_SEC % HZ) == 0
+	/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
+	return div_u64(n, NSEC_PER_SEC / HZ);
+#elif (HZ % 512) == 0
+	/* overflow after 292 years if HZ = 1024 */
+	return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
+#else
+	/*
+	 * Generic case - optimized for cases where HZ is a multiple of 3.
+	 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
+	 */
+	return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
+#endif
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.3-71-gd317


From b2e74a265ded1a185f762ebaab967e9e0d008dd8 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 26 Nov 2009 09:24:30 -0800
Subject: perf_events: Fix read() bogus counts when in error state

When a pinned group cannot be scheduled it goes into error state.

Normally a group cannot go out of error state without being
explicitly re-enabled or disabled. There was a bug in per-thread
mode, whereby upon termination of the thread, the group would
transition from error to off leading to bogus counts and timing
information returned by read().

Fix it by clearing the error state.

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: perfmon2-devel@lists.sourceforge.net
LKML-Reference: <4b0eb9ce.0508d00a.573b.ffffeab6@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f8c7939c57cf..0b9ca2d834dd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -338,7 +338,16 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 		event->group_leader->nr_siblings--;
 
 	update_event_times(event);
-	event->state = PERF_EVENT_STATE_OFF;
+
+	/*
+	 * If event was in error state, then keep it
+	 * that way, otherwise bogus counts will be
+	 * returned on read(). The only way to get out
+	 * of error state is by explicit re-enabling
+	 * of the event
+	 */
+	if (event->state > PERF_EVENT_STATE_OFF)
+		event->state = PERF_EVENT_STATE_OFF;
 
 	/*
 	 * If this was a group event with sibling events then
-- 
cgit v1.2.3-71-gd317


From e5af02261668350b43eb7381648930bde8e872f7 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Fri, 27 Nov 2009 13:28:20 +1100
Subject: softlockup: Fix hung_task_check_count sysctl

I'm seeing spikes of up to 0.5ms in khungtaskd on a large
machine. To reduce this source of jitter I tried setting
hung_task_check_count to 0:

 # echo 0 > /proc/sys/kernel/hung_task_check_count

which didn't have the intended response. Change to a post
increment of max_count, so a value of 0 means check 0 tasks.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: msb@google.com
LKML-Reference: <20091127022820.GU32182@kryten>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d4e841747400..0c642d51aac2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 
 	rcu_read_lock();
 	do_each_thread(g, t) {
-		if (!--max_count)
+		if (!max_count--)
 			goto unlock;
 		if (!--batch_count) {
 			batch_count = HUNG_TASK_BATCHING;
-- 
cgit v1.2.3-71-gd317


From 5fa10b28e57f94a90535cfeafe89dcee9f47d540 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Nov 2009 04:55:53 +0100
Subject: hw-breakpoints: Use struct perf_event_attr to define user breakpoints

In-kernel user breakpoints are created using functions in which
we pass breakpoint parameters as individual variables: address,
length and type.

Although it fits well for x86, this just does not scale across
archictectures that may support this api later as these may have
more or different needs. Pass in a perf_event_attr structure
instead because it is meant to evolve as much as possible into
a generic hardware breakpoint parameter structure.

Reported-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1259294154-5197-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c      | 74 ++++++++++++++++++++----------------
 include/linux/hw_breakpoint.h | 36 ++++++++----------
 kernel/hw_breakpoint.c        | 87 +++++++++----------------------------------
 3 files changed, 75 insertions(+), 122 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 75e0cd847bd6..2941b32ea666 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -593,6 +593,34 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[])
 	return dr7;
 }
 
+static struct perf_event *
+ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+			 struct task_struct *tsk)
+{
+	int err;
+	int gen_len, gen_type;
+	DEFINE_BREAKPOINT_ATTR(attr);
+
+	/*
+	 * We shoud have at least an inactive breakpoint at this
+	 * slot. It means the user is writing dr7 without having
+	 * written the address register first
+	 */
+	if (!bp)
+		return ERR_PTR(-EINVAL);
+
+	err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+	if (err)
+		return ERR_PTR(err);
+
+	attr = bp->attr;
+	attr.bp_len = gen_len;
+	attr.bp_type = gen_type;
+	attr.disabled = 0;
+
+	return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
+}
+
 /*
  * Handle ptrace writes to debug register 7.
  */
@@ -603,7 +631,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 	int i, orig_ret = 0, rc = 0;
 	int enabled, second_pass = 0;
 	unsigned len, type;
-	int gen_len, gen_type;
 	struct perf_event *bp;
 
 	data &= ~DR_CONTROL_RESERVED;
@@ -634,33 +661,12 @@ restore:
 			continue;
 		}
 
-		/*
-		 * We shoud have at least an inactive breakpoint at this
-		 * slot. It means the user is writing dr7 without having
-		 * written the address register first
-		 */
-		if (!bp) {
-			rc = -EINVAL;
-			break;
-		}
-
-		rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
-		if (rc)
-			break;
-
-		/*
-		 * This is a temporary thing as bp is unregistered/registered
-		 * to simulate modification
-		 */
-		bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len,
-					       gen_type, bp->callback,
-					       tsk, true);
-		thread->ptrace_bps[i] = NULL;
+		bp = ptrace_modify_breakpoint(bp, len, type, tsk);
 
 		/* Incorrect bp, or we have a bug in bp API */
 		if (IS_ERR(bp)) {
 			rc = PTR_ERR(bp);
-			bp = NULL;
+			thread->ptrace_bps[i] = NULL;
 			break;
 		}
 		thread->ptrace_bps[i] = bp;
@@ -707,24 +713,26 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 {
 	struct perf_event *bp;
 	struct thread_struct *t = &tsk->thread;
+	DEFINE_BREAKPOINT_ATTR(attr);
 
 	if (!t->ptrace_bps[nr]) {
 		/*
 		 * Put stub len and type to register (reserve) an inactive but
 		 * correct bp
 		 */
-		bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1,
-						 HW_BREAKPOINT_W,
-						 ptrace_triggered, tsk,
-						 false);
+		attr.bp_addr = addr;
+		attr.bp_len = HW_BREAKPOINT_LEN_1;
+		attr.bp_type = HW_BREAKPOINT_W;
+		attr.disabled = 1;
+
+		bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
 	} else {
 		bp = t->ptrace_bps[nr];
 		t->ptrace_bps[nr] = NULL;
-		bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len,
-					       bp->attr.bp_type,
-					       bp->callback,
-					       tsk,
-					       bp->attr.disabled);
+
+		attr = bp->attr;
+		attr.bp_addr = addr;
+		bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
 	}
 	/*
 	 * CHECKME: the previous code returned -EIO if the addr wasn't a
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index c9f7f7c7b0e0..5da472e434b7 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -20,6 +20,14 @@ enum {
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
+/* As it's for in-kernel or ptrace use, we want it to be pinned */
+#define DEFINE_BREAKPOINT_ATTR(name)	\
+struct perf_event_attr name = {		\
+	.type = PERF_TYPE_BREAKPOINT,	\
+	.size = sizeof(name),		\
+	.pinned = 1,			\
+};
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
@@ -36,22 +44,16 @@ static inline int hw_breakpoint_len(struct perf_event *bp)
 }
 
 extern struct perf_event *
-register_user_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
+register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_callback_t triggered,
-			    struct task_struct *tsk,
-			    bool active);
+			    struct task_struct *tsk);
 
 /* FIXME: only change from the attr, and don't unregister */
 extern struct perf_event *
 modify_user_hw_breakpoint(struct perf_event *bp,
-			  unsigned long addr,
-			  int len,
-			  int type,
+			  struct perf_event_attr *attr,
 			  perf_callback_t triggered,
-			  struct task_struct *tsk,
-			  bool active);
+			  struct task_struct *tsk);
 
 /*
  * Kernel breakpoints are not associated with any particular thread.
@@ -89,20 +91,14 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
 #else /* !CONFIG_HAVE_HW_BREAKPOINT */
 
 static inline struct perf_event *
-register_user_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
+register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_callback_t triggered,
-			    struct task_struct *tsk,
-			    bool active)		{ return NULL; }
+			    struct task_struct *tsk)	{ return NULL; }
 static inline struct perf_event *
 modify_user_hw_breakpoint(struct perf_event *bp,
-			  unsigned long addr,
-			  int len,
-			  int type,
+			  struct perf_event_attr *attr,
 			  perf_callback_t triggered,
-			  struct task_struct *tsk,
-			  bool active)			{ return NULL; }
+			  struct task_struct *tsk)	{ return NULL; }
 static inline struct perf_event *
 register_wide_hw_breakpoint_cpu(unsigned long addr,
 				int len,
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 32e1018191be..2a47514f12fd 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -289,90 +289,32 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	return __register_perf_hw_breakpoint(bp);
 }
 
-/*
- * Register a breakpoint bound to a task and a given cpu.
- * If cpu is -1, the breakpoint is active for the task in every cpu
- * If the task is -1, the breakpoint is active for every tasks in the given
- * cpu.
- */
-static struct perf_event *
-register_user_hw_breakpoint_cpu(unsigned long addr,
-				int len,
-				int type,
-				perf_callback_t triggered,
-				pid_t pid,
-				int cpu,
-				bool active)
-{
-	struct perf_event_attr *attr;
-	struct perf_event *bp;
-
-	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-	if (!attr)
-		return ERR_PTR(-ENOMEM);
-
-	attr->type = PERF_TYPE_BREAKPOINT;
-	attr->size = sizeof(*attr);
-	attr->bp_addr = addr;
-	attr->bp_len = len;
-	attr->bp_type = type;
-	/*
-	 * Such breakpoints are used by debuggers to trigger signals when
-	 * we hit the excepted memory op. We can't miss such events, they
-	 * must be pinned.
-	 */
-	attr->pinned = 1;
-
-	if (!active)
-		attr->disabled = 1;
-
-	bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered);
-	kfree(attr);
-
-	return bp;
-}
-
 /**
  * register_user_hw_breakpoint - register a hardware breakpoint for user space
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @active: should we activate it while registering it
- *
  */
 struct perf_event *
-register_user_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
+register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_callback_t triggered,
-			    struct task_struct *tsk,
-			    bool active)
+			    struct task_struct *tsk)
 {
-	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
-					       tsk->pid, -1, active);
+	return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
 /**
  * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
  * @bp: the breakpoint structure to modify
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: new breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @active: should we activate it while registering it
  */
 struct perf_event *
-modify_user_hw_breakpoint(struct perf_event *bp,
-			  unsigned long addr,
-			  int len,
-			  int type,
+modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
 			  perf_callback_t triggered,
-			  struct task_struct *tsk,
-			  bool active)
+			  struct task_struct *tsk)
 {
 	/*
 	 * FIXME: do it without unregistering
@@ -381,8 +323,7 @@ modify_user_hw_breakpoint(struct perf_event *bp,
 	 */
 	unregister_hw_breakpoint(bp);
 
-	return register_user_hw_breakpoint(addr, len, type, triggered,
-					   tsk, active);
+	return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
 
@@ -406,8 +347,16 @@ register_kernel_hw_breakpoint_cpu(unsigned long addr,
 				  int cpu,
 				  bool active)
 {
-	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
-					       -1, cpu, active);
+	DEFINE_BREAKPOINT_ATTR(attr);
+
+	attr.bp_addr = addr;
+	attr.bp_len = len;
+	attr.bp_type = type;
+
+	if (!active)
+		attr.disabled = 1;
+
+	return perf_event_create_kernel_counter(&attr, cpu, -1, triggered);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From dd1853c3f493f6d22d9e5390b192a07b73d2ac0a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Nov 2009 04:55:54 +0100
Subject: hw-breakpoints: Use struct perf_event_attr to define kernel
 breakpoints

Kernel breakpoints are created using functions in which we pass
breakpoint parameters as individual variables: address, length
and type.

Although it fits well for x86, this just does not scale across
architectures that may support this api later as these may have
more or different needs. Pass in a perf_event_attr structure
instead because it is meant to evolve as much as possible into
a generic hardware breakpoint parameter structure.

Reported-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1259294154-5197-2-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hw_breakpoint.h           | 35 ++++++++++++---------------
 kernel/hw_breakpoint.c                  | 35 ++++-----------------------
 kernel/trace/trace_ksym.c               | 42 ++++++++++++++++-----------------
 samples/hw_breakpoint/data_breakpoint.c | 10 ++++----
 4 files changed, 44 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 5da472e434b7..a03daed08c59 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -28,6 +28,13 @@ struct perf_event_attr name = {		\
 	.pinned = 1,			\
 };
 
+static inline void hw_breakpoint_init(struct perf_event_attr *attr)
+{
+	attr->type = PERF_TYPE_BREAKPOINT;
+	attr->size = sizeof(*attr);
+	attr->pinned = 1;
+}
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
@@ -59,19 +66,13 @@ modify_user_hw_breakpoint(struct perf_event *bp,
  * Kernel breakpoints are not associated with any particular thread.
  */
 extern struct perf_event *
-register_wide_hw_breakpoint_cpu(unsigned long addr,
-				int len,
-				int type,
+register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
 				perf_callback_t triggered,
-				int cpu,
-				bool active);
+				int cpu);
 
 extern struct perf_event **
-register_wide_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
-			    perf_callback_t triggered,
-			    bool active);
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+			    perf_callback_t triggered);
 
 extern int register_perf_hw_breakpoint(struct perf_event *bp);
 extern int __register_perf_hw_breakpoint(struct perf_event *bp);
@@ -100,18 +101,12 @@ modify_user_hw_breakpoint(struct perf_event *bp,
 			  perf_callback_t triggered,
 			  struct task_struct *tsk)	{ return NULL; }
 static inline struct perf_event *
-register_wide_hw_breakpoint_cpu(unsigned long addr,
-				int len,
-				int type,
+register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
 				perf_callback_t triggered,
-				int cpu,
-				bool active)		{ return NULL; }
+				int cpu)		{ return NULL; }
 static inline struct perf_event **
-register_wide_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
-			    perf_callback_t triggered,
-			    bool active)		{ return NULL; }
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+			    perf_callback_t triggered)	{ return NULL; }
 static inline int
 register_perf_hw_breakpoint(struct perf_event *bp)	{ return -ENOSYS; }
 static inline int
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2a47514f12fd..cf5ee1628411 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -339,42 +339,16 @@ void unregister_hw_breakpoint(struct perf_event *bp)
 }
 EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 
-static struct perf_event *
-register_kernel_hw_breakpoint_cpu(unsigned long addr,
-				  int len,
-				  int type,
-				  perf_callback_t triggered,
-				  int cpu,
-				  bool active)
-{
-	DEFINE_BREAKPOINT_ATTR(attr);
-
-	attr.bp_addr = addr;
-	attr.bp_len = len;
-	attr.bp_type = type;
-
-	if (!active)
-		attr.disabled = 1;
-
-	return perf_event_create_kernel_counter(&attr, cpu, -1, triggered);
-}
-
 /**
  * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
- * @active: should we activate it while registering it
  *
  * @return a set of per_cpu pointers to perf events
  */
 struct perf_event **
-register_wide_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
-			    perf_callback_t triggered,
-			    bool active)
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+			    perf_callback_t triggered)
 {
 	struct perf_event **cpu_events, **pevent, *bp;
 	long err;
@@ -386,8 +360,7 @@ register_wide_hw_breakpoint(unsigned long addr,
 
 	for_each_possible_cpu(cpu) {
 		pevent = per_cpu_ptr(cpu_events, cpu);
-		bp = register_kernel_hw_breakpoint_cpu(addr, len, type,
-					triggered, cpu, active);
+		bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
 
 		*pevent = bp;
 
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index c538b15b95d6..ddfa0fd43bc0 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -42,9 +42,7 @@
 
 struct trace_ksym {
 	struct perf_event	**ksym_hbp;
-	unsigned long		ksym_addr;
-	int			type;
-	int			len;
+	struct perf_event_attr	attr;
 #ifdef CONFIG_PROFILE_KSYM_TRACER
 	unsigned long		counter;
 #endif
@@ -71,7 +69,7 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-		if ((entry->ksym_addr == hbp_hit_addr) &&
+		if ((entry->attr.bp_addr == hbp_hit_addr) &&
 		    (entry->counter <= MAX_UL_INT)) {
 			entry->counter++;
 			break;
@@ -192,14 +190,15 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->type = op;
-	entry->ksym_addr = addr;
-	entry->len = HW_BREAKPOINT_LEN_4;
+	hw_breakpoint_init(&entry->attr);
+
+	entry->attr.bp_type = op;
+	entry->attr.bp_addr = addr;
+	entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
 
 	ret = -EAGAIN;
-	entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr,
-					entry->len, entry->type,
-					ksym_hbp_handler, true);
+	entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
+					ksym_hbp_handler);
 
 	if (IS_ERR(entry->ksym_hbp)) {
 		ret = PTR_ERR(entry->ksym_hbp);
@@ -236,12 +235,12 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
 	mutex_lock(&ksym_tracer_mutex);
 
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr);
-		if (entry->type == HW_BREAKPOINT_R)
+		ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
+		if (entry->attr.bp_type == HW_BREAKPOINT_R)
 			ret = trace_seq_puts(s, "r--\n");
-		else if (entry->type == HW_BREAKPOINT_W)
+		else if (entry->attr.bp_type == HW_BREAKPOINT_W)
 			ret = trace_seq_puts(s, "-w-\n");
-		else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
+		else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
 			ret = trace_seq_puts(s, "rw-\n");
 		WARN_ON_ONCE(!ret);
 	}
@@ -317,9 +316,9 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 
 	ret = -EINVAL;
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		if (entry->ksym_addr == ksym_addr) {
+		if (entry->attr.bp_addr == ksym_addr) {
 			/* Check for malformed request: (6) */
-			if (entry->type != op)
+			if (entry->attr.bp_type != op)
 				changed = 1;
 			else
 				goto out;
@@ -328,13 +327,12 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 	}
 	if (changed) {
 		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		entry->type = op;
+		entry->attr.bp_type = op;
 		ret = 0;
 		if (op > 0) {
 			entry->ksym_hbp =
-				register_wide_hw_breakpoint(entry->ksym_addr,
-					entry->len, entry->type,
-					ksym_hbp_handler, true);
+				register_wide_hw_breakpoint(&entry->attr,
+					ksym_hbp_handler);
 			if (IS_ERR(entry->ksym_hbp))
 				ret = PTR_ERR(entry->ksym_hbp);
 			else
@@ -489,7 +487,7 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v)
 
 	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
 
-	access_type = entry->type;
+	access_type = entry->attr.bp_type;
 
 	switch (access_type) {
 	case HW_BREAKPOINT_R:
@@ -505,7 +503,7 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v)
 		seq_puts(m, "  NA          ");
 	}
 
-	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
+	if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
 		seq_printf(m, "  %-36s", fn_name);
 	else
 		seq_printf(m, "  %-36s", "<NA>");
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index ee7f9fbaffbd..29525500df00 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -51,13 +51,13 @@ static void sample_hbp_handler(struct perf_event *temp, void *data)
 static int __init hw_break_module_init(void)
 {
 	int ret;
-	unsigned long addr;
+	DEFINE_BREAKPOINT_ATTR(attr);
 
-	addr = kallsyms_lookup_name(ksym_name);
+	attr.bp_addr = kallsyms_lookup_name(ksym_name);
+	attr.bp_len = HW_BREAKPOINT_LEN_4;
+	attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
 
-	sample_hbp = register_wide_hw_breakpoint(addr, HW_BREAKPOINT_LEN_4,
-						 HW_BREAKPOINT_W | HW_BREAKPOINT_R,
-						 sample_hbp_handler, true);
+	sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler);
 	if (IS_ERR(sample_hbp)) {
 		ret = PTR_ERR(sample_hbp);
 		goto fail;
-- 
cgit v1.2.3-71-gd317


From 0f1ef51d244809f417bdf45cdb00109fb6005672 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 26 Nov 2009 15:49:33 +0800
Subject: trace_syscalls: Add syscall nr field

Field syscall number is missed in syscall_enter_define_fields()/
syscall_exit_define_fields().

Syscall number is also needed for event filter or other users.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4B0E330D.1070206@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9189cbe86079..63aa8070365d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -261,6 +261,10 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
 	if (ret)
 		return ret;
 
+	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+	if (ret)
+		return ret;
+
 	for (i = 0; i < meta->nb_args; i++) {
 		ret = trace_define_field(call, meta->types[i],
 					 meta->args[i], offset,
@@ -281,6 +285,10 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
 	if (ret)
 		return ret;
 
+	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+	if (ret)
+		return ret;
+
 	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
 				 FILTER_OTHER);
 
-- 
cgit v1.2.3-71-gd317


From abab9d37d2a826fcf588c5f30152dbe05c40111c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Nov 2009 16:32:21 +0800
Subject: trace_kprobes: Fix memory leak

tp->nr_args is not set before we "goto error",
it causes memory leak for free_trace_probe() use tp->nr_args
to free memory of args.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0CEB95.2060107@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 79ce6a2bd74f..82e85836d05e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -704,10 +704,12 @@ static int create_trace_probe(int argc, char **argv)
 		ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
 		if (ret) {
 			pr_info("Parse error at argument%d. (%d)\n", i, ret);
+			kfree(tp->args[i].name);
 			goto error;
 		}
+
+		tp->nr_args++;
 	}
-	tp->nr_args = i;
 
 	ret = register_trace_probe(tp);
 	if (ret)
-- 
cgit v1.2.3-71-gd317


From 3d9b2e1ddf42dd3df38af7794fa5e39cce760f3b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Nov 2009 16:32:47 +0800
Subject: trace_kprobes: Always show group name

Sometimes the group name is not "kprobes",
It'll be better if we can read it from tracing/kprobe_events.

 # echo 'r:laijs/vfs_read vfs_read %ax' > kprobe_events
 # cat kprobe_events
 r:laijs/vfs_read vfs_read %ax=%ax

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0CEBAF.6000104@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 82e85836d05e..96e1944229be 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -760,7 +760,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	char buf[MAX_ARGSTR_LEN + 1];
 
 	seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
-	seq_printf(m, ":%s", tp->call.name);
+	seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
 
 	if (tp->symbol)
 		seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
-- 
cgit v1.2.3-71-gd317


From 52a11f354970e7301e1d1a029b87535be45abec9 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Nov 2009 16:33:15 +0800
Subject: trace_kprobes: Don't output zero offset

"symbol_name+0" is not so friendly.
It makes the output longer.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0CEBCB.7080309@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 96e1944229be..72d0c65c8676 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -243,7 +243,11 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
 		ret = snprintf(buf, n, "@0x%p", ff->data);
 	else if (ff->func == fetch_symbol) {
 		struct symbol_cache *sc = ff->data;
-		ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
+		if (sc->offset)
+			ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
+					sc->offset);
+		else
+			ret = snprintf(buf, n, "@%s", sc->symbol);
 	} else if (ff->func == fetch_retvalue)
 		ret = snprintf(buf, n, "$retval");
 	else if (ff->func == fetch_stack_address)
@@ -762,10 +766,12 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
 	seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
 
-	if (tp->symbol)
+	if (!tp->symbol)
+		seq_printf(m, " 0x%p", tp->rp.kp.addr);
+	else if (tp->rp.kp.offset)
 		seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
 	else
-		seq_printf(m, " 0x%p", tp->rp.kp.addr);
+		seq_printf(m, " %s", probe_symbol(tp));
 
 	for (i = 0; i < tp->nr_args; i++) {
 		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
-- 
cgit v1.2.3-71-gd317


From 8e7cac79808b62f242069a6ac88d364d35621371 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 29 Nov 2009 16:34:48 +0200
Subject: core: Fix user return notifier on fork()

fork() clones all thread_info flags, including
TIF_USER_RETURN_NOTIFY; if the new task is first scheduled on a cpu
which doesn't have user return notifiers set, this causes user
return notifiers to trigger without any way of clearing itself.

This is easy to trigger with a forky workload on the host in
parallel with kvm, resulting in a cpu in an endless loop on the
verge of returning to userspace.

Fix by dropping the TIF_USER_RETURN_NOTIFY immediately after fork.

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1259505288-16559-1-git-send-email-avi@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/user-return-notifier.h | 7 +++++++
 kernel/fork.c                        | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
index b6ac056291d7..9c4a445bb43c 100644
--- a/include/linux/user-return-notifier.h
+++ b/include/linux/user-return-notifier.h
@@ -26,6 +26,11 @@ static inline void propagate_user_return_notify(struct task_struct *prev,
 
 void fire_user_return_notifiers(void);
 
+static inline void clear_user_return_notifier(struct task_struct *p)
+{
+	clear_tsk_thread_flag(p, TIF_USER_RETURN_NOTIFY);
+}
+
 #else
 
 struct user_return_notifier {};
@@ -37,6 +42,8 @@ static inline void propagate_user_return_notify(struct task_struct *prev,
 
 static inline void fire_user_return_notifiers(void) {}
 
+static inline void clear_user_return_notifier(struct task_struct *p) {}
+
 #endif
 
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 266c6af6ef1b..1b7512d5a64a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
 #include <linux/magic.h>
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		goto out;
 
 	setup_thread_stack(tsk, orig);
+	clear_user_return_notifier(tsk);
 	stackend = end_of_stack(tsk);
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 
-- 
cgit v1.2.3-71-gd317


From ba8665d7dd95eb6093ee06f8f624b6acb1e73206 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 30 Nov 2009 19:19:20 -0500
Subject: trace_kprobes: Fix a memory leak bug and check kstrdup() return value

Fix a memory leak case in create_trace_probe(). When an argument
is too long (> MAX_ARGSTR_LEN), it just jumps to error path. In
that case tp->args[i].name is not released.
This also fixes a bug to check kstrdup()'s return value.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091201001919.10235.56455.stgit@harusame>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 72d0c65c8676..aff5f80b59b8 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -483,7 +483,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
 	return ret;
 }
 
-static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+/* Recursive argument parser */
+static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
@@ -543,7 +544,7 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 			if (!id)
 				return -ENOMEM;
 			id->offset = offset;
-			ret = parse_probe_arg(arg, &id->orig, is_return);
+			ret = __parse_probe_arg(arg, &id->orig, is_return);
 			if (ret)
 				kfree(id);
 			else {
@@ -560,6 +561,16 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 	return ret;
 }
 
+/* String length checking wrapper */
+static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+	if (strlen(arg) > MAX_ARGSTR_LEN) {
+		pr_info("Argument is too long.: %s\n",  arg);
+		return -ENOSPC;
+	}
+	return __parse_probe_arg(arg, ff, is_return);
+}
+
 /* Return 1 if name is reserved or already used by another argument */
 static int conflict_field_name(const char *name,
 			       struct probe_arg *args, int narg)
@@ -698,13 +709,14 @@ static int create_trace_probe(int argc, char **argv)
 		}
 
 		tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
-
-		/* Parse fetch argument */
-		if (strlen(arg) > MAX_ARGSTR_LEN) {
-			pr_info("Argument%d(%s) is too long.\n", i, arg);
-			ret = -ENOSPC;
+		if (!tp->args[i].name) {
+			pr_info("Failed to allocate argument%d name '%s'.\n",
+				i, argv[i]);
+			ret = -ENOMEM;
 			goto error;
 		}
+
+		/* Parse fetch argument */
 		ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
 		if (ret) {
 			pr_info("Parse error at argument%d. (%d)\n", i, ret);
-- 
cgit v1.2.3-71-gd317


From 59d069eb5ae9b033ed1c124c92e1532c4a958991 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 17:30:08 +0800
Subject: perf_event: Initialize data.period in perf_swevent_hrtimer()

In current code in perf_swevent_hrtimer(), data.period is not
initialized, The result is obvious wrong:

 # ./perf record -f -e cpu-clock make
 # ./perf report
 # Samples: 1740
 #
 # Overhead   Command                                   ......
 # ........  ........  ..........................................
 #
   1025422183050275328.00%        sh  libc-2.9.90.so ...
   1025422183050275328.00%      perl  libperl.so     ...
   1025422168240043264.00%      perl  [kernel]       ...
   1025422030011210752.00%      perl  [kernel]       ...

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org>
LKML-Reference: <4B14E220.2050107@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 0b9ca2d834dd..040ee517c808 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4010,6 +4010,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	event->pmu->read(event);
 
 	data.addr = 0;
+	data.period = event->hw.last_period;
 	regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
-- 
cgit v1.2.3-71-gd317


From fa1dae4906982b5d896c07613b1fe42456133b1c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Dec 2009 13:52:08 +0000
Subject: SLOW_WORK: Fix the CONFIG_MODULES=n case

Commits 3d7a641 ("SLOW_WORK: Wait for outstanding work items belonging to a
module to clear") introduced some code to make sure that all of a module's
slow-work items were complete before that module was removed, and commit
3bde31a ("SLOW_WORK: Allow a requeueable work item to sleep till the thread is
needed") further extended that, breaking it in the process if CONFIG_MODULES=n:

    CC      kernel/slow-work.o
  kernel/slow-work.c: In function 'slow_work_execute':
  kernel/slow-work.c:313: error: 'slow_work_thread_processing' undeclared (first use in this function)
  kernel/slow-work.c:313: error: (Each undeclared identifier is reported only once
  kernel/slow-work.c:313: error: for each function it appears in.)
  kernel/slow-work.c: In function 'slow_work_wait_for_items':
  kernel/slow-work.c:950: error: 'slow_work_unreg_sync_lock' undeclared (first use in this function)
  kernel/slow-work.c:951: error: 'slow_work_unreg_wq' undeclared (first use in this function)
  kernel/slow-work.c:961: error: 'slow_work_unreg_work_item' undeclared (first use in this function)
  kernel/slow-work.c:974: error: 'slow_work_unreg_module' undeclared (first use in this function)
  kernel/slow-work.c:977: error: 'slow_work_thread_processing' undeclared (first use in this function)
  make[1]: *** [kernel/slow-work.o] Error 1

Fix this by:

 (1) Extracting the bits of slow_work_execute() that are contingent on
     CONFIG_MODULES, and the bits that should be, into inline functions and
     placing them into the #ifdef'd section that defines the relevant variables
     and adding stubs for moduleless kernels.  This allows the removal of some
     #ifdefs.

 (2) #ifdef'ing out the contents of slow_work_wait_for_items() in moduleless
     kernels.

The four functions related to handling module unloading synchronisation (and
their associated variables) could be offloaded into a separate .c file, but
each function is only used once and three of them are tiny, so doing so would
prevent them from being inlined.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/slow-work.c | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index da94f3c101af..b5c17f15f9de 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -109,6 +109,30 @@ static struct module *slow_work_unreg_module;
 static struct slow_work *slow_work_unreg_work_item;
 static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
 static DEFINE_MUTEX(slow_work_unreg_sync_lock);
+
+static void slow_work_set_thread_processing(int id, struct slow_work *work)
+{
+	if (work)
+		slow_work_thread_processing[id] = work->owner;
+}
+static void slow_work_done_thread_processing(int id, struct slow_work *work)
+{
+	struct module *module = slow_work_thread_processing[id];
+
+	slow_work_thread_processing[id] = NULL;
+	smp_mb();
+	if (slow_work_unreg_work_item == work ||
+	    slow_work_unreg_module == module)
+		wake_up_all(&slow_work_unreg_wq);
+}
+static void slow_work_clear_thread_processing(int id)
+{
+	slow_work_thread_processing[id] = NULL;
+}
+#else
+static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
+static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
+static void slow_work_clear_thread_processing(int id) {}
 #endif
 
 /*
@@ -197,9 +221,6 @@ static unsigned slow_work_calc_vsmax(void)
  */
 static noinline bool slow_work_execute(int id)
 {
-#ifdef CONFIG_MODULES
-	struct module *module;
-#endif
 	struct slow_work *work = NULL;
 	unsigned vsmax;
 	bool very_slow;
@@ -236,10 +257,7 @@ static noinline bool slow_work_execute(int id)
 		very_slow = false; /* avoid the compiler warning */
 	}
 
-#ifdef CONFIG_MODULES
-	if (work)
-		slow_work_thread_processing[id] = work->owner;
-#endif
+	slow_work_set_thread_processing(id, work);
 	if (work) {
 		slow_work_mark_time(work);
 		slow_work_begin_exec(id, work);
@@ -287,15 +305,7 @@ static noinline bool slow_work_execute(int id)
 
 	/* sort out the race between module unloading and put_ref() */
 	slow_work_put_ref(work);
-
-#ifdef CONFIG_MODULES
-	module = slow_work_thread_processing[id];
-	slow_work_thread_processing[id] = NULL;
-	smp_mb();
-	if (slow_work_unreg_work_item == work ||
-	    slow_work_unreg_module == module)
-		wake_up_all(&slow_work_unreg_wq);
-#endif
+	slow_work_done_thread_processing(id, work);
 
 	return true;
 
@@ -310,7 +320,7 @@ auto_requeue:
 	else
 		list_add_tail(&work->link, &slow_work_queue);
 	spin_unlock_irq(&slow_work_queue_lock);
-	slow_work_thread_processing[id] = NULL;
+	slow_work_clear_thread_processing(id);
 	return true;
 }
 
@@ -943,6 +953,7 @@ EXPORT_SYMBOL(slow_work_register_user);
  */
 static void slow_work_wait_for_items(struct module *module)
 {
+#ifdef CONFIG_MODULES
 	DECLARE_WAITQUEUE(myself, current);
 	struct slow_work *work;
 	int loop;
@@ -989,6 +1000,7 @@ static void slow_work_wait_for_items(struct module *module)
 
 	remove_wait_queue(&slow_work_unreg_wq, &myself);
 	mutex_unlock(&slow_work_unreg_sync_lock);
+#endif /* CONFIG_MODULES */
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From f13a48bd798a159291ca583b95453171b88b7448 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Dec 2009 15:36:11 +0000
Subject: SLOW_WORK: Move slow_work's proc file to debugfs

Move slow_work's debugging proc file to debugfs.

Signed-off-by: David Howells <dhowells@redhat.com>
Requested-and-acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/slow-work.txt |   4 +-
 include/linux/slow-work.h   |   8 +-
 init/Kconfig                |   8 +-
 kernel/Makefile             |   2 +-
 kernel/slow-work-debugfs.c  | 227 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/slow-work-proc.c     | 227 --------------------------------------------
 kernel/slow-work.c          |  18 ++--
 kernel/slow-work.h          |   6 +-
 8 files changed, 253 insertions(+), 247 deletions(-)
 create mode 100644 kernel/slow-work-debugfs.c
 delete mode 100644 kernel/slow-work-proc.c

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 52bc31433723..9dbf4470c7e1 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -279,9 +279,9 @@ The slow-work thread pool has a number of configurables:
 VIEWING EXECUTING AND QUEUED ITEMS
 ==================================
 
-If CONFIG_SLOW_WORK_PROC is enabled, a proc file is made available:
+If CONFIG_SLOW_WORK_DEBUG is enabled, a debugfs file is made available:
 
-	/proc/slow_work_rq
+	/sys/kernel/debug/slow_work/runqueue
 
 through which the list of work items being executed and the queues of items to
 be executed may be viewed.  The owner of a work item is given the chance to
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 5035a2691739..13337bf6c3f5 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -20,7 +20,7 @@
 #include <linux/timer.h>
 
 struct slow_work;
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 struct seq_file;
 #endif
 
@@ -42,8 +42,8 @@ struct slow_work_ops {
 	/* execute a work item */
 	void (*execute)(struct slow_work *work);
 
-#ifdef CONFIG_SLOW_WORK_PROC
-	/* describe a work item for /proc */
+#ifdef CONFIG_SLOW_WORK_DEBUG
+	/* describe a work item for debugfs */
 	void (*desc)(struct slow_work *work, struct seq_file *m);
 #endif
 };
@@ -64,7 +64,7 @@ struct slow_work {
 #define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	struct timespec		mark;	/* jiffies at which queued or exec begun */
 #endif
 };
diff --git a/init/Kconfig b/init/Kconfig
index ab5c64801fe5..39923ccc287b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1098,12 +1098,12 @@ config SLOW_WORK
 
 	  See Documentation/slow-work.txt.
 
-config SLOW_WORK_PROC
-	bool "Slow work debugging through /proc"
+config SLOW_WORK_DEBUG
+	bool "Slow work debugging through debugfs"
 	default n
-	depends on SLOW_WORK && PROC_FS
+	depends on SLOW_WORK && DEBUG_FS
 	help
-	  Display the contents of the slow work run queue through /proc,
+	  Display the contents of the slow work run queue through debugfs,
 	  including items currently executing.
 
 	  See Documentation/slow-work.txt.
diff --git a/kernel/Makefile b/kernel/Makefile
index 776ffed1556d..d7c13d249b2d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_PROC) += slow-work-proc.o
+obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
new file mode 100644
index 000000000000..e45c43645298
--- /dev/null
+++ b/kernel/slow-work-debugfs.c
@@ -0,0 +1,227 @@
+/* Slow work debugging
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include "slow-work.h"
+
+#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
+#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
+#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
+
+void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
+{
+	seq_puts(m, "Slow-work: New thread");
+}
+
+/*
+ * Render the time mark field on a work item into a 5-char time with units plus
+ * a space
+ */
+static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
+{
+	struct timespec now, diff;
+
+	now = CURRENT_TIME;
+	diff = timespec_sub(now, work->mark);
+
+	if (diff.tv_sec < 0)
+		seq_puts(m, "  -ve ");
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
+		seq_printf(m, "%3luns ", diff.tv_nsec);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
+		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
+		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
+	else if (diff.tv_sec <= 1)
+		seq_puts(m, "   1s ");
+	else if (diff.tv_sec < 60)
+		seq_printf(m, "%4lus ", diff.tv_sec);
+	else if (diff.tv_sec < 60 * 60)
+		seq_printf(m, "%4lum ", diff.tv_sec / 60);
+	else if (diff.tv_sec < 60 * 60 * 24)
+		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
+	else
+		seq_puts(m, "exces ");
+}
+
+/*
+ * Describe a slow work item for debugfs
+ */
+static int slow_work_runqueue_show(struct seq_file *m, void *v)
+{
+	struct slow_work *work;
+	struct list_head *p = v;
+	unsigned long id;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
+		return 0;
+	case 2:
+		seq_puts(m, "=== ===== ================ == ===== ==========\n");
+		return 0;
+
+	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
+		id = (unsigned long) v - 3;
+
+		read_lock(&slow_work_execs_lock);
+		work = slow_work_execs[id];
+		if (work) {
+			smp_read_barrier_depends();
+
+			seq_printf(m, "%3lu %5d %16p %2lx ",
+				   id, slow_work_pids[id], work, work->flags);
+			slow_work_print_mark(m, work);
+
+			if (work->ops->desc)
+				work->ops->desc(work, m);
+			seq_putc(m, '\n');
+		}
+		read_unlock(&slow_work_execs_lock);
+		return 0;
+
+	default:
+		work = list_entry(p, struct slow_work, link);
+		seq_printf(m, "%3s     - %16p %2lx ",
+			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
+			   work, work->flags);
+		slow_work_print_mark(m, work);
+
+		if (work->ops->desc)
+			work->ops->desc(work, m);
+		seq_putc(m, '\n');
+		return 0;
+	}
+}
+
+/*
+ * map the iterator to a work item
+ */
+static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
+{
+	struct list_head *p;
+	unsigned long count, id;
+
+	switch (*_pos >> ITERATOR_SHIFT) {
+	case 0x0:
+		if (*_pos == 0)
+			*_pos = 1;
+		if (*_pos < 3)
+			return (void *)(unsigned long) *_pos;
+		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
+			for (id = *_pos - 3;
+			     id < SLOW_WORK_THREAD_LIMIT;
+			     id++, (*_pos)++)
+				if (slow_work_execs[id])
+					return (void *)(unsigned long) *_pos;
+		*_pos = 0x1UL << ITERATOR_SHIFT;
+
+	case 0x1:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &slow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+
+	case 0x2:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &vslow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
+{
+	spin_lock_irq(&slow_work_queue_lock);
+	return slow_work_runqueue_index(m, _pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	struct list_head *p = v;
+	unsigned long selector = *_pos >> ITERATOR_SHIFT;
+
+	(*_pos)++;
+	switch (selector) {
+	case 0x0:
+		return slow_work_runqueue_index(m, _pos);
+
+	case 0x1:
+		if (*_pos >> ITERATOR_SHIFT == 0x1) {
+			p = p->next;
+			if (p != &slow_work_queue)
+				return p;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+		p = &vslow_work_queue;
+
+	case 0x2:
+		if (*_pos >> ITERATOR_SHIFT == 0x2) {
+			p = p->next;
+			if (p != &vslow_work_queue)
+				return p;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * clean up after reading
+ */
+static void slow_work_runqueue_stop(struct seq_file *m, void *v)
+{
+	spin_unlock_irq(&slow_work_queue_lock);
+}
+
+static const struct seq_operations slow_work_runqueue_ops = {
+	.start		= slow_work_runqueue_start,
+	.stop		= slow_work_runqueue_stop,
+	.next		= slow_work_runqueue_next,
+	.show		= slow_work_runqueue_show,
+};
+
+/*
+ * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
+ */
+static int slow_work_runqueue_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slow_work_runqueue_ops);
+}
+
+const struct file_operations slow_work_runqueue_fops = {
+	.owner		= THIS_MODULE,
+	.open		= slow_work_runqueue_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/kernel/slow-work-proc.c b/kernel/slow-work-proc.c
deleted file mode 100644
index 3988032571f5..000000000000
--- a/kernel/slow-work-proc.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-
-#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
-
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
-	seq_puts(m, "Slow-work: New thread");
-}
-
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
-	struct timespec now, diff;
-
-	now = CURRENT_TIME;
-	diff = timespec_sub(now, work->mark);
-
-	if (diff.tv_sec < 0)
-		seq_puts(m, "  -ve ");
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
-		seq_printf(m, "%3luns ", diff.tv_nsec);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
-		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
-		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
-	else if (diff.tv_sec <= 1)
-		seq_puts(m, "   1s ");
-	else if (diff.tv_sec < 60)
-		seq_printf(m, "%4lus ", diff.tv_sec);
-	else if (diff.tv_sec < 60 * 60)
-		seq_printf(m, "%4lum ", diff.tv_sec / 60);
-	else if (diff.tv_sec < 60 * 60 * 24)
-		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
-	else
-		seq_puts(m, "exces ");
-}
-
-/*
- * Describe a slow work item for /proc
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
-	struct slow_work *work;
-	struct list_head *p = v;
-	unsigned long id;
-
-	switch ((unsigned long) v) {
-	case 1:
-		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
-		return 0;
-	case 2:
-		seq_puts(m, "=== ===== ================ == ===== ==========\n");
-		return 0;
-
-	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
-		id = (unsigned long) v - 3;
-
-		read_lock(&slow_work_execs_lock);
-		work = slow_work_execs[id];
-		if (work) {
-			smp_read_barrier_depends();
-
-			seq_printf(m, "%3lu %5d %16p %2lx ",
-				   id, slow_work_pids[id], work, work->flags);
-			slow_work_print_mark(m, work);
-
-			if (work->ops->desc)
-				work->ops->desc(work, m);
-			seq_putc(m, '\n');
-		}
-		read_unlock(&slow_work_execs_lock);
-		return 0;
-
-	default:
-		work = list_entry(p, struct slow_work, link);
-		seq_printf(m, "%3s     - %16p %2lx ",
-			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
-			   work, work->flags);
-		slow_work_print_mark(m, work);
-
-		if (work->ops->desc)
-			work->ops->desc(work, m);
-		seq_putc(m, '\n');
-		return 0;
-	}
-}
-
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
-	struct list_head *p;
-	unsigned long count, id;
-
-	switch (*_pos >> ITERATOR_SHIFT) {
-	case 0x0:
-		if (*_pos == 0)
-			*_pos = 1;
-		if (*_pos < 3)
-			return (void *)(unsigned long) *_pos;
-		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
-			for (id = *_pos - 3;
-			     id < SLOW_WORK_THREAD_LIMIT;
-			     id++, (*_pos)++)
-				if (slow_work_execs[id])
-					return (void *)(unsigned long) *_pos;
-		*_pos = 0x1UL << ITERATOR_SHIFT;
-
-	case 0x1:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &slow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-
-	case 0x2:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &vslow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
-	spin_lock_irq(&slow_work_queue_lock);
-	return slow_work_runqueue_index(m, _pos);
-}
-
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
-	struct list_head *p = v;
-	unsigned long selector = *_pos >> ITERATOR_SHIFT;
-
-	(*_pos)++;
-	switch (selector) {
-	case 0x0:
-		return slow_work_runqueue_index(m, _pos);
-
-	case 0x1:
-		if (*_pos >> ITERATOR_SHIFT == 0x1) {
-			p = p->next;
-			if (p != &slow_work_queue)
-				return p;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-		p = &vslow_work_queue;
-
-	case 0x2:
-		if (*_pos >> ITERATOR_SHIFT == 0x2) {
-			p = p->next;
-			if (p != &vslow_work_queue)
-				return p;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
-	spin_unlock_irq(&slow_work_queue_lock);
-}
-
-static const struct seq_operations slow_work_runqueue_ops = {
-	.start		= slow_work_runqueue_start,
-	.stop		= slow_work_runqueue_stop,
-	.next		= slow_work_runqueue_next,
-	.show		= slow_work_runqueue_show,
-};
-
-/*
- * open "/proc/slow_work_rq" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slow_work_runqueue_ops);
-}
-
-const struct file_operations slow_work_runqueue_fops = {
-	.owner		= THIS_MODULE,
-	.open		= slow_work_runqueue_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b5c17f15f9de..00889bd3c590 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,7 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include "slow-work.h"
 
 static void slow_work_cull_timeout(unsigned long);
@@ -138,7 +138,7 @@ static void slow_work_clear_thread_processing(int id) {}
 /*
  * Data for tracking currently executing items for indication through /proc
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
 pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
 DEFINE_RWLOCK(slow_work_execs_lock);
@@ -823,7 +823,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.owner		= THIS_MODULE,
 	.execute	= slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= slow_work_new_thread_desc,
 #endif
 };
@@ -1055,9 +1055,15 @@ static int __init init_slow_work(void)
 	if (slow_work_max_max_threads < nr_cpus * 2)
 		slow_work_max_max_threads = nr_cpus * 2;
 #endif
-#ifdef CONFIG_SLOW_WORK_PROC
-	proc_create("slow_work_rq", S_IFREG | 0400, NULL,
-		    &slow_work_runqueue_fops);
+#ifdef CONFIG_SLOW_WORK_DEBUG
+	{
+		struct dentry *dbdir;
+
+		dbdir = debugfs_create_dir("slow_work", NULL);
+		if (dbdir && !IS_ERR(dbdir))
+			debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
+					    NULL, &slow_work_runqueue_fops);
+	}
 #endif
 	return 0;
 }
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 3c2f007f3ad6..321f3c59d732 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -19,7 +19,7 @@
 /*
  * slow-work.c
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 extern struct slow_work *slow_work_execs[];
 extern pid_t slow_work_pids[];
 extern rwlock_t slow_work_execs_lock;
@@ -30,9 +30,9 @@ extern struct list_head vslow_work_queue;
 extern spinlock_t slow_work_queue_lock;
 
 /*
- * slow-work-proc.c
+ * slow-work-debugfs.c
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 extern const struct file_operations slow_work_runqueue_fops;
 
 extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-- 
cgit v1.2.3-71-gd317


From bf56a4ea9f1683c5b223fd3a5dbea23f1fa91c34 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:20 +0800
Subject: trace_syscalls: Remove unused event_syscall_enter and
 event_syscall_exit

fix event_enter_##sname->event
fix event_exit_##sname->event

remove unused event_syscall_enter and event_syscall_exit

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D278.4090209@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      | 4 ++--
 include/trace/syscall.h       | 2 --
 kernel/trace/trace_syscalls.c | 8 --------
 3 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b50974a93af0..2f7c539ab96d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -178,7 +178,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	  event_enter_##sname = {					\
 		.name                   = "sys_enter"#sname,		\
 		.system                 = "syscalls",			\
-		.event                  = &event_syscall_enter,		\
+		.event                  = &enter_syscall_print_##sname,	\
 		.raw_init		= init_enter_##sname,		\
 		.show_format		= syscall_enter_format,		\
 		.define_fields		= syscall_enter_define_fields,	\
@@ -214,7 +214,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	  event_exit_##sname = {					\
 		.name                   = "sys_exit"#sname,		\
 		.system                 = "syscalls",			\
-		.event                  = &event_syscall_exit,		\
+		.event                  = &exit_syscall_print_##sname,	\
 		.raw_init		= init_exit_##sname,		\
 		.show_format		= syscall_exit_format,		\
 		.define_fields		= syscall_exit_define_fields,	\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 51ee17d3632a..5f8827c92db7 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -37,8 +37,6 @@ extern unsigned long arch_syscall_addr(int nr);
 extern int syscall_name_to_nr(char *name);
 void set_syscall_enter_id(int num, int id);
 void set_syscall_exit_id(int num, int id);
-extern struct trace_event event_syscall_enter;
-extern struct trace_event event_syscall_exit;
 
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 63aa8070365d..00d6e176f5b6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -444,14 +444,6 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-struct trace_event event_syscall_enter = {
-	.trace			= print_syscall_enter,
-};
-
-struct trace_event event_syscall_exit = {
-	.trace			= print_syscall_exit,
-};
-
 int __init init_ftrace_syscalls(void)
 {
 	struct syscall_metadata *meta;
-- 
cgit v1.2.3-71-gd317


From 31c16b13349970b2684248c7d8608d2a96ae135d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:30 +0800
Subject: trace_syscalls: Set event_enter_##sname->data to its metadata

Set event_enter_##sname->data to its metadata,
it makes codes simpler.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D282.7050709@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      |  6 ++++--
 include/trace/syscall.h       |  2 +-
 kernel/trace/trace_syscalls.c | 36 +++++++++++-------------------------
 3 files changed, 16 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2f7c539ab96d..d3c9fd01a110 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -153,6 +153,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 #define __SC_STR_TDECL6(t, a, ...)	#t, __SC_STR_TDECL5(__VA_ARGS__)
 
 #define SYSCALL_TRACE_ENTER_EVENT(sname)				\
+	static const struct syscall_metadata __syscall_meta_##sname;	\
 	static struct ftrace_event_call event_enter_##sname;		\
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
@@ -184,11 +185,12 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		.define_fields		= syscall_enter_define_fields,	\
 		.regfunc		= reg_event_syscall_enter,	\
 		.unregfunc		= unreg_event_syscall_enter,	\
-		.data			= "sys"#sname,			\
+		.data			= (void *)&__syscall_meta_##sname,\
 		TRACE_SYS_ENTER_PROFILE_INIT(sname)			\
 	}
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
+	static const struct syscall_metadata __syscall_meta_##sname;	\
 	static struct ftrace_event_call event_exit_##sname;		\
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
@@ -220,7 +222,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		.define_fields		= syscall_exit_define_fields,	\
 		.regfunc		= reg_event_syscall_exit,	\
 		.unregfunc		= unreg_event_syscall_exit,	\
-		.data			= "sys"#sname,			\
+		.data			= (void *)&__syscall_meta_##sname,\
 		TRACE_SYS_EXIT_PROFILE_INIT(sname)			\
 	}
 
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 5f8827c92db7..c5265c81c4e7 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -34,7 +34,7 @@ struct syscall_metadata {
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
-extern int syscall_name_to_nr(char *name);
+extern int syscall_name_to_nr(const char *name);
 void set_syscall_enter_id(int num, int id);
 void set_syscall_exit_id(int num, int id);
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 00d6e176f5b6..39649b1675dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -51,7 +51,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 	return syscalls_metadata[nr];
 }
 
-int syscall_name_to_nr(char *name)
+int syscall_name_to_nr(const char *name)
 {
 	int i;
 
@@ -172,18 +172,11 @@ extern char *__bad_type_size(void);
 int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
 {
 	int i;
-	int nr;
 	int ret;
-	struct syscall_metadata *entry;
+	struct syscall_metadata *entry = call->data;
 	struct syscall_trace_enter trace;
 	int offset = offsetof(struct syscall_trace_enter, args);
 
-	nr = syscall_name_to_nr(call->data);
-	entry = syscall_nr_to_meta(nr);
-
-	if (!entry)
-		return 0;
-
 	ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
 			       "\tsigned:%u;\n",
 			       SYSCALL_FIELD(int, nr));
@@ -245,18 +238,11 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
 int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
 	struct syscall_trace_enter trace;
-	struct syscall_metadata *meta;
+	struct syscall_metadata *meta = call->data;
 	int ret;
-	int nr;
 	int i;
 	int offset = offsetof(typeof(trace), args);
 
-	nr = syscall_name_to_nr(call->data);
-	meta = syscall_nr_to_meta(nr);
-
-	if (!meta)
-		return 0;
-
 	ret = trace_define_common_fields(call);
 	if (ret)
 		return ret;
@@ -366,9 +352,9 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
-	char *name;
+	const char *name;
 
-	name = (char *)call->data;
+	name = ((struct syscall_metadata *)call->data)->name;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= NR_syscalls)
 		return -ENOSYS;
@@ -389,9 +375,9 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int num;
-	char *name;
+	const char *name;
 
-	name = (char *)call->data;
+	name = ((struct syscall_metadata *)call->data)->name;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= NR_syscalls)
 		return;
@@ -407,9 +393,9 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
-	char *name;
+	const char *name;
 
-	name = call->data;
+	name = ((struct syscall_metadata *)call->data)->name;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= NR_syscalls)
 		return -ENOSYS;
@@ -430,9 +416,9 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int num;
-	char *name;
+	const char *name;
 
-	name = call->data;
+	name = ((struct syscall_metadata *)call->data)->name;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= NR_syscalls)
 		return;
-- 
cgit v1.2.3-71-gd317


From fcc19438dda38dacc8c144e2db3ebc6b9fd4f8b8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:36 +0800
Subject: trace_syscalls: Remove enter_id exit_id

use ->enter_event->id instead of ->enter_id
use ->exit_event->id instead of ->exit_id

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D288.7030001@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      |  2 --
 include/trace/syscall.h       |  6 ------
 kernel/trace/trace_syscalls.c | 30 ++++++++++--------------------
 3 files changed, 10 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d3c9fd01a110..b9af87560adb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -168,7 +168,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		if (!id)						\
 			return -ENODEV;					\
 		event_enter_##sname.id = id;				\
-		set_syscall_enter_id(num, id);				\
 		INIT_LIST_HEAD(&event_enter_##sname.fields);		\
 		return 0;						\
 	}								\
@@ -205,7 +204,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		if (!id)						\
 			return -ENODEV;					\
 		event_exit_##sname.id = id;				\
-		set_syscall_exit_id(num, id);				\
 		INIT_LIST_HEAD(&event_exit_##sname.fields);		\
 		return 0;						\
 	}								\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index c5265c81c4e7..ca09561cd578 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -15,8 +15,6 @@
  * @nb_args: number of parameters it takes
  * @types: list of types as strings
  * @args: list of args as strings (args[i] matches types[i])
- * @enter_id: associated ftrace enter event id
- * @exit_id: associated ftrace exit event id
  * @enter_event: associated syscall_enter trace event
  * @exit_event: associated syscall_exit trace event
  */
@@ -25,8 +23,6 @@ struct syscall_metadata {
 	int		nb_args;
 	const char	**types;
 	const char	**args;
-	int		enter_id;
-	int		exit_id;
 
 	struct ftrace_event_call *enter_event;
 	struct ftrace_event_call *exit_event;
@@ -35,8 +31,6 @@ struct syscall_metadata {
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
 extern int syscall_name_to_nr(const char *name);
-void set_syscall_enter_id(int num, int id);
-void set_syscall_exit_id(int num, int id);
 
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 39649b1675dd..27eb18d69222 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -67,16 +67,6 @@ int syscall_name_to_nr(const char *name)
 	return -1;
 }
 
-void set_syscall_enter_id(int num, int id)
-{
-	syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
-{
-	syscalls_metadata[num]->exit_id = id;
-}
-
 enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags)
 {
@@ -93,7 +83,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
 	if (!entry)
 		goto end;
 
-	if (entry->enter_id != ent->type) {
+	if (entry->enter_event->id != ent->type) {
 		WARN_ON_ONCE(1);
 		goto end;
 	}
@@ -148,7 +138,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
 		return TRACE_TYPE_HANDLED;
 	}
 
-	if (entry->exit_id != ent->type) {
+	if (entry->exit_event->id != ent->type) {
 		WARN_ON_ONCE(1);
 		return TRACE_TYPE_UNHANDLED;
 	}
@@ -302,8 +292,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
 
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-	event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
-						  size, 0, 0);
+	event = trace_current_buffer_lock_reserve(&buffer,
+			sys_data->enter_event->id, size, 0, 0);
 	if (!event)
 		return;
 
@@ -334,8 +324,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
-				sizeof(*entry), 0, 0);
+	event = trace_current_buffer_lock_reserve(&buffer,
+			sys_data->exit_event->id, sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 
@@ -510,11 +500,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 
 	rec = (struct syscall_trace_enter *) raw_data;
 	tracing_generic_entry_update(&rec->ent, 0, 0);
-	rec->ent.type = sys_data->enter_id;
+	rec->ent.type = sys_data->enter_event->id;
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
+	perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
 
 end:
 	perf_swevent_put_recursion_context(rctx);
@@ -615,11 +605,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	rec = (struct syscall_trace_exit *)raw_data;
 
 	tracing_generic_entry_update(&rec->ent, 0, 0);
-	rec->ent.type = sys_data->exit_id;
+	rec->ent.type = sys_data->exit_event->id;
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
+	perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
 
 end:
 	perf_swevent_put_recursion_context(rctx);
-- 
cgit v1.2.3-71-gd317


From c252f65793874b56d50395ab604db465ce688665 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:47 +0800
Subject: trace_syscalls: Add syscall_nr field to struct syscall_metadata

Add syscall_nr field to struct syscall_metadata,
it helps us to get syscall number easier.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D293.6090800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      |  4 ++--
 include/trace/syscall.h       |  3 ++-
 kernel/trace/trace_syscalls.c | 22 +++++++++-------------
 3 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b9af87560adb..3c280d7ecb76 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -161,7 +161,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	static int init_enter_##sname(struct ftrace_event_call *call)	\
 	{								\
 		int num, id;						\
-		num = syscall_name_to_nr("sys"#sname);			\
+		num = __syscall_meta_##sname.syscall_nr;		\
 		if (num < 0)						\
 			return -ENOSYS;					\
 		id = register_ftrace_event(&enter_syscall_print_##sname);\
@@ -197,7 +197,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	static int init_exit_##sname(struct ftrace_event_call *call)	\
 	{								\
 		int num, id;						\
-		num = syscall_name_to_nr("sys"#sname);			\
+		num = __syscall_meta_##sname.syscall_nr;		\
 		if (num < 0)						\
 			return -ENOSYS;					\
 		id = register_ftrace_event(&exit_syscall_print_##sname);\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index ca09561cd578..1531eef3071f 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -12,6 +12,7 @@
  * A syscall entry in the ftrace syscalls array.
  *
  * @name: name of the syscall
+ * @syscall_nr: number of the syscall
  * @nb_args: number of parameters it takes
  * @types: list of types as strings
  * @args: list of args as strings (args[i] matches types[i])
@@ -20,6 +21,7 @@
  */
 struct syscall_metadata {
 	const char	*name;
+	int		syscall_nr;
 	int		nb_args;
 	const char	**types;
 	const char	**args;
@@ -30,7 +32,6 @@ struct syscall_metadata {
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
-extern int syscall_name_to_nr(const char *name);
 
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 27eb18d69222..144cc14d8551 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -51,7 +51,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 	return syscalls_metadata[nr];
 }
 
-int syscall_name_to_nr(const char *name)
+static int syscall_name_to_nr(const char *name)
 {
 	int i;
 
@@ -342,10 +342,8 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
-	const char *name;
 
-	name = ((struct syscall_metadata *)call->data)->name;
-	num = syscall_name_to_nr(name);
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (num < 0 || num >= NR_syscalls)
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
@@ -365,10 +363,8 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int num;
-	const char *name;
 
-	name = ((struct syscall_metadata *)call->data)->name;
-	num = syscall_name_to_nr(name);
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (num < 0 || num >= NR_syscalls)
 		return;
 	mutex_lock(&syscall_trace_lock);
@@ -383,10 +379,8 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
-	const char *name;
 
-	name = ((struct syscall_metadata *)call->data)->name;
-	num = syscall_name_to_nr(name);
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (num < 0 || num >= NR_syscalls)
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
@@ -406,10 +400,8 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int num;
-	const char *name;
 
-	name = ((struct syscall_metadata *)call->data)->name;
-	num = syscall_name_to_nr(name);
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (num < 0 || num >= NR_syscalls)
 		return;
 	mutex_lock(&syscall_trace_lock);
@@ -436,6 +428,10 @@ int __init init_ftrace_syscalls(void)
 	for (i = 0; i < NR_syscalls; i++) {
 		addr = arch_syscall_addr(i);
 		meta = find_syscall_meta(addr);
+		if (!meta)
+			continue;
+
+		meta->syscall_nr = i;
 		syscalls_metadata[i] = meta;
 	}
 
-- 
cgit v1.2.3-71-gd317


From a1301da0997bf73c44dbe584e9070a13adc89672 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:55 +0800
Subject: trace_syscalls: Remove duplicate init_enter_##sname()

use only one init_syscall_trace instead of
many init_enter_##sname()/init_exit_##sname()

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D29B.6090708@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      | 30 ++----------------------------
 include/trace/syscall.h       |  1 +
 kernel/trace/trace_syscalls.c | 12 ++++++++++++
 3 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3c280d7ecb76..cf0d923ea40e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -158,19 +158,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
 	};								\
-	static int init_enter_##sname(struct ftrace_event_call *call)	\
-	{								\
-		int num, id;						\
-		num = __syscall_meta_##sname.syscall_nr;		\
-		if (num < 0)						\
-			return -ENOSYS;					\
-		id = register_ftrace_event(&enter_syscall_print_##sname);\
-		if (!id)						\
-			return -ENODEV;					\
-		event_enter_##sname.id = id;				\
-		INIT_LIST_HEAD(&event_enter_##sname.fields);		\
-		return 0;						\
-	}								\
 	TRACE_SYS_ENTER_PROFILE(sname);					\
 	static struct ftrace_event_call __used				\
 	  __attribute__((__aligned__(4)))				\
@@ -179,7 +166,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		.name                   = "sys_enter"#sname,		\
 		.system                 = "syscalls",			\
 		.event                  = &enter_syscall_print_##sname,	\
-		.raw_init		= init_enter_##sname,		\
+		.raw_init		= init_syscall_trace,		\
 		.show_format		= syscall_enter_format,		\
 		.define_fields		= syscall_enter_define_fields,	\
 		.regfunc		= reg_event_syscall_enter,	\
@@ -194,19 +181,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
 	};								\
-	static int init_exit_##sname(struct ftrace_event_call *call)	\
-	{								\
-		int num, id;						\
-		num = __syscall_meta_##sname.syscall_nr;		\
-		if (num < 0)						\
-			return -ENOSYS;					\
-		id = register_ftrace_event(&exit_syscall_print_##sname);\
-		if (!id)						\
-			return -ENODEV;					\
-		event_exit_##sname.id = id;				\
-		INIT_LIST_HEAD(&event_exit_##sname.fields);		\
-		return 0;						\
-	}								\
 	TRACE_SYS_EXIT_PROFILE(sname);					\
 	static struct ftrace_event_call __used				\
 	  __attribute__((__aligned__(4)))				\
@@ -215,7 +189,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		.name                   = "sys_exit"#sname,		\
 		.system                 = "syscalls",			\
 		.event                  = &exit_syscall_print_##sname,	\
-		.raw_init		= init_exit_##sname,		\
+		.raw_init		= init_syscall_trace,		\
 		.show_format		= syscall_exit_format,		\
 		.define_fields		= syscall_exit_define_fields,	\
 		.regfunc		= reg_event_syscall_exit,	\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 1531eef3071f..dff9371e5274 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -32,6 +32,7 @@ struct syscall_metadata {
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
+extern int init_syscall_trace(struct ftrace_event_call *call);
 
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 144cc14d8551..c6514093c95a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -412,6 +412,18 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
+int init_syscall_trace(struct ftrace_event_call *call)
+{
+	int id;
+
+	id = register_ftrace_event(call->event);
+	if (!id)
+		return -ENODEV;
+	call->id = id;
+	INIT_LIST_HEAD(&call->fields);
+	return 0;
+}
+
 int __init init_ftrace_syscalls(void)
 {
 	struct syscall_metadata *meta;
-- 
cgit v1.2.3-71-gd317


From 3bbe84e9d385205d638035ee9dcc4db1b486ea08 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:24:01 +0800
Subject: trace_syscalls: Simplify syscall profile

use only one prof_sysenter_enable() instead of
prof_sysenter_enable_##sname()

use only one prof_sysenter_disable() instead of
prof_sysenter_disable_##sname()

use only one prof_sysexit_enable() instead of
prof_sysexit_enable_##sname()

use only one prof_sysexit_disable() instead of
prof_sysexit_disable_##sname()

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D2A1.8060304@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      | 31 ++++---------------------------
 include/trace/syscall.h       |  8 ++++----
 kernel/trace/trace_syscalls.c | 24 ++++++++----------------
 3 files changed, 16 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cf0d923ea40e..c2df3a593236 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -99,37 +99,16 @@ struct perf_event_attr;
 #define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
 #ifdef CONFIG_EVENT_PROFILE
-#define TRACE_SYS_ENTER_PROFILE(sname)					       \
-static int prof_sysenter_enable_##sname(struct ftrace_event_call *unused)      \
-{									       \
-	return reg_prof_syscall_enter("sys"#sname);			       \
-}									       \
-									       \
-static void prof_sysenter_disable_##sname(struct ftrace_event_call *unused)    \
-{									       \
-	unreg_prof_syscall_enter("sys"#sname);				       \
-}
-
-#define TRACE_SYS_EXIT_PROFILE(sname)					       \
-static int prof_sysexit_enable_##sname(struct ftrace_event_call *unused)       \
-{									       \
-	return reg_prof_syscall_exit("sys"#sname);			       \
-}									       \
-									       \
-static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
-{                                                                              \
-	unreg_prof_syscall_exit("sys"#sname);				       \
-}
 
 #define TRACE_SYS_ENTER_PROFILE_INIT(sname)				       \
 	.profile_count = ATOMIC_INIT(-1),				       \
-	.profile_enable = prof_sysenter_enable_##sname,			       \
-	.profile_disable = prof_sysenter_disable_##sname,
+	.profile_enable = prof_sysenter_enable,				       \
+	.profile_disable = prof_sysenter_disable,
 
 #define TRACE_SYS_EXIT_PROFILE_INIT(sname)				       \
 	.profile_count = ATOMIC_INIT(-1),				       \
-	.profile_enable = prof_sysexit_enable_##sname,			       \
-	.profile_disable = prof_sysexit_disable_##sname,
+	.profile_enable = prof_sysexit_enable,				       \
+	.profile_disable = prof_sysexit_disable,
 #else
 #define TRACE_SYS_ENTER_PROFILE(sname)
 #define TRACE_SYS_ENTER_PROFILE_INIT(sname)
@@ -158,7 +137,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
 	};								\
-	TRACE_SYS_ENTER_PROFILE(sname);					\
 	static struct ftrace_event_call __used				\
 	  __attribute__((__aligned__(4)))				\
 	  __attribute__((section("_ftrace_events")))			\
@@ -181,7 +159,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
 	};								\
-	TRACE_SYS_EXIT_PROFILE(sname);					\
 	static struct ftrace_event_call __used				\
 	  __attribute__((__aligned__(4)))				\
 	  __attribute__((section("_ftrace_events")))			\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index dff9371e5274..961fda3556bb 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -50,10 +50,10 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
 #endif
 #ifdef CONFIG_EVENT_PROFILE
-int reg_prof_syscall_enter(char *name);
-void unreg_prof_syscall_enter(char *name);
-int reg_prof_syscall_exit(char *name);
-void unreg_prof_syscall_exit(char *name);
+int prof_sysenter_enable(struct ftrace_event_call *call);
+void prof_sysenter_disable(struct ftrace_event_call *call);
+int prof_sysexit_enable(struct ftrace_event_call *call);
+void prof_sysexit_disable(struct ftrace_event_call *call);
 
 #endif
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6514093c95a..1e85b6cc26aa 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -520,14 +520,12 @@ end_recursion:
 	local_irq_restore(flags);
 }
 
-int reg_prof_syscall_enter(char *name)
+int prof_sysenter_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
 
-	num = syscall_name_to_nr(name);
-	if (num < 0 || num >= NR_syscalls)
-		return -ENOSYS;
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_prof_refcount_enter)
@@ -543,13 +541,11 @@ int reg_prof_syscall_enter(char *name)
 	return ret;
 }
 
-void unreg_prof_syscall_enter(char *name)
+void prof_sysenter_disable(struct ftrace_event_call *call)
 {
 	int num;
 
-	num = syscall_name_to_nr(name);
-	if (num < 0 || num >= NR_syscalls)
-		return;
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
 	sys_prof_refcount_enter--;
@@ -625,14 +621,12 @@ end_recursion:
 	local_irq_restore(flags);
 }
 
-int reg_prof_syscall_exit(char *name)
+int prof_sysexit_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
 
-	num = syscall_name_to_nr(name);
-	if (num < 0 || num >= NR_syscalls)
-		return -ENOSYS;
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_prof_refcount_exit)
@@ -648,13 +642,11 @@ int reg_prof_syscall_exit(char *name)
 	return ret;
 }
 
-void unreg_prof_syscall_exit(char *name)
+void prof_sysexit_disable(struct ftrace_event_call *call)
 {
 	int num;
 
-	num = syscall_name_to_nr(name);
-	if (num < 0 || num >= NR_syscalls)
-		return;
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
 	sys_prof_refcount_exit--;
-- 
cgit v1.2.3-71-gd317


From 7be077f56370cd52c48c08272b0867132f87bc48 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:24:06 +0800
Subject: trace_syscalls: Remove unused syscall_name_to_nr()

After duplications are removed, syscall_name_to_nr() is unused.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D2A6.6060803@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 1e85b6cc26aa..57501d90096a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -51,22 +51,6 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 	return syscalls_metadata[nr];
 }
 
-static int syscall_name_to_nr(const char *name)
-{
-	int i;
-
-	if (!syscalls_metadata)
-		return -1;
-
-	for (i = 0; i < NR_syscalls; i++) {
-		if (syscalls_metadata[i]) {
-			if (!strcmp(syscalls_metadata[i]->name, name))
-				return i;
-		}
-	}
-	return -1;
-}
-
 enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags)
 {
-- 
cgit v1.2.3-71-gd317


From ec70ccd806111ba3caf596def91a8580138b12db Mon Sep 17 00:00:00 2001
From: Kristian Høgsberg <krh@bitplanet.net>
Date: Tue, 1 Dec 2009 15:05:01 -0500
Subject: perf: Don't free perf_mmap_data until work has been done
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the CONFIG_PERF_USE_VMALLOC case, perf_mmap_data_free() only
schedules the cleanup of the perf_mmap_data struct.  In that
case we have to wait until the work has been done before we free
data.

Signed-off-by: Kristian Høgsberg <krh@bitplanet.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org>
LKML-Reference: <1259697901-1747-1-git-send-email-krh@bitplanet.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 040ee517c808..6b7ddba1dd64 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2210,6 +2210,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
 	perf_mmap_free_page((unsigned long)data->user_page);
 	for (i = 0; i < data->nr_pages; i++)
 		perf_mmap_free_page((unsigned long)data->data_pages[i]);
+	kfree(data);
 }
 
 #else
@@ -2250,6 +2251,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
 		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
 
 	vfree(base);
+	kfree(data);
 }
 
 static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2355,7 +2357,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
 
 	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
 	perf_mmap_data_free(data);
-	kfree(data);
 }
 
 static void perf_mmap_data_release(struct perf_event *event)
-- 
cgit v1.2.3-71-gd317


From 8592e6486a177a02f048567cb928bc3a1f9a86c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Dec 2009 12:56:46 +0900
Subject: sched: Revert 498657a478c60be092208422fefa9c7b248729c2

498657a478c60be092208422fefa9c7b248729c2 incorrectly assumed
that preempt wasn't disabled around context_switch() and thus
was fixing imaginary problem.  It also broke KVM because it
depended on ->sched_in() to be called with irq enabled so that
it can do smp calls from there.

Revert the incorrect commit and add comment describing different
contexts under with the two callbacks are invoked.

Avi: spotted transposed in/out in the added comment.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Avi Kivity <avi@redhat.com>
Cc: peterz@infradead.org
Cc: efault@gmx.de
Cc: rusty@rustcorp.com.au
LKML-Reference: <1259726212-30259-2-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/preempt.h | 5 +++++
 kernel/sched.c          | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 72b1a10a59b6..2e681d9555bd 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -105,6 +105,11 @@ struct preempt_notifier;
  * @sched_out: we've just been preempted
  *    notifier: struct preempt_notifier for the task being preempted
  *    next: the task that's kicking us out
+ *
+ * Please note that sched_in and out are called under different
+ * contexts.  sched_out is called with rq lock held and irq disabled
+ * while sched_in is called without rq lock and irq enabled.  This
+ * difference is intentional and depended upon by its users.
  */
 struct preempt_ops {
 	void (*sched_in)(struct preempt_notifier *notifier, int cpu);
diff --git a/kernel/sched.c b/kernel/sched.c
index b3d4e2be95aa..1031cae39c4c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2768,9 +2768,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(current, cpu_of(rq));
-	fire_sched_in_preempt_notifiers(current);
 	finish_lock_switch(rq, prev);
 
+	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
-- 
cgit v1.2.3-71-gd317


From 1786bf009f18f722afbb62143c8541e7e60a4e92 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 30 Nov 2009 12:54:51 +0200
Subject: core: Clean up user return notifers use of per_cpu

Instead of using per_cpu(..., raw_smp_processor_id()), use
__get_cpu_var(...).

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1259578491-4589-1-git-send-email-avi@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user-return-notifier.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 03e2d6fd9b18..eb27fd3430a2 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -6,8 +6,6 @@
 
 static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
 
-#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
-
 /*
  * Request a notification when the current cpu returns to userspace.  Must be
  * called in atomic context.  The notifier will also be called in atomic
@@ -16,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
 void user_return_notifier_register(struct user_return_notifier *urn)
 {
 	set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
-	hlist_add_head(&urn->link, &URN_LIST_HEAD);
+	hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
 }
 EXPORT_SYMBOL_GPL(user_return_notifier_register);
 
@@ -27,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
 void user_return_notifier_unregister(struct user_return_notifier *urn)
 {
 	hlist_del(&urn->link);
-	if (hlist_empty(&URN_LIST_HEAD))
+	if (hlist_empty(&__get_cpu_var(return_notifier_list)))
 		clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
 }
 EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
-- 
cgit v1.2.3-71-gd317


From bdddd2963c0264c56f18043f6fa829d3c1d3d1c0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 2 Dec 2009 14:09:16 +1030
Subject: sched: Fix isolcpus boot option

Anton Blanchard wrote:

> We allocate and zero cpu_isolated_map after the isolcpus
> __setup option has run. This means cpu_isolated_map always
> ends up empty and if CPUMASK_OFFSTACK is enabled we write to a
> cpumask that hasn't been allocated.

I introduced this regression in 49557e620339cb13 (sched: Fix
boot crash by zalloc()ing most of the cpu masks).

Use the bootmem allocator if they set isolcpus=, otherwise
allocate and zero like normal.

Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: peterz@infradead.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
LKML-Reference: <200912021409.17013.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Anton Blanchard <anton@samba.org>
---
 kernel/sched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1031cae39c4c..4883fee99314 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8061,6 +8061,7 @@ static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	cpulist_parse(str, cpu_isolated_map);
 	return 1;
 }
@@ -9609,7 +9610,9 @@ void __init sched_init(void)
 	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
 	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+	/* May be allocated at isolcpus cmdline parse time */
+	if (cpu_isolated_map == NULL)
+		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	perf_event_init();
-- 
cgit v1.2.3-71-gd317


From d99ca3b977fc5a93141304f571475c2af9e6c1c5 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 2 Dec 2009 17:26:47 +0900
Subject: sched, cputime: Cleanups related to task_times()

- Remove if({u,s}t)s because no one call it with NULL now.
- Use cputime_{add,sub}().
- Add ifndef-endif for prev_{u,s}time since they are used
  only when !VIRT_CPU_ACCOUNTING.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B1624C7.7040302@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/fork.c         |  2 ++
 kernel/sched.c        | 16 ++++++----------
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0395b0f4df3a..dff85e58264e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1331,7 +1331,9 @@ struct task_struct {
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	cputime_t prev_utime, prev_stime;
+#endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time; 		/* monotonic time */
 	struct timespec real_start_time;	/* boot based time */
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c49257c..ad7cb6d1193c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1066,8 +1066,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->gtime = cputime_zero;
 	p->utimescaled = cputime_zero;
 	p->stimescaled = cputime_zero;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
+#endif
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 4883fee99314..17e2c1db2bde 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5184,10 +5184,8 @@ void account_idle_ticks(unsigned long ticks)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	if (ut)
-		*ut = p->utime;
-	if (st)
-		*st = p->stime;
+	*ut = p->utime;
+	*st = p->stime;
 }
 #else
 
@@ -5197,7 +5195,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	cputime_t rtime, utime = p->utime, total = utime + p->stime;
+	cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
 
 	/*
 	 * Use CFS's precise accounting:
@@ -5217,12 +5215,10 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	 * Compare with previous values, to keep monotonicity:
 	 */
 	p->prev_utime = max(p->prev_utime, utime);
-	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+	p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
 
-	if (ut)
-		*ut = p->prev_utime;
-	if (st)
-		*st = p->prev_stime;
+	*ut = p->prev_utime;
+	*st = p->prev_stime;
 }
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 2 Dec 2009 17:28:07 +0900
Subject: sched, cputime: Introduce thread_group_times()

This is a real fix for problem of utime/stime values decreasing
described in the thread:

   http://lkml.org/lkml/2009/11/3/522

Now cputime is accounted in the following way:

 - {u,s}time in task_struct are increased every time when the thread
   is interrupted by a tick (timer interrupt).

 - When a thread exits, its {u,s}time are added to signal->{u,s}time,
   after adjusted by task_times().

 - When all threads in a thread_group exits, accumulated {u,s}time
   (and also c{u,s}time) in signal struct are added to c{u,s}time
   in signal struct of the group's parent.

So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.

And accounted values are used by:

 - task_times(), to get cputime of a thread:
   This function returns adjusted values that originates from raw
   {u,s}time and scaled by sum_exec_runtime that accounted by CFS.

 - thread_group_cputime(), to get cputime of a thread group:
   This function returns sum of all {u,s}time of living threads in
   the group, plus {u,s}time in the signal struct that is sum of
   adjusted cputimes of all exited threads belonged to the group.

The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:

  group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)

This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).

To fix this, we could do:

  group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)

But task_times() contains hard divisions, so applying it for
every thread should be avoided.

This patch fixes the above problem in the following way:

 - Modify thread's exit (= __exit_signal()) not to use task_times().
   It means {u,s}time in signal struct accumulates raw values instead
   of adjusted values.  As the result it makes thread_group_cputime()
   to return pure sum of "raw" values.

 - Introduce a new function thread_group_times(*task, *utime, *stime)
   that converts "raw" values of thread_group_cputime() to "adjusted"
   values, in same calculation procedure as task_times().

 - Modify group's exit (= wait_task_zombie()) to use this introduced
   thread_group_times().  It make c{u,s}time in signal struct to
   have adjusted values like before this patch.

 - Replace some thread_group_cputime() by thread_group_times().
   This replacements are only applied where conveys the "adjusted"
   cputime to users, and where already uses task_times() near by it.
   (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)

This patch have a positive side effect:

 - Before this patch, if a group contains many short-life threads
   (e.g. runs 0.9ms and not interrupted by ticks), the group's
   cputime could be invisible since thread's cputime was accumulated
   after adjusted: imagine adjustment function as adj(ticks, runtime),
     {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
   After this patch it will not happen because the adjustment is
   applied after accumulated.

v2:
 - remove if()s, put new variables into signal_struct.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c       |  5 +----
 include/linux/sched.h |  4 ++++
 kernel/exit.c         | 23 ++++++++++++-----------
 kernel/fork.c         |  3 +++
 kernel/sched.c        | 41 +++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c          | 18 ++++++++----------
 6 files changed, 69 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ca61a88aed66..2571da43c736 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -506,7 +506,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
-			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
@@ -517,9 +516,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			thread_group_cputime(task, &cputime);
-			utime = cputime.utime;
-			stime = cputime.stime;
+			thread_group_times(task, &utime, &stime);
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dff85e58264e..34238bd10ebf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -624,6 +624,9 @@ struct signal_struct {
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	cputime_t prev_utime, prev_stime;
+#endif
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
@@ -1723,6 +1726,7 @@ static inline void put_task_struct(struct task_struct *t)
 }
 
 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index 2eaf68b634e3..b221ad65fd20 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,8 +91,6 @@ static void __exit_signal(struct task_struct *tsk)
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
 	else {
-		cputime_t utime, stime;
-
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
@@ -112,9 +110,8 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		task_times(tsk, &utime, &stime);
-		sig->utime = cputime_add(sig->utime, utime);
-		sig->stime = cputime_add(sig->stime, stime);
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
 		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -1208,6 +1205,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;
+		cputime_t tgutime, tgstime;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1223,20 +1221,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
+		 *
+		 * We use thread_group_times() to get times for the thread
+		 * group, which consolidates times for all threads in the
+		 * group including the group leader.
 		 */
+		thread_group_times(p, &tgutime, &tgstime);
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
 		psig->cutime =
 			cputime_add(psig->cutime,
-			cputime_add(p->utime,
-			cputime_add(sig->utime,
-				    sig->cutime)));
+			cputime_add(tgutime,
+				    sig->cutime));
 		psig->cstime =
 			cputime_add(psig->cstime,
-			cputime_add(p->stime,
-			cputime_add(sig->stime,
-				    sig->cstime)));
+			cputime_add(tgstime,
+				    sig->cstime));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index ad7cb6d1193c..3d6f121bbe8a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -884,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	sig->prev_utime = sig->prev_stime = cputime_zero;
+#endif
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 17e2c1db2bde..e6ba726941ae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5187,6 +5187,16 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->utime;
 	*st = p->stime;
 }
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
 #else
 
 #ifndef nsecs_to_cputime
@@ -5220,6 +5230,37 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->prev_utime;
 	*st = p->prev_stime;
 }
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct signal_struct *sig = p->signal;
+	struct task_cputime cputime;
+	cputime_t rtime, utime, total;
+
+	thread_group_cputime(p, &cputime);
+
+	total = cputime_add(cputime.utime, cputime.stime);
+	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+	if (total) {
+		u64 temp;
+
+		temp = (u64)(rtime * cputime.utime);
+		do_div(temp, total);
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
+
+	sig->prev_utime = max(sig->prev_utime, utime);
+	sig->prev_stime = max(sig->prev_stime,
+			      cputime_sub(rtime, sig->prev_utime));
+
+	*ut = sig->prev_utime;
+	*st = sig->prev_stime;
+}
 #endif
 
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index bbdfce0d4347..9968c5fb55b9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -911,16 +911,15 @@ change_okay:
 
 void do_sys_times(struct tms *tms)
 {
-	struct task_cputime cputime;
-	cputime_t cutime, cstime;
+	cputime_t tgutime, tgstime, cutime, cstime;
 
-	thread_group_cputime(current, &cputime);
 	spin_lock_irq(&current->sighand->siglock);
+	thread_group_times(current, &tgutime, &tgstime);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
 	spin_unlock_irq(&current->sighand->siglock);
-	tms->tms_utime = cputime_to_clock_t(cputime.utime);
-	tms->tms_stime = cputime_to_clock_t(cputime.stime);
+	tms->tms_utime = cputime_to_clock_t(tgutime);
+	tms->tms_stime = cputime_to_clock_t(tgstime);
 	tms->tms_cutime = cputime_to_clock_t(cutime);
 	tms->tms_cstime = cputime_to_clock_t(cstime);
 }
@@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
-	cputime_t utime, stime;
-	struct task_cputime cputime;
+	cputime_t tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
 
 	memset((char *) r, 0, sizeof *r);
@@ -1372,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				break;
 
 		case RUSAGE_SELF:
-			thread_group_cputime(p, &cputime);
-			utime = cputime_add(utime, cputime.utime);
-			stime = cputime_add(stime, cputime.stime);
+			thread_group_times(p, &tgutime, &tgstime);
+			utime = cputime_add(utime, tgutime);
+			stime = cputime_add(stime, tgstime);
 			r->ru_nvcsw += p->signal->nvcsw;
 			r->ru_nivcsw += p->signal->nivcsw;
 			r->ru_minflt += p->signal->min_flt;
-- 
cgit v1.2.3-71-gd317


From 35dead4235e2b67da7275b4122fed37099c2f462 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 3 Dec 2009 00:29:15 +0100
Subject: modules: don't export section names of empty sections via sysfs

On the parisc architecture we face for each and every loaded kernel module
this kernel "badness warning":
  sysfs: cannot create duplicate filename '/module/ac97_bus/sections/.text'
  Badness at fs/sysfs/dir.c:487

Reason for that is, that on parisc all kernel modules do have multiple
.text sections due to the usage of the -ffunction-sections compiler flag
which is needed to reach all jump targets on this platform.

An objdump on such a kernel module gives:
Sections:
Idx Name          Size      VMA       LMA       File off  Algn
  0 .note.gnu.build-id 00000024  00000000  00000000  00000034  2**2
                  CONTENTS, ALLOC, LOAD, READONLY, DATA
  1 .text         00000000  00000000  00000000  00000058  2**0
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  2 .text.ac97_bus_match 0000001c  00000000  00000000  00000058  2**2
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  3 .text         00000000  00000000  00000000  000000d4  2**0
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
...
Since the .text sections are empty (size of 0 bytes) and won't be
loaded by the kernel module loader anyway, I don't see a reason
why such sections need to be listed under
/sys/module/<module_name>/sections/<section_name> either.

The attached patch does solve this issue by not exporting section
names which are empty.

This fixes bugzilla http://bugzilla.kernel.org/show_bug.cgi?id=14703

Signed-off-by: Helge Deller <deller@gmx.de>
CC: rusty@rustcorp.com.au
CC: akpm@linux-foundation.org
CC: James.Bottomley@HansenPartnership.com
CC: roland@redhat.com
CC: dave@hiauly1.hia.nrc.ca
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 8b7d8805819d..5842a71cf052 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1187,7 +1187,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 
 	/* Count loaded sections and allocate structures */
 	for (i = 0; i < nsect; i++)
-		if (sechdrs[i].sh_flags & SHF_ALLOC)
+		if (sechdrs[i].sh_flags & SHF_ALLOC
+		    && sechdrs[i].sh_size)
 			nloaded++;
 	size[0] = ALIGN(sizeof(*sect_attrs)
 			+ nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1207,6 +1208,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 	for (i = 0; i < nsect; i++) {
 		if (! (sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
+		if (!sechdrs[i].sh_size)
+			continue;
 		sattr->address = sechdrs[i].sh_addr;
 		sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
 					GFP_KERNEL);
-- 
cgit v1.2.3-71-gd317


From d3f6bad3911736e44ba11f3f3f6ac4e8c837fdfc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 Dec 2009 12:10:13 -0800
Subject: rcu: Rename "quiet" functions

The number of "quiet" functions has grown recently, and the
names are no longer very descriptive.  The point of all of these
functions is to do some portion of the task of reporting a
quiescent state, so rename them accordingly:

o	cpu_quiet() becomes rcu_report_qs_rdp(), which reports a
	quiescent state to the per-CPU rcu_data structure.  If this
	turns out to be a new quiescent state for this grace period,
	then rcu_report_qs_rnp() will be invoked to propagate the
	quiescent state up the rcu_node hierarchy.

o	cpu_quiet_msk() becomes rcu_report_qs_rnp(), which reports
	a quiescent state for a given CPU (or possibly a set of CPUs)
	up the rcu_node hierarchy.

o	cpu_quiet_msk_finish() becomes rcu_report_qs_rsp(), which
	reports a full set of quiescent states to the global rcu_state
	structure.

o	task_quiet() becomes rcu_report_unblock_qs_rnp(), which reports
	a quiescent state due to a task exiting an RCU read-side critical
	section that had previously blocked in that same critical section.
	As indicated by the new name, this type of quiescent state is
	reported up the rcu_node hierarchy (using rcu_report_qs_rnp()
	to do so).

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12597846163698-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 66 +++++++++++++++++++++++++++----------------------
 kernel/rcutree.h        |  3 ++-
 kernel/rcutree_plugin.h | 12 ++++-----
 3 files changed, 45 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4ca7e0292fd8..a9f51031d3e8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -740,11 +740,13 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 }
 
 /*
- * Clean up after the prior grace period and let rcu_start_gp() start up
- * the next grace period if one is needed.  Note that the caller must
- * hold rnp->lock, as required by rcu_start_gp(), which will release it.
+ * Report a full set of quiescent states to the specified rcu_state
+ * data structure.  This involves cleaning up after the prior grace
+ * period and letting rcu_start_gp() start up the next grace period
+ * if one is needed.  Note that the caller must hold rnp->lock, as
+ * required by rcu_start_gp(), which will release it.
  */
-static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
+static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
@@ -754,15 +756,16 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
 }
 
 /*
- * Similar to cpu_quiet(), for which it is a helper function.  Allows
- * a group of CPUs to be quieted at one go, though all the CPUs in the
- * group must be represented by the same leaf rcu_node structure.
- * That structure's lock must be held upon entry, and it is released
- * before return.
+ * Similar to rcu_report_qs_rdp(), for which it is a helper function.
+ * Allows quiescent states for a group of CPUs to be reported at one go
+ * to the specified rcu_node structure, though all the CPUs in the group
+ * must be represented by the same rcu_node structure (which need not be
+ * a leaf rcu_node structure, though it often will be).  That structure's
+ * lock must be held upon entry, and it is released before return.
  */
 static void
-cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
-	      unsigned long flags)
+rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
+		  struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
 	struct rcu_node *rnp_c;
@@ -798,21 +801,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
 
 	/*
 	 * Get here if we are the last CPU to pass through a quiescent
-	 * state for this grace period.  Invoke cpu_quiet_msk_finish()
+	 * state for this grace period.  Invoke rcu_report_qs_rsp()
 	 * to clean up and start the next grace period if one is needed.
 	 */
-	cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
+	rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
 }
 
 /*
- * Record a quiescent state for the specified CPU, which must either be
- * the current CPU.  The lastcomp argument is used to make sure we are
- * still in the grace period of interest.  We don't want to end the current
- * grace period based on quiescent states detected in an earlier grace
- * period!
+ * Record a quiescent state for the specified CPU to that CPU's rcu_data
+ * structure.  This must be either called from the specified CPU, or
+ * called when the specified CPU is known to be offline (and when it is
+ * also known that no other CPU is concurrently trying to help the offline
+ * CPU).  The lastcomp argument is used to make sure we are still in the
+ * grace period of interest.  We don't want to end the current grace period
+ * based on quiescent states detected in an earlier grace period!
  */
 static void
-cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
 {
 	unsigned long flags;
 	unsigned long mask;
@@ -827,8 +832,8 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
 		 * The race with GP start is resolved by the fact that we
 		 * hold the leaf rcu_node lock, so that the per-CPU bits
 		 * cannot yet be initialized -- so we would simply find our
-		 * CPU's bit already cleared in cpu_quiet_msk() if this race
-		 * occurred.
+		 * CPU's bit already cleared in rcu_report_qs_rnp() if this
+		 * race occurred.
 		 */
 		rdp->passed_quiesc = 0;	/* try again later! */
 		spin_unlock_irqrestore(&rnp->lock, flags);
@@ -846,7 +851,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
 		 */
 		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 
-		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
 	}
 }
 
@@ -877,8 +882,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	if (!rdp->passed_quiesc)
 		return;
 
-	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
-	cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+	/*
+	 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
+	 * judge of that).
+	 */
+	rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -968,13 +976,13 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 	/*
 	 * We still hold the leaf rcu_node structure lock here, and
 	 * irqs are still disabled.  The reason for this subterfuge is
-	 * because invoking task_quiet() with ->onofflock held leads
-	 * to deadlock.
+	 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
+	 * held leads to deadlock.
 	 */
 	spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
 	rnp = rdp->mynode;
 	if (need_quiet)
-		task_quiet(rnp, flags);
+		rcu_report_unblock_qs_rnp(rnp, flags);
 	else
 		spin_unlock_irqrestore(&rnp->lock, flags);
 
@@ -1164,8 +1172,8 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 		}
 		if (mask != 0 && rnp->completed == lastcomp) {
 
-			/* cpu_quiet_msk() releases rnp->lock. */
-			cpu_quiet_msk(mask, rsp, rnp, flags);
+			/* rcu_report_qs_rnp() releases rnp->lock. */
+			rcu_report_qs_rnp(mask, rsp, rnp, flags);
 			continue;
 		}
 		spin_unlock_irqrestore(&rnp->lock, flags);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index a81188c42929..8bb03cb07447 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -306,7 +306,8 @@ long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempted_readers(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void task_quiet(struct rcu_node *rnp, unsigned long flags);
+static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
+				      unsigned long flags);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 static void rcu_print_task_stall(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 1d295c789d3d..c9f0c975c003 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -167,7 +167,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
  * irqs disabled, and this lock is released upon return, but irqs remain
  * disabled.
  */
-static void task_quiet(struct rcu_node *rnp, unsigned long flags)
+static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
 	unsigned long mask;
@@ -185,7 +185,7 @@ static void task_quiet(struct rcu_node *rnp, unsigned long flags)
 		 * or tasks were kicked up to root rcu_node due to
 		 * CPUs going offline.
 		 */
-		cpu_quiet_msk_finish(&rcu_preempt_state, flags);
+		rcu_report_qs_rsp(&rcu_preempt_state, flags);
 		return;
 	}
 
@@ -193,7 +193,7 @@ static void task_quiet(struct rcu_node *rnp, unsigned long flags)
 	mask = rnp->grpmask;
 	spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	spin_lock(&rnp_p->lock);	/* irqs already disabled. */
-	cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
+	rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 }
 
 /*
@@ -253,12 +253,12 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		/*
 		 * If this was the last task on the current list, and if
 		 * we aren't waiting on any CPUs, report the quiescent state.
-		 * Note that task_quiet() releases rnp->lock.
+		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
 		 */
 		if (empty)
 			spin_unlock_irqrestore(&rnp->lock, flags);
 		else
-			task_quiet(rnp, flags);
+			rcu_report_unblock_qs_rnp(rnp, flags);
 	} else {
 		local_irq_restore(flags);
 	}
@@ -566,7 +566,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 #ifdef CONFIG_HOTPLUG_CPU
 
 /* Because preemptible RCU does not exist, no quieting of tasks. */
-static void task_quiet(struct rcu_node *rnp, unsigned long flags)
+static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 {
 	spin_unlock_irqrestore(&rnp->lock, flags);
 }
-- 
cgit v1.2.3-71-gd317


From cf244dc01bf68e1ad338b82447f8686d24ea4435 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 Dec 2009 12:10:14 -0800
Subject: rcu: Enable fourth level of TREE_RCU hierarchy

Enable a fourth level of rcu_node hierarchy for TREE_RCU and
TREE_PREEMPT_RCU.  This is for stress-testing and experiemental
purposes only, although in theory this would enable 16,777,216
CPUs on 64-bit systems, though only 1,048,576 CPUs on 32-bit
systems. Normal experimental use of this fourth level will
normally set CONFIG_RCU_FANOUT=2, requiring a 16-CPU system,
though the more adventurous (and more fortunate) experimenters
may wish to chose CONFIG_RCU_FANOUT=3 for 81-CPU systems or even
CONFIG_RCU_FANOUT=4 for 256-CPU systems.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12597846161257-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c |  6 +++++-
 kernel/rcutree.h | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a9f51031d3e8..d47e03e5792a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,7 +60,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
 		NUM_RCU_LVL_1, \
 		NUM_RCU_LVL_2, \
-		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+		NUM_RCU_LVL_3, \
+		NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
 	}, \
 	.signaled = RCU_GP_IDLE, \
 	.gpnum = -300, \
@@ -1877,6 +1878,9 @@ void __init rcu_init(void)
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#if NUM_RCU_LVL_4 != 0
+	printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
+#endif /* #if NUM_RCU_LVL_4 != 0 */
 	RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
 	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
 	__rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 8bb03cb07447..df2e0b694744 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
  * In practice, this has not been tested, so there is probably some
  * bug somewhere.
  */
-#define MAX_RCU_LVLS 3
+#define MAX_RCU_LVLS 4
 #define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
 #define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
 #define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
+#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
 
 #if NR_CPUS <= RCU_FANOUT
 #  define NUM_RCU_LVLS	      1
@@ -45,23 +46,33 @@
 #  define NUM_RCU_LVL_1	      (NR_CPUS)
 #  define NUM_RCU_LVL_2	      0
 #  define NUM_RCU_LVL_3	      0
+#  define NUM_RCU_LVL_4	      0
 #elif NR_CPUS <= RCU_FANOUT_SQ
 #  define NUM_RCU_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
 #  define NUM_RCU_LVL_2	      (NR_CPUS)
 #  define NUM_RCU_LVL_3	      0
+#  define NUM_RCU_LVL_4	      0
 #elif NR_CPUS <= RCU_FANOUT_CUBE
 #  define NUM_RCU_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
 #  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
 #  define NUM_RCU_LVL_3	      NR_CPUS
+#  define NUM_RCU_LVL_4	      0
+#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#  define NUM_RCU_LVLS	      4
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_4	      NR_CPUS
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT */
 
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
 
 /*
-- 
cgit v1.2.3-71-gd317


From d9a3da0699b24a589b27a61e1a5b5bd30d9db669 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 Dec 2009 12:10:15 -0800
Subject: rcu: Add expedited grace-period support for preemptible RCU

Implement an synchronize_rcu_expedited() for preemptible RCU
that actually is expedited.  This uses
synchronize_sched_expedited() to force all threads currently
running in a preemptible-RCU read-side critical section onto the
appropriate ->blocked_tasks[] list, then takes a snapshot of all
of these lists and waits for them to drain.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1259784616158-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c     |  34 ++++++---
 kernel/rcutree.c        |  10 ++-
 kernel/rcutree.h        |  35 ++++++++-
 kernel/rcutree_plugin.h | 198 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/rcutree_trace.c  |  10 ++-
 5 files changed, 260 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 3dd0ca23e191..a621a67ef4e3 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p)
 		cur_ops->deferred_free(rp);
 }
 
+static int rcu_no_completed(void)
+{
+	return 0;
+}
+
 static void rcu_torture_deferred_free(struct rcu_torture *p)
 {
 	call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = {
 	.name		= "rcu_sync"
 };
 
+static struct rcu_torture_ops rcu_expedited_ops = {
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= rcu_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_torture_read_unlock,
+	.completed	= rcu_no_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= synchronize_rcu_expedited,
+	.cb_barrier	= NULL,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_expedited"
+};
+
 /*
  * Definitions for rcu_bh torture testing.
  */
@@ -581,11 +601,6 @@ static void sched_torture_read_unlock(int idx)
 	preempt_enable();
 }
 
-static int sched_torture_completed(void)
-{
-	return 0;
-}
-
 static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
 {
 	call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -602,7 +617,7 @@ static struct rcu_torture_ops sched_ops = {
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
-	.completed	= sched_torture_completed,
+	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= sched_torture_synchronize,
 	.cb_barrier	= rcu_barrier_sched,
@@ -617,7 +632,7 @@ static struct rcu_torture_ops sched_sync_ops = {
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
-	.completed	= sched_torture_completed,
+	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= sched_torture_synchronize,
 	.cb_barrier	= NULL,
@@ -631,7 +646,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
-	.completed	= sched_torture_completed,
+	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_sched_expedited,
 	.cb_barrier	= NULL,
@@ -1116,7 +1131,8 @@ rcu_torture_init(void)
 	int cpu;
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
-		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
+		  &rcu_bh_ops, &rcu_bh_sync_ops,
 		  &srcu_ops, &srcu_expedited_ops,
 		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d47e03e5792a..53ae9598f798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -948,7 +948,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
 	unsigned long mask;
-	int need_quiet = 0;
+	int need_report = 0;
 	struct rcu_data *rdp = rsp->rda[cpu];
 	struct rcu_node *rnp;
 
@@ -967,7 +967,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 			break;
 		}
 		if (rnp == rdp->mynode)
-			need_quiet = rcu_preempt_offline_tasks(rsp, rnp, rdp);
+			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
 		else
 			spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		mask = rnp->grpmask;
@@ -982,10 +982,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 	 */
 	spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
 	rnp = rdp->mynode;
-	if (need_quiet)
+	if (need_report & RCU_OFL_TASKS_NORM_GP)
 		rcu_report_unblock_qs_rnp(rnp, flags);
 	else
 		spin_unlock_irqrestore(&rnp->lock, flags);
+	if (need_report & RCU_OFL_TASKS_EXP_GP)
+		rcu_report_exp_rnp(rsp, rnp);
 
 	rcu_adopt_orphan_cbs(rsp);
 }
@@ -1843,6 +1845,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			rnp->level = i;
 			INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
 			INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
+			INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
+			INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
 		}
 	}
 }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index df2e0b694744..d2a0046f63b2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -104,8 +104,12 @@ struct rcu_node {
 				/*  an rcu_data structure, otherwise, each */
 				/*  bit corresponds to a child rcu_node */
 				/*  structure. */
+	unsigned long expmask;	/* Groups that have ->blocked_tasks[] */
+				/*  elements that need to drain to allow the */
+				/*  current expedited grace period to */
+				/*  complete (only for TREE_PREEMPT_RCU). */
 	unsigned long qsmaskinit;
-				/* Per-GP initialization for qsmask. */
+				/* Per-GP initial value for qsmask & expmask. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
@@ -113,7 +117,7 @@ struct rcu_node {
 	u8	grpnum;		/* CPU/group number for next level up. */
 	u8	level;		/* root is at level 0. */
 	struct rcu_node *parent;
-	struct list_head blocked_tasks[2];
+	struct list_head blocked_tasks[4];
 				/* Tasks blocked in RCU read-side critsect. */
 				/*  Grace period number (->gpnum) x blocked */
 				/*  by tasks on the (x & 0x1) element of the */
@@ -128,6 +132,21 @@ struct rcu_node {
 	for ((rnp) = &(rsp)->node[0]; \
 	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
 
+/*
+ * Do a breadth-first scan of the non-leaf rcu_node structures for the
+ * specified rcu_state structure.  Note that if there is a singleton
+ * rcu_node tree with but one rcu_node structure, this loop is a no-op.
+ */
+#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+	for ((rnp) = &(rsp)->node[0]; \
+	     (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+
+/*
+ * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
+ * structure.  Note that if there is a singleton rcu_node tree with but
+ * one rcu_node structure, this loop -will- visit the rcu_node structure.
+ * It is still a leaf node, even if it is also the root node.
+ */
 #define rcu_for_each_leaf_node(rsp, rnp) \
 	for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
 	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
@@ -261,7 +280,7 @@ struct rcu_state {
 	long	gpnum;				/* Current gp number. */
 	long	completed;			/* # of last completed gp. */
 
-	/* End  of fields guarded by root rcu_node's lock. */
+	/* End of fields guarded by root rcu_node's lock. */
 
 	spinlock_t onofflock;			/* exclude on/offline and */
 						/*  starting new GP.  Also */
@@ -293,6 +312,13 @@ struct rcu_state {
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 };
 
+/* Return values for rcu_preempt_offline_tasks(). */
+
+#define RCU_OFL_TASKS_NORM_GP	0x1		/* Tasks blocking normal */
+						/*  GP were moved to root. */
+#define RCU_OFL_TASKS_EXP_GP	0x2		/* Tasks blocking expedited */
+						/*  GP were moved to root. */
+
 #ifdef RCU_TREE_NONCORE
 
 /*
@@ -333,6 +359,9 @@ static void rcu_preempt_offline_cpu(int cpu);
 static void rcu_preempt_check_callbacks(int cpu);
 static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
+#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c9f0c975c003..37fbccdf41d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,12 +24,15 @@
  *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
+#include <linux/delay.h>
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 
+static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+
 /*
  * Tell them what RCU they are running.
  */
@@ -157,7 +160,10 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
  */
 static int rcu_preempted_readers(struct rcu_node *rnp)
 {
-	return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
+	int phase = rnp->gpnum & 0x1;
+
+	return !list_empty(&rnp->blocked_tasks[phase]) ||
+	       !list_empty(&rnp->blocked_tasks[phase + 2]);
 }
 
 /*
@@ -204,6 +210,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 static void rcu_read_unlock_special(struct task_struct *t)
 {
 	int empty;
+	int empty_exp;
 	unsigned long flags;
 	struct rcu_node *rnp;
 	int special;
@@ -247,6 +254,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			spin_unlock(&rnp->lock);  /* irqs remain disabled. */
 		}
 		empty = !rcu_preempted_readers(rnp);
+		empty_exp = !rcu_preempted_readers_exp(rnp);
+		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		list_del_init(&t->rcu_node_entry);
 		t->rcu_blocked_node = NULL;
 
@@ -259,6 +268,13 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			spin_unlock_irqrestore(&rnp->lock, flags);
 		else
 			rcu_report_unblock_qs_rnp(rnp, flags);
+
+		/*
+		 * If this was the last task on the expedited lists,
+		 * then we need to report up the rcu_node hierarchy.
+		 */
+		if (!empty_exp && !rcu_preempted_readers_exp(rnp))
+			rcu_report_exp_rnp(&rcu_preempt_state, rnp);
 	} else {
 		local_irq_restore(flags);
 	}
@@ -343,7 +359,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 	int i;
 	struct list_head *lp;
 	struct list_head *lp_root;
-	int retval;
+	int retval = 0;
 	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	struct task_struct *tp;
 
@@ -353,7 +369,9 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 	}
 	WARN_ON_ONCE(rnp != rdp->mynode &&
 		     (!list_empty(&rnp->blocked_tasks[0]) ||
-		      !list_empty(&rnp->blocked_tasks[1])));
+		      !list_empty(&rnp->blocked_tasks[1]) ||
+		      !list_empty(&rnp->blocked_tasks[2]) ||
+		      !list_empty(&rnp->blocked_tasks[3])));
 
 	/*
 	 * Move tasks up to root rcu_node.  Rely on the fact that the
@@ -361,8 +379,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 	 * rcu_nodes in terms of gp_num value.  This fact allows us to
 	 * move the blocked_tasks[] array directly, element by element.
 	 */
-	retval = rcu_preempted_readers(rnp);
-	for (i = 0; i < 2; i++) {
+	if (rcu_preempted_readers(rnp))
+		retval |= RCU_OFL_TASKS_NORM_GP;
+	if (rcu_preempted_readers_exp(rnp))
+		retval |= RCU_OFL_TASKS_EXP_GP;
+	for (i = 0; i < 4; i++) {
 		lp = &rnp->blocked_tasks[i];
 		lp_root = &rnp_root->blocked_tasks[i];
 		while (!list_empty(lp)) {
@@ -449,14 +470,159 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(struct rcu_node *rnp)
+{
+	return !list_empty(&rnp->blocked_tasks[2]) ||
+	       !list_empty(&rnp->blocked_tasks[3]);
+}
+
+/*
+ * return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period.  Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+	return !rcu_preempted_readers_exp(rnp) &&
+	       ACCESS_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.  This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree.  (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+	unsigned long flags;
+	unsigned long mask;
+
+	spin_lock_irqsave(&rnp->lock, flags);
+	for (;;) {
+		if (!sync_rcu_preempt_exp_done(rnp))
+			break;
+		if (rnp->parent == NULL) {
+			wake_up(&sync_rcu_preempt_exp_wq);
+			break;
+		}
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock); /* irqs remain disabled */
+		rnp = rnp->parent;
+		spin_lock(&rnp->lock); /* irqs already disabled */
+		rnp->expmask &= ~mask;
+	}
+	spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure.  If there are no such
+ * tasks, report it up the rcu_node hierarchy.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
+ */
+static void
+sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+	int must_wait;
+
+	spin_lock(&rnp->lock); /* irqs already disabled */
+	list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
+	list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
+	must_wait = rcu_preempted_readers_exp(rnp);
+	spin_unlock(&rnp->lock); /* irqs remain disabled */
+	if (!must_wait)
+		rcu_report_exp_rnp(rsp, rnp);
+}
+
 /*
- * Wait for an rcu-preempt grace period.  We are supposed to expedite the
- * grace period, but this is the crude slow compatability hack, so just
- * invoke synchronize_rcu().
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * is to invoke synchronize_sched_expedited() to push all the tasks to
+ * the ->blocked_tasks[] lists, move all entries from the first set of
+ * ->blocked_tasks[] lists to the second set, and finally wait for this
+ * second set to drain.
  */
 void synchronize_rcu_expedited(void)
 {
-	synchronize_rcu();
+	unsigned long flags;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = &rcu_preempt_state;
+	long snap;
+	int trycount = 0;
+
+	smp_mb(); /* Caller's modifications seen first by other CPUs. */
+	snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+	smp_mb(); /* Above access cannot bleed into critical section. */
+
+	/*
+	 * Acquire lock, falling back to synchronize_rcu() if too many
+	 * lock-acquisition failures.  Of course, if someone does the
+	 * expedited grace period for us, just leave.
+	 */
+	while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
+		if (trycount++ < 10)
+			udelay(trycount * num_online_cpus());
+		else {
+			synchronize_rcu();
+			return;
+		}
+		if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
+			goto mb_ret; /* Others did our work for us. */
+	}
+	if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
+		goto unlock_mb_ret; /* Others did our work for us. */
+
+	/* force all RCU readers onto blocked_tasks[]. */
+	synchronize_sched_expedited();
+
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Initialize ->expmask for all non-leaf rcu_node structures. */
+	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
+		spin_lock(&rnp->lock); /* irqs already disabled. */
+		rnp->expmask = rnp->qsmaskinit;
+		spin_unlock(&rnp->lock); /* irqs remain disabled. */
+	}
+
+	/* Snapshot current state of ->blocked_tasks[] lists. */
+	rcu_for_each_leaf_node(rsp, rnp)
+		sync_rcu_preempt_exp_init(rsp, rnp);
+	if (NUM_RCU_NODES > 1)
+		sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+
+	/* Wait for snapshotted ->blocked_tasks[] lists to drain. */
+	rnp = rcu_get_root(rsp);
+	wait_event(sync_rcu_preempt_exp_wq,
+		   sync_rcu_preempt_exp_done(rnp));
+
+	/* Clean up and exit. */
+	smp_mb(); /* ensure expedited GP seen before counter increment. */
+	ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
+unlock_mb_ret:
+	mutex_unlock(&sync_rcu_preempt_exp_mutex);
+mb_ret:
+	smp_mb(); /* ensure subsequent action seen after grace period. */
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
@@ -655,6 +821,20 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Because preemptable RCU does not exist, there is never any need to
+ * report on tasks preempted in RCU read-side critical sections during
+ * expedited RCU grace periods.
+ */
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+	return;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
 /*
  * Because preemptable RCU does not exist, it never has any work to do.
  */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 1984cdc51e9a..9d2c88423b31 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -157,6 +157,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
 	long gpnum;
 	int level = 0;
+	int phase;
 	struct rcu_node *rnp;
 
 	gpnum = rsp->gpnum;
@@ -173,10 +174,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 			seq_puts(m, "\n");
 			level = rnp->level;
 		}
-		seq_printf(m, "%lx/%lx %c>%c %d:%d ^%d    ",
+		phase = gpnum & 0x1;
+		seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d    ",
 			   rnp->qsmask, rnp->qsmaskinit,
-			   "T."[list_empty(&rnp->blocked_tasks[gpnum & 1])],
-			   "T."[list_empty(&rnp->blocked_tasks[!(gpnum & 1)])],
+			   "T."[list_empty(&rnp->blocked_tasks[phase])],
+			   "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
+			   "T."[list_empty(&rnp->blocked_tasks[!phase])],
+			   "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
 			   rnp->grplo, rnp->grphi, rnp->grpnum);
 	}
 	seq_puts(m, "\n");
-- 
cgit v1.2.3-71-gd317


From c02260277e472095ffb3ad893be5eeab9dcefde3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 2 Dec 2009 20:49:16 +0100
Subject: mutex: Better control mutex adaptive spinning config

Introduce CONFIG_MUTEX_SPIN_ON_OWNER so that we can centralize
in a single place the conditions that determine its definition
and use.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1259783357-8542-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/Kconfig.locks | 3 +++
 kernel/mutex.c       | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index d1f86db71451..88c92fb44618 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -197,3 +197,6 @@ config INLINE_WRITE_UNLOCK_IRQ
 
 config INLINE_WRITE_UNLOCK_IRQRESTORE
 	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+
+config MUTEX_SPIN_ON_OWNER
+	def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f8..632f04c57d82 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	preempt_disable();
 	mutex_acquire(&lock->dep_map, subclass, 0, ip);
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
-    !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	/*
 	 * Optimistic spinning.
 	 *
-- 
cgit v1.2.3-71-gd317


From c08f782985eed9959438368e84ce1d7f2ed03d95 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 2 Dec 2009 20:49:17 +0100
Subject: mutex: Fix missing conditions to build mutex_spin_on_owner()

We don't need to build mutex_spin_on_owner() if we have
CONFIG_DEBUG_MUTEXES or CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES as
it won't be used under such configs.

Use CONFIG_MUTEX_SPIN_ON_OWNER as it gathers all the necessary
checks before building it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1259783357-8542-2-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3c11ae0a948d..ec0af1fcb195 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5481,7 +5481,7 @@ need_resched_nonpreemptible:
 }
 EXPORT_SYMBOL(schedule);
 
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
  * Look out! "owner" is an entirely speculative pointer
  * access and not reliable.
-- 
cgit v1.2.3-71-gd317


From 8153a10c08f1312af563bb92532002e46d3f504a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 3 Dec 2009 01:25:58 +0000
Subject: ipv4 05/05: add sysctl to accept packets with local source addresses

commit 8ec1e0ebe26087bfc5c0394ada5feb5758014fc8
Author: Patrick McHardy <kaber@trash.net>
Date:   Thu Dec 3 12:16:35 2009 +0100

    ipv4: add sysctl to accept packets with local source addresses

    Change fib_validate_source() to accept packets with a local source address when
    the "accept_local" sysctl is set for the incoming inet device. Combined with the
    previous patches, this allows to communicate between multiple local interfaces
    over the wire.

    Signed-off-by: Patrick McHardy <kaber@trash.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/inetdevice.h             |  1 +
 include/linux/sysctl.h                 |  1 +
 kernel/sysctl_check.c                  |  1 +
 net/ipv4/devinet.c                     |  1 +
 net/ipv4/fib_frontend.c                | 11 +++++++----
 6 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 989f5538b8dd..006b39dec87d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -731,6 +731,12 @@ accept_source_route - BOOLEAN
 	default TRUE (router)
 		FALSE (host)
 
+accept_local - BOOLEAN
+	Accept packets with local source addresses. In combination with
+	suitable routing, this can be used to direct packets between two
+	local interfaces over the wire and have them accepted properly.
+	default FALSE
+
 rp_filter - INTEGER
 	0 - No source validation.
 	1 - Strict mode as defined in RFC3704 Strict Reverse Path
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index eecfa559bfb4..699e85c01a4d 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -83,6 +83,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 #define IN_DEV_RPFILTER(in_dev)		IN_DEV_MAXCONF((in_dev), RP_FILTER)
 #define IN_DEV_SOURCE_ROUTE(in_dev)	IN_DEV_ANDCONF((in_dev), \
 						       ACCEPT_SOURCE_ROUTE)
+#define IN_DEV_ACCEPT_LOCAL(in_dev)	IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL)
 #define IN_DEV_BOOTP_RELAY(in_dev)	IN_DEV_ANDCONF((in_dev), BOOTP_RELAY)
 
 #define IN_DEV_LOG_MARTIANS(in_dev)	IN_DEV_ORCONF((in_dev), LOG_MARTIANS)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1e4743ee6831..9f047d73a216 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -490,6 +490,7 @@ enum
 	NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
 	NET_IPV4_CONF_ARP_ACCEPT=21,
 	NET_IPV4_CONF_ARP_NOTIFY=22,
+	NET_IPV4_CONF_ACCEPT_LOCAL=23,
 	__NET_IPV4_CONF_MAX
 };
 
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b6e7aaea4604..f1d676e4b368 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -220,6 +220,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
 	{ NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },
 	{ NET_IPV4_CONF_ARP_ACCEPT,		"arp_accept" },
 	{ NET_IPV4_CONF_ARP_NOTIFY,		"arp_notify" },
+	{ NET_IPV4_CONF_ACCEPT_LOCAL,		"accept_local" },
 	{}
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c100709d6ddf..e3126612fcbb 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1468,6 +1468,7 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
 		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
 					"accept_source_route"),
+		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
 		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
 		DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
 		DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3b373a8b0473..3323168ee52d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -241,16 +241,17 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			    .iif = oif };
 
 	struct fib_result res;
-	int no_addr, rpf;
+	int no_addr, rpf, accept_local;
 	int ret;
 	struct net *net;
 
-	no_addr = rpf = 0;
+	no_addr = rpf = accept_local = 0;
 	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		no_addr = in_dev->ifa_list == NULL;
 		rpf = IN_DEV_RPFILTER(in_dev);
+		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
 	}
 	rcu_read_unlock();
 
@@ -260,8 +261,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 	net = dev_net(dev);
 	if (fib_lookup(net, &fl, &res))
 		goto last_resort;
-	if (res.type != RTN_UNICAST)
-		goto e_inval_res;
+	if (res.type != RTN_UNICAST) {
+		if (res.type != RTN_LOCAL || !accept_local)
+			goto e_inval_res;
+	}
 	*spec_dst = FIB_RES_PREFSRC(res);
 	fib_combine_itag(itag, &res);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-- 
cgit v1.2.3-71-gd317


From fb3d38b9904888aa8e36d88b2388dc70934b2169 Mon Sep 17 00:00:00 2001
From: Liuweni <qingshenlwy@gmail.com>
Date: Mon, 2 Nov 2009 15:55:03 +0100
Subject: fix kerneldoc for set_irq_msi()

Signed-off-by: Liuweni <qingshenlwy@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/irq/chip.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..ae70e93b29b5 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -166,11 +166,11 @@ int set_irq_data(unsigned int irq, void *data)
 EXPORT_SYMBOL(set_irq_data);
 
 /**
- *	set_irq_data - set irq type data for an irq
+ *	set_irq_msi - set irq type data for an irq
  *	@irq:	Interrupt number
  *	@entry:	Pointer to MSI descriptor data
  *
- *	Set the hardware irq controller data for an irq
+ *	Set the MSI descriptor entry for an irq
  */
 int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 {
-- 
cgit v1.2.3-71-gd317


From fbfecd3712f917ca210a55c157233d88b785896b Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 28 Oct 2009 20:11:04 +0100
Subject: tree-wide: fix typos "couter" -> "counter"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch was generated by

	git grep -E -i -l 'couter' | xargs -r perl -p -i -e 's/couter/counter/'

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/net/s2io.c             | 2 +-
 drivers/s390/block/dasd_proc.c | 2 +-
 kernel/irq/spurious.c          | 2 +-
 sound/synth/emux/soundfont.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index ddccf5fa56b6..075eedc7e768 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -3238,7 +3238,7 @@ static u64 s2io_mdio_read(u32 mmd_type, u64 addr, struct net_device *dev)
 
 /**
  *  s2io_chk_xpak_counter - Function to check the status of the xpak counters
- *  @counter      : couter value to be updated
+ *  @counter      : counter value to be updated
  *  @flag         : flag to indicate the status
  *  @type         : counter type
  *  Description:
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index 654daa3cdfda..5f23eca82804 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -215,7 +215,7 @@ dasd_statistics_read(char *page, char **start, off_t off,
 	}
 
 	prof = &dasd_global_profile;
-	/* prevent couter 'overflow' on output */
+	/* prevent counter 'overflow' on output */
 	for (factor = 1; (prof->dasd_io_reqs / factor) > 9999999;
 	     factor *= 10);
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760fe..abd4fb77f8ce 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -230,7 +230,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		/*
 		 * If we are seeing only the odd spurious IRQ caused by
 		 * bus asynchronicity then don't eventually trigger an error,
-		 * otherwise the couter becomes a doomsday timer for otherwise
+		 * otherwise the counter becomes a doomsday timer for otherwise
 		 * working systems
 		 */
 		if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/sound/synth/emux/soundfont.c b/sound/synth/emux/soundfont.c
index 63c8f45c0c22..67c91230c197 100644
--- a/sound/synth/emux/soundfont.c
+++ b/sound/synth/emux/soundfont.c
@@ -374,7 +374,7 @@ sf_zone_new(struct snd_sf_list *sflist, struct snd_soundfont *sf)
 
 
 /*
- * increment sample couter
+ * increment sample counter
  */
 static void
 set_sample_counter(struct snd_sf_list *sflist, struct snd_soundfont *sf,
-- 
cgit v1.2.3-71-gd317


From af901ca181d92aac3a7dc265144a9081a86d8f39 Mon Sep 17 00:00:00 2001
From: André Goddard Rosa <andre.goddard@gmail.com>
Date: Sat, 14 Nov 2009 13:09:05 -0200
Subject: tree-wide: fix assorted typos all over the place
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

That is "success", "unknown", "through", "performance", "[re|un]mapping"
, "access", "default", "reasonable", "[con]currently", "temperature"
, "channel", "[un]used", "application", "example","hierarchy", "therefore"
, "[over|under]flow", "contiguous", "threshold", "enough" and others.

Signed-off-by: André Goddard Rosa <andre.goddard@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ABI/testing/procfs-diskstats         |  2 +-
 Documentation/ABI/testing/sysfs-block              |  2 +-
 Documentation/DocBook/mtdnand.tmpl                 |  2 +-
 Documentation/DocBook/v4l/videodev2.h.xml          |  2 +-
 Documentation/DocBook/writing-an-alsa-driver.tmpl  |  2 +-
 Documentation/dvb/README.dvb-usb                   |  2 +-
 Documentation/lguest/lguest.c                      |  2 +-
 Documentation/scsi/ChangeLog.megaraid_sas          |  2 +-
 Documentation/spi/spi-summary                      |  2 +-
 Documentation/sysctl/vm.txt                        |  2 +-
 Documentation/video4linux/gspca.txt                |  2 +-
 Documentation/vm/page-types.c                      |  2 +-
 arch/alpha/mm/numa.c                               |  2 +-
 arch/arm/common/scoop.c                            |  2 +-
 .../mach-bcmring/include/mach/csp/dmacHw_priv.h    |  2 +-
 arch/arm/mach-bcmring/include/mach/dma.h           |  2 +-
 arch/arm/mach-lh7a40x/include/mach/hardware.h      |  2 +-
 arch/arm/mach-orion5x/pci.c                        |  2 +-
 arch/arm/mach-pxa/include/mach/palmld.h            |  2 +-
 arch/arm/mach-pxa/include/mach/palmt5.h            |  2 +-
 arch/arm/mach-pxa/include/mach/palmtc.h            |  2 +-
 arch/arm/mach-pxa/include/mach/palmte2.h           |  2 +-
 arch/arm/mach-pxa/include/mach/palmtx.h            |  2 +-
 arch/arm/mach-pxa/include/mach/palmz72.h           |  2 +-
 arch/arm/mach-s3c6400/setup-sdhci.c                |  2 +-
 arch/arm/mach-s3c6410/setup-sdhci.c                |  2 +-
 arch/arm/mach-sa1100/dma.c                         |  2 +-
 arch/arm/plat-mxc/include/mach/iomux-mx3.h         |  2 +-
 arch/arm/plat-mxc/include/mach/iomux-mxc91231.h    |  2 +-
 arch/arm/plat-mxc/pwm.c                            |  2 +-
 arch/arm/plat-omap/dma.c                           |  2 +-
 arch/arm/plat-omap/include/mach/omap16xx.h         |  2 +-
 arch/arm/plat-s3c24xx/include/plat/map.h           |  2 +-
 arch/avr32/boards/hammerhead/Kconfig               |  2 +-
 arch/blackfin/kernel/traps.c                       |  2 +-
 .../mach-bf518/include/mach/defBF51x_base.h        |  4 +-
 .../mach-bf527/include/mach/defBF52x_base.h        |  4 +-
 arch/blackfin/mach-bf537/include/mach/defBF534.h   |  4 +-
 arch/blackfin/mach-bf548/include/mach/defBF544.h   |  4 +-
 arch/blackfin/mach-bf548/include/mach/defBF547.h   |  4 +-
 arch/blackfin/mach-bf548/include/mach/defBF548.h   |  4 +-
 arch/blackfin/mach-bf548/include/mach/defBF549.h   |  4 +-
 arch/cris/mm/fault.c                               |  2 +-
 arch/ia64/hp/common/sba_iommu.c                    |  2 +-
 arch/ia64/ia32/ia32_entry.S                        |  2 +-
 arch/ia64/include/asm/perfmon_default_smpl.h       |  2 +-
 arch/ia64/include/asm/sn/shubio.h                  |  2 +-
 arch/ia64/kernel/esi.c                             |  2 +-
 arch/ia64/kernel/perfmon.c                         |  2 +-
 arch/m68k/ifpsp060/src/fpsp.S                      | 28 +++++-----
 arch/m68k/ifpsp060/src/pfpsp.S                     | 26 ++++-----
 arch/m68k/include/asm/bootinfo.h                   |  2 +-
 arch/microblaze/lib/memcpy.c                       |  2 +-
 arch/microblaze/lib/memmove.c                      |  2 +-
 arch/microblaze/lib/memset.c                       |  2 +-
 arch/mips/include/asm/mach-pnx833x/gpio.h          |  2 +-
 arch/mips/include/asm/sgi/ioc.h                    |  2 +-
 arch/mips/include/asm/sibyte/sb1250_mac.h          |  2 +-
 arch/mips/include/asm/sn/sn0/hubio.h               |  2 +-
 arch/mips/kernel/smtc.c                            |  2 +-
 arch/mips/math-emu/dp_sub.c                        |  2 +-
 arch/mips/txx9/generic/smsc_fdc37m81x.c            |  2 +-
 arch/powerpc/include/asm/reg_fsl_emb.h             |  2 +-
 arch/powerpc/kernel/kgdb.c                         |  2 +-
 arch/powerpc/kernel/tau_6xx.c                      |  2 +-
 arch/powerpc/oprofile/op_model_cell.c              |  4 +-
 arch/powerpc/platforms/52xx/mpc52xx_pci.c          |  2 +-
 arch/powerpc/platforms/powermac/pci.c              |  2 +-
 arch/powerpc/sysdev/dart_iommu.c                   |  2 +-
 arch/s390/math-emu/math.c                          |  4 +-
 arch/x86/include/asm/desc_defs.h                   |  4 +-
 arch/x86/include/asm/mmzone_32.h                   |  2 +-
 arch/x86/include/asm/uv/uv_bau.h                   |  2 +-
 arch/x86/kernel/acpi/boot.c                        |  2 +-
 arch/x86/kernel/amd_iommu.c                        |  4 +-
 arch/x86/kernel/cpu/perf_event.c                   |  2 +-
 arch/x86/kernel/kprobes.c                          |  4 +-
 arch/x86/mm/kmmio.c                                |  4 +-
 block/blk-iopoll.c                                 |  2 +-
 drivers/ata/ata_piix.c                             |  2 +-
 drivers/ata/sata_fsl.c                             |  6 +--
 drivers/atm/iphase.c                               |  2 +-
 drivers/base/dd.c                                  |  2 +-
 drivers/bluetooth/btmrvl_sdio.c                    |  2 +-
 drivers/bluetooth/hci_ldisc.c                      |  2 +-
 drivers/char/mem.c                                 |  2 +-
 drivers/char/mspec.c                               |  2 +-
 drivers/char/n_r3964.c                             |  2 +-
 drivers/char/rio/route.h                           |  2 +-
 drivers/crypto/hifn_795x.c                         |  2 +-
 drivers/dma/at_hdmac.c                             |  2 +-
 drivers/firewire/core-topology.c                   |  2 +-
 drivers/gpu/drm/drm_crtc.c                         |  4 +-
 drivers/gpu/drm/i915/i915_gem.c                    |  2 +-
 drivers/gpu/drm/i915/intel_fb.c                    |  2 +-
 drivers/gpu/drm/i915/intel_sdvo.c                  |  2 +-
 drivers/gpu/drm/radeon/r600.c                      |  4 +-
 drivers/gpu/drm/radeon/radeon_fb.c                 |  2 +-
 drivers/gpu/drm/radeon/radeon_state.c              |  2 +-
 drivers/gpu/drm/radeon/radeon_ttm.c                |  2 +-
 drivers/gpu/drm/radeon/rv770.c                     |  4 +-
 drivers/gpu/drm/ttm/ttm_bo_util.c                  |  2 +-
 drivers/hwmon/adm1029.c                            |  2 +-
 drivers/hwmon/lm93.c                               |  2 +-
 drivers/ieee1394/dv1394.c                          |  2 +-
 drivers/infiniband/hw/ipath/ipath_iba6110.c        |  2 +-
 drivers/infiniband/hw/ipath/ipath_sd7220.c         |  4 +-
 drivers/infiniband/hw/mlx4/qp.c                    |  2 +-
 drivers/input/serio/hp_sdc.c                       |  2 +-
 drivers/input/serio/hp_sdc_mlc.c                   |  2 +-
 drivers/input/touchscreen/atmel-wm97xx.c           |  2 +-
 drivers/input/touchscreen/mainstone-wm97xx.c       |  4 +-
 drivers/input/touchscreen/zylonite-wm97xx.c        |  2 +-
 drivers/isdn/capi/capidrv.c                        |  2 +-
 drivers/isdn/hardware/eicon/di.c                   |  2 +-
 drivers/isdn/hardware/eicon/maintidi.c             |  4 +-
 drivers/isdn/hardware/mISDN/hfcsusb.c              |  2 +-
 drivers/isdn/hardware/mISDN/hfcsusb.h              |  2 +-
 drivers/isdn/hardware/mISDN/mISDNisar.c            |  2 +-
 drivers/isdn/hisax/hfc_usb.c                       |  2 +-
 drivers/isdn/i4l/isdn_ppp.c                        |  6 +--
 drivers/isdn/i4l/isdn_ttyfax.c                     |  2 +-
 drivers/isdn/mISDN/dsp_core.c                      |  2 +-
 drivers/isdn/mISDN/tei.c                           |  2 +-
 drivers/macintosh/therm_windtunnel.c               |  2 +-
 drivers/media/common/saa7146_i2c.c                 |  2 +-
 drivers/media/dvb/dvb-core/dvb_frontend.h          |  2 +-
 drivers/media/dvb/dvb-usb/anysee.c                 |  2 +-
 drivers/media/dvb/dvb-usb/dibusb-mb.c              |  2 +-
 drivers/media/dvb/dvb-usb/dvb-usb-remote.c         |  2 +-
 drivers/media/dvb/dvb-usb/usb-urb.c                |  4 +-
 drivers/media/dvb/frontends/au8522_decoder.c       |  2 +-
 drivers/media/dvb/frontends/cx24110.c              |  4 +-
 drivers/media/dvb/frontends/cx24113.c              |  2 +-
 drivers/media/dvb/frontends/dib3000mb.c            |  2 +-
 drivers/media/dvb/frontends/lgdt330x.c             |  4 +-
 drivers/media/dvb/frontends/stb0899_drv.c          |  2 +-
 drivers/media/dvb/ttpci/av7110.c                   |  4 +-
 drivers/media/dvb/ttpci/budget-patch.c             |  2 +-
 drivers/media/radio/radio-mr800.c                  |  2 +-
 drivers/media/video/cx231xx/cx231xx-avcore.c       |  8 +--
 drivers/media/video/cx23885/cx23885-dvb.c          |  2 +-
 drivers/media/video/cx88/cx88-core.c               |  2 +-
 drivers/media/video/davinci/dm355_ccdc.c           |  2 +-
 drivers/media/video/davinci/vpss.c                 |  2 +-
 drivers/media/video/gspca/sonixb.c                 |  2 +-
 drivers/media/video/gspca/spca500.c                |  2 +-
 drivers/media/video/gspca/spca501.c                |  6 +--
 drivers/media/video/gspca/sunplus.c                |  2 +-
 drivers/media/video/gspca/zc3xx.c                  |  2 +-
 drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h |  2 +-
 drivers/media/video/s2255drv.c                     |  2 +-
 drivers/media/video/zoran/zoran.h                  |  2 +-
 drivers/message/i2o/i2o_block.c                    |  2 +-
 drivers/message/i2o/iop.c                          |  4 +-
 drivers/misc/sgi-gru/grufile.c                     |  2 +-
 drivers/mmc/host/s3cmci.c                          |  2 +-
 drivers/mtd/devices/slram.c                        |  2 +-
 drivers/mtd/nand/diskonchip.c                      |  2 +-
 drivers/mtd/nand/nand_ecc.c                        |  2 +-
 drivers/mtd/nand/s3c2410.c                         |  2 +-
 drivers/net/82596.c                                |  2 +-
 drivers/net/amd8111e.c                             |  7 ++-
 drivers/net/appletalk/cops.c                       |  2 +-
 drivers/net/ariadne.h                              |  2 +-
 drivers/net/atl1c/atl1c_main.c                     |  2 +-
 drivers/net/benet/be_cmds.h                        |  2 +-
 drivers/net/benet/be_main.c                        |  2 +-
 drivers/net/bnx2x_reg.h                            |  2 +-
 drivers/net/cxgb3/sge.c                            |  2 +-
 drivers/net/ehea/ehea_ethtool.c                    |  4 +-
 drivers/net/hamradio/baycom_ser_fdx.c              |  2 +-
 drivers/net/iseries_veth.c                         |  2 +-
 drivers/net/lasi_82596.c                           |  2 +-
 drivers/net/lib82596.c                             |  2 +-
 drivers/net/mlx4/en_rx.c                           |  2 +-
 drivers/net/mlx4/en_tx.c                           |  2 +-
 drivers/net/mlx4/mlx4_en.h                         |  2 +-
 drivers/net/ps3_gelic_net.c                        |  2 +-
 drivers/net/sis900.c                               |  4 +-
 drivers/net/skfp/h/smc.h                           |  8 +--
 drivers/net/skfp/skfddi.c                          |  2 +-
 drivers/net/smsc911x.c                             |  2 +-
 drivers/net/smsc911x.h                             |  2 +-
 drivers/net/spider_net.c                           |  2 +-
 drivers/net/stmmac/gmac.c                          |  2 +-
 drivers/net/stmmac/gmac.h                          |  4 +-
 drivers/net/tokenring/smctr.c                      |  2 +-
 drivers/net/ucc_geth.c                             |  2 +-
 drivers/net/ucc_geth.h                             | 20 +++----
 drivers/net/usb/smsc95xx.c                         |  2 +-
 drivers/net/wan/lmc/lmc_main.c                     |  2 +-
 drivers/net/wimax/i2400m/rx.c                      |  2 +-
 drivers/net/wireless/ath/ath5k/phy.c               |  6 +--
 drivers/net/wireless/ath/ath9k/rc.c                |  2 +-
 drivers/net/wireless/ipw2x00/ipw2100.c             |  6 +--
 drivers/net/wireless/ipw2x00/ipw2200.c             |  8 +--
 drivers/net/wireless/ipw2x00/libipw_module.c       |  2 +-
 drivers/net/wireless/iwmc3200wifi/hal.c            |  2 +-
 drivers/net/wireless/iwmc3200wifi/rx.c             |  2 +-
 drivers/net/wireless/libertas/if_sdio.c            |  2 +-
 drivers/net/wireless/prism54/isl_ioctl.c           |  4 +-
 drivers/net/wireless/rt2x00/rt2400pci.h            |  2 +-
 drivers/net/wireless/rt2x00/rt2500pci.h            |  2 +-
 drivers/net/wireless/rt2x00/rt2500usb.h            |  2 +-
 drivers/net/wireless/rt2x00/rt61pci.h              |  2 +-
 drivers/net/wireless/rt2x00/rt73usb.h              |  2 +-
 drivers/net/wireless/wavelan_cs.c                  |  2 +-
 drivers/net/wireless/zd1211rw/zd_mac.c             |  2 +-
 drivers/parisc/ccio-dma.c                          |  2 +-
 drivers/platform/x86/thinkpad_acpi.c               |  2 +-
 drivers/pnp/pnpbios/rsparser.c                     |  8 +--
 drivers/ps3/ps3-sys-manager.c                      |  2 +-
 drivers/rtc/rtc-v3020.c                            |  2 +-
 drivers/s390/char/fs3270.c                         |  2 +-
 drivers/s390/cio/chp.c                             |  2 +-
 drivers/s390/cio/cmf.c                             |  2 +-
 drivers/sbus/char/envctrl.c                        |  4 +-
 drivers/scsi/53c700.c                              |  2 +-
 drivers/scsi/aacraid/aacraid.h                     |  6 +--
 drivers/scsi/aacraid/comminit.c                    |  2 +-
 drivers/scsi/aic7xxx/aic79xx.seq                   |  2 +-
 drivers/scsi/aic7xxx/aic79xx_core.c                |  2 +-
 drivers/scsi/aic7xxx/aic7xxx_core.c                |  2 +-
 drivers/scsi/bfa/include/defs/bfa_defs_pport.h     |  2 +-
 drivers/scsi/bfa/include/defs/bfa_defs_tsensor.h   |  2 +-
 drivers/scsi/hptiop.c                              |  2 +-
 drivers/scsi/libfc/fc_lport.c                      |  2 +-
 drivers/scsi/libiscsi_tcp.c                        |  2 +-
 drivers/scsi/lpfc/lpfc_attr.c                      |  4 +-
 drivers/scsi/lpfc/lpfc_els.c                       |  4 +-
 drivers/scsi/lpfc/lpfc_init.c                      | 62 +++++++++++-----------
 drivers/scsi/lpfc/lpfc_sli.c                       | 10 ++--
 drivers/scsi/megaraid.h                            |  2 +-
 drivers/scsi/megaraid/mbox_defs.h                  |  2 +-
 drivers/scsi/megaraid/megaraid_mbox.c              |  2 +-
 drivers/scsi/mpt2sas/mpt2sas_scsih.c               |  2 +-
 drivers/scsi/ncr53c8xx.c                           |  2 +-
 drivers/scsi/pmcraid.c                             |  6 +--
 drivers/scsi/pmcraid.h                             |  6 +--
 drivers/scsi/scsi_netlink.c                        |  2 +-
 drivers/scsi/scsi_transport_sas.c                  |  6 +--
 drivers/scsi/sym53c8xx_2/sym_glue.c                |  2 +-
 drivers/scsi/sym53c8xx_2/sym_hipd.c                |  2 +-
 drivers/scsi/sym53c8xx_2/sym_hipd.h                |  2 +-
 drivers/serial/8250_pnp.c                          |  4 +-
 drivers/serial/pmac_zilog.h                        |  2 +-
 drivers/serial/ucc_uart.c                          |  2 +-
 drivers/telephony/ixj.c                            |  4 +-
 drivers/usb/atm/ueagle-atm.c                       |  2 +-
 drivers/usb/class/usbtmc.c                         |  2 +-
 drivers/usb/core/message.c                         |  2 +-
 drivers/usb/gadget/f_acm.c                         |  2 +-
 drivers/usb/gadget/pxa27x_udc.c                    |  2 +-
 drivers/usb/host/fhci-sched.c                      |  2 +-
 drivers/usb/wusbcore/crypto.c                      |  2 +-
 drivers/usb/wusbcore/wa-xfer.c                     |  4 +-
 drivers/uwb/i1480/dfu/usb.c                        |  2 +-
 drivers/uwb/wlp/txrx.c                             |  2 +-
 drivers/video/aty/atyfb_base.c                     |  4 +-
 drivers/video/backlight/atmel-pwm-bl.c             |  2 +-
 drivers/video/backlight/tosa_lcd.c                 |  2 +-
 drivers/video/console/sticore.c                    |  2 +-
 drivers/video/gbefb.c                              |  2 +-
 drivers/video/stifb.c                              |  4 +-
 drivers/video/tdfxfb.c                             |  2 +-
 drivers/video/via/dvi.c                            |  4 +-
 drivers/video/vt8623fb.c                           |  2 +-
 drivers/watchdog/coh901327_wdt.c                   |  2 +-
 drivers/watchdog/machzwd.c                         |  2 +-
 drivers/watchdog/wdrtas.c                          |  6 +--
 fs/binfmt_elf.c                                    |  2 +-
 fs/bio.c                                           |  2 +-
 fs/btrfs/extent_map.c                              |  2 +-
 fs/cifs/README                                     |  2 +-
 fs/cifs/cifsglob.h                                 |  2 +-
 fs/cifs/inode.c                                    |  4 +-
 fs/cifs/smbdes.c                                   |  2 +-
 fs/dlm/plock.c                                     |  2 +-
 fs/ext4/inode.c                                    |  6 +--
 fs/ext4/mballoc.c                                  |  2 +-
 fs/jffs2/compr.c                                   |  2 +-
 fs/jffs2/readinode.c                               |  2 +-
 fs/jffs2/xattr.c                                   |  2 +-
 fs/jfs/jfs_dmap.c                                  |  4 +-
 fs/ncpfs/ioctl.c                                   |  2 +-
 fs/ntfs/compress.c                                 |  2 +-
 fs/ntfs/file.c                                     |  4 +-
 fs/ntfs/logfile.c                                  |  2 +-
 fs/ocfs2/alloc.c                                   |  2 +-
 fs/ocfs2/dlm/dlmmaster.c                           |  2 +-
 fs/ocfs2/dlmglue.c                                 |  2 +-
 fs/ocfs2/journal.c                                 |  2 +-
 fs/ocfs2/refcounttree.c                            |  2 +-
 fs/omfs/bitmap.c                                   |  2 +-
 fs/ubifs/recovery.c                                |  2 +-
 fs/xfs/quota/xfs_dquot.h                           |  2 +-
 include/asm-generic/memory_model.h                 |  2 +-
 include/asm-generic/unistd.h                       |  2 +-
 include/linux/chio.h                               |  2 +-
 include/linux/mfd/ezx-pcap.h                       |  4 +-
 include/linux/pktcdvd.h                            |  2 +-
 include/linux/serial_reg.h                         |  8 +--
 include/linux/videodev2.h                          |  2 +-
 include/net/sctp/structs.h                         |  2 +-
 include/net/tcp.h                                  |  2 +-
 include/net/wimax.h                                |  2 +-
 include/sound/wm8993.h                             |  2 +-
 kernel/perf_event.c                                |  4 +-
 lib/Kconfig.debug                                  |  2 +-
 lib/decompress_bunzip2.c                           |  2 +-
 lib/dma-debug.c                                    |  2 +-
 lib/swiotlb.c                                      |  2 +-
 mm/filemap.c                                       |  2 +-
 mm/memcontrol.c                                    |  4 +-
 mm/memory-failure.c                                |  2 +-
 net/ipv4/netfilter/ipt_ECN.c                       |  2 +-
 net/irda/irlap.c                                   | 14 ++---
 net/irda/irlap_event.c                             |  2 +-
 net/irda/irlmp.c                                   |  4 +-
 net/mac80211/mesh_pathtbl.c                        |  4 +-
 net/netlabel/netlabel_domainhash.c                 |  2 +-
 net/sctp/sm_sideeffect.c                           |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c              |  2 +-
 net/wimax/op-reset.c                               |  2 +-
 scripts/kconfig/mconf.c                            |  2 +-
 security/selinux/netlabel.c                        |  2 +-
 security/selinux/ss/services.c                     |  2 +-
 sound/Kconfig                                      |  2 +-
 sound/isa/cs423x/cs4236.c                          |  2 +-
 sound/isa/opti9xx/miro.c                           |  2 +-
 sound/isa/opti9xx/opti92x-ad1848.c                 |  2 +-
 sound/oss/dmasound/dmasound_paula.c                |  2 +-
 sound/pci/ca0106/ca0106_proc.c                     |  2 +-
 sound/pci/cs46xx/imgs/cwcdma.asp                   |  9 ++--
 sound/pci/emu10k1/emu10k1x.c                       |  2 +-
 sound/pci/hda/patch_cmedia.c                       |  2 +-
 sound/pci/hda/patch_realtek.c                      |  2 +-
 sound/pci/rme9652/hdspm.c                          |  4 +-
 sound/soc/codecs/uda134x.c                         |  4 +-
 sound/soc/codecs/wm8903.c                          |  6 +--
 sound/soc/codecs/wm8993.c                          |  4 +-
 sound/soc/s3c24xx/s3c24xx_simtec.c                 |  2 +-
 sound/soc/s6000/s6000-pcm.c                        |  2 +-
 sound/sound_core.c                                 |  2 +-
 345 files changed, 516 insertions(+), 508 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index 99233902e09e..f91a973a37fe 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -8,7 +8,7 @@ Description:
 		 1 - major number
 		 2 - minor mumber
 		 3 - device name
-		 4 - reads completed succesfully
+		 4 - reads completed successfully
 		 5 - reads merged
 		 6 - sectors read
 		 7 - time spent reading (ms)
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 5f3bedaf8e35..d2f90334bb93 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -4,7 +4,7 @@ Contact:	Jerome Marchand <jmarchan@redhat.com>
 Description:
 		The /sys/block/<disk>/stat files displays the I/O
 		statistics of disk <disk>. They contain 11 fields:
-		 1 - reads completed succesfully
+		 1 - reads completed successfully
 		 2 - reads merged
 		 3 - sectors read
 		 4 - time spent reading (ms)
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index df0d089d0fb9..f508a8a27fea 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -362,7 +362,7 @@ module_exit(board_cleanup);
 	<sect1 id="Multiple_chip_control">
 		<title>Multiple chip control</title>
 		<para>
-			The nand driver can control chip arrays. Therefor the
+			The nand driver can control chip arrays. Therefore the
 			board driver must provide an own select_chip function. This
 			function must (de)select the requested chip.
 			The function pointer in the nand_chip structure must
diff --git a/Documentation/DocBook/v4l/videodev2.h.xml b/Documentation/DocBook/v4l/videodev2.h.xml
index 97002060ac4f..26303e58345f 100644
--- a/Documentation/DocBook/v4l/videodev2.h.xml
+++ b/Documentation/DocBook/v4l/videodev2.h.xml
@@ -492,7 +492,7 @@ struct <link linkend="v4l2-jpegcompression">v4l2_jpegcompression</link> {
                                  * you do, leave them untouched.
                                  * Inluding less markers will make the
                                  * resulting code smaller, but there will
-                                 * be fewer aplications which can read it.
+                                 * be fewer applications which can read it.
                                  * The presence of the APP and COM marker
                                  * is influenced by APP_len and COM_len
                                  * ONLY, not by this property! */
diff --git a/Documentation/DocBook/writing-an-alsa-driver.tmpl b/Documentation/DocBook/writing-an-alsa-driver.tmpl
index 7a2e0e98986a..0d0f7b4d4b1a 100644
--- a/Documentation/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/DocBook/writing-an-alsa-driver.tmpl
@@ -5318,7 +5318,7 @@ struct _snd_pcm_runtime {
       pages of the given size and map them onto the virtually contiguous
       memory.  The virtual pointer is addressed in runtime-&gt;dma_area.
       The physical address (runtime-&gt;dma_addr) is set to zero,
-      because the buffer is physically non-contigous.
+      because the buffer is physically non-contiguous.
       The physical address table is set up in sgbuf-&gt;table.
       You can get the physical address at a certain offset via
       <function>snd_pcm_sgbuf_get_addr()</function>. 
diff --git a/Documentation/dvb/README.dvb-usb b/Documentation/dvb/README.dvb-usb
index bf2a9cdfe7bb..c8238e44ed6b 100644
--- a/Documentation/dvb/README.dvb-usb
+++ b/Documentation/dvb/README.dvb-usb
@@ -85,7 +85,7 @@ http://www.linuxtv.org/wiki/index.php/DVB_USB
 	     - moved transfer control (pid filter, fifo control) from usb driver to frontend, it seems
 	       better settled there (added xfer_ops-struct)
 	     - created a common files for frontends (mc/p/mb)
-  2004-09-28 - added support for a new device (Unkown, vendor ID is Hyper-Paltek)
+  2004-09-28 - added support for a new device (Unknown, vendor ID is Hyper-Paltek)
   2004-09-20 - added support for a new device (Compro DVB-U2000), thanks
 	       to Amaury Demol for reporting
 	     - changed usb TS transfer method (several urbs, stopping transfer
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 098de5bce00a..42208511b5c0 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -304,7 +304,7 @@ static void *map_zeroed_pages(unsigned int num)
 	addr = mmap(NULL, getpagesize() * num,
 		    PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
 	if (addr == MAP_FAILED)
-		err(1, "Mmaping %u pages of /dev/zero", num);
+		err(1, "Mmapping %u pages of /dev/zero", num);
 
 	/*
 	 * One neat mmap feature is that you can close the fd, and it
diff --git a/Documentation/scsi/ChangeLog.megaraid_sas b/Documentation/scsi/ChangeLog.megaraid_sas
index c851ef497795..84524e0cf9c3 100644
--- a/Documentation/scsi/ChangeLog.megaraid_sas
+++ b/Documentation/scsi/ChangeLog.megaraid_sas
@@ -185,7 +185,7 @@ ii.	FW enables WCE bit in Mode Sense cmd for drives that are configured
 	Disks are exposed with WCE=1. User is advised to enable Write Back
 	mode only when the controller has battery backup. At this time
 	Synhronize cache is not supported by the FW. Driver will short-cycle
-	the cmd and return sucess without sending down to FW.
+	the cmd and return success without sending down to FW.
 
 1 Release Date    : Sun Jan. 14 11:21:32 PDT 2007 -
 		 Sumant Patro <Sumant.Patro@lsil.com>/Bo Yang
diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index deab51ddc33e..4884cb33845d 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -538,7 +538,7 @@ SPI MESSAGE QUEUE
 The bulk of the driver will be managing the I/O queue fed by transfer().
 
 That queue could be purely conceptual.  For example, a driver used only
-for low-frequency sensor acess might be fine using synchronous PIO.
+for low-frequency sensor access might be fine using synchronous PIO.
 
 But the queue will probably be very real, using message->queue, PIO,
 often DMA (especially if the root filesystem is in SPI flash), and
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index a6e360d2055c..fc5790d36cd9 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -370,7 +370,7 @@ The default is 1 percent.
 mmap_min_addr
 
 This file indicates the amount of address space  which a user process will
-be restricted from mmaping.  Since kernel null dereference bugs could
+be restricted from mmapping.  Since kernel null dereference bugs could
 accidentally operate based on the information in the first couple of pages
 of memory userspace processes should not be allowed to write to them.  By
 default this value is set to 0 and no protections will be enforced by the
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
index 3f61825be499..6b29555b58b7 100644
--- a/Documentation/video4linux/gspca.txt
+++ b/Documentation/video4linux/gspca.txt
@@ -6,7 +6,7 @@ The modules are:
 
 xxxx		vend:prod
 ----
-spca501		0000:0000	MystFromOri Unknow Camera
+spca501		0000:0000	MystFromOri Unknown Camera
 m5602		0402:5602	ALi Video Camera Controller
 spca501		040a:0002	Kodak DVC-325
 spca500		040a:0300	Kodak EZ200
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
index 3ec4f2a22585..aa7f4d0639c4 100644
--- a/Documentation/vm/page-types.c
+++ b/Documentation/vm/page-types.c
@@ -301,7 +301,7 @@ static char *page_flag_name(uint64_t flags)
 		present = (flags >> i) & 1;
 		if (!page_flag_names[i]) {
 			if (present)
-				fatal("unkown flag bit %d\n", i);
+				fatal("unknown flag bit %d\n", i);
 			continue;
 		}
 		buf[j++] = present ? page_flag_names[i][0] : '_';
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 10b403554b65..7b2c56d8f930 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -197,7 +197,7 @@ setup_memory_node(int nid, void *kernel_end)
 	}
 
 	if (bootmap_start == -1)
-		panic("couldn't find a contigous place for the bootmap");
+		panic("couldn't find a contiguous place for the bootmap");
 
 	/* Allocate the bootmap and mark the whole MM as reserved.  */
 	bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start,
diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c
index 7713a08bb10c..37bda5f3dde3 100644
--- a/arch/arm/common/scoop.c
+++ b/arch/arm/common/scoop.c
@@ -82,7 +82,7 @@ static int scoop_gpio_get(struct gpio_chip *chip, unsigned offset)
 {
 	struct scoop_dev *sdev = container_of(chip, struct scoop_dev, gpio);
 
-	/* XXX: I'm usure,  but it seems so */
+	/* XXX: I'm unsure, but it seems so */
 	return ioread16(sdev->base + SCOOP_GPRR) & (1 << (offset + 1));
 }
 
diff --git a/arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h b/arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h
index 375066ad0186..cbf334d1c761 100644
--- a/arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h
+++ b/arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h
@@ -83,7 +83,7 @@ typedef struct {
 *  @brief   Get next available transaction width
 *
 *
-*  @return  On sucess  : Next avail able transaction width
+*  @return  On success  : Next available transaction width
 *           On failure : dmacHw_TRANSACTION_WIDTH_8
 *
 *  @note
diff --git a/arch/arm/mach-bcmring/include/mach/dma.h b/arch/arm/mach-bcmring/include/mach/dma.h
index 847980c85c88..1f2c5319c056 100644
--- a/arch/arm/mach-bcmring/include/mach/dma.h
+++ b/arch/arm/mach-bcmring/include/mach/dma.h
@@ -651,7 +651,7 @@ int dma_map_add_region(DMA_MemMap_t *memMap,	/* Stores state information about t
 /**
 *   Creates a descriptor ring from a memory mapping.
 *
-*   @return 0 on sucess, error code otherwise.
+*   @return 0 on success, error code otherwise.
 */
 /****************************************************************************/
 
diff --git a/arch/arm/mach-lh7a40x/include/mach/hardware.h b/arch/arm/mach-lh7a40x/include/mach/hardware.h
index 48e827d2fa56..59d2ace35217 100644
--- a/arch/arm/mach-lh7a40x/include/mach/hardware.h
+++ b/arch/arm/mach-lh7a40x/include/mach/hardware.h
@@ -31,7 +31,7 @@
 /*
  * This __REG() version gives the same results as the one above,  except
  * that we are fooling gcc somehow so it generates far better and smaller
- * assembly code for access to contigous registers.  It's a shame that gcc
+ * assembly code for access to contiguous registers.  It's a shame that gcc
  * doesn't guess this by itself.
  */
 #include <asm/types.h>
diff --git a/arch/arm/mach-orion5x/pci.c b/arch/arm/mach-orion5x/pci.c
index 36dc5413cc97..bdf96eb523bc 100644
--- a/arch/arm/mach-orion5x/pci.c
+++ b/arch/arm/mach-orion5x/pci.c
@@ -463,7 +463,7 @@ static void __init orion5x_setup_pci_wins(struct mbus_dram_target_info *dram)
 	writel(win_enable, PCI_BAR_ENABLE);
 
 	/*
-	 * Disable automatic update of address remaping when writing to BARs.
+	 * Disable automatic update of address remapping when writing to BARs.
 	 */
 	orion5x_setbits(PCI_ADDR_DECODE_CTRL, 1);
 }
diff --git a/arch/arm/mach-pxa/include/mach/palmld.h b/arch/arm/mach-pxa/include/mach/palmld.h
index 8721b8010221..ae536e86d8e8 100644
--- a/arch/arm/mach-pxa/include/mach/palmld.h
+++ b/arch/arm/mach-pxa/include/mach/palmld.h
@@ -91,7 +91,7 @@
 /* BATTERY */
 #define PALMLD_BAT_MAX_VOLTAGE		4000	/* 4.00V maximum voltage */
 #define PALMLD_BAT_MIN_VOLTAGE		3550	/* 3.55V critical voltage */
-#define PALMLD_BAT_MAX_CURRENT		0	/* unknokn */
+#define PALMLD_BAT_MAX_CURRENT		0	/* unknown */
 #define PALMLD_BAT_MIN_CURRENT		0	/* unknown */
 #define PALMLD_BAT_MAX_CHARGE		1	/* unknown */
 #define PALMLD_BAT_MIN_CHARGE		1	/* unknown */
diff --git a/arch/arm/mach-pxa/include/mach/palmt5.h b/arch/arm/mach-pxa/include/mach/palmt5.h
index d15662aba008..6baf7469d4ec 100644
--- a/arch/arm/mach-pxa/include/mach/palmt5.h
+++ b/arch/arm/mach-pxa/include/mach/palmt5.h
@@ -66,7 +66,7 @@
 /* BATTERY */
 #define PALMT5_BAT_MAX_VOLTAGE		4000	/* 4.00v current voltage */
 #define PALMT5_BAT_MIN_VOLTAGE		3550	/* 3.55v critical voltage */
-#define PALMT5_BAT_MAX_CURRENT		0	/* unknokn */
+#define PALMT5_BAT_MAX_CURRENT		0	/* unknown */
 #define PALMT5_BAT_MIN_CURRENT		0	/* unknown */
 #define PALMT5_BAT_MAX_CHARGE		1	/* unknown */
 #define PALMT5_BAT_MIN_CHARGE		1	/* unknown */
diff --git a/arch/arm/mach-pxa/include/mach/palmtc.h b/arch/arm/mach-pxa/include/mach/palmtc.h
index 3dc9b074ab46..3f9dd3fd4638 100644
--- a/arch/arm/mach-pxa/include/mach/palmtc.h
+++ b/arch/arm/mach-pxa/include/mach/palmtc.h
@@ -68,7 +68,7 @@
 /* BATTERY */
 #define PALMTC_BAT_MAX_VOLTAGE		4000	/* 4.00V maximum voltage */
 #define PALMTC_BAT_MIN_VOLTAGE		3550	/* 3.55V critical voltage */
-#define PALMTC_BAT_MAX_CURRENT		0	/* unknokn */
+#define PALMTC_BAT_MAX_CURRENT		0	/* unknown */
 #define PALMTC_BAT_MIN_CURRENT		0	/* unknown */
 #define PALMTC_BAT_MAX_CHARGE		1	/* unknown */
 #define PALMTC_BAT_MIN_CHARGE		1	/* unknown */
diff --git a/arch/arm/mach-pxa/include/mach/palmte2.h b/arch/arm/mach-pxa/include/mach/palmte2.h
index 12361341f9d8..f89e989a7637 100644
--- a/arch/arm/mach-pxa/include/mach/palmte2.h
+++ b/arch/arm/mach-pxa/include/mach/palmte2.h
@@ -59,7 +59,7 @@
 /* BATTERY */
 #define PALMTE2_BAT_MAX_VOLTAGE		4000	/* 4.00v current voltage */
 #define PALMTE2_BAT_MIN_VOLTAGE		3550	/* 3.55v critical voltage */
-#define PALMTE2_BAT_MAX_CURRENT		0	/* unknokn */
+#define PALMTE2_BAT_MAX_CURRENT		0	/* unknown */
 #define PALMTE2_BAT_MIN_CURRENT		0	/* unknown */
 #define PALMTE2_BAT_MAX_CHARGE		1	/* unknown */
 #define PALMTE2_BAT_MIN_CHARGE		1	/* unknown */
diff --git a/arch/arm/mach-pxa/include/mach/palmtx.h b/arch/arm/mach-pxa/include/mach/palmtx.h
index 1be0db6ed55e..10abc4f2e8e4 100644
--- a/arch/arm/mach-pxa/include/mach/palmtx.h
+++ b/arch/arm/mach-pxa/include/mach/palmtx.h
@@ -94,7 +94,7 @@
 /* BATTERY */
 #define PALMTX_BAT_MAX_VOLTAGE		4000	/* 4.00v current voltage */
 #define PALMTX_BAT_MIN_VOLTAGE		3550	/* 3.55v critical voltage */
-#define PALMTX_BAT_MAX_CURRENT		0	/* unknokn */
+#define PALMTX_BAT_MAX_CURRENT		0	/* unknown */
 #define PALMTX_BAT_MIN_CURRENT		0	/* unknown */
 #define PALMTX_BAT_MAX_CHARGE		1	/* unknown */
 #define PALMTX_BAT_MIN_CHARGE		1	/* unknown */
diff --git a/arch/arm/mach-pxa/include/mach/palmz72.h b/arch/arm/mach-pxa/include/mach/palmz72.h
index 2806ef69ba5a..2bbcf70dd935 100644
--- a/arch/arm/mach-pxa/include/mach/palmz72.h
+++ b/arch/arm/mach-pxa/include/mach/palmz72.h
@@ -49,7 +49,7 @@
 /* Battery */
 #define PALMZ72_BAT_MAX_VOLTAGE		4000	/* 4.00v current voltage */
 #define PALMZ72_BAT_MIN_VOLTAGE		3550	/* 3.55v critical voltage */
-#define PALMZ72_BAT_MAX_CURRENT		0	/* unknokn */
+#define PALMZ72_BAT_MAX_CURRENT		0	/* unknown */
 #define PALMZ72_BAT_MIN_CURRENT		0	/* unknown */
 #define PALMZ72_BAT_MAX_CHARGE		1	/* unknown */
 #define PALMZ72_BAT_MIN_CHARGE		1	/* unknown */
diff --git a/arch/arm/mach-s3c6400/setup-sdhci.c b/arch/arm/mach-s3c6400/setup-sdhci.c
index b93dafbee1f4..1039937403be 100644
--- a/arch/arm/mach-s3c6400/setup-sdhci.c
+++ b/arch/arm/mach-s3c6400/setup-sdhci.c
@@ -30,7 +30,7 @@ char *s3c6400_hsmmc_clksrcs[4] = {
 	[0] = "hsmmc",
 	[1] = "hsmmc",
 	[2] = "mmc_bus",
-	/* [3] = "48m", - note not succesfully used yet */
+	/* [3] = "48m", - note not successfully used yet */
 };
 
 void s3c6400_setup_sdhci_cfg_card(struct platform_device *dev,
diff --git a/arch/arm/mach-s3c6410/setup-sdhci.c b/arch/arm/mach-s3c6410/setup-sdhci.c
index 20666f3bd478..816d2d9f9ef8 100644
--- a/arch/arm/mach-s3c6410/setup-sdhci.c
+++ b/arch/arm/mach-s3c6410/setup-sdhci.c
@@ -30,7 +30,7 @@ char *s3c6410_hsmmc_clksrcs[4] = {
 	[0] = "hsmmc",
 	[1] = "hsmmc",
 	[2] = "mmc_bus",
-	/* [3] = "48m", - note not succesfully used yet */
+	/* [3] = "48m", - note not successfully used yet */
 };
 
 
diff --git a/arch/arm/mach-sa1100/dma.c b/arch/arm/mach-sa1100/dma.c
index cb4521a6f42d..ad660350c296 100644
--- a/arch/arm/mach-sa1100/dma.c
+++ b/arch/arm/mach-sa1100/dma.c
@@ -65,7 +65,7 @@ static irqreturn_t dma_irq_handler(int irq, void *dev_id)
 
 
 /**
- *	sa1100_request_dma - allocate one of the SA11x0's DMA chanels
+ *	sa1100_request_dma - allocate one of the SA11x0's DMA channels
  *	@device: The SA11x0 peripheral targeted by this request
  *	@device_id: An ascii name for the claiming device
  *	@callback: Function to be called when the DMA completes
diff --git a/arch/arm/plat-mxc/include/mach/iomux-mx3.h b/arch/arm/plat-mxc/include/mach/iomux-mx3.h
index 446f86763816..0c7802bbeccb 100644
--- a/arch/arm/plat-mxc/include/mach/iomux-mx3.h
+++ b/arch/arm/plat-mxc/include/mach/iomux-mx3.h
@@ -112,7 +112,7 @@ enum iomux_gp_func {
  * setups a single pin:
  * 	- reserves the pin so that it is not claimed by another driver
  * 	- setups the iomux according to the configuration
- * 	- if the pin is configured as a GPIO, we claim it throug kernel gpiolib
+ * 	- if the pin is configured as a GPIO, we claim it through kernel gpiolib
  */
 int mxc_iomux_alloc_pin(const unsigned int pin, const char *label);
 /*
diff --git a/arch/arm/plat-mxc/include/mach/iomux-mxc91231.h b/arch/arm/plat-mxc/include/mach/iomux-mxc91231.h
index 9f13061192c8..3887f3fe29d4 100644
--- a/arch/arm/plat-mxc/include/mach/iomux-mxc91231.h
+++ b/arch/arm/plat-mxc/include/mach/iomux-mxc91231.h
@@ -48,7 +48,7 @@
  * setups a single pin:
  * 	- reserves the pin so that it is not claimed by another driver
  * 	- setups the iomux according to the configuration
- * 	- if the pin is configured as a GPIO, we claim it throug kernel gpiolib
+ * 	- if the pin is configured as a GPIO, we claim it through kernel gpiolib
  */
 int mxc_iomux_alloc_pin(const unsigned int pin_mode, const char *label);
 /*
diff --git a/arch/arm/plat-mxc/pwm.c b/arch/arm/plat-mxc/pwm.c
index 5cdbd605ac05..4ff6dfe04283 100644
--- a/arch/arm/plat-mxc/pwm.c
+++ b/arch/arm/plat-mxc/pwm.c
@@ -94,7 +94,7 @@ int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
 		 * register to follow the ratio of duty_ns vs. period_ns
 		 * accordingly.
 		 *
-		 * This is good enought for programming the brightness of
+		 * This is good enough for programming the brightness of
 		 * the LCD backlight.
 		 *
 		 * The real implementation would divide PERCLK[0] first by
diff --git a/arch/arm/plat-omap/dma.c b/arch/arm/plat-omap/dma.c
index b53125f41293..0e308913291b 100644
--- a/arch/arm/plat-omap/dma.c
+++ b/arch/arm/plat-omap/dma.c
@@ -1232,7 +1232,7 @@ static void create_dma_lch_chain(int lch_head, int lch_queue)
  * 					      OMAP_DMA_DYNAMIC_CHAIN
  * @params - Channel parameters
  *
- * @return - Succes : 0
+ * @return - Success : 0
  * 	     Failure: -EINVAL/-ENOMEM
  */
 int omap_request_dma_chain(int dev_id, const char *dev_name,
diff --git a/arch/arm/plat-omap/include/mach/omap16xx.h b/arch/arm/plat-omap/include/mach/omap16xx.h
index 0e69b504c25f..7560b4d583a3 100644
--- a/arch/arm/plat-omap/include/mach/omap16xx.h
+++ b/arch/arm/plat-omap/include/mach/omap16xx.h
@@ -124,7 +124,7 @@
 #define TIPB_SWITCH_BASE		 (0xfffbc800)
 #define OMAP16XX_MMCSD2_SSW_MPU_CONF	(TIPB_SWITCH_BASE + 0x160)
 
-/* UART3 Registers Maping through MPU bus */
+/* UART3 Registers Mapping through MPU bus */
 #define UART3_RHR               (OMAP_UART3_BASE + 0)
 #define UART3_THR               (OMAP_UART3_BASE + 0)
 #define UART3_DLL               (OMAP_UART3_BASE + 0)
diff --git a/arch/arm/plat-s3c24xx/include/plat/map.h b/arch/arm/plat-s3c24xx/include/plat/map.h
index c4d133436fc7..bd534d32b993 100644
--- a/arch/arm/plat-s3c24xx/include/plat/map.h
+++ b/arch/arm/plat-s3c24xx/include/plat/map.h
@@ -64,7 +64,7 @@
 /* the calculation for the VA of this must ensure that
  * it is the same distance apart from the UART in the
  * phsyical address space, as the initial mapping for the IO
- * is done as a 1:1 maping. This puts it (currently) at
+ * is done as a 1:1 mapping. This puts it (currently) at
  * 0xFA800000, which is not in the way of any current mapping
  * by the base system.
 */
diff --git a/arch/avr32/boards/hammerhead/Kconfig b/arch/avr32/boards/hammerhead/Kconfig
index fda2331f9789..5c13d785cc70 100644
--- a/arch/avr32/boards/hammerhead/Kconfig
+++ b/arch/avr32/boards/hammerhead/Kconfig
@@ -24,7 +24,7 @@ config BOARD_HAMMERHEAD_SND
 	bool "Atmel AC97 Sound support"
 	help
 	  This enables Sound support for the Hammerhead board. You may
-	  also go trough the ALSA settings to get it working.
+	  also go through the ALSA settings to get it working.
 
 	  Choose 'Y' here if you have ordered a Corona daugther board and
 	  want to make your board funky.
diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c
index 6b7325d634af..78cb3d38f899 100644
--- a/arch/blackfin/kernel/traps.c
+++ b/arch/blackfin/kernel/traps.c
@@ -619,7 +619,7 @@ asmlinkage notrace void trap_c(struct pt_regs *fp)
 
 /*
  * Similar to get_user, do some address checking, then dereference
- * Return true on sucess, false on bad address
+ * Return true on success, false on bad address
  */
 static bool get_instruction(unsigned short *val, unsigned short *address)
 {
diff --git a/arch/blackfin/mach-bf518/include/mach/defBF51x_base.h b/arch/blackfin/mach-bf518/include/mach/defBF51x_base.h
index e06f4112c695..f9fd2b2a2956 100644
--- a/arch/blackfin/mach-bf518/include/mach/defBF51x_base.h
+++ b/arch/blackfin/mach-bf518/include/mach/defBF51x_base.h
@@ -542,7 +542,7 @@
 #define HMDMA0_CONTROL		0xFFC03300	/* Handshake MDMA0 Control Register					*/
 #define HMDMA0_ECINIT		0xFFC03304	/* HMDMA0 Initial Edge Count Register				*/
 #define HMDMA0_BCINIT		0xFFC03308	/* HMDMA0 Initial Block Count Register				*/
-#define HMDMA0_ECURGENT		0xFFC0330C	/* HMDMA0 Urgent Edge Count Threshhold Register		*/
+#define HMDMA0_ECURGENT		0xFFC0330C	/* HMDMA0 Urgent Edge Count Threshold Register		*/
 #define HMDMA0_ECOVERFLOW	0xFFC03310	/* HMDMA0 Edge Count Overflow Interrupt Register	*/
 #define HMDMA0_ECOUNT		0xFFC03314	/* HMDMA0 Current Edge Count Register				*/
 #define HMDMA0_BCOUNT		0xFFC03318	/* HMDMA0 Current Block Count Register				*/
@@ -550,7 +550,7 @@
 #define HMDMA1_CONTROL		0xFFC03340	/* Handshake MDMA1 Control Register					*/
 #define HMDMA1_ECINIT		0xFFC03344	/* HMDMA1 Initial Edge Count Register				*/
 #define HMDMA1_BCINIT		0xFFC03348	/* HMDMA1 Initial Block Count Register				*/
-#define HMDMA1_ECURGENT		0xFFC0334C	/* HMDMA1 Urgent Edge Count Threshhold Register		*/
+#define HMDMA1_ECURGENT		0xFFC0334C	/* HMDMA1 Urgent Edge Count Threshold Register		*/
 #define HMDMA1_ECOVERFLOW	0xFFC03350	/* HMDMA1 Edge Count Overflow Interrupt Register	*/
 #define HMDMA1_ECOUNT		0xFFC03354	/* HMDMA1 Current Edge Count Register				*/
 #define HMDMA1_BCOUNT		0xFFC03358	/* HMDMA1 Current Block Count Register				*/
diff --git a/arch/blackfin/mach-bf527/include/mach/defBF52x_base.h b/arch/blackfin/mach-bf527/include/mach/defBF52x_base.h
index f821700716ee..b9dbb73d7ef0 100644
--- a/arch/blackfin/mach-bf527/include/mach/defBF52x_base.h
+++ b/arch/blackfin/mach-bf527/include/mach/defBF52x_base.h
@@ -544,7 +544,7 @@
 #define HMDMA0_CONTROL		0xFFC03300	/* Handshake MDMA0 Control Register					*/
 #define HMDMA0_ECINIT		0xFFC03304	/* HMDMA0 Initial Edge Count Register				*/
 #define HMDMA0_BCINIT		0xFFC03308	/* HMDMA0 Initial Block Count Register				*/
-#define HMDMA0_ECURGENT		0xFFC0330C	/* HMDMA0 Urgent Edge Count Threshhold Register		*/
+#define HMDMA0_ECURGENT		0xFFC0330C	/* HMDMA0 Urgent Edge Count Threshold Register		*/
 #define HMDMA0_ECOVERFLOW	0xFFC03310	/* HMDMA0 Edge Count Overflow Interrupt Register	*/
 #define HMDMA0_ECOUNT		0xFFC03314	/* HMDMA0 Current Edge Count Register				*/
 #define HMDMA0_BCOUNT		0xFFC03318	/* HMDMA0 Current Block Count Register				*/
@@ -552,7 +552,7 @@
 #define HMDMA1_CONTROL		0xFFC03340	/* Handshake MDMA1 Control Register					*/
 #define HMDMA1_ECINIT		0xFFC03344	/* HMDMA1 Initial Edge Count Register				*/
 #define HMDMA1_BCINIT		0xFFC03348	/* HMDMA1 Initial Block Count Register				*/
-#define HMDMA1_ECURGENT		0xFFC0334C	/* HMDMA1 Urgent Edge Count Threshhold Register		*/
+#define HMDMA1_ECURGENT		0xFFC0334C	/* HMDMA1 Urgent Edge Count Threshold Register		*/
 #define HMDMA1_ECOVERFLOW	0xFFC03350	/* HMDMA1 Edge Count Overflow Interrupt Register	*/
 #define HMDMA1_ECOUNT		0xFFC03354	/* HMDMA1 Current Edge Count Register				*/
 #define HMDMA1_BCOUNT		0xFFC03358	/* HMDMA1 Current Block Count Register				*/
diff --git a/arch/blackfin/mach-bf537/include/mach/defBF534.h b/arch/blackfin/mach-bf537/include/mach/defBF534.h
index cebb14feb1ba..a6d20ca57683 100644
--- a/arch/blackfin/mach-bf537/include/mach/defBF534.h
+++ b/arch/blackfin/mach-bf537/include/mach/defBF534.h
@@ -934,7 +934,7 @@
 #define HMDMA0_CONTROL		0xFFC03300	/* Handshake MDMA0 Control Register                                     */
 #define HMDMA0_ECINIT		0xFFC03304	/* HMDMA0 Initial Edge Count Register                           */
 #define HMDMA0_BCINIT		0xFFC03308	/* HMDMA0 Initial Block Count Register                          */
-#define HMDMA0_ECURGENT		0xFFC0330C	/* HMDMA0 Urgent Edge Count Threshhold Register         */
+#define HMDMA0_ECURGENT		0xFFC0330C	/* HMDMA0 Urgent Edge Count Threshold Register         */
 #define HMDMA0_ECOVERFLOW	0xFFC03310	/* HMDMA0 Edge Count Overflow Interrupt Register        */
 #define HMDMA0_ECOUNT		0xFFC03314	/* HMDMA0 Current Edge Count Register                           */
 #define HMDMA0_BCOUNT		0xFFC03318	/* HMDMA0 Current Block Count Register                          */
@@ -942,7 +942,7 @@
 #define HMDMA1_CONTROL		0xFFC03340	/* Handshake MDMA1 Control Register                                     */
 #define HMDMA1_ECINIT		0xFFC03344	/* HMDMA1 Initial Edge Count Register                           */
 #define HMDMA1_BCINIT		0xFFC03348	/* HMDMA1 Initial Block Count Register                          */
-#define HMDMA1_ECURGENT		0xFFC0334C	/* HMDMA1 Urgent Edge Count Threshhold Register         */
+#define HMDMA1_ECURGENT		0xFFC0334C	/* HMDMA1 Urgent Edge Count Threshold Register         */
 #define HMDMA1_ECOVERFLOW	0xFFC03350	/* HMDMA1 Edge Count Overflow Interrupt Register        */
 #define HMDMA1_ECOUNT		0xFFC03354	/* HMDMA1 Current Edge Count Register                           */
 #define HMDMA1_BCOUNT		0xFFC03358	/* HMDMA1 Current Block Count Register                          */
diff --git a/arch/blackfin/mach-bf548/include/mach/defBF544.h b/arch/blackfin/mach-bf548/include/mach/defBF544.h
index dd414ae4ba4c..39f588dcd382 100644
--- a/arch/blackfin/mach-bf548/include/mach/defBF544.h
+++ b/arch/blackfin/mach-bf548/include/mach/defBF544.h
@@ -491,7 +491,7 @@
 #define                   HMDMA0_CONTROL  0xffc04500   /* Handshake MDMA0 Control Register */
 #define                    HMDMA0_ECINIT  0xffc04504   /* Handshake MDMA0 Initial Edge Count Register */
 #define                    HMDMA0_BCINIT  0xffc04508   /* Handshake MDMA0 Initial Block Count Register */
-#define                  HMDMA0_ECURGENT  0xffc0450c   /* Handshake MDMA0 Urgent Edge Count Threshhold Register */
+#define                  HMDMA0_ECURGENT  0xffc0450c   /* Handshake MDMA0 Urgent Edge Count Threshold Register */
 #define                HMDMA0_ECOVERFLOW  0xffc04510   /* Handshake MDMA0 Edge Count Overflow Interrupt Register */
 #define                    HMDMA0_ECOUNT  0xffc04514   /* Handshake MDMA0 Current Edge Count Register */
 #define                    HMDMA0_BCOUNT  0xffc04518   /* Handshake MDMA0 Current Block Count Register */
@@ -501,7 +501,7 @@
 #define                   HMDMA1_CONTROL  0xffc04540   /* Handshake MDMA1 Control Register */
 #define                    HMDMA1_ECINIT  0xffc04544   /* Handshake MDMA1 Initial Edge Count Register */
 #define                    HMDMA1_BCINIT  0xffc04548   /* Handshake MDMA1 Initial Block Count Register */
-#define                  HMDMA1_ECURGENT  0xffc0454c   /* Handshake MDMA1 Urgent Edge Count Threshhold Register */
+#define                  HMDMA1_ECURGENT  0xffc0454c   /* Handshake MDMA1 Urgent Edge Count Threshold Register */
 #define                HMDMA1_ECOVERFLOW  0xffc04550   /* Handshake MDMA1 Edge Count Overflow Interrupt Register */
 #define                    HMDMA1_ECOUNT  0xffc04554   /* Handshake MDMA1 Current Edge Count Register */
 #define                    HMDMA1_BCOUNT  0xffc04558   /* Handshake MDMA1 Current Block Count Register */
diff --git a/arch/blackfin/mach-bf548/include/mach/defBF547.h b/arch/blackfin/mach-bf548/include/mach/defBF547.h
index 5a9dbabe0a68..c4dcf302d9f5 100644
--- a/arch/blackfin/mach-bf548/include/mach/defBF547.h
+++ b/arch/blackfin/mach-bf548/include/mach/defBF547.h
@@ -470,7 +470,7 @@
 #define                   HMDMA0_CONTROL  0xffc04500   /* Handshake MDMA0 Control Register */
 #define                    HMDMA0_ECINIT  0xffc04504   /* Handshake MDMA0 Initial Edge Count Register */
 #define                    HMDMA0_BCINIT  0xffc04508   /* Handshake MDMA0 Initial Block Count Register */
-#define                  HMDMA0_ECURGENT  0xffc0450c   /* Handshake MDMA0 Urgent Edge Count Threshhold Register */
+#define                  HMDMA0_ECURGENT  0xffc0450c   /* Handshake MDMA0 Urgent Edge Count Threshold Register */
 #define                HMDMA0_ECOVERFLOW  0xffc04510   /* Handshake MDMA0 Edge Count Overflow Interrupt Register */
 #define                    HMDMA0_ECOUNT  0xffc04514   /* Handshake MDMA0 Current Edge Count Register */
 #define                    HMDMA0_BCOUNT  0xffc04518   /* Handshake MDMA0 Current Block Count Register */
@@ -480,7 +480,7 @@
 #define                   HMDMA1_CONTROL  0xffc04540   /* Handshake MDMA1 Control Register */
 #define                    HMDMA1_ECINIT  0xffc04544   /* Handshake MDMA1 Initial Edge Count Register */
 #define                    HMDMA1_BCINIT  0xffc04548   /* Handshake MDMA1 Initial Block Count Register */
-#define                  HMDMA1_ECURGENT  0xffc0454c   /* Handshake MDMA1 Urgent Edge Count Threshhold Register */
+#define                  HMDMA1_ECURGENT  0xffc0454c   /* Handshake MDMA1 Urgent Edge Count Threshold Register */
 #define                HMDMA1_ECOVERFLOW  0xffc04550   /* Handshake MDMA1 Edge Count Overflow Interrupt Register */
 #define                    HMDMA1_ECOUNT  0xffc04554   /* Handshake MDMA1 Current Edge Count Register */
 #define                    HMDMA1_BCOUNT  0xffc04558   /* Handshake MDMA1 Current Block Count Register */
diff --git a/arch/blackfin/mach-bf548/include/mach/defBF548.h b/arch/blackfin/mach-bf548/include/mach/defBF548.h
index 82cd593f7391..a5079980968c 100644
--- a/arch/blackfin/mach-bf548/include/mach/defBF548.h
+++ b/arch/blackfin/mach-bf548/include/mach/defBF548.h
@@ -853,7 +853,7 @@
 #define                   HMDMA0_CONTROL  0xffc04500   /* Handshake MDMA0 Control Register */
 #define                    HMDMA0_ECINIT  0xffc04504   /* Handshake MDMA0 Initial Edge Count Register */
 #define                    HMDMA0_BCINIT  0xffc04508   /* Handshake MDMA0 Initial Block Count Register */
-#define                  HMDMA0_ECURGENT  0xffc0450c   /* Handshake MDMA0 Urgent Edge Count Threshhold Register */
+#define                  HMDMA0_ECURGENT  0xffc0450c   /* Handshake MDMA0 Urgent Edge Count Threshold Register */
 #define                HMDMA0_ECOVERFLOW  0xffc04510   /* Handshake MDMA0 Edge Count Overflow Interrupt Register */
 #define                    HMDMA0_ECOUNT  0xffc04514   /* Handshake MDMA0 Current Edge Count Register */
 #define                    HMDMA0_BCOUNT  0xffc04518   /* Handshake MDMA0 Current Block Count Register */
@@ -863,7 +863,7 @@
 #define                   HMDMA1_CONTROL  0xffc04540   /* Handshake MDMA1 Control Register */
 #define                    HMDMA1_ECINIT  0xffc04544   /* Handshake MDMA1 Initial Edge Count Register */
 #define                    HMDMA1_BCINIT  0xffc04548   /* Handshake MDMA1 Initial Block Count Register */
-#define                  HMDMA1_ECURGENT  0xffc0454c   /* Handshake MDMA1 Urgent Edge Count Threshhold Register */
+#define                  HMDMA1_ECURGENT  0xffc0454c   /* Handshake MDMA1 Urgent Edge Count Threshold Register */
 #define                HMDMA1_ECOVERFLOW  0xffc04550   /* Handshake MDMA1 Edge Count Overflow Interrupt Register */
 #define                    HMDMA1_ECOUNT  0xffc04554   /* Handshake MDMA1 Current Edge Count Register */
 #define                    HMDMA1_BCOUNT  0xffc04558   /* Handshake MDMA1 Current Block Count Register */
diff --git a/arch/blackfin/mach-bf548/include/mach/defBF549.h b/arch/blackfin/mach-bf548/include/mach/defBF549.h
index 6fc6e39ab61b..f7f043560c6f 100644
--- a/arch/blackfin/mach-bf548/include/mach/defBF549.h
+++ b/arch/blackfin/mach-bf548/include/mach/defBF549.h
@@ -1024,7 +1024,7 @@
 #define                   HMDMA0_CONTROL  0xffc04500   /* Handshake MDMA0 Control Register */
 #define                    HMDMA0_ECINIT  0xffc04504   /* Handshake MDMA0 Initial Edge Count Register */
 #define                    HMDMA0_BCINIT  0xffc04508   /* Handshake MDMA0 Initial Block Count Register */
-#define                  HMDMA0_ECURGENT  0xffc0450c   /* Handshake MDMA0 Urgent Edge Count Threshhold Register */
+#define                  HMDMA0_ECURGENT  0xffc0450c   /* Handshake MDMA0 Urgent Edge Count Threshold Register */
 #define                HMDMA0_ECOVERFLOW  0xffc04510   /* Handshake MDMA0 Edge Count Overflow Interrupt Register */
 #define                    HMDMA0_ECOUNT  0xffc04514   /* Handshake MDMA0 Current Edge Count Register */
 #define                    HMDMA0_BCOUNT  0xffc04518   /* Handshake MDMA0 Current Block Count Register */
@@ -1034,7 +1034,7 @@
 #define                   HMDMA1_CONTROL  0xffc04540   /* Handshake MDMA1 Control Register */
 #define                    HMDMA1_ECINIT  0xffc04544   /* Handshake MDMA1 Initial Edge Count Register */
 #define                    HMDMA1_BCINIT  0xffc04548   /* Handshake MDMA1 Initial Block Count Register */
-#define                  HMDMA1_ECURGENT  0xffc0454c   /* Handshake MDMA1 Urgent Edge Count Threshhold Register */
+#define                  HMDMA1_ECURGENT  0xffc0454c   /* Handshake MDMA1 Urgent Edge Count Threshold Register */
 #define                HMDMA1_ECOVERFLOW  0xffc04550   /* Handshake MDMA1 Edge Count Overflow Interrupt Register */
 #define                    HMDMA1_ECOUNT  0xffc04554   /* Handshake MDMA1 Current Edge Count Register */
 #define                    HMDMA1_BCOUNT  0xffc04558   /* Handshake MDMA1 Current Block Count Register */
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 4a7cdd9ea1ee..380df1a73a6e 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -209,7 +209,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	/* Are we prepared to handle this kernel fault?
 	 *
 	 * (The kernel has valid exception-points in the source
-	 *  when it acesses user-memory. When it fails in one
+	 *  when it accesses user-memory. When it fails in one
 	 *  of those points, we find it in a table and do a jump
 	 *  to some fixup code that loads an appropriate error
 	 *  code)
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 674a8374c6d9..f332e3fe4237 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1381,7 +1381,7 @@ sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
 #endif
 
 			/*
-			** Not virtually contigous.
+			** Not virtually contiguous.
 			** Terminate prev chunk.
 			** Start a new chunk.
 			**
diff --git a/arch/ia64/ia32/ia32_entry.S b/arch/ia64/ia32/ia32_entry.S
index af9405cd70e5..02d1fb732951 100644
--- a/arch/ia64/ia32/ia32_entry.S
+++ b/arch/ia64/ia32/ia32_entry.S
@@ -79,7 +79,7 @@ GLOBAL_ENTRY(ia32_ret_from_clone)
 (p6)	br.cond.spnt .ia32_strace_check_retval
 	;;					// prevent RAW on r8
 END(ia32_ret_from_clone)
-	// fall thrugh
+	// fall through
 GLOBAL_ENTRY(ia32_ret_from_syscall)
 	PT_REGS_UNWIND_INFO(0)
 
diff --git a/arch/ia64/include/asm/perfmon_default_smpl.h b/arch/ia64/include/asm/perfmon_default_smpl.h
index 48822c0811d8..74724b24c2b7 100644
--- a/arch/ia64/include/asm/perfmon_default_smpl.h
+++ b/arch/ia64/include/asm/perfmon_default_smpl.h
@@ -67,7 +67,7 @@ typedef struct {
         unsigned long   ip;                     /* where did the overflow interrupt happened  */
         unsigned long   tstamp;                 /* ar.itc when entering perfmon intr. handler */
 
-        unsigned short  cpu;                    /* cpu on which the overfow occured */
+        unsigned short  cpu;                    /* cpu on which the overflow occured */
         unsigned short  set;                    /* event set active when overflow ocurred   */
         int    		tgid;              	/* thread group id (for NPTL, this is getpid()) */
 } pfm_default_smpl_entry_t;
diff --git a/arch/ia64/include/asm/sn/shubio.h b/arch/ia64/include/asm/sn/shubio.h
index 22a6f18a5313..6052422a22b3 100644
--- a/arch/ia64/include/asm/sn/shubio.h
+++ b/arch/ia64/include/asm/sn/shubio.h
@@ -3289,7 +3289,7 @@ typedef ii_icrb0_e_u_t icrbe_t;
 #define IIO_IIDSR_LVL_SHIFT     0
 #define IIO_IIDSR_LVL_MASK      0x000000ff
 
-/* Xtalk timeout threshhold register (IIO_IXTT) */
+/* Xtalk timeout threshold register (IIO_IXTT) */
 #define IXTT_RRSP_TO_SHFT	55	/* read response timeout */
 #define IXTT_RRSP_TO_MASK	(0x1FULL << IXTT_RRSP_TO_SHFT)
 #define IXTT_RRSP_PS_SHFT	32	/* read responsed TO prescalar */
diff --git a/arch/ia64/kernel/esi.c b/arch/ia64/kernel/esi.c
index d5764a3d74af..b091111270cb 100644
--- a/arch/ia64/kernel/esi.c
+++ b/arch/ia64/kernel/esi.c
@@ -84,7 +84,7 @@ static int __init esi_init (void)
 		      case ESI_DESC_ENTRY_POINT:
 			break;
 		      default:
-			printk(KERN_WARNING "Unkown table type %d found in "
+			printk(KERN_WARNING "Unknown table type %d found in "
 			       "ESI table, ignoring rest of table\n", *p);
 			return -ENODEV;
 		}
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index f1782705b1f7..b3a1cb3e6b25 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -3523,7 +3523,7 @@ pfm_use_debug_registers(struct task_struct *task)
  * IA64_THREAD_DBG_VALID set. This indicates a task which was
  * able to use the debug registers for debugging purposes via
  * ptrace(). Therefore we know it was not using them for
- * perfmormance monitoring, so we only decrement the number
+ * performance monitoring, so we only decrement the number
  * of "ptraced" debug register users to keep the count up to date
  */
 int
diff --git a/arch/m68k/ifpsp060/src/fpsp.S b/arch/m68k/ifpsp060/src/fpsp.S
index 6c1a9a217887..73613b5f1ee5 100644
--- a/arch/m68k/ifpsp060/src/fpsp.S
+++ b/arch/m68k/ifpsp060/src/fpsp.S
@@ -753,7 +753,7 @@ fovfl_ovfl_on:
 
 	bra.l		_real_ovfl
 
-# overflow occurred but is disabled. meanwhile, inexact is enabled. therefore,
+# overflow occurred but is disabled. meanwhile, inexact is enabled. Therefore,
 # we must jump to real_inex().
 fovfl_inex_on:
 
@@ -1015,7 +1015,7 @@ funfl_unfl_on2:
 
 	bra.l		_real_unfl
 
-# undeflow occurred but is disabled. meanwhile, inexact is enabled. therefore,
+# underflow occurred but is disabled. meanwhile, inexact is enabled. Therefore,
 # we must jump to real_inex().
 funfl_inex_on:
 
@@ -2963,7 +2963,7 @@ iea_disabled:
 
 	tst.w		%d0			# is instr fmovm?
 	bmi.b		iea_dis_fmovm		# yes
-# instruction is using an extended precision immediate operand. therefore,
+# instruction is using an extended precision immediate operand. Therefore,
 # the total instruction length is 16 bytes.
 iea_dis_immed:
 	mov.l		&0x10,%d0		# 16 bytes of instruction
@@ -9624,7 +9624,7 @@ sok_dnrm:
 	bge.b		sok_norm2		# thank goodness no
 
 # the multiply factor that we're trying to create should be a denorm
-# for the multiply to work. therefore, we're going to actually do a
+# for the multiply to work. Therefore, we're going to actually do a
 # multiply with a denorm which will cause an unimplemented data type
 # exception to be put into the machine which will be caught and corrected
 # later. we don't do this with the DENORMs above because this method
@@ -12216,7 +12216,7 @@ fin_sd_unfl_dis:
 
 #
 # operand will underflow AND underflow or inexact is enabled.
-# therefore, we must return the result rounded to extended precision.
+# Therefore, we must return the result rounded to extended precision.
 #
 fin_sd_unfl_ena:
 	mov.l		FP_SCR0_HI(%a6),FP_SCR1_HI(%a6)
@@ -12746,7 +12746,7 @@ fdiv_zero_load_p:
 
 #
 # The destination was In Range and the source was a ZERO. The result,
-# therefore, is an INF w/ the proper sign.
+# Therefore, is an INF w/ the proper sign.
 # So, determine the sign and return a new INF (w/ the j-bit cleared).
 #
 	global		fdiv_inf_load		# global for fsgldiv
@@ -12996,7 +12996,7 @@ fneg_sd_unfl_dis:
 
 #
 # operand will underflow AND underflow is enabled.
-# therefore, we must return the result rounded to extended precision.
+# Therefore, we must return the result rounded to extended precision.
 #
 fneg_sd_unfl_ena:
 	mov.l		FP_SCR0_HI(%a6),FP_SCR1_HI(%a6)
@@ -13611,7 +13611,7 @@ fabs_sd_unfl_dis:
 
 #
 # operand will underflow AND underflow is enabled.
-# therefore, we must return the result rounded to extended precision.
+# Therefore, we must return the result rounded to extended precision.
 #
 fabs_sd_unfl_ena:
 	mov.l		FP_SCR0_HI(%a6),FP_SCR1_HI(%a6)
@@ -14973,7 +14973,7 @@ fadd_zero_2:
 
 #
 # the ZEROes have opposite signs:
-# - therefore, we return +ZERO if the rounding modes are RN,RZ, or RP.
+# - Therefore, we return +ZERO if the rounding modes are RN,RZ, or RP.
 # - -ZERO is returned in the case of RM.
 #
 fadd_zero_2_chk_rm:
@@ -15425,7 +15425,7 @@ fsub_zero_2:
 
 #
 # the ZEROes have the same signs:
-# - therefore, we return +ZERO if the rounding mode is RN,RZ, or RP
+# - Therefore, we return +ZERO if the rounding mode is RN,RZ, or RP
 # - -ZERO is returned in the case of RM.
 #
 fsub_zero_2_chk_rm:
@@ -15693,7 +15693,7 @@ fsqrt_sd_unfl_dis:
 
 #
 # operand will underflow AND underflow is enabled.
-# therefore, we must return the result rounded to extended precision.
+# Therefore, we must return the result rounded to extended precision.
 #
 fsqrt_sd_unfl_ena:
 	mov.l		FP_SCR0_HI(%a6),FP_SCR1_HI(%a6)
@@ -21000,7 +21000,7 @@ fout_pack_type:
 	tst.l		%d0
 	bne.b		fout_pack_set
 # "mantissa" is all zero which means that the answer is zero. but, the '040
-# algorithm allows the exponent to be non-zero. the 881/2 do not. therefore,
+# algorithm allows the exponent to be non-zero. the 881/2 do not. Therefore,
 # if the mantissa is zero, I will zero the exponent, too.
 # the question now is whether the exponents sign bit is allowed to be non-zero
 # for a zero, also...
@@ -21743,7 +21743,7 @@ denorm_set_stky:
 	rts
 
 #									#
-# dnrm_lp(): normalize exponent/mantissa to specified threshhold	#
+# dnrm_lp(): normalize exponent/mantissa to specified threshold		#
 #									#
 # INPUT:								#
 #	%a0	   : points to the operand to be denormalized		#
@@ -22402,7 +22402,7 @@ unnorm_shift:
 	bgt.b		unnorm_nrm_zero		# yes; denorm only until exp = 0
 
 #
-# exponent would not go < 0. therefore, number stays normalized
+# exponent would not go < 0. Therefore, number stays normalized
 #
 	sub.w		%d0, %d1		# shift exponent value
 	mov.w		FTEMP_EX(%a0), %d0	# load old exponent
diff --git a/arch/m68k/ifpsp060/src/pfpsp.S b/arch/m68k/ifpsp060/src/pfpsp.S
index 51b9f7d879dd..e71ba0ab013c 100644
--- a/arch/m68k/ifpsp060/src/pfpsp.S
+++ b/arch/m68k/ifpsp060/src/pfpsp.S
@@ -752,7 +752,7 @@ fovfl_ovfl_on:
 
 	bra.l		_real_ovfl
 
-# overflow occurred but is disabled. meanwhile, inexact is enabled. therefore,
+# overflow occurred but is disabled. meanwhile, inexact is enabled. Therefore,
 # we must jump to real_inex().
 fovfl_inex_on:
 
@@ -1014,7 +1014,7 @@ funfl_unfl_on2:
 
 	bra.l		_real_unfl
 
-# undeflow occurred but is disabled. meanwhile, inexact is enabled. therefore,
+# underflow occurred but is disabled. meanwhile, inexact is enabled. Therefore,
 # we must jump to real_inex().
 funfl_inex_on:
 
@@ -2962,7 +2962,7 @@ iea_disabled:
 
 	tst.w		%d0			# is instr fmovm?
 	bmi.b		iea_dis_fmovm		# yes
-# instruction is using an extended precision immediate operand. therefore,
+# instruction is using an extended precision immediate operand. Therefore,
 # the total instruction length is 16 bytes.
 iea_dis_immed:
 	mov.l		&0x10,%d0		# 16 bytes of instruction
@@ -5865,7 +5865,7 @@ denorm_set_stky:
 	rts
 
 #									#
-# dnrm_lp(): normalize exponent/mantissa to specified threshhold	#
+# dnrm_lp(): normalize exponent/mantissa to specified threshold		#
 #									#
 # INPUT:								#
 #	%a0	   : points to the operand to be denormalized		#
@@ -6524,7 +6524,7 @@ unnorm_shift:
 	bgt.b		unnorm_nrm_zero		# yes; denorm only until exp = 0
 
 #
-# exponent would not go < 0. therefore, number stays normalized
+# exponent would not go < 0. Therefore, number stays normalized
 #
 	sub.w		%d0, %d1		# shift exponent value
 	mov.w		FTEMP_EX(%a0), %d0	# load old exponent
@@ -7901,7 +7901,7 @@ fout_pack_type:
 	tst.l		%d0
 	bne.b		fout_pack_set
 # "mantissa" is all zero which means that the answer is zero. but, the '040
-# algorithm allows the exponent to be non-zero. the 881/2 do not. therefore,
+# algorithm allows the exponent to be non-zero. the 881/2 do not. Therefore,
 # if the mantissa is zero, I will zero the exponent, too.
 # the question now is whether the exponents sign bit is allowed to be non-zero
 # for a zero, also...
@@ -8647,7 +8647,7 @@ fin_sd_unfl_dis:
 
 #
 # operand will underflow AND underflow or inexact is enabled.
-# therefore, we must return the result rounded to extended precision.
+# Therefore, we must return the result rounded to extended precision.
 #
 fin_sd_unfl_ena:
 	mov.l		FP_SCR0_HI(%a6),FP_SCR1_HI(%a6)
@@ -9177,7 +9177,7 @@ fdiv_zero_load_p:
 
 #
 # The destination was In Range and the source was a ZERO. The result,
-# therefore, is an INF w/ the proper sign.
+# Therefore, is an INF w/ the proper sign.
 # So, determine the sign and return a new INF (w/ the j-bit cleared).
 #
 	global		fdiv_inf_load		# global for fsgldiv
@@ -9427,7 +9427,7 @@ fneg_sd_unfl_dis:
 
 #
 # operand will underflow AND underflow is enabled.
-# therefore, we must return the result rounded to extended precision.
+# Therefore, we must return the result rounded to extended precision.
 #
 fneg_sd_unfl_ena:
 	mov.l		FP_SCR0_HI(%a6),FP_SCR1_HI(%a6)
@@ -10042,7 +10042,7 @@ fabs_sd_unfl_dis:
 
 #
 # operand will underflow AND underflow is enabled.
-# therefore, we must return the result rounded to extended precision.
+# Therefore, we must return the result rounded to extended precision.
 #
 fabs_sd_unfl_ena:
 	mov.l		FP_SCR0_HI(%a6),FP_SCR1_HI(%a6)
@@ -11404,7 +11404,7 @@ fadd_zero_2:
 
 #
 # the ZEROes have opposite signs:
-# - therefore, we return +ZERO if the rounding modes are RN,RZ, or RP.
+# - Therefore, we return +ZERO if the rounding modes are RN,RZ, or RP.
 # - -ZERO is returned in the case of RM.
 #
 fadd_zero_2_chk_rm:
@@ -11856,7 +11856,7 @@ fsub_zero_2:
 
 #
 # the ZEROes have the same signs:
-# - therefore, we return +ZERO if the rounding mode is RN,RZ, or RP
+# - Therefore, we return +ZERO if the rounding mode is RN,RZ, or RP
 # - -ZERO is returned in the case of RM.
 #
 fsub_zero_2_chk_rm:
@@ -12124,7 +12124,7 @@ fsqrt_sd_unfl_dis:
 
 #
 # operand will underflow AND underflow is enabled.
-# therefore, we must return the result rounded to extended precision.
+# Therefore, we must return the result rounded to extended precision.
 #
 fsqrt_sd_unfl_ena:
 	mov.l		FP_SCR0_HI(%a6),FP_SCR1_HI(%a6)
diff --git a/arch/m68k/include/asm/bootinfo.h b/arch/m68k/include/asm/bootinfo.h
index fb8a06b9ab6a..67e7a78ad96b 100644
--- a/arch/m68k/include/asm/bootinfo.h
+++ b/arch/m68k/include/asm/bootinfo.h
@@ -145,7 +145,7 @@ struct bi_record {
 
     /*
      *  Macintosh hardware profile data - unused, see macintosh.h for
-     *  resonable type values
+     *  reasonable type values
      */
 
 #define BI_MAC_VIA1BASE		0x8010	/* Mac VIA1 base address (always present) */
diff --git a/arch/microblaze/lib/memcpy.c b/arch/microblaze/lib/memcpy.c
index 6a907c58a4bc..cc2108b6b260 100644
--- a/arch/microblaze/lib/memcpy.c
+++ b/arch/microblaze/lib/memcpy.c
@@ -9,7 +9,7 @@
  * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
  * http://www.embedded.com/showArticle.jhtml?articleID=19205567
  *
- * Attempts were made, unsuccesfully, to contact the original
+ * Attempts were made, unsuccessfully, to contact the original
  * author of this code (Michael Morrow, Intel).  Below is the original
  * copyright notice.
  *
diff --git a/arch/microblaze/lib/memmove.c b/arch/microblaze/lib/memmove.c
index d4e9f49a71f7..0929198c5e68 100644
--- a/arch/microblaze/lib/memmove.c
+++ b/arch/microblaze/lib/memmove.c
@@ -9,7 +9,7 @@
  * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
  * http://www.embedded.com/showArticle.jhtml?articleID=19205567
  *
- * Attempts were made, unsuccesfully, to contact the original
+ * Attempts were made, unsuccessfully, to contact the original
  * author of this code (Michael Morrow, Intel).  Below is the original
  * copyright notice.
  *
diff --git a/arch/microblaze/lib/memset.c b/arch/microblaze/lib/memset.c
index 941dc8f94b03..4df851d41a29 100644
--- a/arch/microblaze/lib/memset.c
+++ b/arch/microblaze/lib/memset.c
@@ -9,7 +9,7 @@
  * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
  * http://www.embedded.com/showArticle.jhtml?articleID=19205567
  *
- * Attempts were made, unsuccesfully, to contact the original
+ * Attempts were made, unsuccessfully, to contact the original
  * author of this code (Michael Morrow, Intel).  Below is the original
  * copyright notice.
  *
diff --git a/arch/mips/include/asm/mach-pnx833x/gpio.h b/arch/mips/include/asm/mach-pnx833x/gpio.h
index 8de0eb9c98a3..ed3a88da70f6 100644
--- a/arch/mips/include/asm/mach-pnx833x/gpio.h
+++ b/arch/mips/include/asm/mach-pnx833x/gpio.h
@@ -24,7 +24,7 @@
 
 /* BIG FAT WARNING: races danger!
    No protections exist here. Current users are only early init code,
-   when locking is not needed because no cuncurency yet exists there,
+   when locking is not needed because no concurrency yet exists there,
    and GPIO IRQ dispatcher, which does locking.
    However, if many uses will ever happen, proper locking will be needed
    - including locking between different uses
diff --git a/arch/mips/include/asm/sgi/ioc.h b/arch/mips/include/asm/sgi/ioc.h
index 343ed15f8dc4..57a971904cfe 100644
--- a/arch/mips/include/asm/sgi/ioc.h
+++ b/arch/mips/include/asm/sgi/ioc.h
@@ -164,7 +164,7 @@ struct sgioc_regs {
 	u32 _unused5;
 	u8 _write[3];
 	volatile u8 write;
-#define SGIOC_WRITE_NTHRESH	0x01	/* use 4.5db threshhold */
+#define SGIOC_WRITE_NTHRESH	0x01	/* use 4.5db threshold */
 #define SGIOC_WRITE_TPSPEED	0x02	/* use 100ohm TP speed */
 #define SGIOC_WRITE_EPSEL	0x04	/* force cable mode: 1=AUI 0=TP */
 #define SGIOC_WRITE_EASEL	0x08	/* 1=autoselect 0=manual cable selection */
diff --git a/arch/mips/include/asm/sibyte/sb1250_mac.h b/arch/mips/include/asm/sibyte/sb1250_mac.h
index b6faf08ca81d..591b9061fd8e 100644
--- a/arch/mips/include/asm/sibyte/sb1250_mac.h
+++ b/arch/mips/include/asm/sibyte/sb1250_mac.h
@@ -212,7 +212,7 @@
 #define G_MAC_TXD_WEIGHT1(x)        _SB_GETVALUE(x, S_MAC_TXD_WEIGHT1, M_MAC_TXD_WEIGHT1)
 
 /*
- * MAC Fifo Threshhold registers (Table 9-14)
+ * MAC Fifo Threshold registers (Table 9-14)
  * Register: MAC_THRSH_CFG_0
  * Register: MAC_THRSH_CFG_1
  * Register: MAC_THRSH_CFG_2
diff --git a/arch/mips/include/asm/sn/sn0/hubio.h b/arch/mips/include/asm/sn/sn0/hubio.h
index d0c29d4de084..31c76c021bb6 100644
--- a/arch/mips/include/asm/sn/sn0/hubio.h
+++ b/arch/mips/include/asm/sn/sn0/hubio.h
@@ -825,7 +825,7 @@ typedef union iprb_u {
 	struct {
 	    u64	rsvd1:	15,
 		error:	1,	/* Widget rcvd wr resp pkt w/ error */
-		ovflow:	5,	/* Over flow count. perf measurement */
+		ovflow:	5,	/* Overflow count. perf measurement */
 		fire_and_forget: 1, /* Launch Write without response */
 		mode:	2,	/* Widget operation Mode	*/
 		rsvd2:	2,
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 24630fd8ef60..a38e3ee95515 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -1331,7 +1331,7 @@ void smtc_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu)
 		if (!((asid += ASID_INC) & ASID_MASK) ) {
 			if (cpu_has_vtag_icache)
 				flush_icache_all();
-			/* Traverse all online CPUs (hack requires contigous range) */
+			/* Traverse all online CPUs (hack requires contiguous range) */
 			for_each_online_cpu(i) {
 				/*
 				 * We don't need to worry about our own CPU, nor those of
diff --git a/arch/mips/math-emu/dp_sub.c b/arch/mips/math-emu/dp_sub.c
index b30c5b1f1a2c..a2127d685a0d 100644
--- a/arch/mips/math-emu/dp_sub.c
+++ b/arch/mips/math-emu/dp_sub.c
@@ -110,7 +110,7 @@ ieee754dp ieee754dp_sub(ieee754dp x, ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
-		/* FAAL THOROUGH */
+		/* FALL THROUGH */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		/* normalize ym,ye */
diff --git a/arch/mips/txx9/generic/smsc_fdc37m81x.c b/arch/mips/txx9/generic/smsc_fdc37m81x.c
index a2b2d62d88e3..8ebc3848f3ac 100644
--- a/arch/mips/txx9/generic/smsc_fdc37m81x.c
+++ b/arch/mips/txx9/generic/smsc_fdc37m81x.c
@@ -117,7 +117,7 @@ unsigned long __init smsc_fdc37m81x_init(unsigned long port)
 	if (chip_id == SMSC_FDC37M81X_CHIP_ID)
 		smsc_fdc37m81x_config_end();
 	else {
-		printk(KERN_WARNING "%s: unknow chip id 0x%02x\n", __func__,
+		printk(KERN_WARNING "%s: unknown chip id 0x%02x\n", __func__,
 		       chip_id);
 		g_smsc_fdc37m81x_base = 0;
 	}
diff --git a/arch/powerpc/include/asm/reg_fsl_emb.h b/arch/powerpc/include/asm/reg_fsl_emb.h
index 1e180a594589..0de404dfee8b 100644
--- a/arch/powerpc/include/asm/reg_fsl_emb.h
+++ b/arch/powerpc/include/asm/reg_fsl_emb.h
@@ -39,7 +39,7 @@
 #define PMRN_PMLCB2	0x112	/* PM Local Control B2 */
 #define PMRN_PMLCB3	0x113	/* PM Local Control B3 */
 
-#define PMLCB_THRESHMUL_MASK	0x0700	/* Threshhold Multiple Field */
+#define PMLCB_THRESHMUL_MASK	0x0700	/* Threshold Multiple Field */
 #define PMLCB_THRESHMUL_SHIFT	8
 
 #define PMLCB_THRESHOLD_MASK	0x003f	/* Threshold Field */
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 641c74bb8e27..b6bd1eaa1c24 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -52,7 +52,7 @@ static struct hard_trap_info
 	{ 0x2030, 0x08 /* SIGFPE */  },		/* spe fp data */
 	{ 0x2040, 0x08 /* SIGFPE */  },		/* spe fp data */
 	{ 0x2050, 0x08 /* SIGFPE */  },		/* spe fp round */
-	{ 0x2060, 0x0e /* SIGILL */  },		/* performace monitor */
+	{ 0x2060, 0x0e /* SIGILL */  },		/* performance monitor */
 	{ 0x2900, 0x08 /* SIGFPE */  },		/* apu unavailable */
 	{ 0x3100, 0x0e /* SIGALRM */ },		/* fixed interval timer */
 	{ 0x3200, 0x02 /* SIGINT */  }, 	/* watchdog */
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index c3a56d65c5a9..a753b72efbc0 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -59,7 +59,7 @@ void set_thresholds(unsigned long cpu)
 	mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID);
 
 	/* setup THRM2,
-	 * threshold, valid bit, enable interrupts, interrupt when above threshhold
+	 * threshold, valid bit, enable interrupts, interrupt when above threshold
 	 */
 	mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE);
 #else
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index 52c98edcd703..2c9e52267292 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -1594,7 +1594,7 @@ static void cell_handle_interrupt_spu(struct pt_regs *regs,
 		 * to a latch.  The new values (interrupt setting bits, reset
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
-		 * this to work as desired, the permormance monitor needs to
+		 * this to work as desired, the performance monitor needs to
 		 * be disabled while writing to the latches.  This is a
 		 * HW design issue.
 		 */
@@ -1668,7 +1668,7 @@ static void cell_handle_interrupt_ppu(struct pt_regs *regs,
 		 * to a latch.	The new values (interrupt setting bits, reset
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
-		 * this to work as desired, the permormance monitor needs to
+		 * this to work as desired, the performance monitor needs to
 		 * be disabled while writing to the latches.  This is a
 		 * HW design issue.
 		 */
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pci.c b/arch/powerpc/platforms/52xx/mpc52xx_pci.c
index dd43114e9684..da110bd88346 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pci.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pci.c
@@ -100,7 +100,7 @@ const struct of_device_id mpc52xx_pci_ids[] __initdata = {
 };
 
 /* ======================================================================== */
-/* PCI configuration acess                                                  */
+/* PCI configuration access                                                 */
 /* ======================================================================== */
 
 static int
diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
index e81403b245b5..ab2027cdf893 100644
--- a/arch/powerpc/platforms/powermac/pci.c
+++ b/arch/powerpc/platforms/powermac/pci.c
@@ -302,7 +302,7 @@ static void __init setup_chaos(struct pci_controller *hose,
  *  1 -> Skip the device but act as if the access was successfull
  *       (return 0xff's on reads, eventually, cache config space
  *       accesses in a later version)
- * -1 -> Hide the device (unsuccessful acess)
+ * -1 -> Hide the device (unsuccessful access)
  */
 static int u3_ht_skip_device(struct pci_controller *hose,
 			     struct pci_bus *bus, unsigned int devfn)
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index ae3c4db86fe8..bafc3f85360d 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -160,7 +160,7 @@ static int dart_build(struct iommu_table *tbl, long index,
 
 	dp = ((unsigned int*)tbl->it_base) + index;
 
-	/* On U3, all memory is contigous, so we can move this
+	/* On U3, all memory is contiguous, so we can move this
 	 * out of the loop.
 	 */
 	l = npages;
diff --git a/arch/s390/math-emu/math.c b/arch/s390/math-emu/math.c
index 3ee78ccb617d..cd4e9c168dd7 100644
--- a/arch/s390/math-emu/math.c
+++ b/arch/s390/math-emu/math.c
@@ -2088,7 +2088,7 @@ int math_emu_ldr(__u8 *opcode) {
         __u16 opc = *((__u16 *) opcode);
 
         if ((opc & 0x90) == 0) {           /* test if rx in {0,2,4,6} */
-                /* we got an exception therfore ry can't be in {0,2,4,6} */
+                /* we got an exception therefore ry can't be in {0,2,4,6} */
 		asm volatile(		/* load rx from fp_regs.fprs[ry] */
 			"	bras	1,0f\n"
 			"	ld	0,0(%1)\n"
@@ -2118,7 +2118,7 @@ int math_emu_ler(__u8 *opcode) {
         __u16 opc = *((__u16 *) opcode);
 
         if ((opc & 0x90) == 0) {           /* test if rx in {0,2,4,6} */
-                /* we got an exception therfore ry can't be in {0,2,4,6} */
+                /* we got an exception therefore ry can't be in {0,2,4,6} */
 		asm volatile(		/* load rx from fp_regs.fprs[ry] */
 			"	bras	1,0f\n"
 			"	le	0,0(%1)\n"
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 9d6684849fd9..278441f39856 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -12,9 +12,9 @@
 #include <linux/types.h>
 
 /*
- * FIXME: Acessing the desc_struct through its fields is more elegant,
+ * FIXME: Accessing the desc_struct through its fields is more elegant,
  * and should be the one valid thing to do. However, a lot of open code
- * still touches the a and b acessors, and doing this allow us to do it
+ * still touches the a and b accessors, and doing this allow us to do it
  * incrementally. We keep the signature as a struct, rather than an union,
  * so we can get rid of it transparently in the future -- glommer
  */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index ede6998bd92c..91df7c51806c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
 /*
  * generic node memory support, the following assumptions apply:
  *
- * 1) memory comes in 64Mb contigious chunks which are either present or not
+ * 1) memory comes in 64Mb contiguous chunks which are either present or not
  * 2) we will not have more than 64Gb in total
  *
  * for now assume that 64Gb is max amount of RAM for whole system
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 80e2984f521c..b414d2b401f6 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -55,7 +55,7 @@
 #define DESC_STATUS_SOURCE_TIMEOUT	3
 
 /*
- * source side threshholds at which message retries print a warning
+ * source side thresholds at which message retries print a warning
  */
 #define SOURCE_TIMEOUT_LIMIT		20
 #define DESTINATION_TIMEOUT_LIMIT	20
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..1c2c4838d35c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1122,7 +1122,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	if (!acpi_sci_override_gsi)
 		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
 
-	/* Fill in identity legacy mapings where no override */
+	/* Fill in identity legacy mappings where no override */
 	mp_config_acpi_legacy_irqs();
 
 	count =
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0285521e0a99..42ac5e000995 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1637,7 +1637,7 @@ retry:
 			goto out;
 
 		/*
-		 * aperture was sucessfully enlarged by 128 MB, try
+		 * aperture was successfully enlarged by 128 MB, try
 		 * allocation again
 		 */
 		goto retry;
@@ -2396,7 +2396,7 @@ int __init amd_iommu_init_passthrough(void)
 	struct pci_dev *dev = NULL;
 	u16 devid, devid2;
 
-	/* allocate passthroug domain */
+	/* allocate passthrough domain */
 	pt_domain = protection_domain_alloc();
 	if (!pt_domain)
 		return -ENOMEM;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..35be5802ac1e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1229,7 +1229,7 @@ x86_perf_event_set_period(struct perf_event *event,
 		return 0;
 
 	/*
-	 * If we are way outside a reasoable range then just skip forward:
+	 * If we are way outside a reasonable range then just skip forward:
 	 */
 	if (unlikely(left <= -period)) {
 		left = period;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..7d377379fa4a 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -514,7 +514,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
+ * remain disabled throughout this function.
  */
 static int __kprobes kprobe_handler(struct pt_regs *regs)
 {
@@ -851,7 +851,7 @@ no_change:
 
 /*
  * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.
+ * remain disabled throughout this function.
  */
 static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 {
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917f..d16d576beebf 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -203,7 +203,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
  */
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate
- * and they remain disabled thorough out this function.
+ * and they remain disabled throughout this function.
  */
 int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 {
@@ -302,7 +302,7 @@ no_kmmio:
 
 /*
  * Interrupts are disabled on entry as trap1 is an interrupt gate
- * and they remain disabled thorough out this function.
+ * and they remain disabled throughout this function.
  * This must always get called as the pair to kmmio_handler().
  */
 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index ca564202ed7a..58916afbbda5 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -28,7 +28,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
  * Description:
  *     Add this blk_iopoll structure to the pending poll list and trigger the
  *     raise of the blk iopoll softirq. The driver must already have gotten a
- *     succesful return from blk_iopoll_sched_prep() before calling this.
+ *     successful return from blk_iopoll_sched_prep() before calling this.
  **/
 void blk_iopoll_sched(struct blk_iopoll *iop)
 {
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 9ac4e378992e..3aadded05a05 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -599,7 +599,7 @@ static const struct ich_laptop ich_laptop[] = {
 	{ 0x27DF, 0x1028, 0x02b0 },	/* ICH7 on unknown Dell */
 	{ 0x27DF, 0x1043, 0x1267 },	/* ICH7 on Asus W5F */
 	{ 0x27DF, 0x103C, 0x30A1 },	/* ICH7 on HP Compaq nc2400 */
-	{ 0x27DF, 0x103C, 0x361a },	/* ICH7 on unkown HP  */
+	{ 0x27DF, 0x103C, 0x361a },	/* ICH7 on unknown HP  */
 	{ 0x27DF, 0x1071, 0xD221 },	/* ICH7 on Hercules EC-900 */
 	{ 0x27DF, 0x152D, 0x0778 },	/* ICH7 on unknown Intel */
 	{ 0x24CA, 0x1025, 0x0061 },	/* ICH4 on ACER Aspire 2023WLMi */
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index d344db42a002..0d9d2f20788a 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -43,9 +43,9 @@ enum {
 	/*
 	 * SATA-FSL host controller supports a max. of (15+1) direct PRDEs, and
 	 * chained indirect PRDEs upto a max count of 63.
-	 * We are allocating an array of 63 PRDEs contigiously, but PRDE#15 will
+	 * We are allocating an array of 63 PRDEs contiguously, but PRDE#15 will
 	 * be setup as an indirect descriptor, pointing to it's next
-	 * (contigious) PRDE. Though chained indirect PRDE arrays are
+	 * (contiguous) PRDE. Though chained indirect PRDE arrays are
 	 * supported,it will be more efficient to use a direct PRDT and
 	 * a single chain/link to indirect PRDE array/PRDT.
 	 */
@@ -314,7 +314,7 @@ static unsigned int sata_fsl_fill_sg(struct ata_queued_cmd *qc, void *cmd_desc,
 	u32 ttl_dwords = 0;
 
 	/*
-	 * NOTE : direct & indirect prdt's are contigiously allocated
+	 * NOTE : direct & indirect prdt's are contiguously allocated
 	 */
 	struct prde *prd = (struct prde *)&((struct command_desc *)
 					    cmd_desc)->prdt;
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index b2c1b37ab2e4..f734b345ac71 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -1132,7 +1132,7 @@ static int rx_pkt(struct atm_dev *dev)
                     IF_ERR(printk(" cause: packet time out\n");)
                 }
                 else {
-                    IF_ERR(printk(" cause: buffer over flow\n");)
+                    IF_ERR(printk(" cause: buffer overflow\n");)
                 }
 		goto out_free_desc;
 	}  
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 979d159b5cd1..ee95c76bfd3d 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(wait_for_device_probe);
  * @dev: device to try to bind to the driver
  *
  * This function returns -ENODEV if the device is not registered,
- * 1 if the device is bound sucessfully and 0 otherwise.
+ * 1 if the device is bound successfully and 0 otherwise.
  *
  * This function must be called with @dev->sem held.  When called for a
  * USB interface, @dev->parent->sem must be held as well.
diff --git a/drivers/bluetooth/btmrvl_sdio.c b/drivers/bluetooth/btmrvl_sdio.c
index 5b33b85790f2..63bfc5436799 100644
--- a/drivers/bluetooth/btmrvl_sdio.c
+++ b/drivers/bluetooth/btmrvl_sdio.c
@@ -535,7 +535,7 @@ static int btmrvl_sdio_card_to_host(struct btmrvl_private *priv)
 		break;
 
 	default:
-		BT_ERR("Unknow packet type:%d", type);
+		BT_ERR("Unknown packet type:%d", type);
 		print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, payload,
 						blksz * buf_block_len);
 
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 4895f0e05322..aa0919386b8c 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -214,7 +214,7 @@ static int hci_uart_send_frame(struct sk_buff *skb)
 	struct hci_uart *hu;
 
 	if (!hdev) {
-		BT_ERR("Frame for uknown device (hdev=NULL)");
+		BT_ERR("Frame for unknown device (hdev=NULL)");
 		return -ENODEV;
 	}
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index a074fceb67d3..42e65cf8ab52 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -5,7 +5,7 @@
  *
  *  Added devfs support. 
  *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
- *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
+ *  Shared /dev/zero mmapping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
  */
 
 #include <linux/mm.h>
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index 1997270bb6f4..ecb89d798e35 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -248,7 +248,7 @@ static const struct vm_operations_struct mspec_vm_ops = {
 /*
  * mspec_mmap
  *
- * Called when mmaping the device.  Initializes the vma with a fault handler
+ * Called when mmapping the device.  Initializes the vma with a fault handler
  * and private data structure necessary to allocate, track, and free the
  * underlying pages.
  */
diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c
index 6934025a1ac1..c1d8b54c816d 100644
--- a/drivers/char/n_r3964.c
+++ b/drivers/char/n_r3964.c
@@ -602,7 +602,7 @@ static void receive_char(struct r3964_info *pInfo, const unsigned char c)
 		}
 		break;
 	case R3964_WAIT_FOR_RX_REPEAT:
-		/* FALLTROUGH */
+		/* FALLTHROUGH */
 	case R3964_IDLE:
 		if (c == STX) {
 			/* Prevent rx_queue from overflow: */
diff --git a/drivers/char/rio/route.h b/drivers/char/rio/route.h
index 20ed73f3fd7b..46e963771c30 100644
--- a/drivers/char/rio/route.h
+++ b/drivers/char/rio/route.h
@@ -67,7 +67,7 @@
 typedef struct COST_ROUTE COST_ROUTE;
 struct COST_ROUTE {
 	unsigned char cost;	/* Cost down this link */
-	unsigned char route[NODE_BYTES];	/* Nodes thorough this route */
+	unsigned char route[NODE_BYTES];	/* Nodes through this route */
 };
 
 typedef struct ROUTE_STR ROUTE_STR;
diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c
index 5f753fc08730..09ad9154d86c 100644
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@@ -863,7 +863,7 @@ static int hifn_init_pubrng(struct hifn_device *dev)
 		dev->dmareg |= HIFN_DMAIER_PUBDONE;
 		hifn_write_1(dev, HIFN_1_DMA_IER, dev->dmareg);
 
-		dprintk("Chip %s: Public key engine has been sucessfully "
+		dprintk("Chip %s: Public key engine has been successfully "
 				"initialised.\n", dev->name);
 	}
 
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 7585c4164bd5..c52ac9efd0bf 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -99,7 +99,7 @@ static struct at_desc *atc_alloc_descriptor(struct dma_chan *chan,
 }
 
 /**
- * atc_desc_get - get a unsused descriptor from free_list
+ * atc_desc_get - get an unused descriptor from free_list
  * @atchan: channel we want a new descriptor for
  */
 static struct at_desc *atc_desc_get(struct at_dma_chan *atchan)
diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index fddf2b358936..d373d17257e9 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -183,7 +183,7 @@ static inline struct fw_node *fw_node(struct list_head *l)
  * This function builds the tree representation of the topology given
  * by the self IDs from the latest bus reset.  During the construction
  * of the tree, the function checks that the self IDs are valid and
- * internally consistent.  On succcess this function returns the
+ * internally consistent.  On success this function returns the
  * fw_node corresponding to the local card otherwise NULL.
  */
 static struct fw_node *build_tree(struct fw_card *card,
diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index 5cae0b3eee9b..3f7c500b2115 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -272,7 +272,7 @@ EXPORT_SYMBOL(drm_mode_object_find);
  * functions & device file and adds it to the master fd list.
  *
  * RETURNS:
- * Zero on success, error code on falure.
+ * Zero on success, error code on failure.
  */
 int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
 			 const struct drm_framebuffer_funcs *funcs)
@@ -2328,7 +2328,7 @@ int drm_mode_connector_property_set_ioctl(struct drm_device *dev,
 	} else if (connector->funcs->set_property)
 		ret = connector->funcs->set_property(connector, property, out_resp->value);
 
-	/* store the property value if succesful */
+	/* store the property value if successful */
 	if (!ret)
 		drm_connector_property_set_value(connector, property, out_resp->value);
 out:
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index abfc27b0c2ea..a2a3fa599923 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1309,7 +1309,7 @@ out_free_list:
  * i915_gem_release_mmap - remove physical page mappings
  * @obj: obj in question
  *
- * Preserve the reservation of the mmaping with the DRM core code, but
+ * Preserve the reservation of the mmapping with the DRM core code, but
  * relinquish ownership of the pages back to the system.
  *
  * It is vital that we remove the page mapping if we have mapped a tiled
diff --git a/drivers/gpu/drm/i915/intel_fb.c b/drivers/gpu/drm/i915/intel_fb.c
index 2b0fe54cd92c..40fcf6fdef38 100644
--- a/drivers/gpu/drm/i915/intel_fb.c
+++ b/drivers/gpu/drm/i915/intel_fb.c
@@ -70,7 +70,7 @@ static struct drm_fb_helper_funcs intel_fb_helper_funcs = {
 
 
 /**
- * Curretly it is assumed that the old framebuffer is reused.
+ * Currently it is assumed that the old framebuffer is reused.
  *
  * LOCKING
  * caller should hold the mode config lock.
diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index 083bec2e50f9..e7fa3279e2f8 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -2726,7 +2726,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 	/* Wrap with our custom algo which switches to DDC mode */
 	intel_output->ddc_bus->algo = &intel_sdvo_i2c_bit_algo;
 
-	/* In defaut case sdvo lvds is false */
+	/* In default case sdvo lvds is false */
 	intel_sdvo_get_capabilities(intel_output, &sdvo_priv->caps);
 
 	if (intel_sdvo_output_setup(intel_output,
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index 609719490ec2..00c739c44848 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -389,11 +389,11 @@ int r600_mc_init(struct radeon_device *rdev)
 		 * AGP so that GPU can catch out of VRAM/AGP access
 		 */
 		if (rdev->mc.gtt_location > rdev->mc.mc_vram_size) {
-			/* Enought place before */
+			/* Enough place before */
 			rdev->mc.vram_location = rdev->mc.gtt_location -
 							rdev->mc.mc_vram_size;
 		} else if (tmp > rdev->mc.mc_vram_size) {
-			/* Enought place after */
+			/* Enough place after */
 			rdev->mc.vram_location = rdev->mc.gtt_location +
 							rdev->mc.gtt_size;
 		} else {
diff --git a/drivers/gpu/drm/radeon/radeon_fb.c b/drivers/gpu/drm/radeon/radeon_fb.c
index b38c4c8e2c61..d10eb43645c8 100644
--- a/drivers/gpu/drm/radeon/radeon_fb.c
+++ b/drivers/gpu/drm/radeon/radeon_fb.c
@@ -59,7 +59,7 @@ static struct fb_ops radeonfb_ops = {
 };
 
 /**
- * Curretly it is assumed that the old framebuffer is reused.
+ * Currently it is assumed that the old framebuffer is reused.
  *
  * LOCKING
  * caller should hold the mode config lock.
diff --git a/drivers/gpu/drm/radeon/radeon_state.c b/drivers/gpu/drm/radeon/radeon_state.c
index 38537d971a3e..067167cb39ca 100644
--- a/drivers/gpu/drm/radeon/radeon_state.c
+++ b/drivers/gpu/drm/radeon/radeon_state.c
@@ -1950,7 +1950,7 @@ static void radeon_apply_surface_regs(int surf_index,
  * Note that refcount can be at most 2, since during a free refcount=3
  * might mean we have to allocate a new surface which might not always
  * be available.
- * For example : we allocate three contigous surfaces ABC. If B is
+ * For example : we allocate three contiguous surfaces ABC. If B is
  * freed, we suddenly need two surfaces to store A and C, which might
  * not always be available.
  */
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 765bd184b6fc..5a664000bf70 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -372,7 +372,7 @@ static int radeon_bo_move(struct ttm_buffer_object *bo,
 	     new_mem->mem_type == TTM_PL_SYSTEM) ||
 	    (old_mem->mem_type == TTM_PL_SYSTEM &&
 	     new_mem->mem_type == TTM_PL_TT)) {
-		/* bind is enought */
+		/* bind is enough */
 		radeon_move_null(bo, new_mem);
 		return 0;
 	}
diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c
index 595ac638039d..9e9826ace305 100644
--- a/drivers/gpu/drm/radeon/rv770.c
+++ b/drivers/gpu/drm/radeon/rv770.c
@@ -807,11 +807,11 @@ int rv770_mc_init(struct radeon_device *rdev)
 		 * AGP so that GPU can catch out of VRAM/AGP access
 		 */
 		if (rdev->mc.gtt_location > rdev->mc.mc_vram_size) {
-			/* Enought place before */
+			/* Enough place before */
 			rdev->mc.vram_location = rdev->mc.gtt_location -
 							rdev->mc.mc_vram_size;
 		} else if (tmp > rdev->mc.mc_vram_size) {
-			/* Enought place after */
+			/* Enough place after */
 			rdev->mc.vram_location = rdev->mc.gtt_location +
 							rdev->mc.gtt_size;
 		} else {
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index c70927ecda21..61c5572d2b91 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -427,7 +427,7 @@ static int ttm_bo_kmap_ttm(struct ttm_buffer_object *bo,
 
 		/*
 		 * We need to use vmap to get the desired page protection
-		 * or to make the buffer object look contigous.
+		 * or to make the buffer object look contiguous.
 		 */
 		prot = (mem->placement & TTM_PL_FLAG_CACHED) ?
 			PAGE_KERNEL :
diff --git a/drivers/hwmon/adm1029.c b/drivers/hwmon/adm1029.c
index 36718150b475..e845b75ccee4 100644
--- a/drivers/hwmon/adm1029.c
+++ b/drivers/hwmon/adm1029.c
@@ -432,7 +432,7 @@ static int adm1029_remove(struct i2c_client *client)
 }
 
 /*
-function that update the status of the chips (temperature for exemple)
+function that update the status of the chips (temperature for example)
 */
 static struct adm1029_data *adm1029_update_device(struct device *dev)
 {
diff --git a/drivers/hwmon/lm93.c b/drivers/hwmon/lm93.c
index fc36cadf36fb..c48a284b8314 100644
--- a/drivers/hwmon/lm93.c
+++ b/drivers/hwmon/lm93.c
@@ -928,7 +928,7 @@ static void lm93_update_client_common(struct lm93_data *data,
 	data->prochot_interval = lm93_read_byte(client,
 			LM93_REG_PROCHOT_INTERVAL);
 
-	/* Fan Boost Termperature registers */
+	/* Fan Boost Temperature registers */
 	for (i = 0; i < 4; i++)
 		data->boost[i] = lm93_read_byte(client, LM93_REG_BOOST(i));
 
diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index 2cd00b5b45b4..9fd4a0d3206e 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -125,7 +125,7 @@
    0 - no debugging messages
    1 - some debugging messages, but none during DMA frame transmission
    2 - lots of messages, including during DMA frame transmission
-       (will cause undeflows if your machine is too slow!)
+       (will cause underflows if your machine is too slow!)
 */
 
 #define DV1394_DEBUG_LEVEL 0
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
index 4bd39c8af80f..37d12e5efa49 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6110.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c
@@ -381,7 +381,7 @@ static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
 #define IPATH_GPIO_SCL \
 	(1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
 
-/* keep the code below somewhat more readonable; not used elsewhere */
+/* keep the code below somewhat more readable; not used elsewhere */
 #define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
 				infinipath_hwe_htclnkabyte1crcerr)
 #define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |	\
diff --git a/drivers/infiniband/hw/ipath/ipath_sd7220.c b/drivers/infiniband/hw/ipath/ipath_sd7220.c
index aa47eb549520..2a68d9f624dd 100644
--- a/drivers/infiniband/hw/ipath/ipath_sd7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_sd7220.c
@@ -614,7 +614,7 @@ static int epb_trans(struct ipath_devdata *dd, u16 reg, u64 i_val, u64 *o_vp)
  * @wd: Write Data - value to set in register
  * @mask: ones where data should be spliced into reg.
  *
- * Basic register read/modify/write, with un-needed acesses elided. That is,
+ * Basic register read/modify/write, with un-needed accesses elided. That is,
  * a mask of zero will prevent write, while a mask of 0xFF will prevent read.
  * returns current (presumed, if a write was done) contents of selected
  * register, or <0 if errors.
@@ -989,7 +989,7 @@ static struct rxeq_init {
 	/* Set DFELTHFDR/HDR thresholds */
 	RXEQ_VAL(7, 8,    0, 0, 0, 0), /* FDR */
 	RXEQ_VAL(7, 0x21, 0, 0, 0, 0), /* HDR */
-	/* Set TLTHFDR/HDR theshold */
+	/* Set TLTHFDR/HDR threshold */
 	RXEQ_VAL(7, 9,    2, 2, 2, 2), /* FDR */
 	RXEQ_VAL(7, 0x23, 2, 2, 2, 2), /* HDR */
 	/* Set Preamp setting 2 (ZFR/ZCNT) */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 219b10397b4d..256a00c6aeea 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -352,7 +352,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 	 * anymore, so we do this only if selective signaling is off.
 	 *
 	 * Further, on 32-bit platforms, we can't use vmap() to make
-	 * the QP buffer virtually contigious.  Thus we have to use
+	 * the QP buffer virtually contiguous.  Thus we have to use
 	 * constant-sized WRs to make sure a WR is always fully within
 	 * a single page-sized chunk.
 	 *
diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c
index 1c9410d1822c..bcc2d30ec245 100644
--- a/drivers/input/serio/hp_sdc.c
+++ b/drivers/input/serio/hp_sdc.c
@@ -955,7 +955,7 @@ static int __init hp_sdc_init_hppa(struct parisc_device *d)
 	INIT_DELAYED_WORK(&moduleloader_work, request_module_delayed);
 
 	ret = hp_sdc_init();
-	/* after sucessfull initialization give SDC some time to settle
+	/* after successfull initialization give SDC some time to settle
 	 * and then load the hp_sdc_mlc upper layer driver */
 	if (!ret)
 		schedule_delayed_work(&moduleloader_work,
diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c
index 820e51673b26..7d2b820ef58d 100644
--- a/drivers/input/serio/hp_sdc_mlc.c
+++ b/drivers/input/serio/hp_sdc_mlc.c
@@ -125,7 +125,7 @@ static void hp_sdc_mlc_isr (int irq, void *dev_id,
 		break;
 
 	default:
-		printk(KERN_WARNING PREFIX "Unkown HIL Error status (%x)!\n", data);
+		printk(KERN_WARNING PREFIX "Unknown HIL Error status (%x)!\n", data);
 		break;
 	}
 
diff --git a/drivers/input/touchscreen/atmel-wm97xx.c b/drivers/input/touchscreen/atmel-wm97xx.c
index 35377f583e28..a12242f77e23 100644
--- a/drivers/input/touchscreen/atmel-wm97xx.c
+++ b/drivers/input/touchscreen/atmel-wm97xx.c
@@ -59,7 +59,7 @@
 #define ATMEL_WM97XX_AC97C_IRQ		(29)
 #define ATMEL_WM97XX_GPIO_DEFAULT	(32+16) /* Pin 16 on port B. */
 #else
-#error Unkown CPU, this driver only supports AT32AP700X CPUs.
+#error Unknown CPU, this driver only supports AT32AP700X CPUs.
 #endif
 
 struct continuous {
diff --git a/drivers/input/touchscreen/mainstone-wm97xx.c b/drivers/input/touchscreen/mainstone-wm97xx.c
index 8fc3b08deb3b..6cdcf2a6e036 100644
--- a/drivers/input/touchscreen/mainstone-wm97xx.c
+++ b/drivers/input/touchscreen/mainstone-wm97xx.c
@@ -14,7 +14,7 @@
  *
  * Notes:
  *     This is a wm97xx extended touch driver to capture touch
- *     data in a continuous manner on the Intel XScale archictecture
+ *     data in a continuous manner on the Intel XScale architecture
  *
  *  Features:
  *       - codecs supported:- WM9705, WM9712, WM9713
@@ -131,7 +131,7 @@ static int wm97xx_acc_pen_down(struct wm97xx *wm)
 	/* When the AC97 queue has been drained we need to allow time
 	 * to buffer up samples otherwise we end up spinning polling
 	 * for samples.  The controller can't have a suitably low
-	 * threashold set to use the notifications it gives.
+	 * threshold set to use the notifications it gives.
 	 */
 	schedule_timeout_uninterruptible(1);
 
diff --git a/drivers/input/touchscreen/zylonite-wm97xx.c b/drivers/input/touchscreen/zylonite-wm97xx.c
index 41e4359c277c..eca54dbdf493 100644
--- a/drivers/input/touchscreen/zylonite-wm97xx.c
+++ b/drivers/input/touchscreen/zylonite-wm97xx.c
@@ -96,7 +96,7 @@ static int wm97xx_acc_pen_down(struct wm97xx *wm)
 	/* When the AC97 queue has been drained we need to allow time
 	 * to buffer up samples otherwise we end up spinning polling
 	 * for samples.  The controller can't have a suitably low
-	 * threashold set to use the notifications it gives.
+	 * threshold set to use the notifications it gives.
 	 */
 	msleep(1);
 
diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c
index 3e6d17f42a98..66b7d7a86474 100644
--- a/drivers/isdn/capi/capidrv.c
+++ b/drivers/isdn/capi/capidrv.c
@@ -830,7 +830,7 @@ static void handle_controller(_cmsg * cmsg)
 		      case 0: break;
 		      case 1: s = "unknown class"; break;
 		      case 2: s = "unknown function"; break;
-		      default: s = "unkown error"; break;
+		      default: s = "unknown error"; break;
 		   }
 		   if (s)
 	           printk(KERN_INFO "capidrv-%d: %s from controller 0x%x function %d: %s\n",
diff --git a/drivers/isdn/hardware/eicon/di.c b/drivers/isdn/hardware/eicon/di.c
index b029d130eb21..cb14ae3e7154 100644
--- a/drivers/isdn/hardware/eicon/di.c
+++ b/drivers/isdn/hardware/eicon/di.c
@@ -806,7 +806,7 @@ static void xdi_xlog_request (byte Adapter, byte Id,
           DELIVERY - indication entered isdn_rc function
           RNR=...  - application had returned RNR=... after the
                      look ahead callback
-          RNum=0   - aplication had not returned any buffer to copy
+          RNum=0   - application had not returned any buffer to copy
                      this indication and will copy it self
           COMPLETE - XDI had copied the data to the buffers provided
                      bu the application and is about to issue the
diff --git a/drivers/isdn/hardware/eicon/maintidi.c b/drivers/isdn/hardware/eicon/maintidi.c
index 23960cb6eaab..e7cfb3b5647f 100644
--- a/drivers/isdn/hardware/eicon/maintidi.c
+++ b/drivers/isdn/hardware/eicon/maintidi.c
@@ -385,7 +385,7 @@ static int SuperTraceMessageInput (void* hLib) {
                   }
                   break;
                 default:
-                  diva_mnt_internal_dprintf (0, DLI_ERR, "Unknon IDI Ind (DMA mode): %02x", Ind);
+                  diva_mnt_internal_dprintf (0, DLI_ERR, "Unknown IDI Ind (DMA mode): %02x", Ind);
               }
               p += (this_ind_length+1);
               total_length -= (4 + this_ind_length);
@@ -420,7 +420,7 @@ static int SuperTraceMessageInput (void* hLib) {
             }
             break;
           default:
-            diva_mnt_internal_dprintf (0, DLI_ERR, "Unknon IDI Ind: %02x", Ind);
+            diva_mnt_internal_dprintf (0, DLI_ERR, "Unknown IDI Ind: %02x", Ind);
         }
       }
     }
diff --git a/drivers/isdn/hardware/mISDN/hfcsusb.c b/drivers/isdn/hardware/mISDN/hfcsusb.c
index fc46a26cb14f..a64bb6c67ba7 100644
--- a/drivers/isdn/hardware/mISDN/hfcsusb.c
+++ b/drivers/isdn/hardware/mISDN/hfcsusb.c
@@ -721,7 +721,7 @@ hfcsusb_setup_bch(struct bchannel *bch, int protocol)
 	switch (protocol) {
 	case (-1):	/* used for init */
 		bch->state = -1;
-		/* fall trough */
+		/* fall through */
 	case (ISDN_P_NONE):
 		if (bch->state == ISDN_P_NONE)
 			return 0; /* already in idle state */
diff --git a/drivers/isdn/hardware/mISDN/hfcsusb.h b/drivers/isdn/hardware/mISDN/hfcsusb.h
index 43efe7358fa3..369196adae03 100644
--- a/drivers/isdn/hardware/mISDN/hfcsusb.h
+++ b/drivers/isdn/hardware/mISDN/hfcsusb.h
@@ -150,7 +150,7 @@ symbolic(struct hfcusb_symbolic_list list[], const int num)
 	for (i = 0; list[i].name != NULL; i++)
 		if (list[i].num == num)
 			return list[i].name;
-	return "<unkown USB Error>";
+	return "<unknown USB Error>";
 }
 
 /* USB descriptor need to contain one of the following EndPoint combination: */
diff --git a/drivers/isdn/hardware/mISDN/mISDNisar.c b/drivers/isdn/hardware/mISDN/mISDNisar.c
index de352a17673a..09095c747110 100644
--- a/drivers/isdn/hardware/mISDN/mISDNisar.c
+++ b/drivers/isdn/hardware/mISDN/mISDNisar.c
@@ -860,7 +860,7 @@ isar_pump_statev_modem(struct isar_ch *ch, u8 devt) {
 		pr_debug("%s: pump stev GSTN CLEAR\n", ch->is->name);
 		break;
 	default:
-		pr_info("u%s: nknown pump stev %x\n", ch->is->name, devt);
+		pr_info("u%s: unknown pump stev %x\n", ch->is->name, devt);
 		break;
 	}
 }
diff --git a/drivers/isdn/hisax/hfc_usb.c b/drivers/isdn/hisax/hfc_usb.c
index 9de54202c90c..ad5831f37d84 100644
--- a/drivers/isdn/hisax/hfc_usb.c
+++ b/drivers/isdn/hisax/hfc_usb.c
@@ -1086,7 +1086,7 @@ hfc_usb_l2l1(struct hisax_if *my_hisax_if, int pr, void *arg)
 			break;
 		default:
 			DBG(HFCUSB_DBG_STATES,
-			       "HFC_USB: hfc_usb_d_l2l1: unkown state : %#x", pr);
+			       "HFC_USB: hfc_usb_d_l2l1: unknown state : %#x", pr);
 			break;
 	}
 }
diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
index 2d14b64202a3..0f4ea7d16a15 100644
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ b/drivers/isdn/i4l/isdn_ppp.c
@@ -836,7 +836,7 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count)
 			unsigned short hl;
 			struct sk_buff *skb;
 			/*
-			 * we need to reserve enought space in front of
+			 * we need to reserve enough space in front of
 			 * sk_buff. old call to dev_alloc_skb only reserved
 			 * 16 bytes, now we are looking what the driver want
 			 */
@@ -1326,7 +1326,7 @@ isdn_ppp_xmit(struct sk_buff *skb, struct net_device *netdev)
 		struct sk_buff *new_skb;
 	        unsigned short hl;
 		/*
-		 * we need to reserve enought space in front of
+		 * we need to reserve enough space in front of
 		 * sk_buff. old call to dev_alloc_skb only reserved
 		 * 16 bytes, now we are looking what the driver want.
 		 */
@@ -1685,7 +1685,7 @@ static void isdn_ppp_mp_receive(isdn_net_dev * net_dev, isdn_net_local * lp,
 	 *
 	 * try to accomplish several tasks:
 	 * - reassemble any complete fragment sequence (non-null 'start'
-	 *   indicates there is a continguous sequence present)
+	 *   indicates there is a contiguous sequence present)
 	 * - discard any incomplete sequences that are below minseq -- due
 	 *   to the fact that sender always increment sequence number, if there
 	 *   is an incomplete sequence below minseq, no new fragments would
diff --git a/drivers/isdn/i4l/isdn_ttyfax.c b/drivers/isdn/i4l/isdn_ttyfax.c
index 78f7660c1d0e..4c41f191d4e2 100644
--- a/drivers/isdn/i4l/isdn_ttyfax.c
+++ b/drivers/isdn/i4l/isdn_ttyfax.c
@@ -470,7 +470,7 @@ isdn_tty_cmd_FCLASS2(char **p, modem_info * info)
 		}
 		return 0;
 	}
-	/* BADMUL=value - dummy 0=disable errorchk disabled (treshold multiplier) */
+	/* BADMUL=value - dummy 0=disable errorchk disabled (threshold multiplier) */
 	if (!strncmp(p[0], "BADMUL", 6)) {
 		p[0] += 6;
 		switch (*p[0]) {
diff --git a/drivers/isdn/mISDN/dsp_core.c b/drivers/isdn/mISDN/dsp_core.c
index 77ee2867c8b4..43ff4d3b046e 100644
--- a/drivers/isdn/mISDN/dsp_core.c
+++ b/drivers/isdn/mISDN/dsp_core.c
@@ -110,7 +110,7 @@
  * crossconnections and conferences via software if not possible through
  * hardware. If hardware capability is available, hardware is used.
  *
- * Echo: Is generated by CMX and is used to check performane of hard and
+ * Echo: Is generated by CMX and is used to check performance of hard and
  * software CMX.
  *
  * The CMX has special functions for conferences with one, two and more
diff --git a/drivers/isdn/mISDN/tei.c b/drivers/isdn/mISDN/tei.c
index e04bad6c5baf..6d4da6095885 100644
--- a/drivers/isdn/mISDN/tei.c
+++ b/drivers/isdn/mISDN/tei.c
@@ -725,7 +725,7 @@ tei_id_ver_tout_net(struct FsmInst *fi, int event, void *arg)
 	if (tm->rcnt == 1) {
 		if (*debug & DEBUG_L2_TEI)
 			tm->tei_m.printdebug(fi,
-			    "check req for tei %d sucessful\n", tm->l2->tei);
+			    "check req for tei %d successful\n", tm->l2->tei);
 		mISDN_FsmChangeState(fi, ST_TEI_NOP);
 	} else if (tm->rcnt > 1) {
 		/* duplicate assignment; remove */
diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c
index 8b9364434aa0..3fbe41b0ac07 100644
--- a/drivers/macintosh/therm_windtunnel.c
+++ b/drivers/macintosh/therm_windtunnel.c
@@ -15,7 +15,7 @@
  *
  *	WARNING: This driver has only been testen on Apple's
  *	1.25 MHz Dual G4 (March 03). It is tuned for a CPU
- *	temperatur around 57 C.
+ *	temperature around 57 C.
  *
  *   Copyright (C) 2003, 2004 Samuel Rydh (samuel@ibrium.se)
  *
diff --git a/drivers/media/common/saa7146_i2c.c b/drivers/media/common/saa7146_i2c.c
index 7e8f56815998..48cb154c7a46 100644
--- a/drivers/media/common/saa7146_i2c.c
+++ b/drivers/media/common/saa7146_i2c.c
@@ -98,7 +98,7 @@ static int saa7146_i2c_msg_cleanup(const struct i2c_msg *m, int num, __le32 *op)
 
 		op_count++;
 
-		/* loop throgh all bytes of message i */
+		/* loop through all bytes of message i */
 		for(j = 0; j < m[i].len; j++) {
 			/* write back all bytes that could have been read */
 			m[i].buf[j] = (le32_to_cpu(op[op_count/3]) >> ((3-(op_count%3))*8));
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.h b/drivers/media/dvb/dvb-core/dvb_frontend.h
index 810f07d63246..52e4ce4304ee 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.h
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.h
@@ -160,7 +160,7 @@ struct tuner_state {
  * search callback possible return status
  *
  * DVBFE_ALGO_SEARCH_SUCCESS
- * The frontend search algorithm completed and returned succesfully
+ * The frontend search algorithm completed and returned successfully
  *
  * DVBFE_ALGO_SEARCH_ASLEEP
  * The frontend search algorithm is sleeping
diff --git a/drivers/media/dvb/dvb-usb/anysee.c b/drivers/media/dvb/dvb-usb/anysee.c
index 2ae7f648effe..bb69f3719f9a 100644
--- a/drivers/media/dvb/dvb-usb/anysee.c
+++ b/drivers/media/dvb/dvb-usb/anysee.c
@@ -344,7 +344,7 @@ static int anysee_frontend_attach(struct dvb_usb_adapter *adap)
 	if (ret)
 		return ret;
 
-	err("Unkown Anysee version: %02x %02x %02x. "\
+	err("Unknown Anysee version: %02x %02x %02x. "\
 	    "Please report the <linux-dvb@linuxtv.org>.",
 	    hw_info[0], hw_info[1], hw_info[2]);
 
diff --git a/drivers/media/dvb/dvb-usb/dibusb-mb.c b/drivers/media/dvb/dvb-usb/dibusb-mb.c
index eeef50bff4f9..5c0126dc1ff9 100644
--- a/drivers/media/dvb/dvb-usb/dibusb-mb.c
+++ b/drivers/media/dvb/dvb-usb/dibusb-mb.c
@@ -242,7 +242,7 @@ static struct dvb_usb_device_properties dibusb1_1_properties = {
 			{ &dibusb_dib3000mb_table[9],  &dibusb_dib3000mb_table[11], NULL },
 			{ &dibusb_dib3000mb_table[10], &dibusb_dib3000mb_table[12], NULL },
 		},
-		{	"Unkown USB1.1 DVB-T device ???? please report the name to the author",
+		{	"Unknown USB1.1 DVB-T device ???? please report the name to the author",
 			{ &dibusb_dib3000mb_table[13], NULL },
 			{ &dibusb_dib3000mb_table[14], NULL },
 		},
diff --git a/drivers/media/dvb/dvb-usb/dvb-usb-remote.c b/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
index edde87c6aa3a..6b5ded9e7d5d 100644
--- a/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
+++ b/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
@@ -259,7 +259,7 @@ int dvb_usb_nec_rc_key_to_event(struct dvb_usb_device *d,
 			*state = REMOTE_KEY_REPEAT;
 			break;
 		default:
-			deb_err("unkown type of remote status: %d\n",keybuf[0]);
+			deb_err("unknown type of remote status: %d\n",keybuf[0]);
 			break;
 	}
 	return 0;
diff --git a/drivers/media/dvb/dvb-usb/usb-urb.c b/drivers/media/dvb/dvb-usb/usb-urb.c
index 9da2cc95ca13..f9702e3756b6 100644
--- a/drivers/media/dvb/dvb-usb/usb-urb.c
+++ b/drivers/media/dvb/dvb-usb/usb-urb.c
@@ -56,7 +56,7 @@ static void usb_urb_complete(struct urb *urb)
 				stream->complete(stream, b, urb->actual_length);
 			break;
 		default:
-			err("unkown endpoint type in completition handler.");
+			err("unknown endpoint type in completition handler.");
 			return;
 	}
 	usb_submit_urb(urb,GFP_ATOMIC);
@@ -228,7 +228,7 @@ int usb_urb_init(struct usb_data_stream *stream, struct usb_data_stream_properti
 		case USB_ISOC:
 			return usb_isoc_urb_init(stream);
 		default:
-			err("unkown URB-type for data transfer.");
+			err("unknown URB-type for data transfer.");
 			return -EINVAL;
 	}
 }
diff --git a/drivers/media/dvb/frontends/au8522_decoder.c b/drivers/media/dvb/frontends/au8522_decoder.c
index 74981ee923c8..7c6431fe33e0 100644
--- a/drivers/media/dvb/frontends/au8522_decoder.c
+++ b/drivers/media/dvb/frontends/au8522_decoder.c
@@ -315,7 +315,7 @@ static void setup_decoder_defaults(struct au8522_state *state, u8 input_mode)
 	if (input_mode == AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH13 ||
 	    input_mode == AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH24) {
 		/* Despite what the table says, for the HVR-950q we still need
-		   to be in CVBS mode for the S-Video input (reason uknown). */
+		   to be in CVBS mode for the S-Video input (reason unknown). */
 		/* filter_coef_type = 3; */
 		filter_coef_type = 5;
 	} else {
diff --git a/drivers/media/dvb/frontends/cx24110.c b/drivers/media/dvb/frontends/cx24110.c
index ffbcfabd83f0..00a4e8f03304 100644
--- a/drivers/media/dvb/frontends/cx24110.c
+++ b/drivers/media/dvb/frontends/cx24110.c
@@ -1,4 +1,4 @@
-	/*
+/*
     cx24110 - Single Chip Satellite Channel Receiver driver module
 
     Copyright (C) 2002 Peter Hettkamp <peter.hettkamp@htp-tel.de> based on
@@ -96,7 +96,7 @@ static struct {u8 reg; u8 data;} cx24110_regdata[]=
 	 {0x42,0x00}, /* @ middle bytes " */
 	 {0x43,0x00}, /* @ LSB          " */
 		      /* leave the carrier tracking loop parameters on default */
-		      /* leave the bit timing loop parameters at gefault */
+		      /* leave the bit timing loop parameters at default */
 	 {0x56,0x4d}, /* set the filtune voltage to 2.7V, as recommended by */
 		      /* the cx24108 data sheet for symbol rates above 15MS/s */
 	 {0x57,0x00}, /* @ Filter sigma delta enabled, positive */
diff --git a/drivers/media/dvb/frontends/cx24113.c b/drivers/media/dvb/frontends/cx24113.c
index 075b2b57cf09..e9ee55592fd3 100644
--- a/drivers/media/dvb/frontends/cx24113.c
+++ b/drivers/media/dvb/frontends/cx24113.c
@@ -589,7 +589,7 @@ struct dvb_frontend *cx24113_attach(struct dvb_frontend *fe,
 		info("detected CX24113 variant\n");
 		break;
 	case REV_CX24113:
-		info("sucessfully detected\n");
+		info("successfully detected\n");
 		break;
 	default:
 		err("unsupported device id: %x\n", state->rev);
diff --git a/drivers/media/dvb/frontends/dib3000mb.c b/drivers/media/dvb/frontends/dib3000mb.c
index 136b9d2164d7..ad4c8cfd8090 100644
--- a/drivers/media/dvb/frontends/dib3000mb.c
+++ b/drivers/media/dvb/frontends/dib3000mb.c
@@ -154,7 +154,7 @@ static int dib3000mb_set_frontend(struct dvb_frontend* fe,
 			case BANDWIDTH_AUTO:
 				return -EOPNOTSUPP;
 			default:
-				err("unkown bandwidth value.");
+				err("unknown bandwidth value.");
 				return -EINVAL;
 		}
 	}
diff --git a/drivers/media/dvb/frontends/lgdt330x.c b/drivers/media/dvb/frontends/lgdt330x.c
index 056387b41a8f..43971e63baa7 100644
--- a/drivers/media/dvb/frontends/lgdt330x.c
+++ b/drivers/media/dvb/frontends/lgdt330x.c
@@ -479,7 +479,7 @@ static int lgdt3302_read_status(struct dvb_frontend* fe, fe_status_t* status)
 	switch (state->current_modulation) {
 	case QAM_256:
 	case QAM_64:
-		/* Need to undestand why there are 3 lock levels here */
+		/* Need to understand why there are 3 lock levels here */
 		if ((buf[0] & 0x07) == 0x07)
 			*status |= FE_HAS_CARRIER;
 		break;
@@ -520,7 +520,7 @@ static int lgdt3303_read_status(struct dvb_frontend* fe, fe_status_t* status)
 	switch (state->current_modulation) {
 	case QAM_256:
 	case QAM_64:
-		/* Need to undestand why there are 3 lock levels here */
+		/* Need to understand why there are 3 lock levels here */
 		if ((buf[0] & 0x07) == 0x07)
 			*status |= FE_HAS_CARRIER;
 		else
diff --git a/drivers/media/dvb/frontends/stb0899_drv.c b/drivers/media/dvb/frontends/stb0899_drv.c
index a04c782fff8d..1570669837ea 100644
--- a/drivers/media/dvb/frontends/stb0899_drv.c
+++ b/drivers/media/dvb/frontends/stb0899_drv.c
@@ -1571,7 +1571,7 @@ static enum dvbfe_search stb0899_search(struct dvb_frontend *fe, struct dvb_fron
  * stb0899_track
  * periodically check the signal level against a specified
  * threshold level and perform derotator centering.
- * called once we have a lock from a succesful search
+ * called once we have a lock from a successful search
  * event.
  *
  * Will be called periodically called to maintain the
diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c
index 8d65c652ba50..baf3159a3aa6 100644
--- a/drivers/media/dvb/ttpci/av7110.c
+++ b/drivers/media/dvb/ttpci/av7110.c
@@ -2425,7 +2425,7 @@ static int __devinit av7110_attach(struct saa7146_dev* dev,
 		 * use 0x15 to track VPE  interrupts - increase by 1 every vpeirq() is called
 		 */
 		saa7146_write(dev, EC1SSR, (0x03<<2) | 3 );
-		/* set event counter 1 treshold to maximum allowed value        (rEC p55) */
+		/* set event counter 1 threshold to maximum allowed value        (rEC p55) */
 		saa7146_write(dev, ECT1R,  0x3fff );
 #endif
 		/* Set RPS1 Address register to point to RPS code               (r108 p42) */
@@ -2559,7 +2559,7 @@ static int __devinit av7110_attach(struct saa7146_dev* dev,
 		 * use 0x15 to track VPE  interrupts - increase by 1 every vpeirq() is called
 		 */
 		saa7146_write(dev, EC1SSR, (0x03<<2) | 3 );
-		/* set event counter 1 treshold to maximum allowed value        (rEC p55) */
+		/* set event counter 1 threshold to maximum allowed value        (rEC p55) */
 		saa7146_write(dev, ECT1R,  0x3fff );
 #endif
 		/* Setup BUDGETPATCH MAIN RPS1 "program" (p35) */
diff --git a/drivers/media/dvb/ttpci/budget-patch.c b/drivers/media/dvb/ttpci/budget-patch.c
index 60136688a9a4..9c92f9ddd223 100644
--- a/drivers/media/dvb/ttpci/budget-patch.c
+++ b/drivers/media/dvb/ttpci/budget-patch.c
@@ -456,7 +456,7 @@ static int budget_patch_attach (struct saa7146_dev* dev, struct saa7146_pci_exte
 	// use 0x03 to track RPS1 interrupts - increase by 1 every gpio3 is toggled
 	// use 0x15 to track VPE  interrupts - increase by 1 every vpeirq() is called
 	saa7146_write(dev, EC1SSR, (0x03<<2) | 3 );
-	// set event counter 1 treshold to maximum allowed value        (rEC p55)
+	// set event counter 1 threshold to maximum allowed value        (rEC p55)
 	saa7146_write(dev, ECT1R,  0x3fff );
 #endif
 	// Fix VSYNC level
diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c
index a1239083472d..5f79acb56e48 100644
--- a/drivers/media/radio/radio-mr800.c
+++ b/drivers/media/radio/radio-mr800.c
@@ -28,7 +28,7 @@
  * http://av-usbradio.sourceforge.net/index.php
  * http://sourceforge.net/projects/av-usbradio/
  * Latest release of theirs project was in 2005.
- * Probably, this driver could be improved trough using their
+ * Probably, this driver could be improved through using their
  * achievements (specifications given).
  * Also, Faidon Liambotis <paravoid@debian.org> wrote nice driver for this radio
  * in 2007. He allowed to use his driver to improve current mr800 radio driver.
diff --git a/drivers/media/video/cx231xx/cx231xx-avcore.c b/drivers/media/video/cx231xx/cx231xx-avcore.c
index 28f48f41f218..c2174413ab29 100644
--- a/drivers/media/video/cx231xx/cx231xx-avcore.c
+++ b/drivers/media/video/cx231xx/cx231xx-avcore.c
@@ -2414,9 +2414,11 @@ int cx231xx_gpio_i2c_read_ack(struct cx231xx *dev)
 		cx231xx_info("No ACK after %d msec -GPIO I2C failed!",
 			     nInit * 10);
 
-	/* readAck
-	   throuth clock stretch ,slave has given a SCL signal,
-	   so the SDA data can be directly read.  */
+	/*
+	 * readAck
+	 * through clock stretch, slave has given a SCL signal,
+	 * so the SDA data can be directly read.
+	 */
 	status = cx231xx_get_gpio_bit(dev, dev->gpio_dir, (u8 *)&dev->gpio_val);
 
 	if ((dev->gpio_val & 1 << dev->board.tuner_sda_gpio) == 0) {
diff --git a/drivers/media/video/cx23885/cx23885-dvb.c b/drivers/media/video/cx23885/cx23885-dvb.c
index 45e13ee66dc7..16c6a921f40b 100644
--- a/drivers/media/video/cx23885/cx23885-dvb.c
+++ b/drivers/media/video/cx23885/cx23885-dvb.c
@@ -940,7 +940,7 @@ int cx23885_dvb_register(struct cx23885_tsport *port)
 	int err, i;
 
 	/* Here we need to allocate the correct number of frontends,
-	 * as reflected in the cards struct. The reality is that currrently
+	 * as reflected in the cards struct. The reality is that currently
 	 * no cx23885 boards support this - yet. But, if we don't modify this
 	 * code then the second frontend would never be allocated (later)
 	 * and fail with error before the attach in dvb_register().
diff --git a/drivers/media/video/cx88/cx88-core.c b/drivers/media/video/cx88/cx88-core.c
index cf634606ba9a..b35411160f04 100644
--- a/drivers/media/video/cx88/cx88-core.c
+++ b/drivers/media/video/cx88/cx88-core.c
@@ -624,7 +624,7 @@ int cx88_reset(struct cx88_core *core)
 	/* setup image format */
 	cx_andor(MO_COLOR_CTRL, 0x4000, 0x4000);
 
-	/* setup FIFO Threshholds */
+	/* setup FIFO Thresholds */
 	cx_write(MO_PDMA_STHRSH,   0x0807);
 	cx_write(MO_PDMA_DTHRSH,   0x0807);
 
diff --git a/drivers/media/video/davinci/dm355_ccdc.c b/drivers/media/video/davinci/dm355_ccdc.c
index 56fbefe036ae..314390016370 100644
--- a/drivers/media/video/davinci/dm355_ccdc.c
+++ b/drivers/media/video/davinci/dm355_ccdc.c
@@ -285,7 +285,7 @@ static int validate_ccdc_param(struct ccdc_config_params_raw *ccdcparam)
 
 	if ((ccdcparam->med_filt_thres < 0) ||
 	   (ccdcparam->med_filt_thres > CCDC_MED_FILT_THRESH)) {
-		dev_dbg(dev, "Invalid value of median filter thresold\n");
+		dev_dbg(dev, "Invalid value of median filter threshold\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/media/video/davinci/vpss.c b/drivers/media/video/davinci/vpss.c
index 6d709ca8cfb0..453236bd7559 100644
--- a/drivers/media/video/davinci/vpss.c
+++ b/drivers/media/video/davinci/vpss.c
@@ -53,7 +53,7 @@ struct vpss_hw_ops {
 	int (*enable_clock)(enum vpss_clock_sel clock_sel, int en);
 	/* select input to ccdc */
 	void (*select_ccdc_source)(enum vpss_ccdc_source_sel src_sel);
-	/* clear wbl overlflow bit */
+	/* clear wbl overflow bit */
 	int (*clear_wbl_overflow)(enum vpss_wbl_sel wbl_sel);
 };
 
diff --git a/drivers/media/video/gspca/sonixb.c b/drivers/media/video/gspca/sonixb.c
index cf3af8de6e97..e39efb45fa1c 100644
--- a/drivers/media/video/gspca/sonixb.c
+++ b/drivers/media/video/gspca/sonixb.c
@@ -304,7 +304,7 @@ static const __u8 initOv6650[] = {
 };
 static const __u8 ov6650_sensor_init[][8] =
 {
-	/* Bright, contrast, etc are set througth SCBB interface.
+	/* Bright, contrast, etc are set through SCBB interface.
 	 * AVCAP on win2 do not send any data on this 	controls. */
 	/* Anyway, some registers appears to alter bright and constrat */
 
diff --git a/drivers/media/video/gspca/spca500.c b/drivers/media/video/gspca/spca500.c
index fab7ef85a6c1..7dbd5eea6cc0 100644
--- a/drivers/media/video/gspca/spca500.c
+++ b/drivers/media/video/gspca/spca500.c
@@ -589,7 +589,7 @@ static void spca500_reinit(struct gspca_dev *gspca_dev)
 	int err;
 	__u8 Data;
 
-	/* some unknow command from Aiptek pocket dv and family300 */
+	/* some unknown command from Aiptek pocket dv and family300 */
 
 	reg_w(gspca_dev, 0x00, 0x0d01, 0x01);
 	reg_w(gspca_dev, 0x00, 0x0d03, 0x00);
diff --git a/drivers/media/video/gspca/spca501.c b/drivers/media/video/gspca/spca501.c
index b74a34218da0..66f9f0056146 100644
--- a/drivers/media/video/gspca/spca501.c
+++ b/drivers/media/video/gspca/spca501.c
@@ -1636,7 +1636,7 @@ static const __u16 spca501c_arowana_init_data[][3] = {
 	{}
 };
 
-/* Unknow camera from Ori Usbid 0x0000:0x0000 */
+/* Unknown camera from Ori Usbid 0x0000:0x0000 */
 /* Based on snoops from Ori Cohen */
 static const __u16 spca501c_mysterious_open_data[][3] = {
 	{0x02, 0x000f, 0x0005},
@@ -1945,7 +1945,7 @@ static int sd_init(struct gspca_dev *gspca_dev)
 			goto error;
 		break;
 	case MystFromOriUnknownCamera:
-		/* UnKnow Ori CMOS Camera data */
+		/* Unknown Ori CMOS Camera data */
 		if (write_vector(gspca_dev, spca501c_mysterious_open_data))
 			goto error;
 		break;
@@ -1978,7 +1978,7 @@ static int sd_start(struct gspca_dev *gspca_dev)
 		write_vector(gspca_dev, spca501c_arowana_open_data);
 		break;
 	case MystFromOriUnknownCamera:
-		/* UnKnow  CMOS Camera data */
+		/* Unknown CMOS Camera data */
 		write_vector(gspca_dev, spca501c_mysterious_init_data);
 		break;
 	default:
diff --git a/drivers/media/video/gspca/sunplus.c b/drivers/media/video/gspca/sunplus.c
index aa8f995ce04e..1a9af2ebdbef 100644
--- a/drivers/media/video/gspca/sunplus.c
+++ b/drivers/media/video/gspca/sunplus.c
@@ -631,7 +631,7 @@ static void spca504A_acknowledged_command(struct gspca_dev *gspca_dev,
 	count = 200;
 	while (--count > 0) {
 		msleep(10);
-		/* gsmart mini2 write a each wait setting 1 ms is enought */
+		/* gsmart mini2 write a each wait setting 1 ms is enough */
 /*		reg_w_riv(dev, req, idx, val); */
 		status = reg_r_12(gspca_dev, 0x01, 0x0001, 1);
 		if (status == endcode) {
diff --git a/drivers/media/video/gspca/zc3xx.c b/drivers/media/video/gspca/zc3xx.c
index cdf3357b4c9f..49c3c1226e0e 100644
--- a/drivers/media/video/gspca/zc3xx.c
+++ b/drivers/media/video/gspca/zc3xx.c
@@ -7065,7 +7065,7 @@ static int sd_config(struct gspca_dev *gspca_dev,
 				break;
 			default:
 				PDEBUG(D_PROBE,
-					"Sensor UNKNOW_0 force Tas5130");
+					"Sensor UNKNOWN_0 force Tas5130");
 				sd->sensor = SENSOR_TAS5130CXX;
 			}
 			break;
diff --git a/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h b/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h
index 5b152ff20bd0..5fcad28211d2 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h
+++ b/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h
@@ -332,7 +332,7 @@ struct pvr2_hdw {
 
 	/* Bit mask of PVR2_CVAL_INPUT choices which are valid for the hardware */
 	unsigned int input_avail_mask;
-	/* Bit mask of PVR2_CVAL_INPUT choices which are currenly allowed */
+	/* Bit mask of PVR2_CVAL_INPUT choices which are currently allowed */
 	unsigned int input_allowed_mask;
 
 	/* Location of eeprom or a negative number if none */
diff --git a/drivers/media/video/s2255drv.c b/drivers/media/video/s2255drv.c
index 9e3262c0ba37..a4c84368eb10 100644
--- a/drivers/media/video/s2255drv.c
+++ b/drivers/media/video/s2255drv.c
@@ -1985,7 +1985,7 @@ static int save_frame(struct s2255_dev *dev, struct s2255_pipeinfo *pipe_info)
 					wake_up(&dev->fw_data->wait_fw);
 					break;
 				default:
-					printk(KERN_INFO "s2255 unknwn resp\n");
+					printk(KERN_INFO "s2255 unknown resp\n");
 				}
 			default:
 				pdata++;
diff --git a/drivers/media/video/zoran/zoran.h b/drivers/media/video/zoran/zoran.h
index d439c76b27e1..cb1de7ea197a 100644
--- a/drivers/media/video/zoran/zoran.h
+++ b/drivers/media/video/zoran/zoran.h
@@ -106,7 +106,7 @@ struct zoran_params {
 	unsigned long jpeg_markers;	/* Which markers should go into the JPEG output.
 					 * Unless you exactly know what you do, leave them untouched.
 					 * Inluding less markers will make the resulting code
-					 * smaller, but there will be fewer aplications
+					 * smaller, but there will be fewer applications
 					 * which can read it.
 					 * The presence of the APP and COM marker is
 					 * influenced by APP0_len and COM_len ONLY! */
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index d505b68cd372..e39986a78273 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -940,7 +940,7 @@ static const struct block_device_operations i2o_block_fops = {
  *	Allocate memory for the i2o_block_device struct, gendisk and request
  *	queue and initialize them as far as no additional information is needed.
  *
- *	Returns a pointer to the allocated I2O Block device on succes or a
+ *	Returns a pointer to the allocated I2O Block device on success or a
  *	negative error code on failure.
  */
 static struct i2o_block_device *i2o_block_device_alloc(void)
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 27cf4af0e13d..e5ab62141503 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -132,7 +132,7 @@ u32 i2o_cntxt_list_add(struct i2o_controller * c, void *ptr)
  *	Removes a previously added pointer from the context list and returns
  *	the matching context id.
  *
- *	Returns context id on succes or 0 on failure.
+ *	Returns context id on success or 0 on failure.
  */
 u32 i2o_cntxt_list_remove(struct i2o_controller * c, void *ptr)
 {
@@ -198,7 +198,7 @@ void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
  *	@c: controller to which the context list belong
  *	@ptr: pointer to which the context id should be fetched
  *
- *	Returns context id which matches to the pointer on succes or 0 on
+ *	Returns context id which matches to the pointer on success or 0 on
  *	failure.
  */
 u32 i2o_cntxt_list_get_ptr(struct i2o_controller * c, void *ptr)
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index 41c8fe2a928c..ce5eda985ab0 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -92,7 +92,7 @@ static void gru_vma_close(struct vm_area_struct *vma)
 /*
  * gru_file_mmap
  *
- * Called when mmaping the device.  Initializes the vma with a fault handler
+ * Called when mmapping the device.  Initializes the vma with a fault handler
  * and private data structure necessary to allocate, track, and free the
  * underlying pages.
  */
diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c
index 99b74a351020..941a4d35ef8d 100644
--- a/drivers/mmc/host/s3cmci.c
+++ b/drivers/mmc/host/s3cmci.c
@@ -1360,7 +1360,7 @@ static struct mmc_host_ops s3cmci_ops = {
 
 static struct s3c24xx_mci_pdata s3cmci_def_pdata = {
 	/* This is currently here to avoid a number of if (host->pdata)
-	 * checks. Any zero fields to ensure reaonable defaults are picked. */
+	 * checks. Any zero fields to ensure reasonable defaults are picked. */
 };
 
 #ifdef CONFIG_CPU_FREQ
diff --git a/drivers/mtd/devices/slram.c b/drivers/mtd/devices/slram.c
index 3aa05cd18ea1..592016a0668f 100644
--- a/drivers/mtd/devices/slram.c
+++ b/drivers/mtd/devices/slram.c
@@ -18,7 +18,7 @@
                 to specify the offset instead of the absolute address
 
   NOTE:
-  With slram it's only possible to map a contigous memory region. Therfore
+  With slram it's only possible to map a contiguous memory region. Therefore
   if there's a device mapped somewhere in the region specified slram will
   fail to load (see kernel log if modprobe fails).
 
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index e51c1ed7ac18..b126cf887476 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -1056,7 +1056,7 @@ static struct nand_ecclayout doc200x_oobinfo = {
 };
 
 /* Find the (I)NFTL Media Header, and optionally also the mirror media header.
-   On sucessful return, buf will contain a copy of the media header for
+   On successful return, buf will contain a copy of the media header for
    further processing.  id is the string to scan for, and will presumably be
    either "ANAND" or "BNAND".  If findmirror=1, also look for the mirror media
    header.  The page #s of the found media headers are placed in mh0_page and
diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c
index db7ae9d6a296..92320a643275 100644
--- a/drivers/mtd/nand/nand_ecc.c
+++ b/drivers/mtd/nand/nand_ecc.c
@@ -475,7 +475,7 @@ int __nand_correct_data(unsigned char *buf,
 		 *
 		 * The b2 shift is there to get rid of the lowest two bits.
 		 * We could also do addressbits[b2] >> 1 but for the
-		 * performace it does not make any difference
+		 * performance it does not make any difference
 		 */
 		if (eccsize_mult == 1)
 			byte_addr = (addressbits[b1] << 4) + addressbits[b0];
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index 11dc7e69c4fb..68b5b3a486a9 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -875,7 +875,7 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info,
  * @info: The controller instance.
  * @nmtd: The driver version of the MTD instance.
  *
- * This routine is called after the chip probe has succesfully completed
+ * This routine is called after the chip probe has successfully completed
  * and the relevant per-chip information updated. This call ensure that
  * we update the internal state accordingly.
  *
diff --git a/drivers/net/82596.c b/drivers/net/82596.c
index ea6b139b812c..1663bc9e45de 100644
--- a/drivers/net/82596.c
+++ b/drivers/net/82596.c
@@ -19,7 +19,7 @@
    TBD:
    * look at deferring rx frames rather than discarding (as per tulip)
    * handle tx ring full as per tulip
-   * performace test to tune rx_copybreak
+   * performance test to tune rx_copybreak
 
    Most of my modifications relate to the braindead big-endian
    implementation by Intel.  When the i596 is operating in
diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c
index 4e6359fff0e1..766aabfdfc75 100644
--- a/drivers/net/amd8111e.c
+++ b/drivers/net/amd8111e.c
@@ -1633,8 +1633,13 @@ static int amd8111e_enable_link_change(struct amd8111e_priv* lp)
 	readl(lp->mmio + CMD7);
 	return 0;
 }
-/* This function is called when a packet transmission fails to complete within a  resonable period, on the assumption that an interrupts have been failed or the  interface is locked up. This function will reinitialize the hardware */
 
+/*
+ * This function is called when a packet transmission fails to complete
+ * within a reasonable period, on the assumption that an interrupt have
+ * failed or the interface is locked up. This function will reinitialize
+ * the hardware.
+ */
 static void amd8111e_tx_timeout(struct net_device *dev)
 {
 	struct amd8111e_priv* lp = netdev_priv(dev);
diff --git a/drivers/net/appletalk/cops.c b/drivers/net/appletalk/cops.c
index b5dc7f550725..9d828aed968a 100644
--- a/drivers/net/appletalk/cops.c
+++ b/drivers/net/appletalk/cops.c
@@ -120,7 +120,7 @@ static int irq = 5;		/* Default IRQ */
  *      DAYNA driver mode:
  *              Dayna DL2000/DaynaTalk PC (Half Length), COPS LT-95, 
  *		Farallon PhoneNET PC III, Farallon PhoneNET PC II
- *	Other cards possibly supported mode unkown though:
+ *	Other cards possibly supported mode unknown though:
  *		Dayna DL2000 (Full length), COPS LT/M (Micro-Channel)
  *
  *	Cards NOT supported by this driver but supported by the ltpc.c
diff --git a/drivers/net/ariadne.h b/drivers/net/ariadne.h
index bb613f292e04..727be5cdd1ea 100644
--- a/drivers/net/ariadne.h
+++ b/drivers/net/ariadne.h
@@ -244,7 +244,7 @@ struct Am79C960 {
 #define DLNKTST		0x0010	/* Disable Link Status */
 #define DAPC		0x0008	/* Disable Automatic Polarity Correction */
 #define MENDECL		0x0004	/* MENDEC Loopback Mode */
-#define LRTTSEL		0x0002	/* Low Receive Treshold/Transmit Mode Select */
+#define LRTTSEL		0x0002	/* Low Receive Threshold/Transmit Mode Select */
 #define PORTSEL1	0x0001	/* Port Select Bits */
 #define PORTSEL2	0x8000	/* Port Select Bits */
 #define INTL		0x4000	/* Internal Loopback */
diff --git a/drivers/net/atl1c/atl1c_main.c b/drivers/net/atl1c/atl1c_main.c
index 1372e9a99f5b..96506eacc131 100644
--- a/drivers/net/atl1c/atl1c_main.c
+++ b/drivers/net/atl1c/atl1c_main.c
@@ -1543,7 +1543,7 @@ static irqreturn_t atl1c_intr(int irq, void *data)
 		if (status & ISR_OVER)
 			if (netif_msg_intr(adapter))
 				dev_warn(&pdev->dev,
-					"TX/RX over flow (status = 0x%x)\n",
+					"TX/RX overflow (status = 0x%x)\n",
 					status & ISR_OVER);
 
 		/* link event */
diff --git a/drivers/net/benet/be_cmds.h b/drivers/net/benet/be_cmds.h
index 49953787e41c..f0bb62b5ca9e 100644
--- a/drivers/net/benet/be_cmds.h
+++ b/drivers/net/benet/be_cmds.h
@@ -435,7 +435,7 @@ enum be_if_flags {
  * filtering capabilities. */
 struct be_cmd_req_if_create {
 	struct be_cmd_req_hdr hdr;
-	u32 version;		/* ignore currntly */
+	u32 version;		/* ignore currently */
 	u32 capability_flags;
 	u32 enable_flags;
 	u8 mac_addr[ETH_ALEN];
diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c
index 1f941f027718..02a0908707ed 100644
--- a/drivers/net/benet/be_main.c
+++ b/drivers/net/benet/be_main.c
@@ -1876,7 +1876,7 @@ int be_load_fw(struct be_adapter *adapter, u8 *func)
 		goto fw_exit;
 	}
 
-	dev_info(&adapter->pdev->dev, "Firmware flashed succesfully\n");
+	dev_info(&adapter->pdev->dev, "Firmware flashed successfully\n");
 
 fw_exit:
 	release_firmware(fw);
diff --git a/drivers/net/bnx2x_reg.h b/drivers/net/bnx2x_reg.h
index aa76cbada5e2..732eafdeb0f2 100644
--- a/drivers/net/bnx2x_reg.h
+++ b/drivers/net/bnx2x_reg.h
@@ -2536,7 +2536,7 @@
 /* [RC 1] A flag to indicate that overflow error occurred in one of the
    queues. */
 #define QM_REG_OVFERROR 					 0x16805c
-/* [RC 7] the Q were the qverflow occurs */
+/* [RC 7] the Q where the overflow occurs */
 #define QM_REG_OVFQNUM						 0x168058
 /* [R 16] Pause state for physical queues 15-0 */
 #define QM_REG_PAUSESTATE0					 0x168410
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index f86612857a73..56ba872be9c1 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -1285,7 +1285,7 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	/*
 	 * We do not use Tx completion interrupts to free DMAd Tx packets.
-	 * This is good for performamce but means that we rely on new Tx
+	 * This is good for performance but means that we rely on new Tx
 	 * packets arriving to run the destructors of completed packets,
 	 * which open up space in their sockets' send queues.  Sometimes
 	 * we do not get such new packets causing Tx to stall.  A single
diff --git a/drivers/net/ehea/ehea_ethtool.c b/drivers/net/ehea/ehea_ethtool.c
index d76885223366..75b099ce49c9 100644
--- a/drivers/net/ehea/ehea_ethtool.c
+++ b/drivers/net/ehea/ehea_ethtool.c
@@ -118,7 +118,7 @@ doit:
 	ret = ehea_set_portspeed(port, sp);
 
 	if (!ret)
-		ehea_info("%s: Port speed succesfully set: %dMbps "
+		ehea_info("%s: Port speed successfully set: %dMbps "
 			  "%s Duplex",
 			  port->netdev->name, port->port_speed,
 			  port->full_duplex == 1 ? "Full" : "Half");
@@ -134,7 +134,7 @@ static int ehea_nway_reset(struct net_device *dev)
 	ret = ehea_set_portspeed(port, EHEA_SPEED_AUTONEG);
 
 	if (!ret)
-		ehea_info("%s: Port speed succesfully set: %dMbps "
+		ehea_info("%s: Port speed successfully set: %dMbps "
 			  "%s Duplex",
 			  port->netdev->name, port->port_speed,
 			  port->full_duplex == 1 ? "Full" : "Half");
diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c
index ed60fd664273..0cab992b3d1a 100644
--- a/drivers/net/hamradio/baycom_ser_fdx.c
+++ b/drivers/net/hamradio/baycom_ser_fdx.c
@@ -35,7 +35,7 @@
  *          driver only supports standard serial hardware (8250, 16450, 16550A)
  *
  *          This modem usually draws its supply current out of the otherwise unused
- *          TXD pin of the serial port. Thus a contignuous stream of 0x00-bytes
+ *          TXD pin of the serial port. Thus a contiguous stream of 0x00-bytes
  *          is transmitted to achieve a positive supply voltage.
  *
  *  hsk:    This is a 4800 baud FSK modem, designed for TNC use. It works fine
diff --git a/drivers/net/iseries_veth.c b/drivers/net/iseries_veth.c
index aa7286bc4364..8739ba850f82 100644
--- a/drivers/net/iseries_veth.c
+++ b/drivers/net/iseries_veth.c
@@ -1384,7 +1384,7 @@ static inline void veth_build_dma_list(struct dma_chunk *list,
 	unsigned long done;
 	int i = 1;
 
-	/* FIXME: skbs are continguous in real addresses.  Do we
+	/* FIXME: skbs are contiguous in real addresses.  Do we
 	 * really need to break it into PAGE_SIZE chunks, or can we do
 	 * it just at the granularity of iSeries real->absolute
 	 * mapping?  Indeed, given the way the allocator works, can we
diff --git a/drivers/net/lasi_82596.c b/drivers/net/lasi_82596.c
index a0c578585a50..b77238dbafb8 100644
--- a/drivers/net/lasi_82596.c
+++ b/drivers/net/lasi_82596.c
@@ -47,7 +47,7 @@
    TBD:
    * look at deferring rx frames rather than discarding (as per tulip)
    * handle tx ring full as per tulip
-   * performace test to tune rx_copybreak
+   * performance test to tune rx_copybreak
 
    Most of my modifications relate to the braindead big-endian
    implementation by Intel.  When the i596 is operating in
diff --git a/drivers/net/lib82596.c b/drivers/net/lib82596.c
index 51e11c3e53e1..c0dbfc185b53 100644
--- a/drivers/net/lib82596.c
+++ b/drivers/net/lib82596.c
@@ -47,7 +47,7 @@
    TBD:
    * look at deferring rx frames rather than discarding (as per tulip)
    * handle tx ring full as per tulip
-   * performace test to tune rx_copybreak
+   * performance test to tune rx_copybreak
 
    Most of my modifications relate to the braindead big-endian
    implementation by Intel.  When the i596 is operating in
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index 03b781a7a182..829b9ec9ff67 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -204,7 +204,7 @@ static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
 		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
 		dma = be64_to_cpu(rx_desc->data[nr].addr);
 
-		en_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma);
+		en_dbg(DRV, priv, "Unmapping buffer at dma:0x%llx\n", (u64) dma);
 		pci_unmap_single(mdev->pdev, dma, skb_frags[nr].size,
 				 PCI_DMA_FROMDEVICE);
 		put_page(skb_frags[nr].page);
diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c
index 8c7279965b44..3d1396af9462 100644
--- a/drivers/net/mlx4/en_tx.c
+++ b/drivers/net/mlx4/en_tx.c
@@ -47,7 +47,7 @@ enum {
 static int inline_thold __read_mostly = MAX_INLINE;
 
 module_param_named(inline_thold, inline_thold, int, 0444);
-MODULE_PARM_DESC(inline_thold, "treshold for using inline data");
+MODULE_PARM_DESC(inline_thold, "threshold for using inline data");
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 			   struct mlx4_en_tx_ring *ring, u32 size,
diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h
index 4376147b0ea0..82c3ebc584e3 100644
--- a/drivers/net/mlx4/mlx4_en.h
+++ b/drivers/net/mlx4/mlx4_en.h
@@ -162,7 +162,7 @@ enum {
 #define MLX4_EN_DEF_RX_PAUSE	1
 #define MLX4_EN_DEF_TX_PAUSE	1
 
-/* Interval between sucessive polls in the Tx routine when polling is used
+/* Interval between successive polls in the Tx routine when polling is used
    instead of interrupts (in per-core Tx rings) - should be power of 2 */
 #define MLX4_EN_TX_POLL_MODER	16
 #define MLX4_EN_TX_POLL_TIMEOUT	(HZ / 4)
diff --git a/drivers/net/ps3_gelic_net.c b/drivers/net/ps3_gelic_net.c
index b211613e9dbd..86fde1a90a5a 100644
--- a/drivers/net/ps3_gelic_net.c
+++ b/drivers/net/ps3_gelic_net.c
@@ -296,7 +296,7 @@ static void gelic_card_reset_chain(struct gelic_card *card,
  * @card: card structure
  * @descr: descriptor to re-init
  *
- * return 0 on succes, <0 on failure
+ * return 0 on success, <0 on failure
  *
  * allocates a new rx skb, iommu-maps it and attaches it to the descriptor.
  * Activate the descriptor state-wise
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index c072f7f36acf..9d94a141555c 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -1760,7 +1760,7 @@ static int sis900_rx(struct net_device *net_dev)
 				sis_priv->rx_ring[entry].bufptr, RX_BUF_SIZE,
 				PCI_DMA_FROMDEVICE);
 
-			/* refill the Rx buffer, what if there is not enought
+			/* refill the Rx buffer, what if there is not enough
 			 * memory for new socket buffer ?? */
 			if ((skb = dev_alloc_skb(RX_BUF_SIZE)) == NULL) {
 				/*
@@ -1775,7 +1775,7 @@ static int sis900_rx(struct net_device *net_dev)
 			}
 
 			/* This situation should never happen, but due to
-			   some unknow bugs, it is possible that
+			   some unknown bugs, it is possible that
 			   we are working on NULL sk_buff :-( */
 			if (sis_priv->rx_skbuff[entry] == NULL) {
 				if (netif_msg_rx_err(sis_priv))
diff --git a/drivers/net/skfp/h/smc.h b/drivers/net/skfp/h/smc.h
index 1758d9548361..026a83b9f743 100644
--- a/drivers/net/skfp/h/smc.h
+++ b/drivers/net/skfp/h/smc.h
@@ -393,10 +393,10 @@ struct smt_config {
 					 */
 	u_long	mac_d_max ;		/* MAC : D_Max timer value */
 
-	u_long lct_short ;		/* LCT : error threshhold */
-	u_long lct_medium ;		/* LCT : error threshhold */
-	u_long lct_long ;		/* LCT : error threshhold */
-	u_long lct_extended ;		/* LCT : error threshhold */
+	u_long lct_short ;		/* LCT : error threshold */
+	u_long lct_medium ;		/* LCT : error threshold */
+	u_long lct_long ;		/* LCT : error threshold */
+	u_long lct_extended ;		/* LCT : error threshold */
 } ;
 
 #ifdef	DEBUG
diff --git a/drivers/net/skfp/skfddi.c b/drivers/net/skfp/skfddi.c
index b27156eaf267..db216a728503 100644
--- a/drivers/net/skfp/skfddi.c
+++ b/drivers/net/skfp/skfddi.c
@@ -1002,7 +1002,7 @@ static int skfp_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		}
 		break;
 	default:
-		printk("ioctl for %s: unknow cmd: %04x\n", dev->name, ioc.cmd);
+		printk("ioctl for %s: unknown cmd: %04x\n", dev->name, ioc.cmd);
 		status = -EOPNOTSUPP;
 
 	}			// switch
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index ccdd196f5297..4a00940d0a54 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -816,7 +816,7 @@ static int smsc911x_mii_probe(struct net_device *dev)
 	SMSC_TRACE(HW, "Passed Loop Back Test");
 #endif				/* USE_PHY_WORK_AROUND */
 
-	SMSC_TRACE(HW, "phy initialised succesfully");
+	SMSC_TRACE(HW, "phy initialised successfully");
 	return 0;
 }
 
diff --git a/drivers/net/smsc911x.h b/drivers/net/smsc911x.h
index b5716bd8a597..016360c65ce2 100644
--- a/drivers/net/smsc911x.h
+++ b/drivers/net/smsc911x.h
@@ -30,7 +30,7 @@
 #define SMSC_NAPI_WEIGHT	16
 
 /* implements a PHY loopback test at initialisation time, to ensure a packet
- * can be succesfully looped back */
+ * can be successfully looped back */
 #define USE_PHY_WORK_AROUND
 
 #define DPRINTK(nlevel, klevel, fmt, args...) \
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
index 90e663f4515c..40b51e6bc77b 100644
--- a/drivers/net/spider_net.c
+++ b/drivers/net/spider_net.c
@@ -409,7 +409,7 @@ spider_net_free_rx_chain_contents(struct spider_net_card *card)
  * @card: card structure
  * @descr: descriptor to re-init
  *
- * Return 0 on succes, <0 on failure.
+ * Return 0 on success, <0 on failure.
  *
  * Allocates a new rx skb, iommu-maps it and attaches it to the
  * descriptor. Mark the descriptor as activated, ready-to-use.
diff --git a/drivers/net/stmmac/gmac.c b/drivers/net/stmmac/gmac.c
index b624bb5bae0a..52586ee68953 100644
--- a/drivers/net/stmmac/gmac.c
+++ b/drivers/net/stmmac/gmac.c
@@ -112,7 +112,7 @@ static void gmac_dma_operation_mode(unsigned long ioaddr, int txmode,
 			      " (threshold = %d)\n", txmode);
 		csr6 &= ~DMA_CONTROL_TSF;
 		csr6 &= DMA_CONTROL_TC_TX_MASK;
-		/* Set the transmit threashold */
+		/* Set the transmit threshold */
 		if (txmode <= 32)
 			csr6 |= DMA_CONTROL_TTC_32;
 		else if (txmode <= 64)
diff --git a/drivers/net/stmmac/gmac.h b/drivers/net/stmmac/gmac.h
index 684a363120a9..2e82d6c9a148 100644
--- a/drivers/net/stmmac/gmac.h
+++ b/drivers/net/stmmac/gmac.h
@@ -154,14 +154,14 @@ enum rx_tx_priority_ratio {
 #define DMA_CONTROL_DT		0x04000000 /* Disable Drop TCP/IP csum error */
 #define DMA_CONTROL_RSF		0x02000000 /* Receive Store and Forward */
 #define DMA_CONTROL_DFF		0x01000000 /* Disaable flushing */
-/* Theshold for Activating the FC */
+/* Threshold for Activating the FC */
 enum rfa {
 	act_full_minus_1 = 0x00800000,
 	act_full_minus_2 = 0x00800200,
 	act_full_minus_3 = 0x00800400,
 	act_full_minus_4 = 0x00800600,
 };
-/* Theshold for Deactivating the FC */
+/* Threshold for Deactivating the FC */
 enum rfd {
 	deac_full_minus_1 = 0x00400000,
 	deac_full_minus_2 = 0x00400800,
diff --git a/drivers/net/tokenring/smctr.c b/drivers/net/tokenring/smctr.c
index ebda61bc4c2f..78e12b5e3ac7 100644
--- a/drivers/net/tokenring/smctr.c
+++ b/drivers/net/tokenring/smctr.c
@@ -426,7 +426,7 @@ static int smctr_alloc_shared_memory(struct net_device *dev)
         smctr_malloc(dev, 1L);
 
         /* Allocate Non-MAC receive data buffers.
-         * To guarantee a minimum of 256 contigous memory to
+         * To guarantee a minimum of 256 contiguous memory to
          * UM_Receive_Packet's lookahead pointer, before a page
          * change or ring end is encountered, place each rx buffer on
          * a 256 byte boundary.
diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index 4469f2451a6f..5e9adbaf6745 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -3798,7 +3798,7 @@ static int ucc_geth_probe(struct of_device* ofdev, const struct of_device_id *ma
 		prop = of_get_property(np, "tx-clock", NULL);
 		if (!prop) {
 			printk(KERN_ERR
-				"ucc_geth: mising tx-clock-name property\n");
+				"ucc_geth: missing tx-clock-name property\n");
 			return -EINVAL;
 		}
 		if ((*prop < QE_CLK_NONE) || (*prop > QE_CLK24)) {
diff --git a/drivers/net/ucc_geth.h b/drivers/net/ucc_geth.h
index 03a6ca016d5a..a007e2acf651 100644
--- a/drivers/net/ucc_geth.h
+++ b/drivers/net/ucc_geth.h
@@ -80,16 +80,16 @@ struct ucc_geth {
 				   frames) received that were between 128
 				   (Including FCS length==4) and 255 octets */
 	u32 txok;		/* Total number of octets residing in frames
-				   that where involved in succesfull
+				   that where involved in successfull
 				   transmission */
 	u16 txcf;		/* Total number of PAUSE control frames
 				   transmitted by this MAC */
 	u8 res4[0x2];
 	u32 tmca;		/* Total number of frames that were transmitted
-				   succesfully with the group address bit set
+				   successfully with the group address bit set
 				   that are not broadcast frames */
 	u32 tbca;		/* Total number of frames transmitted
-				   succesfully that had destination address
+				   successfully that had destination address
 				   field equal to the broadcast address */
 	u32 rxfok;		/* Total number of frames received OK */
 	u32 rxbok;		/* Total number of octets received OK */
@@ -98,9 +98,9 @@ struct ucc_geth {
 				   HW because it includes octets in frames that
 				   never even reach the UCC */
 	u32 rmca;		/* Total number of frames that were received
-				   succesfully with the group address bit set
+				   successfully with the group address bit set
 				   that are not broadcast frames */
-	u32 rbca;		/* Total number of frames received succesfully
+	u32 rbca;		/* Total number of frames received successfully
 				   that had destination address equal to the
 				   broadcast address */
 	u32 scar;		/* Statistics carry register */
@@ -759,15 +759,15 @@ struct ucc_geth_hardware_statistics {
 				   frames) received that were between 128
 				   (Including FCS length==4) and 255 octets */
 	u32 txok;		/* Total number of octets residing in frames
-				   that where involved in succesfull
+				   that where involved in successfull
 				   transmission */
 	u16 txcf;		/* Total number of PAUSE control frames
 				   transmitted by this MAC */
 	u32 tmca;		/* Total number of frames that were transmitted
-				   succesfully with the group address bit set
+				   successfully with the group address bit set
 				   that are not broadcast frames */
 	u32 tbca;		/* Total number of frames transmitted
-				   succesfully that had destination address
+				   successfully that had destination address
 				   field equal to the broadcast address */
 	u32 rxfok;		/* Total number of frames received OK */
 	u32 rxbok;		/* Total number of octets received OK */
@@ -776,9 +776,9 @@ struct ucc_geth_hardware_statistics {
 				   HW because it includes octets in frames that
 				   never even reach the UCC */
 	u32 rmca;		/* Total number of frames that were received
-				   succesfully with the group address bit set
+				   successfully with the group address bit set
 				   that are not broadcast frames */
-	u32 rbca;		/* Total number of frames received succesfully
+	u32 rbca;		/* Total number of frames received successfully
 				   that had destination address equal to the
 				   broadcast address */
 } __attribute__ ((packed));
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index c6c922247d05..0c3c738d7419 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -748,7 +748,7 @@ static int smsc95xx_phy_initialize(struct usbnet *dev)
 	mii_nway_restart(&dev->mii);
 
 	if (netif_msg_ifup(dev))
-		devdbg(dev, "phy initialised succesfully");
+		devdbg(dev, "phy initialised successfully");
 	return 0;
 }
 
diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c
index 7ea71b33d2e9..ee784e091f67 100644
--- a/drivers/net/wan/lmc/lmc_main.c
+++ b/drivers/net/wan/lmc/lmc_main.c
@@ -927,7 +927,7 @@ static int __devinit lmc_init_one(struct pci_dev *pdev,
         sc->lmc_media = &lmc_t1_media;
         break;
     default:
-	printk(KERN_WARNING "%s: LMC UNKOWN CARD!\n", dev->name);
+	printk(KERN_WARNING "%s: LMC UNKNOWN CARD!\n", dev->name);
         break;
     }
 
diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c
index 07c32e68909f..99d27473ba3f 100644
--- a/drivers/net/wimax/i2400m/rx.c
+++ b/drivers/net/wimax/i2400m/rx.c
@@ -1114,7 +1114,7 @@ error:
  * device. See the file header for the format. Run all checks on the
  * buffer header, then run over each payload's descriptors, verify
  * their consistency and act on each payload's contents.  If
- * everything is succesful, update the device's statistics.
+ * everything is successful, update the device's statistics.
  *
  * Note: You need to set the skb to contain only the length of the
  * received buffer; for that, use skb_trim(skb, RECEIVED_SIZE).
diff --git a/drivers/net/wireless/ath/ath5k/phy.c b/drivers/net/wireless/ath/ath5k/phy.c
index 1a039f2bd732..6f04cc758dcc 100644
--- a/drivers/net/wireless/ath/ath5k/phy.c
+++ b/drivers/net/wireless/ath/ath5k/phy.c
@@ -117,7 +117,7 @@ static unsigned int ath5k_hw_rfb_op(struct ath5k_hw *ah,
 
 /*
  * This code is used to optimize rf gain on different environments
- * (temprature mostly) based on feedback from a power detector.
+ * (temperature mostly) based on feedback from a power detector.
  *
  * It's only used on RF5111 and RF5112, later RF chips seem to have
  * auto adjustment on hw -notice they have a much smaller BANK 7 and
@@ -2675,7 +2675,7 @@ ath5k_setup_channel_powertable(struct ath5k_hw *ah,
 		/* Fill curves in reverse order
 		 * from lower power (max gain)
 		 * to higher power. Use curve -> idx
-		 * backmaping we did on eeprom init */
+		 * backmapping we did on eeprom init */
 		u8 idx = pdg_curve_to_idx[pdg];
 
 		/* Grab the needed curves by index */
@@ -2777,7 +2777,7 @@ ath5k_setup_channel_powertable(struct ath5k_hw *ah,
 	/* Now we have a set of curves for this
 	 * channel on tmpL (x range is table_max - table_min
 	 * and y values are tmpL[pdg][]) sorted in the same
-	 * order as EEPROM (because we've used the backmaping).
+	 * order as EEPROM (because we've used the backmapping).
 	 * So for RF5112 it's from higher power to lower power
 	 * and for RF2413 it's from lower power to higher power.
 	 * For RF5111 we only have one curve. */
diff --git a/drivers/net/wireless/ath/ath9k/rc.c b/drivers/net/wireless/ath/ath9k/rc.c
index 1895d63aad0a..0a35ee62a02a 100644
--- a/drivers/net/wireless/ath/ath9k/rc.c
+++ b/drivers/net/wireless/ath/ath9k/rc.c
@@ -969,7 +969,7 @@ static bool ath_rc_update_per(struct ath_softc *sc,
 				 * Since this probe succeeded, we allow the next
 				 * probe twice as soon.  This allows the maxRate
 				 * to move up faster if the probes are
-				 * succesful.
+				 * successful.
 				 */
 				ath_rc_priv->probe_time =
 					now_msec - rate_table->probe_interval / 2;
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index a741d37fd96f..e1b330023200 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -551,7 +551,7 @@ static int ipw2100_get_ordinal(struct ipw2100_priv *priv, u32 ord,
 		/* get number of entries */
 		field_count = *(((u16 *) & field_info) + 1);
 
-		/* abort if no enought memory */
+		/* abort if no enough memory */
 		total_length = field_len * field_count;
 		if (total_length > *len) {
 			*len = total_length;
@@ -3044,7 +3044,7 @@ static void ipw2100_tx_send_data(struct ipw2100_priv *priv)
 			     IPW_MAX_BDS)) {
 			/* TODO: Support merging buffers if more than
 			 * IPW_MAX_BDS are used */
-			IPW_DEBUG_INFO("%s: Maximum BD theshold exceeded.  "
+			IPW_DEBUG_INFO("%s: Maximum BD threshold exceeded.  "
 				       "Increase fragmentation level.\n",
 				       priv->net_dev->name);
 		}
@@ -6823,7 +6823,7 @@ static int ipw2100_wx_get_range(struct net_device *dev,
 	range->max_qual.updated = 7;	/* Updated all three */
 
 	range->avg_qual.qual = 70;	/* > 8% missed beacons is 'bad' */
-	/* TODO: Find real 'good' to 'bad' threshol value for RSSI */
+	/* TODO: Find real 'good' to 'bad' threshold value for RSSI */
 	range->avg_qual.level = 20 + IPW2100_RSSI_TO_DBM;
 	range->avg_qual.noise = 0;
 	range->avg_qual.updated = 7;	/* Updated all three */
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index 9b0f2c0646e0..b2aa960b8346 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -787,7 +787,7 @@ static int ipw_get_ordinal(struct ipw_priv *priv, u32 ord, void *val, u32 * len)
 		/* get number of entries */
 		field_count = *(((u16 *) & field_info) + 1);
 
-		/* abort if not enought memory */
+		/* abort if not enough memory */
 		total_len = field_len * field_count;
 		if (total_len > *len) {
 			*len = total_len;
@@ -7751,7 +7751,7 @@ static void ipw_rebuild_decrypted_skb(struct ipw_priv *priv,
 	case SEC_LEVEL_0:
 		break;
 	default:
-		printk(KERN_ERR "Unknow security level %d\n",
+		printk(KERN_ERR "Unknown security level %d\n",
 		       priv->ieee->sec.level);
 		break;
 	}
@@ -8917,7 +8917,7 @@ static int ipw_wx_get_range(struct net_device *dev,
 	range->max_qual.updated = 7;	/* Updated all three */
 
 	range->avg_qual.qual = 70;
-	/* TODO: Find real 'good' to 'bad' threshol value for RSSI */
+	/* TODO: Find real 'good' to 'bad' threshold value for RSSI */
 	range->avg_qual.level = 0;	/* FIXME to real average level */
 	range->avg_qual.noise = 0;
 	range->avg_qual.updated = 7;	/* Updated all three */
@@ -10290,7 +10290,7 @@ static int ipw_tx_skb(struct ipw_priv *priv, struct libipw_txb *txb,
 		case SEC_LEVEL_0:
 			break;
 		default:
-			printk(KERN_ERR "Unknow security level %d\n",
+			printk(KERN_ERR "Unknown security level %d\n",
 			       priv->ieee->sec.level);
 			break;
 		}
diff --git a/drivers/net/wireless/ipw2x00/libipw_module.c b/drivers/net/wireless/ipw2x00/libipw_module.c
index be5b809ec97a..20b8a8a20644 100644
--- a/drivers/net/wireless/ipw2x00/libipw_module.c
+++ b/drivers/net/wireless/ipw2x00/libipw_module.c
@@ -199,7 +199,7 @@ struct net_device *alloc_ieee80211(int sizeof_priv, int monitor)
 	ieee->host_decrypt = 1;
 	ieee->host_mc_decrypt = 1;
 
-	/* Host fragementation in Open mode. Default is enabled.
+	/* Host fragmentation in Open mode. Default is enabled.
 	 * Note: host fragmentation is always enabled if host encryption
 	 * is enabled. For cards can do hardware encryption, they must do
 	 * hardware fragmentation as well. So we don't need a variable
diff --git a/drivers/net/wireless/iwmc3200wifi/hal.c b/drivers/net/wireless/iwmc3200wifi/hal.c
index c430418248b4..d13c8853ee82 100644
--- a/drivers/net/wireless/iwmc3200wifi/hal.c
+++ b/drivers/net/wireless/iwmc3200wifi/hal.c
@@ -411,7 +411,7 @@ static void iwm_build_lmac_hdr(struct iwm_priv *iwm, struct iwm_lmac_hdr *hdr,
 /*
  * iwm_hal_send_host_cmd(): sends commands to the UMAC or the LMAC.
  * Sending command to the LMAC is equivalent to sending a
- * regular UMAC command with the LMAC passtrough or the LMAC
+ * regular UMAC command with the LMAC passthrough or the LMAC
  * wrapper UMAC command IDs.
  */
 int iwm_hal_send_host_cmd(struct iwm_priv *iwm,
diff --git a/drivers/net/wireless/iwmc3200wifi/rx.c b/drivers/net/wireless/iwmc3200wifi/rx.c
index 771a301003c9..8ddb51a2a977 100644
--- a/drivers/net/wireless/iwmc3200wifi/rx.c
+++ b/drivers/net/wireless/iwmc3200wifi/rx.c
@@ -1448,7 +1448,7 @@ static void iwm_rx_process_packet(struct iwm_priv *iwm,
 		kfree_skb(packet->skb);
 		break;
 	default:
-		IWM_ERR(iwm, "Unknow ticket action: %d\n",
+		IWM_ERR(iwm, "Unknown ticket action: %d\n",
 			le16_to_cpu(ticket_node->ticket->action));
 		kfree_skb(packet->skb);
 	}
diff --git a/drivers/net/wireless/libertas/if_sdio.c b/drivers/net/wireless/libertas/if_sdio.c
index 485a8d406525..afe6abecc044 100644
--- a/drivers/net/wireless/libertas/if_sdio.c
+++ b/drivers/net/wireless/libertas/if_sdio.c
@@ -934,7 +934,7 @@ static int if_sdio_probe(struct sdio_func *func,
 	}
 
 	if (i == ARRAY_SIZE(if_sdio_models)) {
-		lbs_pr_err("unkown card model 0x%x\n", card->model);
+		lbs_pr_err("unknown card model 0x%x\n", card->model);
 		ret = -ENODEV;
 		goto free;
 	}
diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c
index bc08464d8323..f7f5c793514b 100644
--- a/drivers/net/wireless/prism54/isl_ioctl.c
+++ b/drivers/net/wireless/prism54/isl_ioctl.c
@@ -1897,7 +1897,7 @@ prism54_get_mac(struct net_device *ndev, struct iw_request_info *info,
 	return 0;
 }
 
-/* Setting policy also clears the MAC acl, even if we don't change the defaut
+/* Setting policy also clears the MAC acl, even if we don't change the default
  * policy
  */
 
@@ -2323,7 +2323,7 @@ prism54_process_trap_helper(islpci_private *priv, enum oid_num_t oid,
 
 	case DOT11_OID_BEACON:
 		send_formatted_event(priv,
-				     "Received a beacon from an unkown AP",
+				     "Received a beacon from an unknown AP",
 				     mlme, 0);
 		break;
 
diff --git a/drivers/net/wireless/rt2x00/rt2400pci.h b/drivers/net/wireless/rt2x00/rt2400pci.h
index ccd644104ad1..aced05775693 100644
--- a/drivers/net/wireless/rt2x00/rt2400pci.h
+++ b/drivers/net/wireless/rt2x00/rt2400pci.h
@@ -35,7 +35,7 @@
 
 /*
  * Signal information.
- * Defaul offset is required for RSSI <-> dBm conversion.
+ * Default offset is required for RSSI <-> dBm conversion.
  */
 #define DEFAULT_RSSI_OFFSET		100
 
diff --git a/drivers/net/wireless/rt2x00/rt2500pci.h b/drivers/net/wireless/rt2x00/rt2500pci.h
index 54d37957883c..3db9041838a4 100644
--- a/drivers/net/wireless/rt2x00/rt2500pci.h
+++ b/drivers/net/wireless/rt2x00/rt2500pci.h
@@ -46,7 +46,7 @@
 
 /*
  * Signal information.
- * Defaul offset is required for RSSI <-> dBm conversion.
+ * Default offset is required for RSSI <-> dBm conversion.
  */
 #define DEFAULT_RSSI_OFFSET		121
 
diff --git a/drivers/net/wireless/rt2x00/rt2500usb.h b/drivers/net/wireless/rt2x00/rt2500usb.h
index b01edca42583..d3000827883a 100644
--- a/drivers/net/wireless/rt2x00/rt2500usb.h
+++ b/drivers/net/wireless/rt2x00/rt2500usb.h
@@ -46,7 +46,7 @@
 
 /*
  * Signal information.
- * Defaul offset is required for RSSI <-> dBm conversion.
+ * Default offset is required for RSSI <-> dBm conversion.
  */
 #define DEFAULT_RSSI_OFFSET		120
 
diff --git a/drivers/net/wireless/rt2x00/rt61pci.h b/drivers/net/wireless/rt2x00/rt61pci.h
index 93eb699165cc..77b5116f549b 100644
--- a/drivers/net/wireless/rt2x00/rt61pci.h
+++ b/drivers/net/wireless/rt2x00/rt61pci.h
@@ -37,7 +37,7 @@
 
 /*
  * Signal information.
- * Defaul offset is required for RSSI <-> dBm conversion.
+ * Default offset is required for RSSI <-> dBm conversion.
  */
 #define DEFAULT_RSSI_OFFSET		120
 
diff --git a/drivers/net/wireless/rt2x00/rt73usb.h b/drivers/net/wireless/rt2x00/rt73usb.h
index 81fe0be51c42..e194332dac5f 100644
--- a/drivers/net/wireless/rt2x00/rt73usb.h
+++ b/drivers/net/wireless/rt2x00/rt73usb.h
@@ -37,7 +37,7 @@
 
 /*
  * Signal information.
- * Defaul offset is required for RSSI <-> dBm conversion.
+ * Default offset is required for RSSI <-> dBm conversion.
  */
 #define DEFAULT_RSSI_OFFSET		120
 
diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c
index 431a20ec6db6..b3b0b5b685c6 100644
--- a/drivers/net/wireless/wavelan_cs.c
+++ b/drivers/net/wireless/wavelan_cs.c
@@ -4011,7 +4011,7 @@ wavelan_interrupt(int		irq,
 #endif
 
   /* Prevent reentrancy. We need to do that because we may have
-   * multiple interrupt handler running concurently.
+   * multiple interrupt handler running concurrently.
    * It is safe because interrupts are disabled before aquiring
    * the spinlock. */
   spin_lock(&lp->spinlock);
diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index 6d666359a42f..2b7f96594373 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -312,7 +312,7 @@ static void tx_status(struct ieee80211_hw *hw, struct sk_buff *skb,
  * zd_mac_tx_failed - callback for failed frames
  * @dev: the mac80211 wireless device
  *
- * This function is called if a frame couldn't be succesfully be
+ * This function is called if a frame couldn't be successfully be
  * transferred. The first frame from the tx queue, will be selected and
  * reported as error to the upper layers.
  */
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index a6b4a5a53d40..f511e70d454c 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -650,7 +650,7 @@ ccio_clear_io_tlb(struct ioc *ioc, dma_addr_t iovp, size_t byte_cnt)
  * Mark the I/O Pdir entries invalid and blow away the corresponding I/O
  * TLB entries.
  *
- * FIXME: at some threshhold it might be "cheaper" to just blow
+ * FIXME: at some threshold it might be "cheaper" to just blow
  *        away the entire I/O TLB instead of individual entries.
  *
  * FIXME: Uturn has 256 TLB entries. We don't need to purge every
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index d93108d148fc..36db20a8f892 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -6533,7 +6533,7 @@ static struct ibm_struct volume_driver_data = {
  * 	The speeds are stored on handles
  * 	(FANA:FAN9), (FANC:FANB), (FANE:FAND).
  *
- * 	There are three default speed sets, acessible as handles:
+ * 	There are three default speed sets, accessible as handles:
  * 	FS1L,FS1M,FS1H; FS2L,FS2M,FS2H; FS3L,FS3M,FS3H
  *
  * 	ACPI DSDT switches which set is in use depending on various
diff --git a/drivers/pnp/pnpbios/rsparser.c b/drivers/pnp/pnpbios/rsparser.c
index 87b4f49a5251..a5135ebe5f07 100644
--- a/drivers/pnp/pnpbios/rsparser.c
+++ b/drivers/pnp/pnpbios/rsparser.c
@@ -191,7 +191,7 @@ static unsigned char *pnpbios_parse_allocated_resource_data(struct pnp_dev *dev,
 			return (unsigned char *)p;
 			break;
 
-		default:	/* an unkown tag */
+		default:	/* an unknown tag */
 len_err:
 			dev_err(&dev->dev, "unknown tag %#x length %d\n",
 				tag, len);
@@ -405,7 +405,7 @@ pnpbios_parse_resource_option_data(unsigned char *p, unsigned char *end,
 		case SMALL_TAG_END:
 			return p + 2;
 
-		default:	/* an unkown tag */
+		default:	/* an unknown tag */
 len_err:
 			dev_err(&dev->dev, "unknown tag %#x length %d\n",
 				tag, len);
@@ -475,7 +475,7 @@ static unsigned char *pnpbios_parse_compatible_ids(unsigned char *p,
 			return (unsigned char *)p;
 			break;
 
-		default:	/* an unkown tag */
+		default:	/* an unknown tag */
 len_err:
 			dev_err(&dev->dev, "unknown tag %#x length %d\n",
 				tag, len);
@@ -744,7 +744,7 @@ static unsigned char *pnpbios_encode_allocated_resource_data(struct pnp_dev
 			return (unsigned char *)p;
 			break;
 
-		default:	/* an unkown tag */
+		default:	/* an unknown tag */
 len_err:
 			dev_err(&dev->dev, "unknown tag %#x length %d\n",
 				tag, len);
diff --git a/drivers/ps3/ps3-sys-manager.c b/drivers/ps3/ps3-sys-manager.c
index 88cb74088611..3cbaf1811bd0 100644
--- a/drivers/ps3/ps3-sys-manager.c
+++ b/drivers/ps3/ps3-sys-manager.c
@@ -46,7 +46,7 @@
 /**
  * struct ps3_sys_manager_header - System manager message header.
  * @version: Header version, currently 1.
- * @size: Header size in bytes, curently 16.
+ * @size: Header size in bytes, currently 16.
  * @payload_size: Message payload size in bytes.
  * @service_id: Message type, one of enum ps3_sys_manager_service_id.
  * @request_tag: Unique number to identify reply.
diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c
index ad164056feb6..434e92fdb966 100644
--- a/drivers/rtc/rtc-v3020.c
+++ b/drivers/rtc/rtc-v3020.c
@@ -335,7 +335,7 @@ static int rtc_probe(struct platform_device *pdev)
 		goto err_io;
 	}
 
-	/* Make sure frequency measurment mode, test modes, and lock
+	/* Make sure frequency measurement mode, test modes, and lock
 	 * are all disabled */
 	v3020_set_reg(chip, V3020_STATUS_0, 0x0);
 
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index 097d3846a828..f54b1eec6ddf 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -74,7 +74,7 @@ fs3270_do_io(struct raw3270_view *view, struct raw3270_request *rq)
 		}
 		rc = raw3270_start(view, rq);
 		if (rc == 0) {
-			/* Started sucessfully. Now wait for completion. */
+			/* Started successfully. Now wait for completion. */
 			wait_event(fp->wait, raw3270_request_final(rq));
 		}
 	} while (rc == -EACCES);
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index 8ab51608da55..c268a2e5b7c3 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -65,7 +65,7 @@ static void set_chp_logically_online(struct chp_id chpid, int onoff)
 	chpid_to_chp(chpid)->state = onoff;
 }
 
-/* On succes return 0 if channel-path is varied offline, 1 if it is varied
+/* On success return 0 if channel-path is varied offline, 1 if it is varied
  * online. Return -ENODEV if channel-path is not registered. */
 int chp_get_status(struct chp_id chpid)
 {
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index 30f516111307..2985eb439485 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -462,7 +462,7 @@ static struct cmb_area cmb_area = {
  * block of memory, which can not be moved as long as any channel
  * is active. Therefore, a maximum number of subchannels needs to
  * be defined somewhere. This is a module parameter, defaulting to
- * a resonable value of 1024, or 32 kb of memory.
+ * a reasonable value of 1024, or 32 kb of memory.
  * Current kernels don't allow kmalloc with more than 128kb, so the
  * maximum is 4096.
  */
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 58e583b61e60..aa2b60a868ba 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -92,11 +92,11 @@
 #define ENVCTRL_CPUTEMP_MON			1    /* cpu temperature monitor */
 #define ENVCTRL_CPUVOLTAGE_MON	  	2    /* voltage monitor         */
 #define ENVCTRL_FANSTAT_MON  		3    /* fan status monitor      */
-#define ENVCTRL_ETHERTEMP_MON		4    /* ethernet temperarture */
+#define ENVCTRL_ETHERTEMP_MON		4    /* ethernet temperature */
 					     /* monitor                     */
 #define ENVCTRL_VOLTAGESTAT_MON	  	5    /* voltage status monitor  */
 #define ENVCTRL_MTHRBDTEMP_MON		6    /* motherboard temperature */
-#define ENVCTRL_SCSITEMP_MON		7    /* scsi temperarture */
+#define ENVCTRL_SCSITEMP_MON		7    /* scsi temperature */
 #define ENVCTRL_GLOBALADDR_MON		8    /* global address */
 
 /* Child device type.
diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c
index f5a9addb7050..07ce9bfcdf06 100644
--- a/drivers/scsi/53c700.c
+++ b/drivers/scsi/53c700.c
@@ -1491,7 +1491,7 @@ NCR_700_intr(int irq, void *dev_id)
 	unsigned long flags;
 	int handled = 0;
 
-	/* Use the host lock to serialise acess to the 53c700
+	/* Use the host lock to serialise access to the 53c700
 	 * hardware.  Note: In future, we may need to take the queue
 	 * lock to enter the done routines.  When that happens, we
 	 * need to ensure that for this driver, the host lock and the
diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index cdbdec9f4fb2..83986ed86556 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -526,10 +526,10 @@ struct aac_driver_ident
 
 /*
  *	The adapter interface specs all queues to be located in the same
- *	physically contigous block. The host structure that defines the
+ *	physically contiguous block. The host structure that defines the
  *	commuication queues will assume they are each a separate physically
- *	contigous memory region that will support them all being one big
- *	contigous block.
+ *	contiguous memory region that will support them all being one big
+ *	contiguous block.
  *	There is a command and response queue for each level and direction of
  *	commuication. These regions are accessed by both the host and adapter.
  */
diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c
index d598eba630d0..666d5151d628 100644
--- a/drivers/scsi/aacraid/comminit.c
+++ b/drivers/scsi/aacraid/comminit.c
@@ -226,7 +226,7 @@ static int aac_comm_init(struct aac_dev * dev)
 	spin_lock_init(&dev->fib_lock);
 
 	/*
-	 *	Allocate the physically contigous space for the commuication
+	 *	Allocate the physically contiguous space for the commuication
 	 *	queue headers. 
 	 */
 
diff --git a/drivers/scsi/aic7xxx/aic79xx.seq b/drivers/scsi/aic7xxx/aic79xx.seq
index 3b66b5ae3d9f..2fb78e35a9e5 100644
--- a/drivers/scsi/aic7xxx/aic79xx.seq
+++ b/drivers/scsi/aic7xxx/aic79xx.seq
@@ -217,7 +217,7 @@ BEGIN_CRITICAL;
 scbdma_tohost_done:
 	test	CCSCBCTL, CCARREN jz fill_qoutfifo_dmadone;
 	/*
-	 * An SCB has been succesfully uploaded to the host.
+	 * An SCB has been successfully uploaded to the host.
 	 * If the SCB was uploaded for some reason other than
 	 * bad SCSI status (currently only for underruns), we
 	 * queue the SCB for normal completion.  Otherwise, we
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index 63b521d615f2..4d419c155ce9 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -2487,7 +2487,7 @@ ahd_handle_scsiint(struct ahd_softc *ahd, u_int intstat)
 		/*
 		 * Although the driver does not care about the
 		 * 'Selection in Progress' status bit, the busy
-		 * LED does.  SELINGO is only cleared by a sucessfull
+		 * LED does.  SELINGO is only cleared by a successfull
 		 * selection, so we must manually clear it to insure
 		 * the LED turns off just incase no future successful
 		 * selections occur (e.g. no devices on the bus).
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
index 8dfb59d58992..45aa728a76b2 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -1733,7 +1733,7 @@ ahc_handle_scsiint(struct ahc_softc *ahc, u_int intstat)
 		/*
 		 * Although the driver does not care about the
 		 * 'Selection in Progress' status bit, the busy
-		 * LED does.  SELINGO is only cleared by a sucessfull
+		 * LED does.  SELINGO is only cleared by a successfull
 		 * selection, so we must manually clear it to insure
 		 * the LED turns off just incase no future successful
 		 * selections occur (e.g. no devices on the bus).
diff --git a/drivers/scsi/bfa/include/defs/bfa_defs_pport.h b/drivers/scsi/bfa/include/defs/bfa_defs_pport.h
index a000bc4e2d4a..bf320412ee24 100644
--- a/drivers/scsi/bfa/include/defs/bfa_defs_pport.h
+++ b/drivers/scsi/bfa/include/defs/bfa_defs_pport.h
@@ -61,7 +61,7 @@ enum bfa_pport_speed {
  * 		Port operational type (in sync with SNIA port type).
  */
 enum bfa_pport_type {
-	BFA_PPORT_TYPE_UNKNOWN = 1,	/*  port type is unkown */
+	BFA_PPORT_TYPE_UNKNOWN = 1,	/*  port type is unknown */
 	BFA_PPORT_TYPE_TRUNKED = 2,	/*  Trunked mode */
 	BFA_PPORT_TYPE_NPORT   = 5,	/*  P2P with switched fabric */
 	BFA_PPORT_TYPE_NLPORT  = 6,	/*  public loop */
diff --git a/drivers/scsi/bfa/include/defs/bfa_defs_tsensor.h b/drivers/scsi/bfa/include/defs/bfa_defs_tsensor.h
index 31881d218515..ade763dbc8ce 100644
--- a/drivers/scsi/bfa/include/defs/bfa_defs_tsensor.h
+++ b/drivers/scsi/bfa/include/defs/bfa_defs_tsensor.h
@@ -25,7 +25,7 @@
  * Temperature sensor status values
  */
 enum bfa_tsensor_status {
-	BFA_TSENSOR_STATUS_UNKNOWN   = 1,   /*  unkown status */
+	BFA_TSENSOR_STATUS_UNKNOWN   = 1,   /*  unknown status */
 	BFA_TSENSOR_STATUS_FAULTY    = 2,   /*  sensor is faulty */
 	BFA_TSENSOR_STATUS_BELOW_MIN = 3,   /*  temperature below mininum */
 	BFA_TSENSOR_STATUS_NOMINAL   = 4,   /*  normal temperature */
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
index a0e7e711ff9d..5be67a6fca93 100644
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -834,7 +834,7 @@ static int hptiop_reset_hba(struct hptiop_hba *hba)
 			atomic_read(&hba->resetting) == 0, 60 * HZ);
 
 	if (atomic_read(&hba->resetting)) {
-		/* IOP is in unkown state, abort reset */
+		/* IOP is in unknown state, abort reset */
 		printk(KERN_ERR "scsi%d: reset failed\n", hba->host->host_no);
 		return -1;
 	}
diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c
index bd2f77197447..6486ae4591b8 100644
--- a/drivers/scsi/libfc/fc_lport.c
+++ b/drivers/scsi/libfc/fc_lport.c
@@ -56,7 +56,7 @@
  * at the same time.
  *
  * When discovery succeeds or fails a callback is made to the lport as
- * notification. Currently, succesful discovery causes the lport to take no
+ * notification. Currently, successful discovery causes the lport to take no
  * action. A failure will cause the lport to reset. There is likely a circular
  * locking problem with this implementation.
  */
diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
index 2e0746d70303..ca25ee5190b0 100644
--- a/drivers/scsi/libiscsi_tcp.c
+++ b/drivers/scsi/libiscsi_tcp.c
@@ -1004,7 +1004,7 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task)
  * iscsi_tcp_task_xmit - xmit normal PDU task
  * @task: iscsi command task
  *
- * We're expected to return 0 when everything was transmitted succesfully,
+ * We're expected to return 0 when everything was transmitted successfully,
  * -EAGAIN if there's still data in the queue, or != 0 for any other kind
  * of error.
  */
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index e1a30a16a9fa..9bd19aa14249 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -654,7 +654,7 @@ lpfc_selective_reset(struct lpfc_hba *phba)
  * Notes:
  * Assumes any error from lpfc_selective_reset() will be negative.
  * If lpfc_selective_reset() returns zero then the length of the buffer
- * is returned which indicates succcess
+ * is returned which indicates success
  *
  * Returns:
  * -EINVAL if the buffer does not contain the string "selective"
@@ -3147,7 +3147,7 @@ sysfs_ctlreg_write(struct kobject *kobj, struct bin_attribute *bin_attr,
  * sysfs_ctlreg_read - Read method for reading from ctlreg
  * @kobj: kernel kobject that contains the kernel class device.
  * @bin_attr: kernel attributes passed to us.
- * @buf: if succesful contains the data from the adapter IOREG space.
+ * @buf: if successful contains the data from the adapter IOREG space.
  * @off: offset into buffer to beginning of data.
  * @count: bytes to transfer.
  *
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index 45337cd23feb..a14ab4580d4e 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -802,7 +802,7 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 
 	/* FLOGI completes successfully */
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_ELS,
-			 "0101 FLOGI completes sucessfully "
+			 "0101 FLOGI completes successfully "
 			 "Data: x%x x%x x%x x%x\n",
 			 irsp->un.ulpWord[4], sp->cmn.e_d_tov,
 			 sp->cmn.w2.r_a_tov, sp->cmn.edtovResolution);
@@ -4133,7 +4133,7 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 	/* Indicate we are walking fc_rscn_id_list on this vport */
 	vport->fc_rscn_flush = 1;
 	spin_unlock_irq(shost->host_lock);
-	/* Get the array count after sucessfully have the token */
+	/* Get the array count after successfully have the token */
 	rscn_cnt = vport->fc_rscn_id_cnt;
 	/* If we are already processing an RSCN, save the received
 	 * RSCN payload buffer, cmdiocb->context2 to process later.
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 562d8cee874b..82f8ab5c72cd 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -645,7 +645,7 @@ lpfc_hba_down_prep(struct lpfc_hba *phba)
  * down the SLI Layer.
  *
  * Return codes
- *   0 - sucess.
+ *   0 - success.
  *   Any other value - error.
  **/
 static int
@@ -700,7 +700,7 @@ lpfc_hba_down_post_s3(struct lpfc_hba *phba)
  * down the SLI Layer.
  *
  * Return codes
- *   0 - sucess.
+ *   0 - success.
  *   Any other value - error.
  **/
 static int
@@ -755,7 +755,7 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba)
  * uninitialization after the HBA is reset when bring down the SLI Layer.
  *
  * Return codes
- *   0 - sucess.
+ *   0 - success.
  *   Any other value - error.
  **/
 int
@@ -1254,7 +1254,7 @@ lpfc_handle_eratt_s4(struct lpfc_hba *phba)
  * routine from the API jump table function pointer from the lpfc_hba struct.
  *
  * Return codes
- *   0 - sucess.
+ *   0 - success.
  *   Any other value - error.
  **/
 void
@@ -3124,7 +3124,7 @@ static void lpfc_log_intr_mode(struct lpfc_hba *phba, uint32_t intr_mode)
  * PCI devices.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  **/
 static int
@@ -3220,7 +3220,7 @@ lpfc_reset_hba(struct lpfc_hba *phba)
  * support the SLI-3 HBA device it attached to.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  **/
 static int
@@ -3321,7 +3321,7 @@ lpfc_sli_driver_resource_unset(struct lpfc_hba *phba)
  * support the SLI-4 HBA device it attached to.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  **/
 static int
@@ -3642,7 +3642,7 @@ lpfc_init_api_table_setup(struct lpfc_hba *phba, uint8_t dev_grp)
  * device specific resource setup to support the HBA device it attached to.
  *
  * Return codes
- *	0 - sucessful
+ *	0 - successful
  *	other values - error
  **/
 static int
@@ -3688,7 +3688,7 @@ lpfc_setup_driver_resource_phase1(struct lpfc_hba *phba)
  * device specific resource setup to support the HBA device it attached to.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  **/
 static int
@@ -3753,7 +3753,7 @@ lpfc_free_iocb_list(struct lpfc_hba *phba)
  * list and set up the IOCB tag array accordingly.
  *
  * Return codes
- *	0 - sucessful
+ *	0 - successful
  *	other values - error
  **/
 static int
@@ -3872,7 +3872,7 @@ lpfc_free_active_sgl(struct lpfc_hba *phba)
  * list and set up the sgl xritag tag array accordingly.
  *
  * Return codes
- *	0 - sucessful
+ *	0 - successful
  *	other values - error
  **/
 static int
@@ -3986,7 +3986,7 @@ out_free_mem:
  * enabled and the driver is reinitializing the device.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	ENOMEM - No availble memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -4146,7 +4146,7 @@ lpfc_sli4_remove_rpi_hdrs(struct lpfc_hba *phba)
  * PCI device data structure is set.
  *
  * Return codes
- *      pointer to @phba - sucessful
+ *      pointer to @phba - successful
  *      NULL - error
  **/
 static struct lpfc_hba *
@@ -4202,7 +4202,7 @@ lpfc_hba_free(struct lpfc_hba *phba)
  * host with it.
  *
  * Return codes
- *      0 - sucessful
+ *      0 - successful
  *      other values - error
  **/
 static int
@@ -4365,7 +4365,7 @@ lpfc_post_init_setup(struct lpfc_hba *phba)
  * with SLI-3 interface spec.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  **/
 static int
@@ -4662,7 +4662,7 @@ lpfc_sli4_bar2_register_memmap(struct lpfc_hba *phba, uint32_t vf)
  * this routine.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	ENOMEM - could not allocated memory.
  **/
 static int
@@ -4761,7 +4761,7 @@ lpfc_destroy_bootstrap_mbox(struct lpfc_hba *phba)
  * allocation for the port.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	ENOMEM - No availble memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -4861,7 +4861,7 @@ lpfc_sli4_read_config(struct lpfc_hba *phba)
  * HBA consistent with the SLI-4 interface spec.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	ENOMEM - No availble memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -4910,7 +4910,7 @@ lpfc_setup_endian_order(struct lpfc_hba *phba)
  * we just use some constant number as place holder.
  *
  * Return codes
- *      0 - sucessful
+ *      0 - successful
  *      ENOMEM - No availble memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -5218,7 +5218,7 @@ out_error:
  * operation.
  *
  * Return codes
- *      0 - sucessful
+ *      0 - successful
  *      ENOMEM - No availble memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -5286,7 +5286,7 @@ lpfc_sli4_queue_destroy(struct lpfc_hba *phba)
  * operation.
  *
  * Return codes
- *      0 - sucessful
+ *      0 - successful
  *      ENOMEM - No availble memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -5552,7 +5552,7 @@ out_error:
  * operation.
  *
  * Return codes
- *      0 - sucessful
+ *      0 - successful
  *      ENOMEM - No availble memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -5599,7 +5599,7 @@ lpfc_sli4_queue_unset(struct lpfc_hba *phba)
  * Later, this can be used for all the slow-path events.
  *
  * Return codes
- *      0 - sucessful
+ *      0 - successful
  *      -ENOMEM - No availble memory
  **/
 static int
@@ -5760,7 +5760,7 @@ lpfc_sli4_cq_event_release_all(struct lpfc_hba *phba)
  * all resources assigned to the PCI function which originates this request.
  *
  * Return codes
- *      0 - sucessful
+ *      0 - successful
  *      ENOMEM - No availble memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -5923,7 +5923,7 @@ lpfc_sli4_fcfi_unreg(struct lpfc_hba *phba, uint16_t fcfi)
  * with SLI-4 interface spec.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  **/
 static int
@@ -6052,7 +6052,7 @@ lpfc_sli4_pci_mem_unset(struct lpfc_hba *phba)
  * will be left with MSI-X enabled and leaks its vectors.
  *
  * Return codes
- *   0 - sucessful
+ *   0 - successful
  *   other values - error
  **/
 static int
@@ -6184,7 +6184,7 @@ lpfc_sli_disable_msix(struct lpfc_hba *phba)
  * is done in this function.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  */
 static int
@@ -6243,7 +6243,7 @@ lpfc_sli_disable_msi(struct lpfc_hba *phba)
  * MSI-X -> MSI -> IRQ.
  *
  * Return codes
- *   0 - sucessful
+ *   0 - successful
  *   other values - error
  **/
 static uint32_t
@@ -6333,7 +6333,7 @@ lpfc_sli_disable_intr(struct lpfc_hba *phba)
  * enabled and leaks its vectors.
  *
  * Return codes
- * 0 - sucessful
+ * 0 - successful
  * other values - error
  **/
 static int
@@ -6443,7 +6443,7 @@ lpfc_sli4_disable_msix(struct lpfc_hba *phba)
  * which is done in this function.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  **/
 static int
@@ -6508,7 +6508,7 @@ lpfc_sli4_disable_msi(struct lpfc_hba *phba)
  * MSI-X -> MSI -> IRQ.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	other values - error
  **/
 static uint32_t
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 43cbe336f1f8..42d4f3dae1d6 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -1794,7 +1794,7 @@ lpfc_sli_handle_mb_event(struct lpfc_hba *phba)
 		 */
 		if (lpfc_sli_chk_mbx_command(pmbox->mbxCommand) ==
 		    MBX_SHUTDOWN) {
-			/* Unknow mailbox command compl */
+			/* Unknown mailbox command compl */
 			lpfc_printf_log(phba, KERN_ERR, LOG_MBOX | LOG_SLI,
 					"(%d):0323 Unknown Mailbox command "
 					"x%x (x%x) Cmpl\n",
@@ -4163,7 +4163,7 @@ lpfc_sli4_read_fcoe_params(struct lpfc_hba *phba,
  * addition, this routine gets the port vpd data.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	ENOMEM - could not allocated memory.
  **/
 static int
@@ -11091,7 +11091,7 @@ lpfc_sli4_handle_received_buffer(struct lpfc_hba *phba)
  * sequential.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  *      EIO - The mailbox failed to complete successfully.
  * 	When this error occurs, the driver is not guaranteed
  *	to have any rpi regions posted to the device and
@@ -11129,7 +11129,7 @@ lpfc_sli4_post_all_rpi_hdrs(struct lpfc_hba *phba)
  * maps up to 64 rpi context regions.
  *
  * Return codes
- * 	0 - sucessful
+ * 	0 - successful
  * 	ENOMEM - No available memory
  *      EIO - The mailbox failed to complete successfully.
  **/
@@ -11191,7 +11191,7 @@ lpfc_sli4_post_rpi_hdr(struct lpfc_hba *phba, struct lpfc_rpi_hdr *rpi_page)
  * PAGE_SIZE modulo 64 rpi context headers.
  *
  * Returns
- * 	A nonzero rpi defined as rpi_base <= rpi < max_rpi if sucessful
+ * 	A nonzero rpi defined as rpi_base <= rpi < max_rpi if successful
  * 	LPFC_RPI_ALLOC_ERROR if no rpis are available.
  **/
 int
diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h
index 512c2cc1a33f..d310f49d077e 100644
--- a/drivers/scsi/megaraid.h
+++ b/drivers/scsi/megaraid.h
@@ -381,7 +381,7 @@ typedef struct {
 	u8	battery_status;	/*
 				 * BIT 0: battery module missing
 				 * BIT 1: VBAD
-				 * BIT 2: temprature high
+				 * BIT 2: temperature high
 				 * BIT 3: battery pack missing
 				 * BIT 4,5:
 				 *   00 - charge complete
diff --git a/drivers/scsi/megaraid/mbox_defs.h b/drivers/scsi/megaraid/mbox_defs.h
index b25b74764ec3..ce2487a888ed 100644
--- a/drivers/scsi/megaraid/mbox_defs.h
+++ b/drivers/scsi/megaraid/mbox_defs.h
@@ -497,7 +497,7 @@ typedef struct {
  * @inserted_drive		: channel:Id of inserted drive
  * @battery_status		: bit 0: battery module missing
  *				bit 1: VBAD
- *				bit 2: temprature high
+ *				bit 2: temperature high
  *				bit 3: battery pack missing
  *				bit 4,5:
  *					00 - charge complete
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index 234f0b7eb21c..f9ae8037a710 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -2704,7 +2704,7 @@ megaraid_reset_handler(struct scsi_cmnd *scp)
 	}
 	else {
 		con_log(CL_ANN, (KERN_NOTICE
-		"megaraid mbox: reset sequence completed sucessfully\n"));
+		"megaraid mbox: reset sequence completed successfully\n"));
 	}
 
 
diff --git a/drivers/scsi/mpt2sas/mpt2sas_scsih.c b/drivers/scsi/mpt2sas/mpt2sas_scsih.c
index 86ab32d7ab15..756e509d495c 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_scsih.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_scsih.c
@@ -2894,7 +2894,7 @@ _scsih_normalize_sense(char *sense_buffer, struct sense_info *data)
 
 #ifdef CONFIG_SCSI_MPT2SAS_LOGGING
 /**
- * _scsih_scsi_ioc_info - translated non-succesfull SCSI_IO request
+ * _scsih_scsi_ioc_info - translated non-successfull SCSI_IO request
  * @ioc: per adapter object
  * @scmd: pointer to scsi command object
  * @mpi_reply: reply mf payload returned from firmware
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index e3c482aa87b5..a2d569828308 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -6495,7 +6495,7 @@ static void ncr_int_ma (struct ncb *np)
 	**	we force a SIR_NEGO_PROTO interrupt (it is a hack that avoids 
 	**	bloat for such a should_not_happen situation).
 	**	In all other situation, we reset the BUS.
-	**	Are these assumptions reasonnable ? (Wait and see ...)
+	**	Are these assumptions reasonable ? (Wait and see ...)
 	*/
 unexpected_phase:
 	dsp -= 8;
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index f7c70e2a8224..d3d39f86fcf7 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -3342,7 +3342,7 @@ static int pmcraid_chr_fasync(int fd, struct file *filep, int mode)
  * @direction : data transfer direction
  *
  * Return value
- *  0 on sucess, non-zero error code on failure
+ *  0 on success, non-zero error code on failure
  */
 static int pmcraid_build_passthrough_ioadls(
 	struct pmcraid_cmd *cmd,
@@ -3401,7 +3401,7 @@ static int pmcraid_build_passthrough_ioadls(
  * @direction: data transfer direction
  *
  * Return value
- *  0 on sucess, non-zero error code on failure
+ *  0 on success, non-zero error code on failure
  */
 static void pmcraid_release_passthrough_ioadls(
 	struct pmcraid_cmd *cmd,
@@ -3429,7 +3429,7 @@ static void pmcraid_release_passthrough_ioadls(
  * @arg: pointer to pmcraid_passthrough_buffer user buffer
  *
  * Return value
- *  0 on sucess, non-zero error code on failure
+ *  0 on success, non-zero error code on failure
  */
 static long pmcraid_ioctl_passthrough(
 	struct pmcraid_instance *pinstance,
diff --git a/drivers/scsi/pmcraid.h b/drivers/scsi/pmcraid.h
index 3441b3f90827..2752b56cad56 100644
--- a/drivers/scsi/pmcraid.h
+++ b/drivers/scsi/pmcraid.h
@@ -771,11 +771,11 @@ static struct pmcraid_ioasc_error pmcraid_ioasc_error_table[] = {
 	{0x01180600, IOASC_LOG_LEVEL_MUST,
 	 "Recovered Error, soft media error, sector reassignment suggested"},
 	{0x015D0000, IOASC_LOG_LEVEL_MUST,
-	 "Recovered Error, failure prediction thresold exceeded"},
+	 "Recovered Error, failure prediction threshold exceeded"},
 	{0x015D9200, IOASC_LOG_LEVEL_MUST,
-	 "Recovered Error, soft Cache Card Battery error thresold"},
+	 "Recovered Error, soft Cache Card Battery error threshold"},
 	{0x015D9200, IOASC_LOG_LEVEL_MUST,
-	 "Recovered Error, soft Cache Card Battery error thresold"},
+	 "Recovered Error, soft Cache Card Battery error threshold"},
 	{0x02048000, IOASC_LOG_LEVEL_MUST,
 	 "Not Ready, IOA Reset Required"},
 	{0x02408500, IOASC_LOG_LEVEL_MUST,
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index 723fdecd91bd..0fd6ae6911ad 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(scsi_nl_send_transport_msg);
  * @data_buf:		pointer to vendor unique data buffer
  *
  * Returns:
- *   0 on succesful return
+ *   0 on successful return
  *   otherwise, failing error code
  *
  * Notes:
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index fd47cb1bee1b..f27e52d963d3 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -666,7 +666,7 @@ EXPORT_SYMBOL(sas_phy_add);
  *
  * Note:
  *   This function must only be called on a PHY that has not
- *   sucessfully been added using sas_phy_add().
+ *   successfully been added using sas_phy_add().
  */
 void sas_phy_free(struct sas_phy *phy)
 {
@@ -896,7 +896,7 @@ EXPORT_SYMBOL(sas_port_add);
  *
  * Note:
  *   This function must only be called on a PORT that has not
- *   sucessfully been added using sas_port_add().
+ *   successfully been added using sas_port_add().
  */
 void sas_port_free(struct sas_port *port)
 {
@@ -1476,7 +1476,7 @@ EXPORT_SYMBOL(sas_rphy_add);
  *
  * Note:
  *   This function must only be called on a remote
- *   PHY that has not sucessfully been added using
+ *   PHY that has not successfully been added using
  *   sas_rphy_add() (or has been sas_rphy_remove()'d)
  */
 void sas_rphy_free(struct sas_rphy *rphy)
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 45374d66d26a..2b38f6ad6e11 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -1864,7 +1864,7 @@ static pci_ers_result_t sym2_io_slot_dump(struct pci_dev *pdev)
  *
  * This routine is similar to sym_set_workarounds(), except
  * that, at this point, we already know that the device was
- * succesfully intialized at least once before, and so most
+ * successfully intialized at least once before, and so most
  * of the steps taken there are un-needed here.
  */
 static void sym2_reset_workarounds(struct pci_dev *pdev)
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c
index 297deb817a5d..a7bc8b7b09ac 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.c
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c
@@ -2692,7 +2692,7 @@ static void sym_int_ma (struct sym_hcb *np)
 	 *  we force a SIR_NEGO_PROTO interrupt (it is a hack that avoids 
 	 *  bloat for such a should_not_happen situation).
 	 *  In all other situation, we reset the BUS.
-	 *  Are these assumptions reasonnable ? (Wait and see ...)
+	 *  Are these assumptions reasonable ? (Wait and see ...)
 	 */
 unexpected_phase:
 	dsp -= 8;
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.h b/drivers/scsi/sym53c8xx_2/sym_hipd.h
index 053e63c86822..5a80cbac3f92 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.h
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.h
@@ -54,7 +54,7 @@
  *
  *    SYM_OPT_LIMIT_COMMAND_REORDERING
  *        When this option is set, the driver tries to limit tagged 
- *        command reordering to some reasonnable value.
+ *        command reordering to some reasonable value.
  *        (set for Linux)
  */
 #if 0
diff --git a/drivers/serial/8250_pnp.c b/drivers/serial/8250_pnp.c
index d71dfe398940..36ede02ceacf 100644
--- a/drivers/serial/8250_pnp.c
+++ b/drivers/serial/8250_pnp.c
@@ -361,9 +361,9 @@ static const struct pnp_device_id pnp_dev_table[] = {
 	{	"LTS0001",		0       },
 	/* Rockwell's (PORALiNK) 33600 INT PNP */
 	{	"WCI0003",		0	},
-	/* Unkown PnP modems */
+	/* Unknown PnP modems */
 	{	"PNPCXXX",		UNKNOWN_DEV	},
-	/* More unkown PnP modems */
+	/* More unknown PnP modems */
 	{	"PNPDXXX",		UNKNOWN_DEV	},
 	{	"",			0	}
 };
diff --git a/drivers/serial/pmac_zilog.h b/drivers/serial/pmac_zilog.h
index 570b0d925e83..f6e77f12acd5 100644
--- a/drivers/serial/pmac_zilog.h
+++ b/drivers/serial/pmac_zilog.h
@@ -73,7 +73,7 @@ static inline struct uart_pmac_port *pmz_get_port_A(struct uart_pmac_port *uap)
 }
 
 /*
- * Register acessors. Note that we don't need to enforce a recovery
+ * Register accessors. Note that we don't need to enforce a recovery
  * delay on PCI PowerMac hardware, it's dealt in HW by the MacIO chip,
  * though if we try to use this driver on older machines, we might have
  * to add it back
diff --git a/drivers/serial/ucc_uart.c b/drivers/serial/ucc_uart.c
index 0c08f286a2ef..46de564aaea0 100644
--- a/drivers/serial/ucc_uart.c
+++ b/drivers/serial/ucc_uart.c
@@ -313,7 +313,7 @@ static void qe_uart_stop_tx(struct uart_port *port)
  * This function will attempt to stuff of all the characters from the
  * kernel's transmit buffer into TX BDs.
  *
- * A return value of non-zero indicates that it sucessfully stuffed all
+ * A return value of non-zero indicates that it successfully stuffed all
  * characters from the kernel buffer.
  *
  * A return value of zero indicates that there are still characters in the
diff --git a/drivers/telephony/ixj.c b/drivers/telephony/ixj.c
index 40de151f2789..e89304c72568 100644
--- a/drivers/telephony/ixj.c
+++ b/drivers/telephony/ixj.c
@@ -4190,7 +4190,7 @@ static void ixj_aec_start(IXJ *j, int level)
 				ixj_WriteDSPCommand(0x1224, j);
 
 			ixj_WriteDSPCommand(0xE014, j);
-			ixj_WriteDSPCommand(0x0003, j);	/* Lock threashold at 3dB */
+			ixj_WriteDSPCommand(0x0003, j);	/* Lock threshold at 3dB */
 
 			ixj_WriteDSPCommand(0xE338, j);	/* Set Echo Suppresser Attenuation to 0dB */
 
@@ -4235,7 +4235,7 @@ static void ixj_aec_start(IXJ *j, int level)
 				ixj_WriteDSPCommand(0x1224, j);
 
 			ixj_WriteDSPCommand(0xE014, j);
-			ixj_WriteDSPCommand(0x0003, j);	/* Lock threashold at 3dB */
+			ixj_WriteDSPCommand(0x0003, j);	/* Lock threshold at 3dB */
 
 			ixj_WriteDSPCommand(0xE338, j);	/* Set Echo Suppresser Attenuation to 0dB */
 
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index d171b563e94c..bba4d3eabe0f 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -1958,7 +1958,7 @@ static void uea_dispatch_cmv_e1(struct uea_softc *sc, struct intr_pkt *intr)
 		goto bad1;
 
 	/* FIXME : ADI930 reply wrong preambule (func = 2, sub = 2) to
-	 * the first MEMACESS cmv. Ignore it...
+	 * the first MEMACCESS cmv. Ignore it...
 	 */
 	if (cmv->bFunction != dsc->function) {
 		if (UEA_CHIP_VERSION(sc) == ADI930
diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 2473cf0c6b1d..b4bd2411c666 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -1,5 +1,5 @@
 /**
- * drivers/usb/class/usbtmc.c - USB Test & Measurment class driver
+ * drivers/usb/class/usbtmc.c - USB Test & Measurement class driver
  *
  * Copyright (C) 2007 Stefan Kopp, Gechingen, Germany
  * Copyright (C) 2008 Novell, Inc.
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index da718e84d58d..e80f1af438c8 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1890,7 +1890,7 @@ static void cancel_async_set_config(struct usb_device *udev)
  * routine gets around the normal restrictions by using a work thread to
  * submit the change-config request.
  *
- * Returns 0 if the request was succesfully queued, error code otherwise.
+ * Returns 0 if the request was successfully queued, error code otherwise.
  * The caller has no way to know whether the queued request will eventually
  * succeed.
  */
diff --git a/drivers/usb/gadget/f_acm.c b/drivers/usb/gadget/f_acm.c
index 7953948bfe4a..4e3657808b0f 100644
--- a/drivers/usb/gadget/f_acm.c
+++ b/drivers/usb/gadget/f_acm.c
@@ -432,7 +432,7 @@ static void acm_disable(struct usb_function *f)
  * @length: size of data
  * Context: irqs blocked, acm->lock held, acm_notify_req non-null
  *
- * Returns zero on sucess or a negative errno.
+ * Returns zero on success or a negative errno.
  *
  * See section 6.3.5 of the CDC 1.1 specification for information
  * about the only notification we issue:  SerialState change.
diff --git a/drivers/usb/gadget/pxa27x_udc.c b/drivers/usb/gadget/pxa27x_udc.c
index 1937d8c7b433..adda1208a1ec 100644
--- a/drivers/usb/gadget/pxa27x_udc.c
+++ b/drivers/usb/gadget/pxa27x_udc.c
@@ -1524,7 +1524,7 @@ static int pxa_udc_get_frame(struct usb_gadget *_gadget)
  * pxa_udc_wakeup - Force udc device out of suspend
  * @_gadget: usb gadget
  *
- * Returns 0 if succesfull, error code otherwise
+ * Returns 0 if successfull, error code otherwise
  */
 static int pxa_udc_wakeup(struct usb_gadget *_gadget)
 {
diff --git a/drivers/usb/host/fhci-sched.c b/drivers/usb/host/fhci-sched.c
index 62a226b61670..00a29855d0c4 100644
--- a/drivers/usb/host/fhci-sched.c
+++ b/drivers/usb/host/fhci-sched.c
@@ -627,7 +627,7 @@ irqreturn_t fhci_irq(struct usb_hcd *hcd)
 
 
 /*
- * Process normal completions(error or sucess) and clean the schedule.
+ * Process normal completions(error or success) and clean the schedule.
  *
  * This is the main path for handing urbs back to drivers. The only other patth
  * is process_del_list(),which unlinks URBs by scanning EDs,instead of scanning
diff --git a/drivers/usb/wusbcore/crypto.c b/drivers/usb/wusbcore/crypto.c
index 9ec7fd5da489..9579cf4c38bf 100644
--- a/drivers/usb/wusbcore/crypto.c
+++ b/drivers/usb/wusbcore/crypto.c
@@ -111,7 +111,7 @@ struct aes_ccm_b1 {
  *
  * CCM uses Ax blocks to generate a keystream with which the MIC and
  * the message's payload are encoded. A0 always encrypts/decrypts the
- * MIC. Ax (x>0) are used for the sucesive payload blocks.
+ * MIC. Ax (x>0) are used for the successive payload blocks.
  *
  * The x is the counter, and is increased for each block.
  */
diff --git a/drivers/usb/wusbcore/wa-xfer.c b/drivers/usb/wusbcore/wa-xfer.c
index 613a5fc490d3..489b47833e2c 100644
--- a/drivers/usb/wusbcore/wa-xfer.c
+++ b/drivers/usb/wusbcore/wa-xfer.c
@@ -558,7 +558,7 @@ static void wa_seg_dto_cb(struct urb *urb)
 /*
  * Callback for the segment request
  *
- * If succesful transition state (unless already transitioned or
+ * If successful transition state (unless already transitioned or
  * outbound transfer); otherwise, take a note of the error, mark this
  * segment done and try completion.
  *
@@ -1364,7 +1364,7 @@ segment_aborted:
 /*
  * Callback for the IN data phase
  *
- * If succesful transition state; otherwise, take a note of the
+ * If successful transition state; otherwise, take a note of the
  * error, mark this segment done and try completion.
  *
  * Note we don't access until we are sure that the transfer hasn't
diff --git a/drivers/uwb/i1480/dfu/usb.c b/drivers/uwb/i1480/dfu/usb.c
index c7080d497311..0bb665a0c024 100644
--- a/drivers/uwb/i1480/dfu/usb.c
+++ b/drivers/uwb/i1480/dfu/usb.c
@@ -229,7 +229,7 @@ void i1480_usb_neep_cb(struct urb *urb)
  * will verify it.
  *
  * Set i1480->evt_result with the result of getting the event or its
- * size (if succesful).
+ * size (if successful).
  *
  * Delivers the data directly to i1480->evt_buf
  */
diff --git a/drivers/uwb/wlp/txrx.c b/drivers/uwb/wlp/txrx.c
index 86a853b84119..7350ed6909f8 100644
--- a/drivers/uwb/wlp/txrx.c
+++ b/drivers/uwb/wlp/txrx.c
@@ -282,7 +282,7 @@ EXPORT_SYMBOL_GPL(wlp_receive_frame);
  *         and transmission will be done by the calling function.
  * @dst:   On return this will contain the device address to which the
  *         frame is destined.
- * @returns: 0 on success no tx : WLP header sucessfully applied to skb buffer,
+ * @returns: 0 on success no tx : WLP header successfully applied to skb buffer,
  *                                calling function can proceed with tx
  *           1 on success with tx : WLP will take over transmission of this
  *                                  frame
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index 913b4a47ae52..1ddeb4c34763 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -3276,7 +3276,7 @@ static void __devinit aty_init_lcd(struct atyfb_par *par, u32 bios_base)
 				txtformat = "24 bit interface";
 				break;
 			default:
-				txtformat = "unkown format";
+				txtformat = "unknown format";
 			}
 		} else {
 			switch (format & 7) {
@@ -3299,7 +3299,7 @@ static void __devinit aty_init_lcd(struct atyfb_par *par, u32 bios_base)
 				txtformat = "262144 colours (FDPI-2 mode)";
 				break;
 			default:
-				txtformat = "unkown format";
+				txtformat = "unknown format";
 			}
 		}
 		PRINTKI("%s%s %s monitor detected: %s\n",
diff --git a/drivers/video/backlight/atmel-pwm-bl.c b/drivers/video/backlight/atmel-pwm-bl.c
index 505c0823a105..2cf7ba52f67c 100644
--- a/drivers/video/backlight/atmel-pwm-bl.c
+++ b/drivers/video/backlight/atmel-pwm-bl.c
@@ -158,7 +158,7 @@ static int atmel_pwm_bl_probe(struct platform_device *pdev)
 			goto err_free_pwm;
 		}
 
-		/* Turn display off by defatult. */
+		/* Turn display off by default. */
 		retval = gpio_direction_output(pwmbl->gpio_on,
 				0 ^ pdata->on_active_low);
 		if (retval)
diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c
index 50ec17dfc517..fa32b94a4546 100644
--- a/drivers/video/backlight/tosa_lcd.c
+++ b/drivers/video/backlight/tosa_lcd.c
@@ -177,7 +177,7 @@ static int __devinit tosa_lcd_probe(struct spi_device *spi)
 	if (!data)
 		return -ENOMEM;
 
-	data->is_vga = true; /* defaut to VGA mode */
+	data->is_vga = true; /* default to VGA mode */
 
 	/*
 	 * bits_per_word cannot be configured in platform data
diff --git a/drivers/video/console/sticore.c b/drivers/video/console/sticore.c
index 857b3668b3ba..6468a297e341 100644
--- a/drivers/video/console/sticore.c
+++ b/drivers/video/console/sticore.c
@@ -436,7 +436,7 @@ sti_init_glob_cfg(struct sti_struct *sti,
 			    (offs < PCI_BASE_ADDRESS_0 ||
 			     offs > PCI_BASE_ADDRESS_5)) {
 				printk (KERN_WARNING
-					"STI pci region maping for region %d (%02x) can't be mapped\n",
+					"STI pci region mapping for region %d (%02x) can't be mapped\n",
 					i,sti->rm_entry[i]);
 				continue;
 			}
diff --git a/drivers/video/gbefb.c b/drivers/video/gbefb.c
index 1a83709f9611..492e6e64b653 100644
--- a/drivers/video/gbefb.c
+++ b/drivers/video/gbefb.c
@@ -701,7 +701,7 @@ static int gbefb_set_par(struct fb_info *info)
 	   blocks of 512x128, 256x128 or 128x128 pixels, respectively for 8bit,
 	   16bit and 32 bit modes (64 kB). They cover the screen with partial
 	   tiles on the right and/or bottom of the screen if needed.
-	   For exemple in 640x480 8 bit mode the mapping is:
+	   For example in 640x480 8 bit mode the mapping is:
 
 	   <-------- 640 ----->
 	   <---- 512 ----><128|384 offscreen>
diff --git a/drivers/video/stifb.c b/drivers/video/stifb.c
index 6120f0c526fe..876648e15e9d 100644
--- a/drivers/video/stifb.c
+++ b/drivers/video/stifb.c
@@ -756,9 +756,9 @@ hyperResetPlanes(struct stifb_info *fb, int enable)
 		if (fb->info.var.bits_per_pixel == 32)
 			controlPlaneReg = 0x04000F00;
 		else
-			controlPlaneReg = 0x00000F00;   /* 0x00000800 should be enought, but lets clear all 4 bits */
+			controlPlaneReg = 0x00000F00;   /* 0x00000800 should be enough, but lets clear all 4 bits */
 	else
-		controlPlaneReg = 0x00000F00; /* 0x00000100 should be enought, but lets clear all 4 bits */
+		controlPlaneReg = 0x00000F00; /* 0x00000100 should be enough, but lets clear all 4 bits */
 
 	switch (enable) {
 	case ENABLE:
diff --git a/drivers/video/tdfxfb.c b/drivers/video/tdfxfb.c
index ff43c8885028..980548390048 100644
--- a/drivers/video/tdfxfb.c
+++ b/drivers/video/tdfxfb.c
@@ -52,7 +52,7 @@
  *
  * 0.1.3 (released 1999-11-02)	added Attila's panning support, code
  *				reorg, hwcursor address page size alignment
- *				(for mmaping both frame buffer and regs),
+ *				(for mmapping both frame buffer and regs),
  *				and my changes to get rid of hardcoded
  *				VGA i/o register locations (uses PCI
  *				configuration info now)
diff --git a/drivers/video/via/dvi.c b/drivers/video/via/dvi.c
index c5c32b6b6e6c..67b36932212b 100644
--- a/drivers/video/via/dvi.c
+++ b/drivers/video/via/dvi.c
@@ -467,7 +467,7 @@ static int dvi_get_panel_size_from_DDCv1(void)
 	default:
 		viaparinfo->tmds_setting_info->dvi_panel_size =
 			VIA_RES_1024X768;
-		DEBUG_MSG(KERN_INFO "Unknow panel size max resolution = %d !\
+		DEBUG_MSG(KERN_INFO "Unknown panel size max resolution = %d !\
 					 set default panel size.\n", max_h);
 		break;
 	}
@@ -534,7 +534,7 @@ static int dvi_get_panel_size_from_DDCv2(void)
 	default:
 		viaparinfo->tmds_setting_info->dvi_panel_size =
 			VIA_RES_1024X768;
-		DEBUG_MSG(KERN_INFO "Unknow panel size max resolution = %d!\
+		DEBUG_MSG(KERN_INFO "Unknown panel size max resolution = %d!\
 					set default panel size.\n", HSize);
 		break;
 	}
diff --git a/drivers/video/vt8623fb.c b/drivers/video/vt8623fb.c
index 3df17dc8c3d7..65ccd215d496 100644
--- a/drivers/video/vt8623fb.c
+++ b/drivers/video/vt8623fb.c
@@ -446,7 +446,7 @@ static int vt8623fb_set_par(struct fb_info *info)
 
 	svga_wseq_mask(0x1E, 0xF0, 0xF0); // DI/DVP bus
 	svga_wseq_mask(0x2A, 0x0F, 0x0F); // DI/DVP bus
-	svga_wseq_mask(0x16, 0x08, 0xBF); // FIFO read treshold
+	svga_wseq_mask(0x16, 0x08, 0xBF); // FIFO read threshold
 	vga_wseq(NULL, 0x17, 0x1F);       // FIFO depth
 	vga_wseq(NULL, 0x18, 0x4E);
 	svga_wseq_mask(0x1A, 0x08, 0x08); // enable MMIO ?
diff --git a/drivers/watchdog/coh901327_wdt.c b/drivers/watchdog/coh901327_wdt.c
index 381026c0bd7b..923cc68dba26 100644
--- a/drivers/watchdog/coh901327_wdt.c
+++ b/drivers/watchdog/coh901327_wdt.c
@@ -508,7 +508,7 @@ void coh901327_watchdog_reset(void)
 	 * deactivating the watchdog before it is shut down by it.
 	 *
 	 * NOTE: on future versions of the watchdog, this restriction is
-	 * gone: the watchdog will be reloaded with a defaul value (1 min)
+	 * gone: the watchdog will be reloaded with a default value (1 min)
 	 * instead of last value, and you can conveniently set the watchdog
 	 * timeout to 10ms (value = 1) without any problems.
 	 */
diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
index b6b3f59ab446..47d719717a3b 100644
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -21,7 +21,7 @@
  *      wd#1 - 2 seconds;
  *      wd#2 - 7.2 ms;
  *  After the expiration of wd#1, it can generate a NMI, SCI, SMI, or
- *  a system RESET and it starts wd#2 that unconditionaly will RESET
+ *  a system RESET and it starts wd#2 that unconditionally will RESET
  *  the system when the counter reaches zero.
  *
  *  14-Dec-2001 Matt Domsch <Matt_Domsch@dell.com>
diff --git a/drivers/watchdog/wdrtas.c b/drivers/watchdog/wdrtas.c
index 3bde56bce63a..5bfb1f2c5319 100644
--- a/drivers/watchdog/wdrtas.c
+++ b/drivers/watchdog/wdrtas.c
@@ -542,7 +542,7 @@ static struct notifier_block wdrtas_notifier = {
 /**
  * wdrtas_get_tokens - reads in RTAS tokens
  *
- * returns 0 on succes, <0 on failure
+ * returns 0 on success, <0 on failure
  *
  * wdrtas_get_tokens reads in the tokens for the RTAS calls used in
  * this watchdog driver. It tolerates, if "get-sensor-state" and
@@ -598,7 +598,7 @@ static void wdrtas_unregister_devs(void)
 /**
  * wdrtas_register_devs - registers the misc dev handlers
  *
- * returns 0 on succes, <0 on failure
+ * returns 0 on success, <0 on failure
  *
  * wdrtas_register_devs registers the watchdog and temperature watchdog
  * misc devs
@@ -630,7 +630,7 @@ static int wdrtas_register_devs(void)
 /**
  * wdrtas_init - init function of the watchdog driver
  *
- * returns 0 on succes, <0 on failure
+ * returns 0 on success, <0 on failure
  *
  * registers the file handlers and the reboot notifier
  */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb51b1e4..d15ea1790bfb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -767,7 +767,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	
 	current->mm->start_stack = bprm->p;
 
-	/* Now we do a little grungy work by mmaping the ELF image into
+	/* Now we do a little grungy work by mmapping the ELF image into
 	   the correct location in memory. */
 	for(i = 0, elf_ppnt = elf_phdata;
 	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..2bd671a08e3b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -272,7 +272,7 @@ EXPORT_SYMBOL(bio_init);
  *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
  *   fall back to just using @kmalloc to allocate the required memory.
  *
- *   Note that the caller must set ->bi_destructor on succesful return
+ *   Note that the caller must set ->bi_destructor on successful return
  *   of a bio, to do the appropriate freeing of the bio once the reference
  *   count drops to zero.
  **/
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2c726b7b9faa..6815d2a84b94 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -256,7 +256,7 @@ out:
  * Insert @em into @tree or perform a simple forward/backward merge with
  * existing mappings.  The extent_map struct passed in will be inserted
  * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was sucessfull.
+ * reference dropped if the merge attempt was successfull.
  */
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em)
diff --git a/fs/cifs/README b/fs/cifs/README
index 79c1a93400be..a727b7cb075f 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -423,7 +423,7 @@ A partial list of the supported mount options follows:
 		source name to use to represent the client netbios machine 
 		name when doing the RFC1001 netbios session initialize.
   direct        Do not do inode data caching on files opened on this mount.
-		This precludes mmaping files on this mount. In some cases
+		This precludes mmapping files on this mount. In some cases
 		with fast networks and little or no caching benefits on the
 		client (e.g. when the application is doing large sequential
 		reads bigger than page size without rereading the same data) 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d0fde18039c..4b35f7ec0583 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -39,7 +39,7 @@
 
 /*
  * MAX_REQ is the maximum number of requests that WE will send
- * on one socket concurently. It also matches the most common
+ * on one socket concurrently. It also matches the most common
  * value of max multiplex returned by servers.  We may
  * eventually want to use the negotiated value (in case
  * future servers can handle more) when we are more confident that
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5e2492535daa..83580213fcac 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -917,8 +917,8 @@ undo_setattr:
 /*
  * If dentry->d_inode is null (usually meaning the cached dentry
  * is a negative dentry) then we would attempt a standard SMB delete, but
- * if that fails we can not attempt the fall back mechanisms on EACESS
- * but will return the EACESS to the caller.  Note that the VFS does not call
+ * if that fails we can not attempt the fall back mechanisms on EACCESS
+ * but will return the EACCESS to the caller. Note that the VFS does not call
  * unlink on negative dentries currently.
  */
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 224a1f478966..b6b6dcb500bf 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -371,7 +371,7 @@ E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
 	smbhash(p24 + 16, c8, p21 + 14, 1);
 }
 
-#if 0 /* currently unsued */
+#if 0 /* currently unused */
 static void
 D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 16f682e26c07..b540aa5d1f61 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -143,7 +143,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(dlm_posix_lock);
 
-/* Returns failure iff a succesful lock operation should be canceled */
+/* Returns failure iff a successful lock operation should be canceled */
 static int dlm_plock_callback(struct plock_op *op)
 {
 	struct file *file;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 618ca95cbb59..0282ec78cf8f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2932,7 +2932,7 @@ retry:
 		ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
 					&mpd);
 		/*
-		 * If we have a contigous extent of pages and we
+		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
 		 * them for I/O.
 		 */
@@ -5370,7 +5370,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  * worse case, the indexs blocks spread over different block groups
  *
  * If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
  * they could still across block group boundary.
  *
  * Also account for superblock, inode, quota and xattr blocks
@@ -5446,7 +5446,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba12824defa..74e495dabe09 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -142,7 +142,7 @@
  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  * value of s_mb_order2_reqs can be tuned via
  * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
- * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
  * stripe size. This should result in better allocation on RAID setups. If
  * not, we search in the specific group using bitmap for best extents. The
  * tunable min_to_scan and max_to_scan control the behaviour here.
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f25e70c1b51c..f0294410868d 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -177,7 +177,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		spin_unlock(&jffs2_compressor_list_lock);
 		break;
 	default:
-		printk(KERN_ERR "JFFS2: unknow compression mode.\n");
+		printk(KERN_ERR "JFFS2: unknown compression mode.\n");
 	}
  out:
 	if (ret == JFFS2_COMPR_NONE) {
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1a80301004b8..378991cfe40f 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -931,7 +931,7 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
  * Helper function for jffs2_get_inode_nodes().
  * The function detects whether more data should be read and reads it if yes.
  *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
  * 	    negative error code on failure.
  */
 static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 082e844ab2db..4b107881acd5 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
  *   is used to release xattr name/value pair and detach from c->xattrindex.
  * reclaim_xattr_datum(c)
  *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
- *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 
  *   is hard coded as 32KiB.
  * do_verify_xattr_datum(c, xd)
  *   is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2bc7d8aa5740..d9b031cf69f5 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -755,7 +755,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	 * allocation group.
 	 */
 	if ((blkno & (bmp->db_agsize - 1)) == 0)
-		/* check if the AG is currenly being written to.
+		/* check if the AG is currently being written to.
 		 * if so, call dbNextAG() to find a non-busy
 		 * AG with sufficient free space.
 		 */
@@ -3337,7 +3337,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 	for (i = 0, n = 0; i < agno; n++) {
 		bmp->db_agfree[n] = 0;	/* init collection point */
 
-		/* coalesce cotiguous k AGs; */
+		/* coalesce contiguous k AGs; */
 		for (j = 0; j < k && i < agno; j++, i++) {
 			/* merge AGi to AGn */
 			bmp->db_agfree[n] += bmp->db_agfree[i];
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0d58caf4a6e1..ec8f45f12e05 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -835,7 +835,7 @@ static int ncp_ioctl_need_write(unsigned int cmd)
 	case NCP_IOC_SETROOT:
 		return 0;
 	default:
-		/* unkown IOCTL command, assume write */
+		/* unknown IOCTL command, assume write */
 		return 1;
 	}
 }
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 9669541d0119..08f7530e9341 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -927,7 +927,7 @@ lock_retry_remap:
 		return 0;
 
 	ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
-			"EOVERFLOW" : (!err ? "EIO" : "unkown error"));
+			"EOVERFLOW" : (!err ? "EIO" : "unknown error"));
 	return err < 0 ? err : -EIO;
 
 read_err:
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 663c0e341f8b..43179ddd336f 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -399,7 +399,7 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
  * @cached_page: allocated but as yet unused page
  * @lru_pvec:	lru-buffering pagevec of caller
  *
- * Obtain @nr_pages locked page cache pages from the mapping @maping and
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
  * starting at index @index.
  *
  * If a page is newly created, increment its refcount and add it to the
@@ -1281,7 +1281,7 @@ rl_not_mapped_enoent:
 
 /*
  * Copy as much as we can into the pages and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the pages
+ * were successfully copied.  If a fault is encountered then clear the pages
  * out to (ofs + bytes) and return the number of bytes which were copied.
  */
 static inline size_t ntfs_copy_from_user(struct page **pages,
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 89b02985c054..4dadcdf3d451 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -338,7 +338,7 @@ err_out:
  * copy of the complete multi sector transfer deprotected page.  On failure,
  * *@wrp is undefined.
  *
- * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
  * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
  *
  * The following error codes are defined:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59ff..7c7198a5bc90 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2398,7 +2398,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
  *
  * The array is assumed to be large enough to hold an entire path (tree depth).
  *
- * Upon succesful return from this function:
+ * Upon successful return from this function:
  *
  * - The 'right_path' array will contain a path to the leaf block
  *   whose range contains e_cpos.
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b35..03ccf9a7b1f4 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2586,7 +2586,7 @@ fail:
 	 * is complete everywhere.  if the target dies while this is
 	 * going on, some nodes could potentially see the target as the
 	 * master, so it is important that my recovery finds the migration
-	 * mle and sets the master to UNKNONWN. */
+	 * mle and sets the master to UNKNOWN. */
 
 
 	/* wait for new node to assert master */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0d38d67194cb..c5e4a49e3a12 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1855,7 +1855,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 		 * outstanding lock request, so a cancel convert is
 		 * required. We intentionally overwrite 'ret' - if the
 		 * cancel fails and the lock was granted, it's easier
-		 * to just bubble sucess back up to the user.
+		 * to just bubble success back up to the user.
 		 */
 		ret = ocfs2_flock_handle_signal(lockres, level);
 	} else if (!ret && (level > lockres->l_level)) {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 54c16b66327e..bf34c491ae96 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -659,7 +659,7 @@ static int __ocfs2_journal_access(handle_t *handle,
 
 	default:
 		status = -EINVAL;
-		mlog(ML_ERROR, "Uknown access type!\n");
+		mlog(ML_ERROR, "Unknown access type!\n");
 	}
 	if (!status && ocfs2_meta_ecc(osb) && triggers)
 		jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 60287fc56bcb..d00c658ca150 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2431,7 +2431,7 @@ out:
  * we gonna touch and whether we need to create new blocks.
  *
  * Normally the refcount blocks store these refcount should be
- * continguous also, so that we can get the number easily.
+ * contiguous also, so that we can get the number easily.
  * As for meta_ac, we will at most add split 2 refcount record and
  * 2 more refcount block, so just check it in a rough way.
  *
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index e1c0ec0ae989..082234581d05 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -85,7 +85,7 @@ out:
 }
 
 /*
- * Tries to allocate exactly one block.  Returns true if sucessful.
+ * Tries to allocate exactly one block.  Returns true if successful.
  */
 int omfs_allocate_block(struct super_block *sb, u64 block)
 {
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index f94ddf7efba0..868a55ee080f 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -23,7 +23,7 @@
 /*
  * This file implements functions needed to recover from unclean un-mounts.
  * When UBIFS is mounted, it checks a flag on the master node to determine if
- * an un-mount was completed sucessfully. If not, the process of mounting
+ * an un-mount was completed successfully. If not, the process of mounting
  * incorparates additional checking and fixing of on-flash data structures.
  * UBIFS always cleans away all remnants of an unclean un-mount, so that
  * errors do not accumulate. However UBIFS defers recovery if it is mounted
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 6533ead9b889..a2c16bcee90b 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -98,7 +98,7 @@ typedef struct xfs_dquot {
 #define dq_flags	q_lists.dqm_flags
 
 /*
- * Lock hierachy for q_qlock:
+ * Lock hierarchy for q_qlock:
  *	XFS_QLOCK_NORMAL is the implicit default,
  * 	XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
  */
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 4c8d0afae711..fb2d63f13f4c 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -47,7 +47,7 @@
 
 #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
 
-/* memmap is virtually contigious.  */
+/* memmap is virtually contiguous.  */
 #define __pfn_to_page(pfn)	(vmemmap + (pfn))
 #define __page_to_pfn(page)	(unsigned long)((page) - vmemmap)
 
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index d76b66acea95..7c38c147e5e6 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -631,7 +631,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
  * these are provided for both review and as a porting
  * help for the C library version.
 *
- * Last chance: are any of these important enought to
+ * Last chance: are any of these important enough to
  * enable by default?
  */
 #ifdef __ARCH_WANT_SYSCALL_NO_AT
diff --git a/include/linux/chio.h b/include/linux/chio.h
index 519248d8b2b6..d9bac7f97282 100644
--- a/include/linux/chio.h
+++ b/include/linux/chio.h
@@ -21,7 +21,7 @@
  *    query vendor-specific element types
  *
  *    accessing elements works by specifing type and unit of the element.
- *    for eample, storage elements are addressed with type = CHET_ST and
+ *    for example, storage elements are addressed with type = CHET_ST and
  *    unit = 0 .. cp_nslots-1
  *
  */
diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h
index e5124ceea769..3402042ddc31 100644
--- a/include/linux/mfd/ezx-pcap.h
+++ b/include/linux/mfd/ezx-pcap.h
@@ -45,7 +45,7 @@ void pcap_set_ts_bits(struct pcap_chip *, u32);
 #define PCAP_CLEAR_INTERRUPT_REGISTER	0x01ffffff
 #define PCAP_MASK_ALL_INTERRUPT		0x01ffffff
 
-/* registers acessible by both pcap ports */
+/* registers accessible by both pcap ports */
 #define PCAP_REG_ISR		0x0	/* Interrupt Status */
 #define PCAP_REG_MSR		0x1	/* Interrupt Mask */
 #define PCAP_REG_PSTAT		0x2	/* Processor Status */
@@ -67,7 +67,7 @@ void pcap_set_ts_bits(struct pcap_chip *, u32);
 #define PCAP_REG_VENDOR_TEST1	0x1e
 #define PCAP_REG_VENDOR_TEST2	0x1f
 
-/* registers acessible by pcap port 1 only (a1200, e2 & e6) */
+/* registers accessible by pcap port 1 only (a1200, e2 & e6) */
 #define PCAP_REG_INT_SEL	0x3	/* Interrupt Select */
 #define PCAP_REG_SWCTRL		0x4	/* Switching Regulator Control */
 #define PCAP_REG_VREG1		0x5	/* Regulator Bank 1 Control */
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index d745f5b6c7b0..76e5053e1fac 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -30,7 +30,7 @@
 
 /*
  * use drive write caching -- we need deferred error handling to be
- * able to sucessfully recover with this option (drive will return good
+ * able to successfully recover with this option (drive will return good
  * status as soon as the cdb is validated).
  */
 #if defined(CONFIG_CDROM_PKTCDVD_WCACHE)
diff --git a/include/linux/serial_reg.h b/include/linux/serial_reg.h
index 850db2e80510..cf9327c051ad 100644
--- a/include/linux/serial_reg.h
+++ b/include/linux/serial_reg.h
@@ -216,10 +216,10 @@
 
 #define UART_IIR_TOD	0x08	/* Character Timeout Indication Detected */
 
-#define UART_FCR_PXAR1	0x00	/* receive FIFO treshold = 1 */
-#define UART_FCR_PXAR8	0x40	/* receive FIFO treshold = 8 */
-#define UART_FCR_PXAR16	0x80	/* receive FIFO treshold = 16 */
-#define UART_FCR_PXAR32	0xc0	/* receive FIFO treshold = 32 */
+#define UART_FCR_PXAR1	0x00	/* receive FIFO threshold = 1 */
+#define UART_FCR_PXAR8	0x40	/* receive FIFO threshold = 8 */
+#define UART_FCR_PXAR16	0x80	/* receive FIFO threshold = 16 */
+#define UART_FCR_PXAR32	0xc0	/* receive FIFO threshold = 32 */
 
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index b59e78c57161..dfd4745a955f 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -490,7 +490,7 @@ struct v4l2_jpegcompression {
 				 * you do, leave them untouched.
 				 * Inluding less markers will make the
 				 * resulting code smaller, but there will
-				 * be fewer aplications which can read it.
+				 * be fewer applications which can read it.
 				 * The presence of the APP and COM marker
 				 * is influenced by APP_len and COM_len
 				 * ONLY, not by this property! */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 6e5f0e0c7967..ca4789e4f1e1 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -988,7 +988,7 @@ struct sctp_transport {
 	int init_sent_count;
 
 	/* state       : The current state of this destination,
-	 *             : i.e. SCTP_ACTIVE, SCTP_INACTIVE, SCTP_UNKOWN.
+	 *             : i.e. SCTP_ACTIVE, SCTP_INACTIVE, SCTP_UNKNOWN.
 	 */
 	int state;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 03a49c703377..1827e7f217d1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1260,7 +1260,7 @@ static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct sk_bu
 	skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
 
 /* This function calculates a "timeout" which is equivalent to the timeout of a
- * TCP connection after "boundary" unsucessful, exponentially backed-off
+ * TCP connection after "boundary" unsuccessful, exponentially backed-off
  * retransmissions with an initial RTO of TCP_RTO_MIN.
  */
 static inline bool retransmits_timed_out(const struct sock *sk,
diff --git a/include/net/wimax.h b/include/net/wimax.h
index 2af7bf839f23..3b07f6aad102 100644
--- a/include/net/wimax.h
+++ b/include/net/wimax.h
@@ -79,7 +79,7 @@
  * drivers have to only report state changes due to external
  * conditions.
  *
- * All API operations are 'atomic', serialized thorough a mutex in the
+ * All API operations are 'atomic', serialized through a mutex in the
  * `struct wimax_dev`.
  *
  * EXPORTING TO USER SPACE THROUGH GENERIC NETLINK
diff --git a/include/sound/wm8993.h b/include/sound/wm8993.h
index 9c661f2f8cda..eee19f63c0d8 100644
--- a/include/sound/wm8993.h
+++ b/include/sound/wm8993.h
@@ -36,7 +36,7 @@ struct wm8993_platform_data {
 	unsigned int micbias1_lvl:1;
 	unsigned int micbias2_lvl:1;
 
-	/* Jack detect threashold levels, see datasheet for values */
+	/* Jack detect threshold levels, see datasheet for values */
 	unsigned int jd_scthr:2;
 	unsigned int jd_thr:2;
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7f29643c8985..759a629cc4bc 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -419,7 +419,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
 	if (!task) {
 		/*
 		 * Per cpu events are removed via an smp call and
-		 * the removal is always sucessful.
+		 * the removal is always successful.
 		 */
 		smp_call_function_single(event->cpu,
 					 __perf_event_remove_from_context,
@@ -827,7 +827,7 @@ perf_install_in_context(struct perf_event_context *ctx,
 	if (!task) {
 		/*
 		 * Per cpu events are installed via an smp call and
-		 * the install is always sucessful.
+		 * the install is always successful.
 		 */
 		smp_call_function_single(cpu, __perf_install_in_context,
 					 event, 1);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 234ceb10861f..456b2bc6b1ff 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -105,7 +105,7 @@ config DEBUG_SECTION_MISMATCH
 	bool "Enable full Section mismatch analysis"
 	depends on UNDEFINED
 	# This option is on purpose disabled for now.
-	# It will be enabled when we are down to a resonable number
+	# It will be enabled when we are down to a reasonable number
 	# of section mismatch warnings (< 10 for an allyesconfig build)
 	help
 	  The section mismatch analysis checks if there are illegal
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index 600f473a5610..76074209f9a2 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -299,7 +299,7 @@ static int INIT get_next_block(struct bunzip_data *bd)
 		   again when using them (during symbol decoding).*/
 		base = hufGroup->base-1;
 		limit = hufGroup->limit-1;
-		/* Calculate permute[].  Concurently, initialize
+		/* Calculate permute[].  Concurrently, initialize
 		 * temp[] and limit[]. */
 		pp = 0;
 		for (i = minLen; i <= maxLen; i++) {
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index ce6b7eabf674..d9b08e0f7f55 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -259,7 +259,7 @@ static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket,
 		 * times. Without a hardware IOMMU this results in the
 		 * same device addresses being put into the dma-debug
 		 * hash multiple times too. This can result in false
-		 * positives being reported. Therfore we implement a
+		 * positives being reported. Therefore we implement a
 		 * best-fit algorithm here which returns the entry from
 		 * the hash which fits best to the reference value
 		 * instead of the first-fit.
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index ac25cd28e807..853907e45868 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -453,7 +453,7 @@ do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
-	 * entries to indicate the number of contigous entries available.
+	 * entries to indicate the number of contiguous entries available.
 	 * While returning the entries to the free list, we merge the entries
 	 * with slots below and above the pool being returned.
 	 */
diff --git a/mm/filemap.c b/mm/filemap.c
index ef169f37156d..c3d3506ecaba 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1844,7 +1844,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 
 /*
  * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then return the number of
+ * were successfully copied.  If a fault is encountered then return the number of
  * bytes which were copied.
  */
 size_t iov_iter_copy_from_user_atomic(struct page *page,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7226e60e52af..c31a310aa146 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -209,7 +209,7 @@ struct mem_cgroup {
 	int	prev_priority;	/* for recording reclaim priority */
 
 	/*
-	 * While reclaiming in a hiearchy, we cache the last child we
+	 * While reclaiming in a hierarchy, we cache the last child we
 	 * reclaimed from.
 	 */
 	int last_scanned_child;
@@ -2466,7 +2466,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 
 	cgroup_lock();
 	/*
-	 * If parent's use_hiearchy is set, we can't make any modifications
+	 * If parent's use_hierarchy is set, we can't make any modifications
 	 * in the child subtrees. If it is unset, then the change can
 	 * occur, provided the current cgroup has no children.
 	 *
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index dacc64183874..1ac49fef95ab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -174,7 +174,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
 	list_for_each_entry_safe (tk, next, to_kill, nd) {
 		if (doit) {
 			/*
-			 * In case something went wrong with munmaping
+			 * In case something went wrong with munmapping
 			 * make sure the process doesn't catch the
 			 * signal and then access the memory. Just kill it.
 			 * the signal handlers
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index f7e2fa0974dc..16ad251c9725 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -50,7 +50,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
 	struct tcphdr _tcph, *tcph;
 	__be16 oldval;
 
-	/* Not enought header? */
+	/* Not enough header? */
 	tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
 	if (!tcph)
 		return false;
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
index 356e65b1dc42..783c5f367d29 100644
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -450,10 +450,10 @@ void irlap_disconnect_request(struct irlap_cb *self)
 
 	/* Check if we are in the right state for disconnecting */
 	switch (self->state) {
-	case LAP_XMIT_P:        /* FALLTROUGH */
-	case LAP_XMIT_S:        /* FALLTROUGH */
-	case LAP_CONN:          /* FALLTROUGH */
-	case LAP_RESET_WAIT:    /* FALLTROUGH */
+	case LAP_XMIT_P:        /* FALLTHROUGH */
+	case LAP_XMIT_S:        /* FALLTHROUGH */
+	case LAP_CONN:          /* FALLTHROUGH */
+	case LAP_RESET_WAIT:    /* FALLTHROUGH */
 	case LAP_RESET_CHECK:
 		irlap_do_event(self, DISCONNECT_REQUEST, NULL, NULL);
 		break;
@@ -485,9 +485,9 @@ void irlap_disconnect_indication(struct irlap_cb *self, LAP_REASON reason)
 		IRDA_DEBUG(1, "%s(), Sending reset request!\n", __func__);
 		irlap_do_event(self, RESET_REQUEST, NULL, NULL);
 		break;
-	case LAP_NO_RESPONSE:	   /* FALLTROUGH */
-	case LAP_DISC_INDICATION:  /* FALLTROUGH */
-	case LAP_FOUND_NONE:       /* FALLTROUGH */
+	case LAP_NO_RESPONSE:	   /* FALLTHROUGH */
+	case LAP_DISC_INDICATION:  /* FALLTHROUGH */
+	case LAP_FOUND_NONE:       /* FALLTHROUGH */
 	case LAP_MEDIA_BUSY:
 		irlmp_link_disconnect_indication(self->notify.instance, self,
 						 reason, NULL);
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
index c5c51959e3ce..94a9884d7146 100644
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -1741,7 +1741,7 @@ static int irlap_state_reset(struct irlap_cb *self, IRLAP_EVENT event,
  * Function irlap_state_xmit_s (event, skb, info)
  *
  *   XMIT_S, The secondary station has been given the right to transmit,
- *   and we therefor do not expect to receive any transmissions from other
+ *   and we therefore do not expect to receive any transmissions from other
  *   stations.
  */
 static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 7bf5b913828b..0e7d8bde145d 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -105,7 +105,7 @@ int __init irlmp_init(void)
 
 	init_timer(&irlmp->discovery_timer);
 
-	/* Do discovery every 3 seconds, conditionaly */
+	/* Do discovery every 3 seconds, conditionally */
 	if (sysctl_discovery)
 		irlmp_start_discovery_timer(irlmp,
 					    sysctl_discovery_timeout*HZ);
@@ -1842,7 +1842,7 @@ LM_REASON irlmp_convert_lap_reason( LAP_REASON lap_reason)
 		reason = LM_CONNECT_FAILURE;
 		break;
 	default:
-		IRDA_DEBUG(1, "%s(), Unknow IrLAP disconnect reason %d!\n",
+		IRDA_DEBUG(1, "%s(), Unknown IrLAP disconnect reason %d!\n",
 			   __func__, lap_reason);
 		reason = LM_LAP_DISCONNECT;
 		break;
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 751c4d0e2b36..719ddbc9e48c 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -244,7 +244,7 @@ struct mesh_path *mesh_path_lookup_by_idx(int idx, struct ieee80211_sub_if_data
  * @addr: destination address of the path (ETH_ALEN length)
  * @sdata: local subif
  *
- * Returns: 0 on sucess
+ * Returns: 0 on success
  *
  * State: the initial state of the new path is set to 0
  */
@@ -530,7 +530,7 @@ static void mesh_path_node_reclaim(struct rcu_head *rp)
  * @addr: dst address (ETH_ALEN length)
  * @sdata: local subif
  *
- * Returns: 0 if succesful
+ * Returns: 0 if successful
  */
 int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata)
 {
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index 7a10bbe02c13..c5d9f97ef217 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -682,7 +682,7 @@ struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain,
  * buckets and @skip_chain entries.  For each entry in the table call
  * @callback, if @callback returns a negative value stop 'walking' through the
  * table and return.  Updates the values in @skip_bkt and @skip_chain on
- * return.  Returns zero on succcess, negative values on failure.
+ * return.  Returns zero on success, negative values on failure.
  *
  */
 int netlbl_domhsh_walk(u32 *skip_bkt,
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 8674d4919556..29d8501bf156 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -719,7 +719,7 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
 
 	if (sctp_style(sk, TCP)) {
 		/* Change the sk->sk_state of a TCP-style socket that has
-		 * sucessfully completed a connect() call.
+		 * successfully completed a connect() call.
 		 */
 		if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
 			sk->sk_state = SCTP_SS_ESTABLISHED;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f11be72a1a80..b15e1ebb2bfa 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -54,7 +54,7 @@
  * Assumptions:
  * - head[0] is physically contiguous.
  * - tail[0] is physically contiguous.
- * - pages[] is not physically or virtually contigous and consists of
+ * - pages[] is not physically or virtually contiguous and consists of
  *   PAGE_SIZE elements.
  *
  * Output:
diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c
index ca269178c4d4..35f370091f4f 100644
--- a/net/wimax/op-reset.c
+++ b/net/wimax/op-reset.c
@@ -62,7 +62,7 @@
  * Called when wanting to reset the device for any reason. Device is
  * taken back to power on status.
  *
- * This call blocks; on succesful return, the device has completed the
+ * This call blocks; on successful return, the device has completed the
  * reset process and is ready to operate.
  */
 int wimax_reset(struct wimax_dev *wimax_dev)
diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c
index d82953573588..8413cf38ed27 100644
--- a/scripts/kconfig/mconf.c
+++ b/scripts/kconfig/mconf.c
@@ -213,7 +213,7 @@ load_config_help[] = N_(
 	"to modify that configuration.\n"
 	"\n"
 	"If you are uncertain, then you have probably never used alternate\n"
-	"configuration files.  You should therefor leave this blank to abort.\n"),
+	"configuration files. You should therefore leave this blank to abort.\n"),
 save_config_text[] = N_(
 	"Enter a filename to which this configuration should be saved "
 	"as an alternate.  Leave blank to abort."),
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index e68823741ad5..2534400317c5 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -204,7 +204,7 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
  *
  * Description
  * Call the NetLabel mechanism to set the label of a packet using @sid.
- * Returns zero on auccess, negative values on failure.
+ * Returns zero on success, negative values on failure.
  *
  */
 int selinux_netlbl_skbuff_setsid(struct sk_buff *skb,
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index ff17820d35ec..5914eeb0b339 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -741,7 +741,7 @@ int security_bounded_transition(u32 old_sid, u32 new_sid)
 		goto out;
 	}
 
-	/* type/domain unchaned */
+	/* type/domain unchanged */
 	if (old_context->type == new_context->type) {
 		rc = 0;
 		goto out;
diff --git a/sound/Kconfig b/sound/Kconfig
index 4b5365ad6b46..fcad760f5691 100644
--- a/sound/Kconfig
+++ b/sound/Kconfig
@@ -55,7 +55,7 @@ config SOUND_OSS_CORE_PRECLAIM
 	  Please read Documentation/feature-removal-schedule.txt for
 	  details.
 
-	  If unusre, say Y.
+	  If unsure, say Y.
 
 source "sound/oss/dmasound/Kconfig"
 
diff --git a/sound/isa/cs423x/cs4236.c b/sound/isa/cs423x/cs4236.c
index a076a6ce8071..a828baaab636 100644
--- a/sound/isa/cs423x/cs4236.c
+++ b/sound/isa/cs423x/cs4236.c
@@ -177,7 +177,7 @@ static struct pnp_card_device_id snd_cs423x_pnpids[] = {
 	{ .id = "CSC0437", .devs = { { "CSC0000" }, { "CSC0010" }, { "CSC0003" } } },
 	/* Digital PC 5000 Onboard - CS4236B */
 	{ .id = "CSC0735", .devs = { { "CSC0000" }, { "CSC0010" } } },
-	/* some uknown CS4236B */
+	/* some unknown CS4236B */
 	{ .id = "CSC0b35", .devs = { { "CSC0000" }, { "CSC0010" }, { "CSC0003" } } },
 	/* Intel PR440FX Onboard sound */
 	{ .id = "CSC0b36", .devs = { { "CSC0000" }, { "CSC0010" }, { "CSC0003" } } },
diff --git a/sound/isa/opti9xx/miro.c b/sound/isa/opti9xx/miro.c
index 02e30d7c6a93..ddad60ef3f37 100644
--- a/sound/isa/opti9xx/miro.c
+++ b/sound/isa/opti9xx/miro.c
@@ -137,7 +137,7 @@ struct snd_miro {
 static void snd_miro_proc_init(struct snd_miro * miro);
 
 static char * snd_opti9xx_names[] = {
-	"unkown",
+	"unknown",
 	"82C928", "82C929",
 	"82C924", "82C925",
 	"82C930", "82C931", "82C933"
diff --git a/sound/isa/opti9xx/opti92x-ad1848.c b/sound/isa/opti9xx/opti92x-ad1848.c
index 5cd555325b9d..848007508ffd 100644
--- a/sound/isa/opti9xx/opti92x-ad1848.c
+++ b/sound/isa/opti9xx/opti92x-ad1848.c
@@ -185,7 +185,7 @@ MODULE_DEVICE_TABLE(pnp_card, snd_opti9xx_pnpids);
 #endif
 
 static char * snd_opti9xx_names[] = {
-	"unkown",
+	"unknown",
 	"82C928",	"82C929",
 	"82C924",	"82C925",
 	"82C930",	"82C931",	"82C933"
diff --git a/sound/oss/dmasound/dmasound_paula.c b/sound/oss/dmasound/dmasound_paula.c
index 06e9e88e4c05..bb14e4c67e89 100644
--- a/sound/oss/dmasound/dmasound_paula.c
+++ b/sound/oss/dmasound/dmasound_paula.c
@@ -657,7 +657,7 @@ static int AmiStateInfo(char *buffer, size_t space)
 	len += sprintf(buffer+len, "\tsound.volume_right = %d [0...64]\n",
 		       dmasound.volume_right);
 	if (len >= space) {
-		printk(KERN_ERR "dmasound_paula: overlowed state buffer alloc.\n") ;
+		printk(KERN_ERR "dmasound_paula: overflowed state buffer alloc.\n") ;
 		len = space ;
 	}
 	return len;
diff --git a/sound/pci/ca0106/ca0106_proc.c b/sound/pci/ca0106/ca0106_proc.c
index c62b7d10ec61..8d13092300da 100644
--- a/sound/pci/ca0106/ca0106_proc.c
+++ b/sound/pci/ca0106/ca0106_proc.c
@@ -233,7 +233,7 @@ static void snd_ca0106_proc_dump_iec958( struct snd_info_buffer *buffer, u32 val
 			snd_iprintf(buffer, "user-defined\n");
 			break;
 		default:
-			snd_iprintf(buffer, "unkown\n");
+			snd_iprintf(buffer, "unknown\n");
 			break;
 		}
 		snd_iprintf(buffer, "Sample Bits: ");
diff --git a/sound/pci/cs46xx/imgs/cwcdma.asp b/sound/pci/cs46xx/imgs/cwcdma.asp
index 09d24c76f034..a65e1193c89a 100644
--- a/sound/pci/cs46xx/imgs/cwcdma.asp
+++ b/sound/pci/cs46xx/imgs/cwcdma.asp
@@ -26,10 +26,11 @@
 //
 //
 // The purpose of this code is very simple: make it possible to tranfser
-// the samples 'as they are' with no alteration from a PCMreader SCB (DMA from host)
-// to any other SCB. This is useful for AC3 throug SPDIF. SRC (source rate converters) 
-// task always alters the samples in some how, however it's from 48khz -> 48khz. The
-// alterations are not audible, but AC3 wont work. 
+// the samples 'as they are' with no alteration from a PCMreader
+// SCB (DMA from host) to any other SCB. This is useful for AC3 through SPDIF.
+// SRC (source rate converters) task always alters the samples in somehow,
+// however it's from 48khz -> 48khz.
+// The alterations are not audible, but AC3 wont work. 
 //
 //        ...
 //         |
diff --git a/sound/pci/emu10k1/emu10k1x.c b/sound/pci/emu10k1/emu10k1x.c
index 36e08bd2b3cc..360e3809a60b 100644
--- a/sound/pci/emu10k1/emu10k1x.c
+++ b/sound/pci/emu10k1/emu10k1x.c
@@ -184,7 +184,7 @@ MODULE_PARM_DESC(enable, "Enable the EMU10K1X soundcard.");
  * The hardware has 3 channels for playback and 1 for capture.
  *  - channel 0 is the front channel
  *  - channel 1 is the rear channel
- *  - channel 2 is the center/lfe chanel
+ *  - channel 2 is the center/lfe channel
  * Volume is controlled by the AC97 for the front and rear channels by
  * the PCM Playback Volume, Sigmatel Surround Playback Volume and 
  * Surround Playback Volume. The Sigmatel 4-Speaker Stereo switch affects
diff --git a/sound/pci/hda/patch_cmedia.c b/sound/pci/hda/patch_cmedia.c
index 780e1a72114a..8917071d5b6a 100644
--- a/sound/pci/hda/patch_cmedia.c
+++ b/sound/pci/hda/patch_cmedia.c
@@ -66,7 +66,7 @@ struct cmi_spec {
 
 	struct hda_pcm pcm_rec[2];	/* PCM information */
 
-	/* pin deafault configuration */
+	/* pin default configuration */
 	hda_nid_t pin_nid[NUM_PINS];
 	unsigned int def_conf[NUM_PINS];
 	unsigned int pin_def_confs;
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index ff20048504b6..872731eb49e8 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6619,7 +6619,7 @@ static struct hda_input_mux alc889A_mb31_capture_source = {
 		/* Front Mic (0x01) unused */
 		{ "Line", 0x2 },
 		/* Line 2 (0x03) unused */
-		/* CD (0x04) unsused? */
+		/* CD (0x04) unused? */
 	},
 };
 
diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c
index 0dce331a2a3b..a1b10d1a384d 100644
--- a/sound/pci/rme9652/hdspm.c
+++ b/sound/pci/rme9652/hdspm.c
@@ -3017,7 +3017,7 @@ snd_hdspm_proc_read_madi(struct snd_info_entry * entry,
 		insel = "Coaxial";
 		break;
 	default:
-		insel = "Unkown";
+		insel = "Unknown";
 	}
 
 	switch (hdspm->control_register & HDSPM_SyncRefMask) {
@@ -3028,7 +3028,7 @@ snd_hdspm_proc_read_madi(struct snd_info_entry * entry,
 		syncref = "MADI";
 		break;
 	default:
-		syncref = "Unkown";
+		syncref = "Unknown";
 	}
 	snd_iprintf(buffer, "Inputsel = %s, SyncRef = %s\n", insel,
 		    syncref);
diff --git a/sound/soc/codecs/uda134x.c b/sound/soc/codecs/uda134x.c
index c33b92edbded..8ce1c9b2e5b8 100644
--- a/sound/soc/codecs/uda134x.c
+++ b/sound/soc/codecs/uda134x.c
@@ -101,7 +101,7 @@ static int uda134x_write(struct snd_soc_codec *codec, unsigned int reg,
 	pr_debug("%s reg: %02X, value:%02X\n", __func__, reg, value);
 
 	if (reg >= UDA134X_REGS_NUM) {
-		printk(KERN_ERR "%s unkown register: reg: %u",
+		printk(KERN_ERR "%s unknown register: reg: %u",
 		       __func__, reg);
 		return -EINVAL;
 	}
@@ -552,7 +552,7 @@ static int uda134x_soc_probe(struct platform_device *pdev)
 					ARRAY_SIZE(uda1341_snd_controls));
 	break;
 	default:
-		printk(KERN_ERR "%s unkown codec type: %d",
+		printk(KERN_ERR "%s unknown codec type: %d",
 			__func__, pd->model);
 	return -EINVAL;
 	}
diff --git a/sound/soc/codecs/wm8903.c b/sound/soc/codecs/wm8903.c
index fe1307b500cf..d72347d90b70 100644
--- a/sound/soc/codecs/wm8903.c
+++ b/sound/soc/codecs/wm8903.c
@@ -607,7 +607,7 @@ SOC_SINGLE("Right Input PGA Common Mode Switch", WM8903_ANALOGUE_RIGHT_INPUT_1,
 SOC_SINGLE("DRC Switch", WM8903_DRC_0, 15, 1, 0),
 SOC_ENUM("DRC Compressor Slope R0", drc_slope_r0),
 SOC_ENUM("DRC Compressor Slope R1", drc_slope_r1),
-SOC_SINGLE_TLV("DRC Compressor Threashold Volume", WM8903_DRC_3, 5, 124, 1,
+SOC_SINGLE_TLV("DRC Compressor Threshold Volume", WM8903_DRC_3, 5, 124, 1,
 	       drc_tlv_thresh),
 SOC_SINGLE_TLV("DRC Volume", WM8903_DRC_3, 0, 30, 1, drc_tlv_amp),
 SOC_SINGLE_TLV("DRC Minimum Gain Volume", WM8903_DRC_1, 2, 3, 1, drc_tlv_min),
@@ -617,11 +617,11 @@ SOC_ENUM("DRC Decay Rate", drc_decay),
 SOC_ENUM("DRC FF Delay", drc_ff_delay),
 SOC_SINGLE("DRC Anticlip Switch", WM8903_DRC_0, 1, 1, 0),
 SOC_SINGLE("DRC QR Switch", WM8903_DRC_0, 2, 1, 0),
-SOC_SINGLE_TLV("DRC QR Threashold Volume", WM8903_DRC_0, 6, 3, 0, drc_tlv_max),
+SOC_SINGLE_TLV("DRC QR Threshold Volume", WM8903_DRC_0, 6, 3, 0, drc_tlv_max),
 SOC_ENUM("DRC QR Decay Rate", drc_qr_decay),
 SOC_SINGLE("DRC Smoothing Switch", WM8903_DRC_0, 3, 1, 0),
 SOC_SINGLE("DRC Smoothing Hysteresis Switch", WM8903_DRC_0, 0, 1, 0),
-SOC_ENUM("DRC Smoothing Threashold", drc_smoothing),
+SOC_ENUM("DRC Smoothing Threshold", drc_smoothing),
 SOC_SINGLE_TLV("DRC Startup Volume", WM8903_DRC_0, 6, 18, 0, drc_tlv_startup),
 
 SOC_DOUBLE_R_TLV("Digital Capture Volume", WM8903_ADC_DIGITAL_VOLUME_LEFT,
diff --git a/sound/soc/codecs/wm8993.c b/sound/soc/codecs/wm8993.c
index d9987999e92c..bc033687b220 100644
--- a/sound/soc/codecs/wm8993.c
+++ b/sound/soc/codecs/wm8993.c
@@ -689,7 +689,7 @@ SOC_DOUBLE_TLV("Digital Sidetone Volume", WM8993_DIGITAL_SIDE_TONE,
 
 SOC_SINGLE("DRC Switch", WM8993_DRC_CONTROL_1, 15, 1, 0),
 SOC_ENUM("DRC Path", drc_path),
-SOC_SINGLE_TLV("DRC Compressor Threashold Volume", WM8993_DRC_CONTROL_2,
+SOC_SINGLE_TLV("DRC Compressor Threshold Volume", WM8993_DRC_CONTROL_2,
 	       2, 60, 1, drc_comp_threash),
 SOC_SINGLE_TLV("DRC Compressor Amplitude Volume", WM8993_DRC_CONTROL_3,
 	       11, 30, 1, drc_comp_amp),
@@ -709,7 +709,7 @@ SOC_SINGLE_TLV("DRC Quick Release Volume", WM8993_DRC_CONTROL_3, 2, 3, 0,
 SOC_ENUM("DRC Quick Release Rate", drc_qr_rate),
 SOC_SINGLE("DRC Smoothing Switch", WM8993_DRC_CONTROL_1, 11, 1, 0),
 SOC_SINGLE("DRC Smoothing Hysteresis Switch", WM8993_DRC_CONTROL_1, 8, 1, 0),
-SOC_ENUM("DRC Smoothing Hysteresis Threashold", drc_smooth),
+SOC_ENUM("DRC Smoothing Hysteresis Threshold", drc_smooth),
 SOC_SINGLE_TLV("DRC Startup Volume", WM8993_DRC_CONTROL_4, 8, 18, 0,
 	       drc_startup_tlv),
 
diff --git a/sound/soc/s3c24xx/s3c24xx_simtec.c b/sound/soc/s3c24xx/s3c24xx_simtec.c
index 1966e0d5652d..3c7ccb78b6ab 100644
--- a/sound/soc/s3c24xx/s3c24xx_simtec.c
+++ b/sound/soc/s3c24xx/s3c24xx_simtec.c
@@ -270,7 +270,7 @@ static int attach_gpio_amp(struct device *dev,
 		gpio_direction_output(pd->amp_gain[1], 0);
 	}
 
-	/* note, curently we assume GPA0 isn't valid amp */
+	/* note, currently we assume GPA0 isn't valid amp */
 	if (pdata->amp_gpio > 0) {
 		ret = gpio_request(pd->amp_gpio, "gpio-amp");
 		if (ret) {
diff --git a/sound/soc/s6000/s6000-pcm.c b/sound/soc/s6000/s6000-pcm.c
index 83b8028e209d..81d6f983f51e 100644
--- a/sound/soc/s6000/s6000-pcm.c
+++ b/sound/soc/s6000/s6000-pcm.c
@@ -196,7 +196,7 @@ static int s6000_pcm_start(struct snd_pcm_substream *substream)
 			   0 /* destination skip after chunk (impossible) */,
 			   4 /* 16 byte burst size */,
 			   -1 /* don't conserve bandwidth */,
-			   0 /* low watermark irq descriptor theshold */,
+			   0 /* low watermark irq descriptor threshold */,
 			   0 /* disable hardware timestamps */,
 			   1 /* enable channel */);
 
diff --git a/sound/sound_core.c b/sound/sound_core.c
index 49c998186592..dbca7c909a31 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -353,7 +353,7 @@ static struct sound_unit *chains[SOUND_STEP];
  *      @dev: device pointer
  *
  *	Allocate a special sound device by minor number from the sound
- *	subsystem. The allocated number is returned on succes. On failure
+ *	subsystem. The allocated number is returned on success. On failure
  *	a negative error code is returned.
  */
  
-- 
cgit v1.2.3-71-gd317


From b69f2292063d2caf37ca9aec7d63ded203701bf3 Mon Sep 17 00:00:00 2001
From: Louis Rilling <louis.rilling@kerlabs.com>
Date: Fri, 4 Dec 2009 14:52:42 +0100
Subject: block: Fix io_context leak after failure of clone with CLONE_IO

With CLONE_IO, parent's io_context->nr_tasks is incremented, but never
decremented whenever copy_process() fails afterwards, which prevents
exit_io_context() from calling IO schedulers exit functions.

Give a task_struct to exit_io_context(), and call exit_io_context() instead of
put_io_context() in copy_process() cleanup path.

Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c           | 10 +++++-----
 include/linux/iocontext.h |  5 +++--
 kernel/exit.c             |  2 +-
 kernel/fork.c             |  3 ++-
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index dcd041290b28..cbdabb0dd6d7 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -66,14 +66,14 @@ static void cfq_exit(struct io_context *ioc)
 }
 
 /* Called by the exitting task */
-void exit_io_context(void)
+void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 
-	task_lock(current);
-	ioc = current->io_context;
-	current->io_context = NULL;
-	task_unlock(current);
+	task_lock(task);
+	ioc = task->io_context;
+	task->io_context = NULL;
+	task_unlock(task);
 
 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
 		if (ioc->aic && ioc->aic->exit)
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index d61b0b8b5cd1..a63235996309 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -98,14 +98,15 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 	return NULL;
 }
 
+struct task_struct;
 #ifdef CONFIG_BLOCK
 int put_io_context(struct io_context *ioc);
-void exit_io_context(void);
+void exit_io_context(struct task_struct *task);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 #else
-static inline void exit_io_context(void)
+static inline void exit_io_context(struct task_struct *task)
 {
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..2544000125d9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1004,7 +1004,7 @@ NORET_TYPE void do_exit(long code)
 	tsk->flags |= PF_EXITPIDONE;
 
 	if (tsk->io_context)
-		exit_io_context();
+		exit_io_context(tsk);
 
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c49257c..607353425bb0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1310,7 +1310,8 @@ bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_io:
-	put_io_context(p->io_context);
+	if (p->io_context)
+		exit_io_context(p);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-- 
cgit v1.2.3-71-gd317


From 0414f2ec03d72dc4e569627e6112fa6dafc99a79 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <nigel@tuxonice.net>
Date: Sun, 6 Dec 2009 16:15:53 +0100
Subject: PM / Hibernate: Move swap functions to kernel/power/swap.c.

Move hibernation code's functions for allocating and freeing swap
from swsusp.c to swap.c, which is where you'd expect to find them.

Signed-off-by: Nigel Cunningham <nigel@tuxonice.net>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c   | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/swsusp.c | 101 --------------------------------------------------
 2 files changed, 101 insertions(+), 101 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 890f6b11b1d3..0ce9b00f5d33 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -38,6 +38,107 @@ struct swsusp_header {
 
 static struct swsusp_header *swsusp_header;
 
+/**
+ *	The following functions are used for tracing the allocated
+ *	swap pages, so that they can be freed in case of an error.
+ */
+
+struct swsusp_extent {
+	struct rb_node node;
+	unsigned long start;
+	unsigned long end;
+};
+
+static struct rb_root swsusp_extents = RB_ROOT;
+
+static int swsusp_extents_insert(unsigned long swap_offset)
+{
+	struct rb_node **new = &(swsusp_extents.rb_node);
+	struct rb_node *parent = NULL;
+	struct swsusp_extent *ext;
+
+	/* Figure out where to put the new node */
+	while (*new) {
+		ext = container_of(*new, struct swsusp_extent, node);
+		parent = *new;
+		if (swap_offset < ext->start) {
+			/* Try to merge */
+			if (swap_offset == ext->start - 1) {
+				ext->start--;
+				return 0;
+			}
+			new = &((*new)->rb_left);
+		} else if (swap_offset > ext->end) {
+			/* Try to merge */
+			if (swap_offset == ext->end + 1) {
+				ext->end++;
+				return 0;
+			}
+			new = &((*new)->rb_right);
+		} else {
+			/* It already is in the tree */
+			return -EINVAL;
+		}
+	}
+	/* Add the new node and rebalance the tree. */
+	ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
+	if (!ext)
+		return -ENOMEM;
+
+	ext->start = swap_offset;
+	ext->end = swap_offset;
+	rb_link_node(&ext->node, parent, new);
+	rb_insert_color(&ext->node, &swsusp_extents);
+	return 0;
+}
+
+/**
+ *	alloc_swapdev_block - allocate a swap page and register that it has
+ *	been allocated, so that it can be freed in case of an error.
+ */
+
+sector_t alloc_swapdev_block(int swap)
+{
+	unsigned long offset;
+
+	offset = swp_offset(get_swap_page_of_type(swap));
+	if (offset) {
+		if (swsusp_extents_insert(offset))
+			swap_free(swp_entry(swap, offset));
+		else
+			return swapdev_block(swap, offset);
+	}
+	return 0;
+}
+
+/**
+ *	free_all_swap_pages - free swap pages allocated for saving image data.
+ *	It also frees the extents used to register which swap entres had been
+ *	allocated.
+ */
+
+void free_all_swap_pages(int swap)
+{
+	struct rb_node *node;
+
+	while ((node = swsusp_extents.rb_node)) {
+		struct swsusp_extent *ext;
+		unsigned long offset;
+
+		ext = container_of(node, struct swsusp_extent, node);
+		rb_erase(node, &swsusp_extents);
+		for (offset = ext->start; offset <= ext->end; offset++)
+			swap_free(swp_entry(swap, offset));
+
+		kfree(ext);
+	}
+}
+
+int swsusp_swap_in_use(void)
+{
+	return (swsusp_extents.rb_node != NULL);
+}
+
 /*
  * General things
  */
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6a07f4dbf2f8..57222d2089b8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -57,107 +57,6 @@
 
 int in_suspend __nosavedata = 0;
 
-/**
- *	The following functions are used for tracing the allocated
- *	swap pages, so that they can be freed in case of an error.
- */
-
-struct swsusp_extent {
-	struct rb_node node;
-	unsigned long start;
-	unsigned long end;
-};
-
-static struct rb_root swsusp_extents = RB_ROOT;
-
-static int swsusp_extents_insert(unsigned long swap_offset)
-{
-	struct rb_node **new = &(swsusp_extents.rb_node);
-	struct rb_node *parent = NULL;
-	struct swsusp_extent *ext;
-
-	/* Figure out where to put the new node */
-	while (*new) {
-		ext = container_of(*new, struct swsusp_extent, node);
-		parent = *new;
-		if (swap_offset < ext->start) {
-			/* Try to merge */
-			if (swap_offset == ext->start - 1) {
-				ext->start--;
-				return 0;
-			}
-			new = &((*new)->rb_left);
-		} else if (swap_offset > ext->end) {
-			/* Try to merge */
-			if (swap_offset == ext->end + 1) {
-				ext->end++;
-				return 0;
-			}
-			new = &((*new)->rb_right);
-		} else {
-			/* It already is in the tree */
-			return -EINVAL;
-		}
-	}
-	/* Add the new node and rebalance the tree. */
-	ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
-	if (!ext)
-		return -ENOMEM;
-
-	ext->start = swap_offset;
-	ext->end = swap_offset;
-	rb_link_node(&ext->node, parent, new);
-	rb_insert_color(&ext->node, &swsusp_extents);
-	return 0;
-}
-
-/**
- *	alloc_swapdev_block - allocate a swap page and register that it has
- *	been allocated, so that it can be freed in case of an error.
- */
-
-sector_t alloc_swapdev_block(int swap)
-{
-	unsigned long offset;
-
-	offset = swp_offset(get_swap_page_of_type(swap));
-	if (offset) {
-		if (swsusp_extents_insert(offset))
-			swap_free(swp_entry(swap, offset));
-		else
-			return swapdev_block(swap, offset);
-	}
-	return 0;
-}
-
-/**
- *	free_all_swap_pages - free swap pages allocated for saving image data.
- *	It also frees the extents used to register which swap entres had been
- *	allocated.
- */
-
-void free_all_swap_pages(int swap)
-{
-	struct rb_node *node;
-
-	while ((node = swsusp_extents.rb_node)) {
-		struct swsusp_extent *ext;
-		unsigned long offset;
-
-		ext = container_of(node, struct swsusp_extent, node);
-		rb_erase(node, &swsusp_extents);
-		for (offset = ext->start; offset <= ext->end; offset++)
-			swap_free(swp_entry(swap, offset));
-
-		kfree(ext);
-	}
-}
-
-int swsusp_swap_in_use(void)
-{
-	return (swsusp_extents.rb_node != NULL);
-}
-
 /**
  *	swsusp_show_speed - print the time elapsed between two events represented by
  *	@start and @stop
-- 
cgit v1.2.3-71-gd317


From 8e60c6a1348e17e68ad73589a52a03876e7059be Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <nigel@tuxonice.net>
Date: Sun, 6 Dec 2009 16:16:07 +0100
Subject: PM / Hibernate: Shift remaining code from swsusp.c to hibernate.c

Shift the remaining declaration of the variable in_suspend and the
function swsusp_show_speed from swsusp.c to hibernate.c, and delete
swsusp.c.

Signed-off-by: Nigel Cunningham <nigel@tuxonice.net>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Makefile    |  2 +-
 kernel/power/hibernate.c | 30 ++++++++++++++++++++++++++++++
 kernel/power/swsusp.c    | 29 -----------------------------
 3 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c3b81c30e5d5..43191815f874 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
-obj-$(CONFIG_HIBERNATION)	+= swsusp.o hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o
 obj-$(CONFIG_HIBERNATION_NVS)	+= hibernate_nvs.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04a9e90d248f..bbfe472d7524 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -32,6 +32,7 @@ static int noresume = 0;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
+int in_suspend __nosavedata = 0;
 
 enum {
 	HIBERNATION_INVALID,
@@ -201,6 +202,35 @@ static void platform_recover(int platform_mode)
 		hibernation_ops->recover();
 }
 
+/**
+ *	swsusp_show_speed - print the time elapsed between two events.
+ *	@start: Starting event.
+ *	@stop: Final event.
+ *	@nr_pages -	number of pages processed between @start and @stop
+ *	@msg -		introductory message to print
+ */
+
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+			unsigned nr_pages, char *msg)
+{
+	s64 elapsed_centisecs64;
+	int centisecs;
+	int k;
+	int kps;
+
+	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+	centisecs = elapsed_centisecs64;
+	if (centisecs == 0)
+		centisecs = 1;	/* avoid div-by-zero */
+	k = nr_pages * (PAGE_SIZE / 1024);
+	kps = (k * 100) / centisecs;
+	printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
+			msg, k,
+			centisecs / 100, centisecs % 100,
+			kps / 1000, (kps % 1000) / 10);
+}
+
 /**
  *	create_image - freeze devices that need to be frozen with interrupts
  *	off, create the hibernation image and thaw those devices.  Control
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 57222d2089b8..5b3601bd1893 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -56,32 +56,3 @@
 #include "power.h"
 
 int in_suspend __nosavedata = 0;
-
-/**
- *	swsusp_show_speed - print the time elapsed between two events represented by
- *	@start and @stop
- *
- *	@nr_pages -	number of pages processed between @start and @stop
- *	@msg -		introductory message to print
- */
-
-void swsusp_show_speed(struct timeval *start, struct timeval *stop,
-			unsigned nr_pages, char *msg)
-{
-	s64 elapsed_centisecs64;
-	int centisecs;
-	int k;
-	int kps;
-
-	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
-	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
-	centisecs = elapsed_centisecs64;
-	if (centisecs == 0)
-		centisecs = 1;	/* avoid div-by-zero */
-	k = nr_pages * (PAGE_SIZE / 1024);
-	kps = (k * 100) / centisecs;
-	printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
-			msg, k,
-			centisecs / 100, centisecs % 100,
-			kps / 1000, (kps % 1000) / 10);
-}
-- 
cgit v1.2.3-71-gd317


From 66d0ae4d6ffa45b8e6d8bdbf85f8f1b285c8152d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Sun, 6 Dec 2009 16:16:24 +0100
Subject: PM / Hibernate: Swap, use KERN_CONT

Use KERN_CONT in save_image() for printks, so that anybody won't
try to add a loglevel.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 0ce9b00f5d33..09b2b0ae9e9d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -437,7 +437,7 @@ static int save_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
+			printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
 		nr_pages++;
 	}
 	err2 = wait_on_bio_chain(&bio);
@@ -445,9 +445,9 @@ static int save_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret)
-		printk("\b\b\b\bdone\n");
+		printk(KERN_CONT "\b\b\b\bdone\n");
 	else
-		printk("\n");
+		printk(KERN_CONT "\n");
 	swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
 	return ret;
 }
-- 
cgit v1.2.3-71-gd317


From 7b199ca2025f4756daceec8802f07da636c524c9 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 3 Dec 2009 20:22:21 +0100
Subject: PM / Runtime: Export the PM runtime workqueue

This patch (as1306) exports the PM runtime workqueue for use by
loadable modules.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 347d2cc88cd0..0998c7139053 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -220,6 +220,7 @@ static struct attribute_group attr_group = {
 
 #ifdef CONFIG_PM_RUNTIME
 struct workqueue_struct *pm_wq;
+EXPORT_SYMBOL_GPL(pm_wq);
 
 static int __init pm_start_workqueue(void)
 {
-- 
cgit v1.2.3-71-gd317